new-site/scripts/workers/amb_location_scraper.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

519 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Anytime Mailbox Location Scraper (BC + ON)
Scrapes all BC and Ontario virtual mailbox locations from anytimemailbox.com,
extracts pricing, and upserts into the amb_locations PG table.
Deactivates sold-out locations. Detects price changes and sends admin alert.
Schedule: daily via cron (0 6 * * *)
Usage: python3 scripts/workers/amb_location_scraper.py
"""
import asyncio
import json
import logging
import os
import re
import smtplib
import sys
from datetime import datetime, timezone
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import Optional
import psycopg2
from playwright.async_api import async_playwright
LOG = logging.getLogger("workers.amb_scraper")
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")
DOMAIN = os.getenv("DOMAIN", "performancewest.net")
SMTP_HOST = os.getenv("SMTP_HOST", "co.carrierone.com")
SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
SMTP_USER = os.getenv("SMTP_USER", "noreply@performancewest.net")
SMTP_PASS = os.getenv("SMTP_PASS", "")
SMTP_FROM = os.getenv("SMTP_FROM", "Performance West <noreply@performancewest.net>")
ADMIN_EMAIL = os.getenv("ADMIN_EMAIL", "ops@performancewest.net")
AMB_PROVINCE_URLS = {
"BC": "https://www.anytimemailbox.com/l/canada/british-columbia",
"ON": "https://www.anytimemailbox.com/l/canada/ontario",
}
def slugify(text: str) -> str:
"""Convert address to URL-safe slug."""
s = text.lower().strip()
s = re.sub(r"[^a-z0-9\s-]", "", s)
s = re.sub(r"[\s_]+", "-", s)
s = re.sub(r"-+", "-", s).strip("-")
return s
def parse_price_text(text: str) -> int:
"""Extract dollar amount from text like '$9.99/mo' or '$99/yr'. Returns cents."""
match = re.search(r"\$\s?([\d,]+(?:\.\d{1,2})?)", text)
if match:
return int(float(match.group(1).replace(",", "")) * 100)
return 0
async def scrape_province_locations(province: str) -> list[dict]:
"""Scrape all Anytime Mailbox locations for a given province."""
url_page = AMB_PROVINCE_URLS.get(province)
if not url_page:
LOG.error("No AMB URL configured for province: %s", province)
return []
locations = []
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
page = await browser.new_page()
LOG.info("[%s] Navigating to AMB page: %s", province, url_page)
await page.goto(url_page, wait_until="domcontentloaded", timeout=60000)
await page.wait_for_timeout(3000)
# AMB uses /s/city-address URLs for individual locations.
location_urls_raw = await page.evaluate("""() => {
const links = document.querySelectorAll('a[href]');
return [...links]
.map(a => a.href)
.filter(h => h.includes('/s/') && !h.includes('#'));
}""")
location_urls_raw = list(dict.fromkeys(location_urls_raw)) # dedupe
LOG.info("[%s] Found %d raw /s/ location URLs", province, len(location_urls_raw))
# Fallback: card-based approach
if not location_urls_raw:
card_selectors = [
".location-card",
"[data-testid='location-card']",
".LocationCard",
f"a[href*='/l/canada/']",
".search-results-list a",
"article a[href*='anytimemailbox.com/l/']",
]
for sel in card_selectors:
cards = await page.query_selector_all(sel)
if cards:
LOG.info("[%s] Fallback: found %d cards with selector: %s", province, len(cards), sel)
for card in cards:
href = await card.get_attribute("href") or ""
inner_a = await card.query_selector("a[href*='/s/']")
if inner_a:
href = await inner_a.get_attribute("href") or ""
if href and "/s/" in href:
if not href.startswith("http"):
href = f"https://www.anytimemailbox.com{href}"
location_urls_raw.append(href)
break
location_urls = list(dict.fromkeys(u for u in location_urls_raw if "/s/" in u))
LOG.info("[%s] Found %d unique location URLs to scrape", province, len(location_urls))
# Visit each location page to get address + pricing
for url in location_urls:
try:
loc = await _scrape_single_location(page, url, province)
if loc:
locations.append(loc)
except Exception as e:
LOG.warning("[%s] Failed to scrape %s: %s", province, url, e)
await browser.close()
LOG.info("[%s] Scraped %d locations total", province, len(locations))
return locations
async def _scrape_single_location(page, url: str, province: str = "BC") -> Optional[dict]:
"""Scrape a single AMB location page for address, pricing, and operator name.
AMB BC pages show prices in CAD. We store the CAD values and convert
to USD at display time using the daily Bank of Canada rate.
The gb-block-layout-column elements contain plan cards with text like:
"BronzeC$ 14.99 / month SelectC$ 169.99 / year Select..."
operator_name is the legal business name of the mailbox operator at this
location (e.g. "Regus", "iPostal1", "The UPS Store"). It appears in the
page <title>, h1, or a prominent heading before the address block.
"""
await page.goto(url, wait_until="networkidle", timeout=30000)
await page.wait_for_timeout(2000)
# Extract address from the YOUR NAME / address block
page_text = await page.inner_text("body")
# ── Operator name ────────────────────────────────────────────────────────
# AMB location pages have the operator/business name as the primary heading.
# Strategy (in priority order):
# 1. <h1> tag — most reliable
# 2. <title> before " - Anytime Mailbox" suffix
# 3. The line immediately before the street address in page_text
# (AMB shows: "Regus\n329 Howe St\n...")
operator_name: Optional[str] = None
try:
# 1. h1 element
h1_el = await page.query_selector("h1")
if h1_el:
h1_text = (await h1_el.inner_text()).strip()
# Exclude generic headings that are just the address
if h1_text and not re.match(r"^\d+\s+", h1_text) and len(h1_text) < 80:
operator_name = h1_text
# 2. Page title: "Regus | 329 Howe St, Vancouver, BC | Anytime Mailbox"
if not operator_name:
title = await page.title()
title_parts = re.split(r"\s*[\|\-]\s*", title)
for part in title_parts:
part = part.strip()
if (part
and "anytime mailbox" not in part.lower()
and not re.match(r"^\d+\s+", part)
and len(part) < 60):
operator_name = part
break
# 3. Line immediately before the street number in page_text
if not operator_name:
lines_text = [l.strip() for l in page_text.splitlines() if l.strip()]
for idx, line in enumerate(lines_text):
if re.match(r"^\d+\s+[\w]", line) and idx > 0:
candidate = lines_text[idx - 1]
# Must look like a business name: not all-caps noise, not a
# postal code, not a price, and reasonably short
if (candidate
and len(candidate) < 80
and not re.match(r"^[A-Z]\d[A-Z]", candidate)
and not re.search(r"C\$|\$\d", candidate)
and candidate.lower() not in ("your name", "name", "address", "company")):
operator_name = candidate
break
except Exception as e:
LOG.warning("Could not extract operator_name from %s: %s", url, e)
if operator_name:
LOG.info(" operator_name: %s", operator_name)
else:
LOG.warning(" Could not determine operator_name for %s", url)
# Extract street address from page text
# AMB pages show: "YOUR NAME\n702 Russell Ave\nB438 Unit #MAILBOX\nVancouver, BC V5P 3V6"
addr_match = re.search(r"(\d+\s+[\w\s]+?(?:St|Ave|Dr|Rd|Blvd|Way|Drive|Street|Avenue|Road|Highway|Hwy)[\w\s]*?)[\n#]", page_text, re.IGNORECASE)
if addr_match:
address = addr_match.group(1).strip()
else:
# Fallback: extract from URL slug
# e.g. "vancouver-5307-victoria-drive" → "5307 Victoria Drive"
url_tail = url.rstrip("/").split("/")[-1]
# Remove city prefix: split on dashes, find first digit group
parts = url_tail.split("-")
addr_parts = []
found_digit = False
for part in parts:
if re.match(r"^\d+$", part):
found_digit = True
if found_digit:
addr_parts.append(part)
if addr_parts:
address = " ".join(addr_parts).title()
else:
LOG.warning("Could not extract address from %s", url)
return None
# City from URL: /s/vancouver-... or /s/kelowna-...
url_slug = url.rstrip("/").split("/")[-1]
city_match = re.match(r"([a-z]+)", url_slug)
city = city_match.group(1).title() if city_match else "Vancouver"
# Postal code
postal_match = re.search(r"\b([A-Z]\d[A-Z]\s?\d[A-Z]\d)\b", page_text)
postal_code = postal_match.group(1) if postal_match else ""
# Pricing: extract from plan cards (gb-block-layout-column or similar)
# Formats seen:
# "BronzeC$ 14.99 / month SelectC$ 169.99 / year Select..."
# "The 2026 PlanC$ 16.00 / month Select..." (single plan, no yearly)
plan_texts = await page.evaluate("""() => {
const cols = document.querySelectorAll('.gb-block-layout-column, [class*="plan"], [class*="price"]');
return [...cols].map(c => c.textContent.replace(/[\\s]+/g, ' ').trim()).filter(t => /C\\$/.test(t));
}""")
monthly_cad_cents = 0
yearly_cad_cents = 0
# Priority: Bronze (non-promo) → any named plan → first plan with C$
plan_priority = ["bronze", "basic", "starter", "standard", "the 2026", "silver"]
for target in plan_priority:
for pt in plan_texts:
if target in pt.lower() and "promo" not in pt.lower():
mo_match = re.search(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*month", pt, re.IGNORECASE)
if mo_match:
monthly_cad_cents = int(float(mo_match.group(1).replace(",", "")) * 100)
yr_match = re.search(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*year", pt, re.IGNORECASE)
if yr_match:
yearly_cad_cents = int(float(yr_match.group(1).replace(",", "")) * 100)
if monthly_cad_cents or yearly_cad_cents:
break
if monthly_cad_cents or yearly_cad_cents:
break
# Last resort: grab the first C$ price from the page
if not monthly_cad_cents and not yearly_cad_cents:
all_prices = re.findall(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*(month|year)", page_text, re.IGNORECASE)
for amount_str, period in all_prices:
cents = int(float(amount_str.replace(",", "")) * 100)
if period.lower() in ("month",) and not monthly_cad_cents:
monthly_cad_cents = cents
elif period.lower() in ("year",) and not yearly_cad_cents:
yearly_cad_cents = cents
if not yearly_cad_cents and monthly_cad_cents:
yearly_cad_cents = monthly_cad_cents * 12
# Convert CAD to USD using a fixed approximate rate (scraper stores USD)
# The FX rate is updated daily by the API's fx.ts module; here we use a
# conservative estimate. The order form will show the exact USD at order time.
CAD_TO_USD = float(os.getenv("CAD_TO_USD_RATE", "0.72"))
monthly_usd_cents = int(monthly_cad_cents * CAD_TO_USD)
yearly_usd_cents = int(yearly_cad_cents * CAD_TO_USD)
# Check mailbox availability — look for sold out / no availability indicators
available_units = -1 # -1 = unknown
avail_text = page_text.lower()
if any(kw in avail_text for kw in ["sold out", "no mailboxes available", "currently unavailable", "waitlist", "no units available"]):
available_units = 0
LOG.warning(" %s: SOLD OUT — no mailboxes available", address)
else:
# Try to click into signup flow to count available unit numbers
try:
for sel in ['button:has-text("Select")', 'a:has-text("Select")']:
btn = await page.query_selector(sel)
if btn and await btn.is_visible():
await btn.click()
break
await page.wait_for_timeout(2000)
# Look for mailbox number dropdown/select
unit_count = await page.evaluate("""() => {
const selects = document.querySelectorAll('select');
for (const sel of selects) {
const opts = [...sel.options].filter(o => o.value && o.value !== '');
if (opts.length > 0) return opts.length;
}
// Check for radio buttons or list items
const radios = document.querySelectorAll('input[type="radio"][name*="mailbox"], input[type="radio"][name*="unit"]');
if (radios.length > 0) return radios.length;
return -1;
}""")
available_units = unit_count if isinstance(unit_count, int) else -1
except Exception:
pass # Keep as unknown
slug = slugify(f"{address}-{city}")
LOG.info(" %s: %s, %s — C$%.2f/yr (US$%.2f/yr), C$%.2f/mo, units=%s",
slug, address, city,
yearly_cad_cents / 100, yearly_usd_cents / 100,
monthly_cad_cents / 100,
"sold_out" if available_units == 0 else str(available_units) if available_units > 0 else "unknown")
return {
"slug": slug,
"name": address,
"full_address": f"{address}, {city}, {province} {postal_code}",
"city": city,
"province": province,
"postal_code": postal_code,
"provider_url": url,
"plan_name": "Bronze",
"monthly_price_usd": monthly_usd_cents,
"yearly_price_usd": yearly_usd_cents,
"available_units": available_units,
"operator_name": operator_name,
}
def upsert_locations(locations: list[dict], province: str = "BC") -> list[dict]:
"""Upsert locations into PG. Returns list of price changes.
Only deactivates locations for the given province that were not found in the scrape.
"""
conn = psycopg2.connect(DATABASE_URL)
now = datetime.now(timezone.utc)
changes = []
try:
with conn.cursor() as cur:
for loc in locations:
# Check existing
cur.execute("SELECT yearly_price_usd, monthly_price_usd, is_active FROM amb_locations WHERE slug = %s", (loc["slug"],))
existing = cur.fetchone()
if existing:
old_yearly, old_monthly, was_active = existing
price_changed = (old_yearly != loc["yearly_price_usd"] or old_monthly != loc["monthly_price_usd"])
# If no units available, mark as inactive so it doesn't show in order form
units = loc.get("available_units", -1)
should_be_active = units != 0 # 0 = sold out, -1 = unknown (keep active), >0 = has units
cur.execute("""
UPDATE amb_locations SET
name = %s, full_address = %s, city = %s, province = %s,
postal_code = %s, provider_url = %s, plan_name = %s,
monthly_price_usd = %s, yearly_price_usd = %s,
available_units = %s,
is_active = %s, last_scraped_at = %s,
price_changed_at = CASE WHEN %s THEN %s ELSE price_changed_at END,
operator_name = COALESCE(%s, operator_name),
updated_at = %s
WHERE slug = %s
""", (
loc["name"], loc["full_address"], loc["city"], loc["province"],
loc["postal_code"], loc["provider_url"], loc["plan_name"],
loc["monthly_price_usd"], loc["yearly_price_usd"],
units, should_be_active, now,
price_changed, now,
loc.get("operator_name"),
now, loc["slug"],
))
if not should_be_active and was_active:
changes.append({
"slug": loc["slug"],
"name": loc["name"],
"old_yearly": old_yearly,
"new_yearly": loc["yearly_price_usd"],
"old_monthly": old_monthly,
"new_monthly": loc["monthly_price_usd"],
"sold_out": True,
})
LOG.warning("SOLD OUT: %s — no mailboxes available, deactivated", loc["slug"])
if price_changed:
changes.append({
"slug": loc["slug"],
"name": loc["name"],
"old_yearly": old_yearly,
"new_yearly": loc["yearly_price_usd"],
"old_monthly": old_monthly,
"new_monthly": loc["monthly_price_usd"],
})
LOG.warning("PRICE CHANGE: %s — yearly $%d → $%d", loc["slug"], old_yearly // 100, loc["yearly_price_usd"] // 100)
else:
units = loc.get("available_units", -1)
should_be_active = units != 0
cur.execute("""
INSERT INTO amb_locations (slug, name, full_address, city, province, postal_code,
provider_url, plan_name, monthly_price_usd, yearly_price_usd,
available_units, is_active, operator_name, last_scraped_at, created_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (
loc["slug"], loc["name"], loc["full_address"], loc["city"], loc["province"],
loc["postal_code"], loc["provider_url"], loc["plan_name"],
loc["monthly_price_usd"], loc["yearly_price_usd"],
units, should_be_active, loc.get("operator_name"), now, now, now,
))
LOG.info("NEW LOCATION: %s%s, %s — $%d/yr", loc["slug"], loc["name"], loc["city"], loc["yearly_price_usd"] // 100)
# Mark locations not seen in this scrape as inactive (scoped to province)
scraped_slugs = [loc["slug"] for loc in locations]
if scraped_slugs:
cur.execute(
"UPDATE amb_locations SET is_active = FALSE, updated_at = %s "
"WHERE slug != ALL(%s) AND province = %s AND is_active = TRUE",
(now, scraped_slugs, province),
)
deactivated = cur.rowcount
if deactivated:
LOG.warning("[%s] Deactivated %d locations not found in scrape", province, deactivated)
conn.commit()
finally:
conn.close()
return changes
def send_price_change_alert(changes: list[dict]):
"""Send admin email about price changes."""
if not changes or not SMTP_PASS:
return
lines = []
for c in changes:
if c.get("sold_out"):
lines.append(f" SOLD OUT: {c['name']} ({c['slug']}) — no mailboxes available, location deactivated")
else:
lines.append(
f" {c['name']} ({c['slug']}): "
f"yearly ${c['old_yearly'] // 100} → ${c['new_yearly'] // 100}, "
f"monthly ${c['old_monthly'] // 100} → ${c['new_monthly'] // 100}"
)
body = (
f"Anytime Mailbox price changes detected on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}:\n\n"
+ "\n".join(lines)
+ "\n\nPlease review and update any affected pending orders."
+ f"\n\nhttps://{DOMAIN}/admin"
)
msg = MIMEMultipart()
msg["From"] = SMTP_FROM
msg["To"] = ADMIN_EMAIL
msg["Subject"] = f"[PW Alert] Anytime Mailbox price change — {len(changes)} location(s)"
msg.attach(MIMEText(body, "plain"))
try:
with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=30) as server:
server.ehlo()
server.starttls()
server.ehlo()
server.login(SMTP_USER, SMTP_PASS)
server.sendmail(SMTP_USER, [ADMIN_EMAIL], msg.as_string())
LOG.info("Sent price change alert to %s", ADMIN_EMAIL)
except Exception as e:
LOG.error("Failed to send price change alert: %s", e)
async def main():
all_changes = []
total_locations = 0
for province in AMB_PROVINCE_URLS:
LOG.info("=" * 50)
LOG.info("Starting AMB %s location scrape", province)
LOG.info("=" * 50)
locations = await scrape_province_locations(province)
if not locations:
LOG.error("[%s] No locations scraped — check if AMB site changed or province page moved", province)
continue
changes = upsert_locations(locations, province)
all_changes.extend(changes)
total_locations += len(locations)
LOG.info("[%s] Processed %d locations, %d changes", province, len(locations), len(changes))
if all_changes:
LOG.warning("%d total price/availability changes detected", len(all_changes))
send_price_change_alert(all_changes)
else:
LOG.info("No price or availability changes detected")
LOG.info("AMB scrape complete: %d locations across %d provinces", total_locations, len(AMB_PROVINCE_URLS))
if __name__ == "__main__":
asyncio.run(main())