Initial commit — Performance West telecom compliance platform

Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00 · 2026-04-27 06:54:22 -05:00 · f8cd37ac8c
commit f8cd37ac8c
1823 changed files with 145167 additions and 0 deletions
--- a/scripts/workers/amb_location_scraper.py
+++ b/scripts/workers/amb_location_scraper.py
@ -0,0 +1,519 @@
+"""
+Anytime Mailbox Location Scraper (BC + ON)
+
+Scrapes all BC and Ontario virtual mailbox locations from anytimemailbox.com,
+extracts pricing, and upserts into the amb_locations PG table.
+
+Deactivates sold-out locations. Detects price changes and sends admin alert.
+
+Schedule: daily via cron (0 6 * * *)
+Usage:   python3 scripts/workers/amb_location_scraper.py
+"""
+
+import asyncio
+import json
+import logging
+import os
+import re
+import smtplib
+import sys
+from datetime import datetime, timezone
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from typing import Optional
+
+import psycopg2
+from playwright.async_api import async_playwright
+
+LOG = logging.getLogger("workers.amb_scraper")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
+
+DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")
+DOMAIN = os.getenv("DOMAIN", "performancewest.net")
+
+SMTP_HOST = os.getenv("SMTP_HOST", "co.carrierone.com")
+SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
+SMTP_USER = os.getenv("SMTP_USER", "noreply@performancewest.net")
+SMTP_PASS = os.getenv("SMTP_PASS", "")
+SMTP_FROM = os.getenv("SMTP_FROM", "Performance West <noreply@performancewest.net>")
+ADMIN_EMAIL = os.getenv("ADMIN_EMAIL", "ops@performancewest.net")
+
+AMB_PROVINCE_URLS = {
+    "BC": "https://www.anytimemailbox.com/l/canada/british-columbia",
+    "ON": "https://www.anytimemailbox.com/l/canada/ontario",
+}
+
+
+def slugify(text: str) -> str:
+    """Convert address to URL-safe slug."""
+    s = text.lower().strip()
+    s = re.sub(r"[^a-z0-9\s-]", "", s)
+    s = re.sub(r"[\s_]+", "-", s)
+    s = re.sub(r"-+", "-", s).strip("-")
+    return s
+
+
+def parse_price_text(text: str) -> int:
+    """Extract dollar amount from text like '$9.99/mo' or '$99/yr'. Returns cents."""
+    match = re.search(r"\$\s?([\d,]+(?:\.\d{1,2})?)", text)
+    if match:
+        return int(float(match.group(1).replace(",", "")) * 100)
+    return 0
+
+
+async def scrape_province_locations(province: str) -> list[dict]:
+    """Scrape all Anytime Mailbox locations for a given province."""
+    url_page = AMB_PROVINCE_URLS.get(province)
+    if not url_page:
+        LOG.error("No AMB URL configured for province: %s", province)
+        return []
+
+    locations = []
+
+    async with async_playwright() as pw:
+        browser = await pw.chromium.launch(headless=True)
+        page = await browser.new_page()
+
+        LOG.info("[%s] Navigating to AMB page: %s", province, url_page)
+        await page.goto(url_page, wait_until="domcontentloaded", timeout=60000)
+        await page.wait_for_timeout(3000)
+
+        # AMB uses /s/city-address URLs for individual locations.
+        location_urls_raw = await page.evaluate("""() => {
+            const links = document.querySelectorAll('a[href]');
+            return [...links]
+                .map(a => a.href)
+                .filter(h => h.includes('/s/') && !h.includes('#'));
+        }""")
+        location_urls_raw = list(dict.fromkeys(location_urls_raw))  # dedupe
+        LOG.info("[%s] Found %d raw /s/ location URLs", province, len(location_urls_raw))
+
+        # Fallback: card-based approach
+        if not location_urls_raw:
+            card_selectors = [
+                ".location-card",
+                "[data-testid='location-card']",
+                ".LocationCard",
+                f"a[href*='/l/canada/']",
+                ".search-results-list a",
+                "article a[href*='anytimemailbox.com/l/']",
+            ]
+            for sel in card_selectors:
+                cards = await page.query_selector_all(sel)
+                if cards:
+                    LOG.info("[%s] Fallback: found %d cards with selector: %s", province, len(cards), sel)
+                    for card in cards:
+                        href = await card.get_attribute("href") or ""
+                        inner_a = await card.query_selector("a[href*='/s/']")
+                        if inner_a:
+                            href = await inner_a.get_attribute("href") or ""
+                        if href and "/s/" in href:
+                            if not href.startswith("http"):
+                                href = f"https://www.anytimemailbox.com{href}"
+                            location_urls_raw.append(href)
+                    break
+
+        location_urls = list(dict.fromkeys(u for u in location_urls_raw if "/s/" in u))
+        LOG.info("[%s] Found %d unique location URLs to scrape", province, len(location_urls))
+
+        # Visit each location page to get address + pricing
+        for url in location_urls:
+            try:
+                loc = await _scrape_single_location(page, url, province)
+                if loc:
+                    locations.append(loc)
+            except Exception as e:
+                LOG.warning("[%s] Failed to scrape %s: %s", province, url, e)
+
+        await browser.close()
+
+    LOG.info("[%s] Scraped %d locations total", province, len(locations))
+    return locations
+
+
+async def _scrape_single_location(page, url: str, province: str = "BC") -> Optional[dict]:
+    """Scrape a single AMB location page for address, pricing, and operator name.
+    
+    AMB BC pages show prices in CAD. We store the CAD values and convert
+    to USD at display time using the daily Bank of Canada rate.
+    The gb-block-layout-column elements contain plan cards with text like:
+      "BronzeC$ 14.99 / month SelectC$ 169.99 / year Select..."
+
+    operator_name is the legal business name of the mailbox operator at this
+    location (e.g. "Regus", "iPostal1", "The UPS Store"). It appears in the
+    page <title>, h1, or a prominent heading before the address block.
+    """
+    await page.goto(url, wait_until="networkidle", timeout=30000)
+    await page.wait_for_timeout(2000)
+
+    # Extract address from the YOUR NAME / address block
+    page_text = await page.inner_text("body")
+
+    # ── Operator name ────────────────────────────────────────────────────────
+    # AMB location pages have the operator/business name as the primary heading.
+    # Strategy (in priority order):
+    #   1. <h1> tag — most reliable
+    #   2. <title> before " - Anytime Mailbox" suffix
+    #   3. The line immediately before the street address in page_text
+    #      (AMB shows: "Regus\n329 Howe St\n...")
+    operator_name: Optional[str] = None
+
+    try:
+        # 1. h1 element
+        h1_el = await page.query_selector("h1")
+        if h1_el:
+            h1_text = (await h1_el.inner_text()).strip()
+            # Exclude generic headings that are just the address
+            if h1_text and not re.match(r"^\d+\s+", h1_text) and len(h1_text) < 80:
+                operator_name = h1_text
+
+        # 2. Page title: "Regus | 329 Howe St, Vancouver, BC | Anytime Mailbox"
+        if not operator_name:
+            title = await page.title()
+            title_parts = re.split(r"\s*[\|\-–]\s*", title)
+            for part in title_parts:
+                part = part.strip()
+                if (part
+                    and "anytime mailbox" not in part.lower()
+                    and not re.match(r"^\d+\s+", part)
+                    and len(part) < 60):
+                    operator_name = part
+                    break
+
+        # 3. Line immediately before the street number in page_text
+        if not operator_name:
+            lines_text = [l.strip() for l in page_text.splitlines() if l.strip()]
+            for idx, line in enumerate(lines_text):
+                if re.match(r"^\d+\s+[\w]", line) and idx > 0:
+                    candidate = lines_text[idx - 1]
+                    # Must look like a business name: not all-caps noise, not a
+                    # postal code, not a price, and reasonably short
+                    if (candidate
+                        and len(candidate) < 80
+                        and not re.match(r"^[A-Z]\d[A-Z]", candidate)
+                        and not re.search(r"C\$|\$\d", candidate)
+                        and candidate.lower() not in ("your name", "name", "address", "company")):
+                        operator_name = candidate
+                    break
+    except Exception as e:
+        LOG.warning("Could not extract operator_name from %s: %s", url, e)
+
+    if operator_name:
+        LOG.info("  operator_name: %s", operator_name)
+    else:
+        LOG.warning("  Could not determine operator_name for %s", url)
+
+    # Extract street address from page text
+    # AMB pages show: "YOUR NAME\n702 Russell Ave\nB438 Unit #MAILBOX\nVancouver, BC V5P 3V6"
+    addr_match = re.search(r"(\d+\s+[\w\s]+?(?:St|Ave|Dr|Rd|Blvd|Way|Drive|Street|Avenue|Road|Highway|Hwy)[\w\s]*?)[\n#]", page_text, re.IGNORECASE)
+    if addr_match:
+        address = addr_match.group(1).strip()
+    else:
+        # Fallback: extract from URL slug
+        # e.g. "vancouver-5307-victoria-drive" → "5307 Victoria Drive"
+        url_tail = url.rstrip("/").split("/")[-1]
+        # Remove city prefix: split on dashes, find first digit group
+        parts = url_tail.split("-")
+        addr_parts = []
+        found_digit = False
+        for part in parts:
+            if re.match(r"^\d+$", part):
+                found_digit = True
+            if found_digit:
+                addr_parts.append(part)
+        if addr_parts:
+            address = " ".join(addr_parts).title()
+        else:
+            LOG.warning("Could not extract address from %s", url)
+            return None
+
+    # City from URL: /s/vancouver-... or /s/kelowna-...
+    url_slug = url.rstrip("/").split("/")[-1]
+    city_match = re.match(r"([a-z]+)", url_slug)
+    city = city_match.group(1).title() if city_match else "Vancouver"
+
+    # Postal code
+    postal_match = re.search(r"\b([A-Z]\d[A-Z]\s?\d[A-Z]\d)\b", page_text)
+    postal_code = postal_match.group(1) if postal_match else ""
+
+    # Pricing: extract from plan cards (gb-block-layout-column or similar)
+    # Formats seen:
+    #   "BronzeC$ 14.99 / month SelectC$ 169.99 / year Select..."
+    #   "The 2026 PlanC$ 16.00 / month Select..."  (single plan, no yearly)
+    plan_texts = await page.evaluate("""() => {
+        const cols = document.querySelectorAll('.gb-block-layout-column, [class*="plan"], [class*="price"]');
+        return [...cols].map(c => c.textContent.replace(/[\\s]+/g, ' ').trim()).filter(t => /C\\$/.test(t));
+    }""")
+
+    monthly_cad_cents = 0
+    yearly_cad_cents = 0
+
+    # Priority: Bronze (non-promo) → any named plan → first plan with C$
+    plan_priority = ["bronze", "basic", "starter", "standard", "the 2026", "silver"]
+
+    for target in plan_priority:
+        for pt in plan_texts:
+            if target in pt.lower() and "promo" not in pt.lower():
+                mo_match = re.search(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*month", pt, re.IGNORECASE)
+                if mo_match:
+                    monthly_cad_cents = int(float(mo_match.group(1).replace(",", "")) * 100)
+                yr_match = re.search(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*year", pt, re.IGNORECASE)
+                if yr_match:
+                    yearly_cad_cents = int(float(yr_match.group(1).replace(",", "")) * 100)
+                if monthly_cad_cents or yearly_cad_cents:
+                    break
+        if monthly_cad_cents or yearly_cad_cents:
+            break
+
+    # Last resort: grab the first C$ price from the page
+    if not monthly_cad_cents and not yearly_cad_cents:
+        all_prices = re.findall(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*(month|year)", page_text, re.IGNORECASE)
+        for amount_str, period in all_prices:
+            cents = int(float(amount_str.replace(",", "")) * 100)
+            if period.lower() in ("month",) and not monthly_cad_cents:
+                monthly_cad_cents = cents
+            elif period.lower() in ("year",) and not yearly_cad_cents:
+                yearly_cad_cents = cents
+
+    if not yearly_cad_cents and monthly_cad_cents:
+        yearly_cad_cents = monthly_cad_cents * 12
+
+    # Convert CAD to USD using a fixed approximate rate (scraper stores USD)
+    # The FX rate is updated daily by the API's fx.ts module; here we use a
+    # conservative estimate. The order form will show the exact USD at order time.
+    CAD_TO_USD = float(os.getenv("CAD_TO_USD_RATE", "0.72"))
+    monthly_usd_cents = int(monthly_cad_cents * CAD_TO_USD)
+    yearly_usd_cents = int(yearly_cad_cents * CAD_TO_USD)
+
+    # Check mailbox availability — look for sold out / no availability indicators
+    available_units = -1  # -1 = unknown
+    avail_text = page_text.lower()
+    if any(kw in avail_text for kw in ["sold out", "no mailboxes available", "currently unavailable", "waitlist", "no units available"]):
+        available_units = 0
+        LOG.warning("  %s: SOLD OUT — no mailboxes available", address)
+    else:
+        # Try to click into signup flow to count available unit numbers
+        try:
+            for sel in ['button:has-text("Select")', 'a:has-text("Select")']:
+                btn = await page.query_selector(sel)
+                if btn and await btn.is_visible():
+                    await btn.click()
+                    break
+            await page.wait_for_timeout(2000)
+
+            # Look for mailbox number dropdown/select
+            unit_count = await page.evaluate("""() => {
+                const selects = document.querySelectorAll('select');
+                for (const sel of selects) {
+                    const opts = [...sel.options].filter(o => o.value && o.value !== '');
+                    if (opts.length > 0) return opts.length;
+                }
+                // Check for radio buttons or list items
+                const radios = document.querySelectorAll('input[type="radio"][name*="mailbox"], input[type="radio"][name*="unit"]');
+                if (radios.length > 0) return radios.length;
+                return -1;
+            }""")
+            available_units = unit_count if isinstance(unit_count, int) else -1
+        except Exception:
+            pass  # Keep as unknown
+
+    slug = slugify(f"{address}-{city}")
+
+    LOG.info("  %s: %s, %s — C$%.2f/yr (US$%.2f/yr), C$%.2f/mo, units=%s",
+             slug, address, city,
+             yearly_cad_cents / 100, yearly_usd_cents / 100,
+             monthly_cad_cents / 100,
+             "sold_out" if available_units == 0 else str(available_units) if available_units > 0 else "unknown")
+
+    return {
+        "slug": slug,
+        "name": address,
+        "full_address": f"{address}, {city}, {province} {postal_code}",
+        "city": city,
+        "province": province,
+        "postal_code": postal_code,
+        "provider_url": url,
+        "plan_name": "Bronze",
+        "monthly_price_usd": monthly_usd_cents,
+        "yearly_price_usd": yearly_usd_cents,
+        "available_units": available_units,
+        "operator_name": operator_name,
+    }
+
+
+def upsert_locations(locations: list[dict], province: str = "BC") -> list[dict]:
+    """Upsert locations into PG. Returns list of price changes.
+
+    Only deactivates locations for the given province that were not found in the scrape.
+    """
+    conn = psycopg2.connect(DATABASE_URL)
+    now = datetime.now(timezone.utc)
+    changes = []
+
+    try:
+        with conn.cursor() as cur:
+            for loc in locations:
+                # Check existing
+                cur.execute("SELECT yearly_price_usd, monthly_price_usd, is_active FROM amb_locations WHERE slug = %s", (loc["slug"],))
+                existing = cur.fetchone()
+
+                if existing:
+                    old_yearly, old_monthly, was_active = existing
+                    price_changed = (old_yearly != loc["yearly_price_usd"] or old_monthly != loc["monthly_price_usd"])
+
+                    # If no units available, mark as inactive so it doesn't show in order form
+                    units = loc.get("available_units", -1)
+                    should_be_active = units != 0  # 0 = sold out, -1 = unknown (keep active), >0 = has units
+
+                    cur.execute("""
+                        UPDATE amb_locations SET
+                            name = %s, full_address = %s, city = %s, province = %s,
+                            postal_code = %s, provider_url = %s, plan_name = %s,
+                            monthly_price_usd = %s, yearly_price_usd = %s,
+                            available_units = %s,
+                            is_active = %s, last_scraped_at = %s,
+                            price_changed_at = CASE WHEN %s THEN %s ELSE price_changed_at END,
+                            operator_name = COALESCE(%s, operator_name),
+                            updated_at = %s
+                        WHERE slug = %s
+                    """, (
+                        loc["name"], loc["full_address"], loc["city"], loc["province"],
+                        loc["postal_code"], loc["provider_url"], loc["plan_name"],
+                        loc["monthly_price_usd"], loc["yearly_price_usd"],
+                        units, should_be_active, now,
+                        price_changed, now,
+                        loc.get("operator_name"),
+                        now, loc["slug"],
+                    ))
+
+                    if not should_be_active and was_active:
+                        changes.append({
+                            "slug": loc["slug"],
+                            "name": loc["name"],
+                            "old_yearly": old_yearly,
+                            "new_yearly": loc["yearly_price_usd"],
+                            "old_monthly": old_monthly,
+                            "new_monthly": loc["monthly_price_usd"],
+                            "sold_out": True,
+                        })
+                        LOG.warning("SOLD OUT: %s — no mailboxes available, deactivated", loc["slug"])
+
+                    if price_changed:
+                        changes.append({
+                            "slug": loc["slug"],
+                            "name": loc["name"],
+                            "old_yearly": old_yearly,
+                            "new_yearly": loc["yearly_price_usd"],
+                            "old_monthly": old_monthly,
+                            "new_monthly": loc["monthly_price_usd"],
+                        })
+                        LOG.warning("PRICE CHANGE: %s — yearly $%d → $%d", loc["slug"], old_yearly // 100, loc["yearly_price_usd"] // 100)
+                else:
+                    units = loc.get("available_units", -1)
+                    should_be_active = units != 0
+
+                    cur.execute("""
+                        INSERT INTO amb_locations (slug, name, full_address, city, province, postal_code,
+                            provider_url, plan_name, monthly_price_usd, yearly_price_usd,
+                            available_units, is_active, operator_name, last_scraped_at, created_at, updated_at)
+                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                    """, (
+                        loc["slug"], loc["name"], loc["full_address"], loc["city"], loc["province"],
+                        loc["postal_code"], loc["provider_url"], loc["plan_name"],
+                        loc["monthly_price_usd"], loc["yearly_price_usd"],
+                        units, should_be_active, loc.get("operator_name"), now, now, now,
+                    ))
+                    LOG.info("NEW LOCATION: %s — %s, %s — $%d/yr", loc["slug"], loc["name"], loc["city"], loc["yearly_price_usd"] // 100)
+
+            # Mark locations not seen in this scrape as inactive (scoped to province)
+            scraped_slugs = [loc["slug"] for loc in locations]
+            if scraped_slugs:
+                cur.execute(
+                    "UPDATE amb_locations SET is_active = FALSE, updated_at = %s "
+                    "WHERE slug != ALL(%s) AND province = %s AND is_active = TRUE",
+                    (now, scraped_slugs, province),
+                )
+                deactivated = cur.rowcount
+                if deactivated:
+                    LOG.warning("[%s] Deactivated %d locations not found in scrape", province, deactivated)
+
+            conn.commit()
+    finally:
+        conn.close()
+
+    return changes
+
+
+def send_price_change_alert(changes: list[dict]):
+    """Send admin email about price changes."""
+    if not changes or not SMTP_PASS:
+        return
+
+    lines = []
+    for c in changes:
+        if c.get("sold_out"):
+            lines.append(f"  SOLD OUT: {c['name']} ({c['slug']}) — no mailboxes available, location deactivated")
+        else:
+            lines.append(
+                f"  {c['name']} ({c['slug']}): "
+                f"yearly ${c['old_yearly'] // 100} → ${c['new_yearly'] // 100}, "
+                f"monthly ${c['old_monthly'] // 100} → ${c['new_monthly'] // 100}"
+            )
+
+    body = (
+        f"Anytime Mailbox price changes detected on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}:\n\n"
+        + "\n".join(lines)
+        + "\n\nPlease review and update any affected pending orders."
+        + f"\n\nhttps://{DOMAIN}/admin"
+    )
+
+    msg = MIMEMultipart()
+    msg["From"] = SMTP_FROM
+    msg["To"] = ADMIN_EMAIL
+    msg["Subject"] = f"[PW Alert] Anytime Mailbox price change — {len(changes)} location(s)"
+    msg.attach(MIMEText(body, "plain"))
+
+    try:
+        with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=30) as server:
+            server.ehlo()
+            server.starttls()
+            server.ehlo()
+            server.login(SMTP_USER, SMTP_PASS)
+            server.sendmail(SMTP_USER, [ADMIN_EMAIL], msg.as_string())
+        LOG.info("Sent price change alert to %s", ADMIN_EMAIL)
+    except Exception as e:
+        LOG.error("Failed to send price change alert: %s", e)
+
+
+async def main():
+    all_changes = []
+    total_locations = 0
+
+    for province in AMB_PROVINCE_URLS:
+        LOG.info("=" * 50)
+        LOG.info("Starting AMB %s location scrape", province)
+        LOG.info("=" * 50)
+
+        locations = await scrape_province_locations(province)
+
+        if not locations:
+            LOG.error("[%s] No locations scraped — check if AMB site changed or province page moved", province)
+            continue
+
+        changes = upsert_locations(locations, province)
+        all_changes.extend(changes)
+        total_locations += len(locations)
+
+        LOG.info("[%s] Processed %d locations, %d changes", province, len(locations), len(changes))
+
+    if all_changes:
+        LOG.warning("%d total price/availability changes detected", len(all_changes))
+        send_price_change_alert(all_changes)
+    else:
+        LOG.info("No price or availability changes detected")
+
+    LOG.info("AMB scrape complete: %d locations across %d provinces", total_locations, len(AMB_PROVINCE_URLS))
+
+
+if __name__ == "__main__":
+    asyncio.run(main())