new-site/scripts/workers/amb_location_scraper.py

"""
Anytime Mailbox Location Scraper (BC + ON)

Scrapes all BC and Ontario virtual mailbox locations from anytimemailbox.com,
extracts pricing, and upserts into the amb_locations PG table.

Deactivates sold-out locations. Detects price changes and sends admin alert.

Schedule: daily via cron (0 6 * * *)
Usage:   python3 scripts/workers/amb_location_scraper.py
"""

import asyncio
import json
import logging
import os
import re
import smtplib
import sys
from datetime import datetime, timezone
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import Optional

import psycopg2
from playwright.async_api import async_playwright

LOG = logging.getLogger("workers.amb_scraper")
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")

DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")
DOMAIN = os.getenv("DOMAIN", "performancewest.net")

SMTP_HOST = os.getenv("SMTP_HOST", "co.carrierone.com")
SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
SMTP_USER = os.getenv("SMTP_USER", "noreply@performancewest.net")
SMTP_PASS = os.getenv("SMTP_PASS", "")
SMTP_FROM = os.getenv("SMTP_FROM", "Performance West <noreply@performancewest.net>")
ADMIN_EMAIL = os.getenv("ADMIN_EMAIL", "ops@performancewest.net")

AMB_PROVINCE_URLS = {
    "BC": "https://www.anytimemailbox.com/l/canada/british-columbia",
    "ON": "https://www.anytimemailbox.com/l/canada/ontario",
}


def slugify(text: str) -> str:
    """Convert address to URL-safe slug."""
    s = text.lower().strip()
    s = re.sub(r"[^a-z0-9\s-]", "", s)
    s = re.sub(r"[\s_]+", "-", s)
    s = re.sub(r"-+", "-", s).strip("-")
    return s


def parse_price_text(text: str) -> int:
    """Extract dollar amount from text like '$9.99/mo' or '$99/yr'. Returns cents."""
    match = re.search(r"\$\s?([\d,]+(?:\.\d{1,2})?)", text)
    if match:
        return int(float(match.group(1).replace(",", "")) * 100)
    return 0


async def scrape_province_locations(province: str) -> list[dict]:
    """Scrape all Anytime Mailbox locations for a given province."""
    url_page = AMB_PROVINCE_URLS.get(province)
    if not url_page:
        LOG.error("No AMB URL configured for province: %s", province)
        return []

    locations = []

    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        page = await browser.new_page()

        LOG.info("[%s] Navigating to AMB page: %s", province, url_page)
        await page.goto(url_page, wait_until="domcontentloaded", timeout=60000)
        await page.wait_for_timeout(3000)

        # AMB uses /s/city-address URLs for individual locations.
        location_urls_raw = await page.evaluate("""() => {
            const links = document.querySelectorAll('a[href]');
            return [...links]
                .map(a => a.href)
                .filter(h => h.includes('/s/') && !h.includes('#'));
        }""")
        location_urls_raw = list(dict.fromkeys(location_urls_raw))  # dedupe
        LOG.info("[%s] Found %d raw /s/ location URLs", province, len(location_urls_raw))

        # Fallback: card-based approach
        if not location_urls_raw:
            card_selectors = [
                ".location-card",
                "[data-testid='location-card']",
                ".LocationCard",
                f"a[href*='/l/canada/']",
                ".search-results-list a",
                "article a[href*='anytimemailbox.com/l/']",
            ]
            for sel in card_selectors:
                cards = await page.query_selector_all(sel)
                if cards:
                    LOG.info("[%s] Fallback: found %d cards with selector: %s", province, len(cards), sel)
                    for card in cards:
                        href = await card.get_attribute("href") or ""
                        inner_a = await card.query_selector("a[href*='/s/']")
                        if inner_a:
                            href = await inner_a.get_attribute("href") or ""
                        if href and "/s/" in href:
                            if not href.startswith("http"):
                                href = f"https://www.anytimemailbox.com{href}"
                            location_urls_raw.append(href)
                    break

        location_urls = list(dict.fromkeys(u for u in location_urls_raw if "/s/" in u))
        LOG.info("[%s] Found %d unique location URLs to scrape", province, len(location_urls))

        # Visit each location page to get address + pricing
        for url in location_urls:
            try:
                loc = await _scrape_single_location(page, url, province)
                if loc:
                    locations.append(loc)
            except Exception as e:
                LOG.warning("[%s] Failed to scrape %s: %s", province, url, e)

        await browser.close()

    LOG.info("[%s] Scraped %d locations total", province, len(locations))
    return locations


async def _scrape_single_location(page, url: str, province: str = "BC") -> Optional[dict]:
    """Scrape a single AMB location page for address, pricing, and operator name.

    AMB BC pages show prices in CAD. We store the CAD values and convert
    to USD at display time using the daily Bank of Canada rate.
    The gb-block-layout-column elements contain plan cards with text like:
      "BronzeC$ 14.99 / month SelectC$ 169.99 / year Select..."

    operator_name is the legal business name of the mailbox operator at this
    location (e.g. "Regus", "iPostal1", "The UPS Store"). It appears in the
    page <title>, h1, or a prominent heading before the address block.
    """
    await page.goto(url, wait_until="networkidle", timeout=30000)
    await page.wait_for_timeout(2000)

    # Extract address from the YOUR NAME / address block
    page_text = await page.inner_text("body")

    # ── Operator name ────────────────────────────────────────────────────────
    # AMB location pages have the operator/business name as the primary heading.
    # Strategy (in priority order):
    #   1. <h1> tag — most reliable
    #   2. <title> before " - Anytime Mailbox" suffix
    #   3. The line immediately before the street address in page_text
    #      (AMB shows: "Regus\n329 Howe St\n...")
    operator_name: Optional[str] = None

    try:
        # 1. h1 element
        h1_el = await page.query_selector("h1")
        if h1_el:
            h1_text = (await h1_el.inner_text()).strip()
            # Exclude generic headings that are just the address
            if h1_text and not re.match(r"^\d+\s+", h1_text) and len(h1_text) < 80:
                operator_name = h1_text

        # 2. Page title: "Regus | 329 Howe St, Vancouver, BC | Anytime Mailbox"
        if not operator_name:
            title = await page.title()
            title_parts = re.split(r"\s*[\|\-–]\s*", title)
            for part in title_parts:
                part = part.strip()
                if (part
                    and "anytime mailbox" not in part.lower()
                    and not re.match(r"^\d+\s+", part)
                    and len(part) < 60):
                    operator_name = part
                    break

        # 3. Line immediately before the street number in page_text
        if not operator_name:
            lines_text = [l.strip() for l in page_text.splitlines() if l.strip()]
            for idx, line in enumerate(lines_text):
                if re.match(r"^\d+\s+[\w]", line) and idx > 0:
                    candidate = lines_text[idx - 1]
                    # Must look like a business name: not all-caps noise, not a
                    # postal code, not a price, and reasonably short
                    if (candidate
                        and len(candidate) < 80
                        and not re.match(r"^[A-Z]\d[A-Z]", candidate)
                        and not re.search(r"C\$|\$\d", candidate)
                        and candidate.lower() not in ("your name", "name", "address", "company")):
                        operator_name = candidate
                    break
    except Exception as e:
        LOG.warning("Could not extract operator_name from %s: %s", url, e)

    if operator_name:
        LOG.info("  operator_name: %s", operator_name)
    else:
        LOG.warning("  Could not determine operator_name for %s", url)

    # Extract street address from page text
    # AMB pages show: "YOUR NAME\n702 Russell Ave\nB438 Unit #MAILBOX\nVancouver, BC V5P 3V6"
    addr_match = re.search(r"(\d+\s+[\w\s]+?(?:St|Ave|Dr|Rd|Blvd|Way|Drive|Street|Avenue|Road|Highway|Hwy)[\w\s]*?)[\n#]", page_text, re.IGNORECASE)
    if addr_match:
        address = addr_match.group(1).strip()
    else:
        # Fallback: extract from URL slug
        # e.g. "vancouver-5307-victoria-drive" → "5307 Victoria Drive"
        url_tail = url.rstrip("/").split("/")[-1]
        # Remove city prefix: split on dashes, find first digit group
        parts = url_tail.split("-")
        addr_parts = []
        found_digit = False
        for part in parts:
            if re.match(r"^\d+$", part):
                found_digit = True
            if found_digit:
                addr_parts.append(part)
        if addr_parts:
            address = " ".join(addr_parts).title()
        else:
            LOG.warning("Could not extract address from %s", url)
            return None

    # City from URL: /s/vancouver-... or /s/kelowna-...
    url_slug = url.rstrip("/").split("/")[-1]
    city_match = re.match(r"([a-z]+)", url_slug)
    city = city_match.group(1).title() if city_match else "Vancouver"

    # Postal code
    postal_match = re.search(r"\b([A-Z]\d[A-Z]\s?\d[A-Z]\d)\b", page_text)
    postal_code = postal_match.group(1) if postal_match else ""

    # Pricing: extract from plan cards (gb-block-layout-column or similar)
    # Formats seen:
    #   "BronzeC$ 14.99 / month SelectC$ 169.99 / year Select..."
    #   "The 2026 PlanC$ 16.00 / month Select..."  (single plan, no yearly)
    plan_texts = await page.evaluate("""() => {
        const cols = document.querySelectorAll('.gb-block-layout-column, [class*="plan"], [class*="price"]');
        return [...cols].map(c => c.textContent.replace(/[\\s]+/g, ' ').trim()).filter(t => /C\\$/.test(t));
    }""")

    monthly_cad_cents = 0
    yearly_cad_cents = 0

    # Priority: Bronze (non-promo) → any named plan → first plan with C$
    plan_priority = ["bronze", "basic", "starter", "standard", "the 2026", "silver"]

    for target in plan_priority:
        for pt in plan_texts:
            if target in pt.lower() and "promo" not in pt.lower():
                mo_match = re.search(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*month", pt, re.IGNORECASE)
                if mo_match:
                    monthly_cad_cents = int(float(mo_match.group(1).replace(",", "")) * 100)
                yr_match = re.search(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*year", pt, re.IGNORECASE)
                if yr_match:
                    yearly_cad_cents = int(float(yr_match.group(1).replace(",", "")) * 100)
                if monthly_cad_cents or yearly_cad_cents:
                    break
        if monthly_cad_cents or yearly_cad_cents:
            break

    # Last resort: grab the first C$ price from the page
    if not monthly_cad_cents and not yearly_cad_cents:
        all_prices = re.findall(r"C\$\s?([\d,]+(?:\.\d{1,2})?)\s*/\s*(month|year)", page_text, re.IGNORECASE)
        for amount_str, period in all_prices:
            cents = int(float(amount_str.replace(",", "")) * 100)
            if period.lower() in ("month",) and not monthly_cad_cents:
                monthly_cad_cents = cents
            elif period.lower() in ("year",) and not yearly_cad_cents:
                yearly_cad_cents = cents

    if not yearly_cad_cents and monthly_cad_cents:
        yearly_cad_cents = monthly_cad_cents * 12

    # Convert CAD to USD using a fixed approximate rate (scraper stores USD)
    # The FX rate is updated daily by the API's fx.ts module; here we use a
    # conservative estimate. The order form will show the exact USD at order time.
    CAD_TO_USD = float(os.getenv("CAD_TO_USD_RATE", "0.72"))
    monthly_usd_cents = int(monthly_cad_cents * CAD_TO_USD)
    yearly_usd_cents = int(yearly_cad_cents * CAD_TO_USD)

    # Check mailbox availability — look for sold out / no availability indicators
    available_units = -1  # -1 = unknown
    avail_text = page_text.lower()
    if any(kw in avail_text for kw in ["sold out", "no mailboxes available", "currently unavailable", "waitlist", "no units available"]):
        available_units = 0
        LOG.warning("  %s: SOLD OUT — no mailboxes available", address)
    else:
        # Try to click into signup flow to count available unit numbers
        try:
            for sel in ['button:has-text("Select")', 'a:has-text("Select")']:
                btn = await page.query_selector(sel)
                if btn and await btn.is_visible():
                    await btn.click()
                    break
            await page.wait_for_timeout(2000)

            # Look for mailbox number dropdown/select
            unit_count = await page.evaluate("""() => {
                const selects = document.querySelectorAll('select');
                for (const sel of selects) {
                    const opts = [...sel.options].filter(o => o.value && o.value !== '');
                    if (opts.length > 0) return opts.length;
                }
                // Check for radio buttons or list items
                const radios = document.querySelectorAll('input[type="radio"][name*="mailbox"], input[type="radio"][name*="unit"]');
                if (radios.length > 0) return radios.length;
                return -1;
            }""")
            available_units = unit_count if isinstance(unit_count, int) else -1
        except Exception:
            pass  # Keep as unknown

    slug = slugify(f"{address}-{city}")

    LOG.info("  %s: %s, %s — C$%.2f/yr (US$%.2f/yr), C$%.2f/mo, units=%s",
             slug, address, city,
             yearly_cad_cents / 100, yearly_usd_cents / 100,
             monthly_cad_cents / 100,
             "sold_out" if available_units == 0 else str(available_units) if available_units > 0 else "unknown")

    return {
        "slug": slug,
        "name": address,
        "full_address": f"{address}, {city}, {province} {postal_code}",
        "city": city,
        "province": province,
        "postal_code": postal_code,
        "provider_url": url,
        "plan_name": "Bronze",
        "monthly_price_usd": monthly_usd_cents,
        "yearly_price_usd": yearly_usd_cents,
        "available_units": available_units,
        "operator_name": operator_name,
    }


def upsert_locations(locations: list[dict], province: str = "BC") -> list[dict]:
    """Upsert locations into PG. Returns list of price changes.

    Only deactivates locations for the given province that were not found in the scrape.
    """
    conn = psycopg2.connect(DATABASE_URL)
    now = datetime.now(timezone.utc)
    changes = []

    try:
        with conn.cursor() as cur:
            for loc in locations:
                # Check existing
                cur.execute("SELECT yearly_price_usd, monthly_price_usd, is_active FROM amb_locations WHERE slug = %s", (loc["slug"],))
                existing = cur.fetchone()

                if existing:
                    old_yearly, old_monthly, was_active = existing
                    price_changed = (old_yearly != loc["yearly_price_usd"] or old_monthly != loc["monthly_price_usd"])

                    # If no units available, mark as inactive so it doesn't show in order form
                    units = loc.get("available_units", -1)
                    should_be_active = units != 0  # 0 = sold out, -1 = unknown (keep active), >0 = has units

                    cur.execute("""
                        UPDATE amb_locations SET
                            name = %s, full_address = %s, city = %s, province = %s,
                            postal_code = %s, provider_url = %s, plan_name = %s,
                            monthly_price_usd = %s, yearly_price_usd = %s,
                            available_units = %s,
                            is_active = %s, last_scraped_at = %s,
                            price_changed_at = CASE WHEN %s THEN %s ELSE price_changed_at END,
                            operator_name = COALESCE(%s, operator_name),
                            updated_at = %s
                        WHERE slug = %s
                    """, (
                        loc["name"], loc["full_address"], loc["city"], loc["province"],
                        loc["postal_code"], loc["provider_url"], loc["plan_name"],
                        loc["monthly_price_usd"], loc["yearly_price_usd"],
                        units, should_be_active, now,
                        price_changed, now,
                        loc.get("operator_name"),
                        now, loc["slug"],
                    ))

                    if not should_be_active and was_active:
                        changes.append({
                            "slug": loc["slug"],
                            "name": loc["name"],
                            "old_yearly": old_yearly,
                            "new_yearly": loc["yearly_price_usd"],
                            "old_monthly": old_monthly,
                            "new_monthly": loc["monthly_price_usd"],
                            "sold_out": True,
                        })
                        LOG.warning("SOLD OUT: %s — no mailboxes available, deactivated", loc["slug"])

                    if price_changed:
                        changes.append({
                            "slug": loc["slug"],
                            "name": loc["name"],
                            "old_yearly": old_yearly,
                            "new_yearly": loc["yearly_price_usd"],
                            "old_monthly": old_monthly,
                            "new_monthly": loc["monthly_price_usd"],
                        })
                        LOG.warning("PRICE CHANGE: %s — yearly $%d → $%d", loc["slug"], old_yearly // 100, loc["yearly_price_usd"] // 100)
                else:
                    units = loc.get("available_units", -1)
                    should_be_active = units != 0

                    cur.execute("""
                        INSERT INTO amb_locations (slug, name, full_address, city, province, postal_code,
                            provider_url, plan_name, monthly_price_usd, yearly_price_usd,
                            available_units, is_active, operator_name, last_scraped_at, created_at, updated_at)
                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    """, (
                        loc["slug"], loc["name"], loc["full_address"], loc["city"], loc["province"],
                        loc["postal_code"], loc["provider_url"], loc["plan_name"],
                        loc["monthly_price_usd"], loc["yearly_price_usd"],
                        units, should_be_active, loc.get("operator_name"), now, now, now,
                    ))
                    LOG.info("NEW LOCATION: %s — %s, %s — $%d/yr", loc["slug"], loc["name"], loc["city"], loc["yearly_price_usd"] // 100)

            # Mark locations not seen in this scrape as inactive (scoped to province)
            scraped_slugs = [loc["slug"] for loc in locations]
            if scraped_slugs:
                cur.execute(
                    "UPDATE amb_locations SET is_active = FALSE, updated_at = %s "
                    "WHERE slug != ALL(%s) AND province = %s AND is_active = TRUE",
                    (now, scraped_slugs, province),
                )
                deactivated = cur.rowcount
                if deactivated:
                    LOG.warning("[%s] Deactivated %d locations not found in scrape", province, deactivated)

            conn.commit()
    finally:
        conn.close()

    return changes


def send_price_change_alert(changes: list[dict]):
    """Send admin email about price changes."""
    if not changes or not SMTP_PASS:
        return

    lines = []
    for c in changes:
        if c.get("sold_out"):
            lines.append(f"  SOLD OUT: {c['name']} ({c['slug']}) — no mailboxes available, location deactivated")
        else:
            lines.append(
                f"  {c['name']} ({c['slug']}): "
                f"yearly ${c['old_yearly'] // 100} → ${c['new_yearly'] // 100}, "
                f"monthly ${c['old_monthly'] // 100} → ${c['new_monthly'] // 100}"
            )

    body = (
        f"Anytime Mailbox price changes detected on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}:\n\n"
        + "\n".join(lines)
        + "\n\nPlease review and update any affected pending orders."
        + f"\n\nhttps://{DOMAIN}/admin"
    )

    msg = MIMEMultipart()
    msg["From"] = SMTP_FROM
    msg["To"] = ADMIN_EMAIL
    msg["Subject"] = f"[PW Alert] Anytime Mailbox price change — {len(changes)} location(s)"
    msg.attach(MIMEText(body, "plain"))

    try:
        with smtplib.SMTP(SMTP_HOST, SMTP_PORT, timeout=30) as server:
            server.ehlo()
            server.starttls()
            server.ehlo()
            server.login(SMTP_USER, SMTP_PASS)
            server.sendmail(SMTP_USER, [ADMIN_EMAIL], msg.as_string())
        LOG.info("Sent price change alert to %s", ADMIN_EMAIL)
    except Exception as e:
        LOG.error("Failed to send price change alert: %s", e)


async def main():
    all_changes = []
    total_locations = 0

    for province in AMB_PROVINCE_URLS:
        LOG.info("=" * 50)
        LOG.info("Starting AMB %s location scrape", province)
        LOG.info("=" * 50)

        locations = await scrape_province_locations(province)

        if not locations:
            LOG.error("[%s] No locations scraped — check if AMB site changed or province page moved", province)
            continue

        changes = upsert_locations(locations, province)
        all_changes.extend(changes)
        total_locations += len(locations)

        LOG.info("[%s] Processed %d locations, %d changes", province, len(locations), len(changes))

    if all_changes:
        LOG.warning("%d total price/availability changes detected", len(all_changes))
        send_price_change_alert(all_changes)
    else:
        LOG.info("No price or availability changes detected")

    LOG.info("AMB scrape complete: %d locations across %d provinces", total_locations, len(AMB_PROVINCE_URLS))


if __name__ == "__main__":
    asyncio.run(main())