new-site/scripts/workers/fcc_499_filer_scraper.py

"""
FCC Form 499 Filer Database Scraper

Downloads the complete 499 filer registry from the FCC's public API and
upserts into the local fcc_499_filers table. This gives us FRN → entity
name lookups without hitting the FCC site in real time.

Source: https://apps.fcc.gov/cgb/form499/499results.cfm
Schedule: daily via cron (same as RMD scraper)
Usage:   python3 scripts/workers/fcc_499_filer_scraper.py
"""
from __future__ import annotations

import logging
import os
import sys
import xml.etree.ElementTree as ET
from datetime import datetime, timezone

import psycopg2

LOG = logging.getLogger("workers.fcc_499_scraper")
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")

DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")

FILER_DB_URL = "https://apps.fcc.gov/cgb/form499/499results.cfm"


def fetch_filer_data() -> list[dict]:
    """Fetch the complete 499 filer database from FCC.

    Tries XML format first, then falls back to HTML scraping.
    """
    import urllib.request

    filers = []

    # Try XML API
    url = f"{FILER_DB_URL}?XML=TRUE"
    LOG.info("Fetching 499 filer database from %s", url)

    try:
        req = urllib.request.Request(url, headers={
            "User-Agent": "PerformanceWest-ComplianceChecker/1.0",
            "Accept": "application/xml, text/xml",
        })
        with urllib.request.urlopen(req, timeout=120) as resp:
            data = resp.read()

        LOG.info("Downloaded %d bytes", len(data))

        # Parse XML
        root = ET.fromstring(data)

        # The XML structure may vary — try common patterns
        for filer_el in root.iter():
            if filer_el.tag.lower() in ("filer", "record", "row"):
                filer = {}
                for child in filer_el:
                    tag = child.tag.lower().replace(" ", "_").replace("-", "_")
                    filer[tag] = (child.text or "").strip()
                if filer.get("legal_name") or filer.get("legalname") or filer.get("name"):
                    filers.append(_normalize_filer(filer))

        # If no records found with that approach, try treating entire XML as flat list
        if not filers:
            for child in root:
                filer = {}
                for field in child:
                    tag = field.tag.lower().replace(" ", "_").replace("-", "_")
                    filer[tag] = (field.text or "").strip()
                if any(v for v in filer.values()):
                    filers.append(_normalize_filer(filer))

        LOG.info("Parsed %d filers from XML", len(filers))

    except Exception as e:
        LOG.warning("XML fetch/parse failed: %s — trying HTML fallback", e)

        # Fallback: fetch HTML search with broad query
        try:
            search_url = f"{FILER_DB_URL}?state=&serviceType=&legalName=&frn=&d-16544-p=1"
            req = urllib.request.Request(search_url, headers={
                "User-Agent": "Mozilla/5.0 (compatible; PerformanceWest/1.0)",
            })
            with urllib.request.urlopen(req, timeout=60) as resp:
                html = resp.read().decode("utf-8", errors="replace")

            # Parse table rows from HTML
            import re
            rows = re.findall(
                r"<tr[^>]*>\s*<td[^>]*>(\d+)</td>\s*<td[^>]*>(\d*)</td>\s*<td[^>]*>(.*?)</td>\s*<td[^>]*>(.*?)</td>\s*<td[^>]*>(.*?)</td>",
                html, re.DOTALL,
            )
            for filer_id, frn, name, state, svc_type in rows:
                if name.strip():
                    filers.append({
                        "filer_id": filer_id.strip(),
                        "frn": frn.strip() if frn.strip() else None,
                        "legal_name": _clean_html(name.strip()),
                        "trade_name": None,
                        "state": state.strip()[:2] if state.strip() else None,
                        "service_type": _clean_html(svc_type.strip()) if svc_type.strip() else None,
                        "holding_company": None,
                        "status": "Active",
                    })

            LOG.info("Parsed %d filers from HTML", len(filers))
        except Exception as html_err:
            LOG.error("HTML fallback also failed: %s", html_err)

    return filers


def _normalize_filer(raw: dict) -> dict:
    """Normalize field names from various XML/HTML formats."""
    return {
        "filer_id": raw.get("filerid") or raw.get("filer_id") or raw.get("id") or "",
        "frn": raw.get("frn") or raw.get("coresid") or raw.get("fcc_registration_number") or None,
        "legal_name": raw.get("legal_name") or raw.get("legalname") or raw.get("name") or "",
        "trade_name": raw.get("trade_name") or raw.get("tradename") or raw.get("dba") or None,
        "state": (raw.get("state") or "")[:2] or None,
        "service_type": raw.get("service_type") or raw.get("servicetype") or raw.get("primary_service") or None,
        "holding_company": raw.get("holding_company") or raw.get("holdingcompany") or None,
        "status": raw.get("status") or "Active",
    }


def _clean_html(text: str) -> str:
    """Remove HTML tags from text."""
    import re
    return re.sub(r"<[^>]+>", "", text).strip()


def upsert_filers(filers: list[dict]) -> int:
    """Upsert filers into the fcc_499_filers table. Returns count of upserted rows."""
    if not filers:
        return 0

    conn = psycopg2.connect(DATABASE_URL)
    now = datetime.now(timezone.utc)
    count = 0

    try:
        with conn.cursor() as cur:
            for f in filers:
                if not f.get("legal_name"):
                    continue
                cur.execute("""
                    INSERT INTO fcc_499_filers (filer_id, frn, legal_name, trade_name, state,
                        service_type, holding_company, status, last_scraped_at, updated_at)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                    ON CONFLICT (filer_id) DO UPDATE SET
                        frn = COALESCE(EXCLUDED.frn, fcc_499_filers.frn),
                        legal_name = EXCLUDED.legal_name,
                        trade_name = COALESCE(EXCLUDED.trade_name, fcc_499_filers.trade_name),
                        state = COALESCE(EXCLUDED.state, fcc_499_filers.state),
                        service_type = COALESCE(EXCLUDED.service_type, fcc_499_filers.service_type),
                        holding_company = COALESCE(EXCLUDED.holding_company, fcc_499_filers.holding_company),
                        status = EXCLUDED.status,
                        last_scraped_at = %s,
                        updated_at = %s
                """, (
                    f["filer_id"], f["frn"], f["legal_name"], f["trade_name"], f["state"],
                    f["service_type"], f["holding_company"], f["status"],
                    now, now,
                    now, now,
                ))
                count += 1

        conn.commit()
    finally:
        conn.close()

    return count


def main():
    LOG.info("=" * 50)
    LOG.info("FCC Form 499 Filer Database Scraper")
    LOG.info("=" * 50)

    filers = fetch_filer_data()

    if not filers:
        LOG.error("No filers fetched — check if FCC API changed")
        return

    count = upsert_filers(filers)
    LOG.info("Upserted %d filers into fcc_499_filers", count)

    # Summary
    conn = psycopg2.connect(DATABASE_URL)
    with conn.cursor() as cur:
        cur.execute("SELECT COUNT(*) FROM fcc_499_filers")
        total = cur.fetchone()[0]
        cur.execute("SELECT COUNT(*) FROM fcc_499_filers WHERE frn IS NOT NULL")
        with_frn = cur.fetchone()[0]
    conn.close()

    LOG.info("Total filers in DB: %d (%d with FRN)", total, with_frn)
    LOG.info("Scrape complete")


if __name__ == "__main__":
    main()