""" FCC Form 499 Filer Database Scraper Downloads the complete 499 filer registry from the FCC's public API and upserts into the local fcc_499_filers table. This gives us FRN → entity name lookups without hitting the FCC site in real time. Source: https://apps.fcc.gov/cgb/form499/499results.cfm Schedule: daily via cron (same as RMD scraper) Usage: python3 scripts/workers/fcc_499_filer_scraper.py """ from __future__ import annotations import logging import os import sys import xml.etree.ElementTree as ET from datetime import datetime, timezone import psycopg2 LOG = logging.getLogger("workers.fcc_499_scraper") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s") DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest") FILER_DB_URL = "https://apps.fcc.gov/cgb/form499/499results.cfm" def fetch_filer_data() -> list[dict]: """Fetch the complete 499 filer database from FCC. Tries XML format first, then falls back to HTML scraping. """ import urllib.request filers = [] # Try XML API url = f"{FILER_DB_URL}?XML=TRUE" LOG.info("Fetching 499 filer database from %s", url) try: req = urllib.request.Request(url, headers={ "User-Agent": "PerformanceWest-ComplianceChecker/1.0", "Accept": "application/xml, text/xml", }) with urllib.request.urlopen(req, timeout=120) as resp: data = resp.read() LOG.info("Downloaded %d bytes", len(data)) # Parse XML root = ET.fromstring(data) # The XML structure may vary — try common patterns for filer_el in root.iter(): if filer_el.tag.lower() in ("filer", "record", "row"): filer = {} for child in filer_el: tag = child.tag.lower().replace(" ", "_").replace("-", "_") filer[tag] = (child.text or "").strip() if filer.get("legal_name") or filer.get("legalname") or filer.get("name"): filers.append(_normalize_filer(filer)) # If no records found with that approach, try treating entire XML as flat list if not filers: for child in root: filer = {} for field in child: tag = field.tag.lower().replace(" ", "_").replace("-", "_") filer[tag] = (field.text or "").strip() if any(v for v in filer.values()): filers.append(_normalize_filer(filer)) LOG.info("Parsed %d filers from XML", len(filers)) except Exception as e: LOG.warning("XML fetch/parse failed: %s — trying HTML fallback", e) # Fallback: fetch HTML search with broad query try: search_url = f"{FILER_DB_URL}?state=&serviceType=&legalName=&frn=&d-16544-p=1" req = urllib.request.Request(search_url, headers={ "User-Agent": "Mozilla/5.0 (compatible; PerformanceWest/1.0)", }) with urllib.request.urlopen(req, timeout=60) as resp: html = resp.read().decode("utf-8", errors="replace") # Parse table rows from HTML import re rows = re.findall( r"]*>\s*]*>(\d+)\s*]*>(\d*)\s*]*>(.*?)\s*]*>(.*?)\s*]*>(.*?)", html, re.DOTALL, ) for filer_id, frn, name, state, svc_type in rows: if name.strip(): filers.append({ "filer_id": filer_id.strip(), "frn": frn.strip() if frn.strip() else None, "legal_name": _clean_html(name.strip()), "trade_name": None, "state": state.strip()[:2] if state.strip() else None, "service_type": _clean_html(svc_type.strip()) if svc_type.strip() else None, "holding_company": None, "status": "Active", }) LOG.info("Parsed %d filers from HTML", len(filers)) except Exception as html_err: LOG.error("HTML fallback also failed: %s", html_err) return filers def _normalize_filer(raw: dict) -> dict: """Normalize field names from various XML/HTML formats.""" return { "filer_id": raw.get("filerid") or raw.get("filer_id") or raw.get("id") or "", "frn": raw.get("frn") or raw.get("coresid") or raw.get("fcc_registration_number") or None, "legal_name": raw.get("legal_name") or raw.get("legalname") or raw.get("name") or "", "trade_name": raw.get("trade_name") or raw.get("tradename") or raw.get("dba") or None, "state": (raw.get("state") or "")[:2] or None, "service_type": raw.get("service_type") or raw.get("servicetype") or raw.get("primary_service") or None, "holding_company": raw.get("holding_company") or raw.get("holdingcompany") or None, "status": raw.get("status") or "Active", } def _clean_html(text: str) -> str: """Remove HTML tags from text.""" import re return re.sub(r"<[^>]+>", "", text).strip() def upsert_filers(filers: list[dict]) -> int: """Upsert filers into the fcc_499_filers table. Returns count of upserted rows.""" if not filers: return 0 conn = psycopg2.connect(DATABASE_URL) now = datetime.now(timezone.utc) count = 0 try: with conn.cursor() as cur: for f in filers: if not f.get("legal_name"): continue cur.execute(""" INSERT INTO fcc_499_filers (filer_id, frn, legal_name, trade_name, state, service_type, holding_company, status, last_scraped_at, updated_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (filer_id) DO UPDATE SET frn = COALESCE(EXCLUDED.frn, fcc_499_filers.frn), legal_name = EXCLUDED.legal_name, trade_name = COALESCE(EXCLUDED.trade_name, fcc_499_filers.trade_name), state = COALESCE(EXCLUDED.state, fcc_499_filers.state), service_type = COALESCE(EXCLUDED.service_type, fcc_499_filers.service_type), holding_company = COALESCE(EXCLUDED.holding_company, fcc_499_filers.holding_company), status = EXCLUDED.status, last_scraped_at = %s, updated_at = %s """, ( f["filer_id"], f["frn"], f["legal_name"], f["trade_name"], f["state"], f["service_type"], f["holding_company"], f["status"], now, now, now, now, )) count += 1 conn.commit() finally: conn.close() return count def main(): LOG.info("=" * 50) LOG.info("FCC Form 499 Filer Database Scraper") LOG.info("=" * 50) filers = fetch_filer_data() if not filers: LOG.error("No filers fetched — check if FCC API changed") return count = upsert_filers(filers) LOG.info("Upserted %d filers into fcc_499_filers", count) # Summary conn = psycopg2.connect(DATABASE_URL) with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM fcc_499_filers") total = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM fcc_499_filers WHERE frn IS NOT NULL") with_frn = cur.fetchone()[0] conn.close() LOG.info("Total filers in DB: %d (%d with FRN)", total, with_frn) LOG.info("Scrape complete") if __name__ == "__main__": main()