new-site/scripts/workers/fcc_499_filer_scraper.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

207 lines
7.7 KiB
Python

"""
FCC Form 499 Filer Database Scraper
Downloads the complete 499 filer registry from the FCC's public API and
upserts into the local fcc_499_filers table. This gives us FRN → entity
name lookups without hitting the FCC site in real time.
Source: https://apps.fcc.gov/cgb/form499/499results.cfm
Schedule: daily via cron (same as RMD scraper)
Usage: python3 scripts/workers/fcc_499_filer_scraper.py
"""
from __future__ import annotations
import logging
import os
import sys
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
import psycopg2
LOG = logging.getLogger("workers.fcc_499_scraper")
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")
FILER_DB_URL = "https://apps.fcc.gov/cgb/form499/499results.cfm"
def fetch_filer_data() -> list[dict]:
"""Fetch the complete 499 filer database from FCC.
Tries XML format first, then falls back to HTML scraping.
"""
import urllib.request
filers = []
# Try XML API
url = f"{FILER_DB_URL}?XML=TRUE"
LOG.info("Fetching 499 filer database from %s", url)
try:
req = urllib.request.Request(url, headers={
"User-Agent": "PerformanceWest-ComplianceChecker/1.0",
"Accept": "application/xml, text/xml",
})
with urllib.request.urlopen(req, timeout=120) as resp:
data = resp.read()
LOG.info("Downloaded %d bytes", len(data))
# Parse XML
root = ET.fromstring(data)
# The XML structure may vary — try common patterns
for filer_el in root.iter():
if filer_el.tag.lower() in ("filer", "record", "row"):
filer = {}
for child in filer_el:
tag = child.tag.lower().replace(" ", "_").replace("-", "_")
filer[tag] = (child.text or "").strip()
if filer.get("legal_name") or filer.get("legalname") or filer.get("name"):
filers.append(_normalize_filer(filer))
# If no records found with that approach, try treating entire XML as flat list
if not filers:
for child in root:
filer = {}
for field in child:
tag = field.tag.lower().replace(" ", "_").replace("-", "_")
filer[tag] = (field.text or "").strip()
if any(v for v in filer.values()):
filers.append(_normalize_filer(filer))
LOG.info("Parsed %d filers from XML", len(filers))
except Exception as e:
LOG.warning("XML fetch/parse failed: %s — trying HTML fallback", e)
# Fallback: fetch HTML search with broad query
try:
search_url = f"{FILER_DB_URL}?state=&serviceType=&legalName=&frn=&d-16544-p=1"
req = urllib.request.Request(search_url, headers={
"User-Agent": "Mozilla/5.0 (compatible; PerformanceWest/1.0)",
})
with urllib.request.urlopen(req, timeout=60) as resp:
html = resp.read().decode("utf-8", errors="replace")
# Parse table rows from HTML
import re
rows = re.findall(
r"<tr[^>]*>\s*<td[^>]*>(\d+)</td>\s*<td[^>]*>(\d*)</td>\s*<td[^>]*>(.*?)</td>\s*<td[^>]*>(.*?)</td>\s*<td[^>]*>(.*?)</td>",
html, re.DOTALL,
)
for filer_id, frn, name, state, svc_type in rows:
if name.strip():
filers.append({
"filer_id": filer_id.strip(),
"frn": frn.strip() if frn.strip() else None,
"legal_name": _clean_html(name.strip()),
"trade_name": None,
"state": state.strip()[:2] if state.strip() else None,
"service_type": _clean_html(svc_type.strip()) if svc_type.strip() else None,
"holding_company": None,
"status": "Active",
})
LOG.info("Parsed %d filers from HTML", len(filers))
except Exception as html_err:
LOG.error("HTML fallback also failed: %s", html_err)
return filers
def _normalize_filer(raw: dict) -> dict:
"""Normalize field names from various XML/HTML formats."""
return {
"filer_id": raw.get("filerid") or raw.get("filer_id") or raw.get("id") or "",
"frn": raw.get("frn") or raw.get("coresid") or raw.get("fcc_registration_number") or None,
"legal_name": raw.get("legal_name") or raw.get("legalname") or raw.get("name") or "",
"trade_name": raw.get("trade_name") or raw.get("tradename") or raw.get("dba") or None,
"state": (raw.get("state") or "")[:2] or None,
"service_type": raw.get("service_type") or raw.get("servicetype") or raw.get("primary_service") or None,
"holding_company": raw.get("holding_company") or raw.get("holdingcompany") or None,
"status": raw.get("status") or "Active",
}
def _clean_html(text: str) -> str:
"""Remove HTML tags from text."""
import re
return re.sub(r"<[^>]+>", "", text).strip()
def upsert_filers(filers: list[dict]) -> int:
"""Upsert filers into the fcc_499_filers table. Returns count of upserted rows."""
if not filers:
return 0
conn = psycopg2.connect(DATABASE_URL)
now = datetime.now(timezone.utc)
count = 0
try:
with conn.cursor() as cur:
for f in filers:
if not f.get("legal_name"):
continue
cur.execute("""
INSERT INTO fcc_499_filers (filer_id, frn, legal_name, trade_name, state,
service_type, holding_company, status, last_scraped_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (filer_id) DO UPDATE SET
frn = COALESCE(EXCLUDED.frn, fcc_499_filers.frn),
legal_name = EXCLUDED.legal_name,
trade_name = COALESCE(EXCLUDED.trade_name, fcc_499_filers.trade_name),
state = COALESCE(EXCLUDED.state, fcc_499_filers.state),
service_type = COALESCE(EXCLUDED.service_type, fcc_499_filers.service_type),
holding_company = COALESCE(EXCLUDED.holding_company, fcc_499_filers.holding_company),
status = EXCLUDED.status,
last_scraped_at = %s,
updated_at = %s
""", (
f["filer_id"], f["frn"], f["legal_name"], f["trade_name"], f["state"],
f["service_type"], f["holding_company"], f["status"],
now, now,
now, now,
))
count += 1
conn.commit()
finally:
conn.close()
return count
def main():
LOG.info("=" * 50)
LOG.info("FCC Form 499 Filer Database Scraper")
LOG.info("=" * 50)
filers = fetch_filer_data()
if not filers:
LOG.error("No filers fetched — check if FCC API changed")
return
count = upsert_filers(filers)
LOG.info("Upserted %d filers into fcc_499_filers", count)
# Summary
conn = psycopg2.connect(DATABASE_URL)
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM fcc_499_filers")
total = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM fcc_499_filers WHERE frn IS NOT NULL")
with_frn = cur.fetchone()[0]
conn.close()
LOG.info("Total filers in DB: %d (%d with FRN)", total, with_frn)
LOG.info("Scrape complete")
if __name__ == "__main__":
main()