Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
207 lines
7.7 KiB
Python
207 lines
7.7 KiB
Python
"""
|
|
FCC Form 499 Filer Database Scraper
|
|
|
|
Downloads the complete 499 filer registry from the FCC's public API and
|
|
upserts into the local fcc_499_filers table. This gives us FRN → entity
|
|
name lookups without hitting the FCC site in real time.
|
|
|
|
Source: https://apps.fcc.gov/cgb/form499/499results.cfm
|
|
Schedule: daily via cron (same as RMD scraper)
|
|
Usage: python3 scripts/workers/fcc_499_filer_scraper.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime, timezone
|
|
|
|
import psycopg2
|
|
|
|
LOG = logging.getLogger("workers.fcc_499_scraper")
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
|
|
|
|
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")
|
|
|
|
FILER_DB_URL = "https://apps.fcc.gov/cgb/form499/499results.cfm"
|
|
|
|
|
|
def fetch_filer_data() -> list[dict]:
|
|
"""Fetch the complete 499 filer database from FCC.
|
|
|
|
Tries XML format first, then falls back to HTML scraping.
|
|
"""
|
|
import urllib.request
|
|
|
|
filers = []
|
|
|
|
# Try XML API
|
|
url = f"{FILER_DB_URL}?XML=TRUE"
|
|
LOG.info("Fetching 499 filer database from %s", url)
|
|
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
"User-Agent": "PerformanceWest-ComplianceChecker/1.0",
|
|
"Accept": "application/xml, text/xml",
|
|
})
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
data = resp.read()
|
|
|
|
LOG.info("Downloaded %d bytes", len(data))
|
|
|
|
# Parse XML
|
|
root = ET.fromstring(data)
|
|
|
|
# The XML structure may vary — try common patterns
|
|
for filer_el in root.iter():
|
|
if filer_el.tag.lower() in ("filer", "record", "row"):
|
|
filer = {}
|
|
for child in filer_el:
|
|
tag = child.tag.lower().replace(" ", "_").replace("-", "_")
|
|
filer[tag] = (child.text or "").strip()
|
|
if filer.get("legal_name") or filer.get("legalname") or filer.get("name"):
|
|
filers.append(_normalize_filer(filer))
|
|
|
|
# If no records found with that approach, try treating entire XML as flat list
|
|
if not filers:
|
|
for child in root:
|
|
filer = {}
|
|
for field in child:
|
|
tag = field.tag.lower().replace(" ", "_").replace("-", "_")
|
|
filer[tag] = (field.text or "").strip()
|
|
if any(v for v in filer.values()):
|
|
filers.append(_normalize_filer(filer))
|
|
|
|
LOG.info("Parsed %d filers from XML", len(filers))
|
|
|
|
except Exception as e:
|
|
LOG.warning("XML fetch/parse failed: %s — trying HTML fallback", e)
|
|
|
|
# Fallback: fetch HTML search with broad query
|
|
try:
|
|
search_url = f"{FILER_DB_URL}?state=&serviceType=&legalName=&frn=&d-16544-p=1"
|
|
req = urllib.request.Request(search_url, headers={
|
|
"User-Agent": "Mozilla/5.0 (compatible; PerformanceWest/1.0)",
|
|
})
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
html = resp.read().decode("utf-8", errors="replace")
|
|
|
|
# Parse table rows from HTML
|
|
import re
|
|
rows = re.findall(
|
|
r"<tr[^>]*>\s*<td[^>]*>(\d+)</td>\s*<td[^>]*>(\d*)</td>\s*<td[^>]*>(.*?)</td>\s*<td[^>]*>(.*?)</td>\s*<td[^>]*>(.*?)</td>",
|
|
html, re.DOTALL,
|
|
)
|
|
for filer_id, frn, name, state, svc_type in rows:
|
|
if name.strip():
|
|
filers.append({
|
|
"filer_id": filer_id.strip(),
|
|
"frn": frn.strip() if frn.strip() else None,
|
|
"legal_name": _clean_html(name.strip()),
|
|
"trade_name": None,
|
|
"state": state.strip()[:2] if state.strip() else None,
|
|
"service_type": _clean_html(svc_type.strip()) if svc_type.strip() else None,
|
|
"holding_company": None,
|
|
"status": "Active",
|
|
})
|
|
|
|
LOG.info("Parsed %d filers from HTML", len(filers))
|
|
except Exception as html_err:
|
|
LOG.error("HTML fallback also failed: %s", html_err)
|
|
|
|
return filers
|
|
|
|
|
|
def _normalize_filer(raw: dict) -> dict:
|
|
"""Normalize field names from various XML/HTML formats."""
|
|
return {
|
|
"filer_id": raw.get("filerid") or raw.get("filer_id") or raw.get("id") or "",
|
|
"frn": raw.get("frn") or raw.get("coresid") or raw.get("fcc_registration_number") or None,
|
|
"legal_name": raw.get("legal_name") or raw.get("legalname") or raw.get("name") or "",
|
|
"trade_name": raw.get("trade_name") or raw.get("tradename") or raw.get("dba") or None,
|
|
"state": (raw.get("state") or "")[:2] or None,
|
|
"service_type": raw.get("service_type") or raw.get("servicetype") or raw.get("primary_service") or None,
|
|
"holding_company": raw.get("holding_company") or raw.get("holdingcompany") or None,
|
|
"status": raw.get("status") or "Active",
|
|
}
|
|
|
|
|
|
def _clean_html(text: str) -> str:
|
|
"""Remove HTML tags from text."""
|
|
import re
|
|
return re.sub(r"<[^>]+>", "", text).strip()
|
|
|
|
|
|
def upsert_filers(filers: list[dict]) -> int:
|
|
"""Upsert filers into the fcc_499_filers table. Returns count of upserted rows."""
|
|
if not filers:
|
|
return 0
|
|
|
|
conn = psycopg2.connect(DATABASE_URL)
|
|
now = datetime.now(timezone.utc)
|
|
count = 0
|
|
|
|
try:
|
|
with conn.cursor() as cur:
|
|
for f in filers:
|
|
if not f.get("legal_name"):
|
|
continue
|
|
cur.execute("""
|
|
INSERT INTO fcc_499_filers (filer_id, frn, legal_name, trade_name, state,
|
|
service_type, holding_company, status, last_scraped_at, updated_at)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (filer_id) DO UPDATE SET
|
|
frn = COALESCE(EXCLUDED.frn, fcc_499_filers.frn),
|
|
legal_name = EXCLUDED.legal_name,
|
|
trade_name = COALESCE(EXCLUDED.trade_name, fcc_499_filers.trade_name),
|
|
state = COALESCE(EXCLUDED.state, fcc_499_filers.state),
|
|
service_type = COALESCE(EXCLUDED.service_type, fcc_499_filers.service_type),
|
|
holding_company = COALESCE(EXCLUDED.holding_company, fcc_499_filers.holding_company),
|
|
status = EXCLUDED.status,
|
|
last_scraped_at = %s,
|
|
updated_at = %s
|
|
""", (
|
|
f["filer_id"], f["frn"], f["legal_name"], f["trade_name"], f["state"],
|
|
f["service_type"], f["holding_company"], f["status"],
|
|
now, now,
|
|
now, now,
|
|
))
|
|
count += 1
|
|
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
return count
|
|
|
|
|
|
def main():
|
|
LOG.info("=" * 50)
|
|
LOG.info("FCC Form 499 Filer Database Scraper")
|
|
LOG.info("=" * 50)
|
|
|
|
filers = fetch_filer_data()
|
|
|
|
if not filers:
|
|
LOG.error("No filers fetched — check if FCC API changed")
|
|
return
|
|
|
|
count = upsert_filers(filers)
|
|
LOG.info("Upserted %d filers into fcc_499_filers", count)
|
|
|
|
# Summary
|
|
conn = psycopg2.connect(DATABASE_URL)
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT COUNT(*) FROM fcc_499_filers")
|
|
total = cur.fetchone()[0]
|
|
cur.execute("SELECT COUNT(*) FROM fcc_499_filers WHERE frn IS NOT NULL")
|
|
with_frn = cur.fetchone()[0]
|
|
conn.close()
|
|
|
|
LOG.info("Total filers in DB: %d (%d with FRN)", total, with_frn)
|
|
LOG.info("Scrape complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|