""" Import FCC Form 499 Filer Database Excel dump into PostgreSQL. Download the Excel file from https://apps.fcc.gov/cgb/form499/499a.cfm ("Excel Dump of all 499 filings") then run: python3 scripts/workers/fcc_499_filer_import.py /path/to/499_dump.xlsx Or copy to the server and run: scp 499_dump.xlsx deploy@207.174.124.71:/tmp/ ssh deploy@207.174.124.71 "cd /opt/performancewest && docker compose exec -T workers python3 /app/scripts/workers/fcc_499_filer_import.py /tmp/499_dump.xlsx" """ from __future__ import annotations import csv import logging import os import sys from datetime import datetime, timezone LOG = logging.getLogger("workers.fcc_499_import") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s") DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://pw:pw_dev_2026@207.174.124.71:5433/performancewest") def import_excel(path: str) -> int: """Import an Excel (.xlsx) file into fcc_499_filers table.""" try: import openpyxl except ImportError: LOG.error("openpyxl not installed. Install with: pip install openpyxl") sys.exit(1) LOG.info("Reading %s", path) wb = openpyxl.load_workbook(path, read_only=True, data_only=True) ws = wb.active rows = list(ws.iter_rows(values_only=True)) if len(rows) < 2: LOG.error("File has fewer than 2 rows — no data") return 0 # Normalize headers headers = [str(h or "").strip() for h in rows[0]] LOG.info("Headers (%d): %s", len(headers), headers[:10]) return _upsert_rows(headers, rows[1:]) def import_csv(path: str) -> int: """Import a CSV file into fcc_499_filers table.""" LOG.info("Reading %s", path) with open(path, "r", encoding="utf-8-sig") as f: reader = csv.reader(f) rows_list = list(reader) if len(rows_list) < 2: LOG.error("File has fewer than 2 rows — no data") return 0 headers = [h.strip() for h in rows_list[0]] LOG.info("Headers (%d): %s", len(headers), headers[:10]) return _upsert_rows(headers, rows_list[1:]) def _upsert_rows(headers: list[str], data_rows: list) -> int: """Upsert rows into fcc_499_filers. Returns count.""" import psycopg2 # Build a column map — the FCC Excel uses various header names col_map = {} for i, h in enumerate(headers): hl = h.lower().replace(" ", "_").replace("-", "_") if "filer" in hl and "id" in hl: col_map["filer_id"] = i elif hl in ("frn", "coresid", "fcc_registration_number", "cores_id"): col_map["frn"] = i elif "legal" in hl and "name" in hl: col_map["legal_name"] = i elif "trade" in hl and "name" in hl: col_map["trade_name"] = i elif hl == "state" or (hl == "hq_state" and "state" not in col_map): col_map["state"] = i elif "principal" in hl and "communication" in hl: col_map["service_type"] = i elif "service" in hl and "type" in hl: col_map["service_type"] = i elif "holding" in hl and "company" in hl: col_map["holding_company"] = i elif "operational" in hl and "status" in hl: col_map["status"] = i elif hl == "company_name" and "legal_name" not in col_map: col_map["legal_name"] = i LOG.info("Column map: %s", col_map) if "filer_id" not in col_map and "legal_name" not in col_map: LOG.error("Could not find filer_id or legal_name columns. Headers: %s", headers) return 0 conn = psycopg2.connect(DATABASE_URL) now = datetime.now(timezone.utc) count = 0 with conn.cursor() as cur: for row in data_rows: def get(field): idx = col_map.get(field) if idx is None or idx >= len(row): return None val = row[idx] return str(val).strip() if val is not None else None filer_id = get("filer_id") legal_name = get("legal_name") if not filer_id or not legal_name: continue frn = get("frn") if frn: frn = frn.replace(".0", "").strip() # Excel sometimes adds .0 if len(frn) < 10: frn = frn.zfill(10) cur.execute(""" INSERT INTO fcc_499_filers (filer_id, frn, legal_name, trade_name, state, service_type, holding_company, status, last_scraped_at, updated_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (filer_id) DO UPDATE SET frn = COALESCE(EXCLUDED.frn, fcc_499_filers.frn), legal_name = EXCLUDED.legal_name, trade_name = COALESCE(EXCLUDED.trade_name, fcc_499_filers.trade_name), state = COALESCE(EXCLUDED.state, fcc_499_filers.state), service_type = COALESCE(EXCLUDED.service_type, fcc_499_filers.service_type), holding_company = COALESCE(EXCLUDED.holding_company, fcc_499_filers.holding_company), status = COALESCE(EXCLUDED.status, fcc_499_filers.status), last_scraped_at = %s, updated_at = %s """, ( filer_id, frn or None, legal_name, get("trade_name"), (get("state") or "")[:2] or None, get("service_type"), get("holding_company"), get("status") or "Active", now, now, now, now, )) count += 1 if count % 1000 == 0: conn.commit() LOG.info(" … %d rows", count) conn.commit() # Report totals with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM fcc_499_filers") total = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM fcc_499_filers WHERE frn IS NOT NULL") with_frn = cur.fetchone()[0] conn.close() LOG.info("Import complete: %d rows upserted. Total in DB: %d (%d with FRN)", count, total, with_frn) return count def main(): if len(sys.argv) < 2: print("Usage: python3 scripts/workers/fcc_499_filer_import.py ") print() print("Download the Excel dump from: https://apps.fcc.gov/cgb/form499/499a.cfm") sys.exit(1) path = sys.argv[1] if not os.path.exists(path): LOG.error("File not found: %s", path) sys.exit(1) if path.endswith(".xlsx") or path.endswith(".xls"): count = import_excel(path) elif path.endswith(".csv"): count = import_csv(path) else: LOG.error("Unsupported file format. Use .xlsx, .xls, or .csv") sys.exit(1) if count == 0: sys.exit(1) if __name__ == "__main__": main()