""" NANPA area-code importer. One-shot (re-runnable) loader for `nanpa_area_codes`. Fetches the public NANPA "Geographic Area Code Number Report" — a CSV listing every active/assigned NANP area code and the geography it serves. Source: https://www.nationalnanpa.com/enas/geoAreaCodeNumberReport.do (the site exposes the data as CSV behind a simple GET; we mirror the fields we care about into our PG table.) Fallback: if the NANPA endpoint is unreachable, we have a bundled snapshot at scripts/data/nanpa_area_codes_snapshot.csv that we fall back to so classification still works in dev / offline environments. Coverage: - All 50 US states + DC + territories (PR, VI, GU, AS, MP) - Canada (for international classification) - Caribbean NANP jurisdictions (1-242 Bahamas, 1-246 Barbados, etc. — these are "NANP but not US" and classify as international) Usage: python -m scripts.workers.cdr_npa_importer python -m scripts.workers.cdr_npa_importer --snapshot-only # skip network """ from __future__ import annotations import argparse import csv import io import logging import os import sys import urllib.request from pathlib import Path from typing import Iterable import psycopg2 import psycopg2.extras log = logging.getLogger("cdr_npa_importer") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) NANPA_URL = os.environ.get( "NANPA_CSV_URL", "https://www.nationalnanpa.com/enas/displayNpaCityReport.do" "?csv=Y", ) SNAPSHOT_PATH = ( Path(__file__).resolve().parent.parent / "data" / "nanpa_area_codes_snapshot.csv" ) DATABASE_URL = os.environ.get("DATABASE_URL", "") # NANP jurisdictions that are NOT the United States — classifying their # area codes as "international" is critical for correct 499-A splits. # Source: ITU-T E.164 + NANPA country-code assignments. _NANP_NON_US_BY_NPA: dict[str, tuple[str, str]] = { # NPA: (ISO-2 country, region label) "242": ("BS", "Bahamas"), "246": ("BB", "Barbados"), "264": ("AI", "Anguilla"), "268": ("AG", "Antigua and Barbuda"), "284": ("VG", "British Virgin Islands"), "340": ("VI", "US Virgin Islands"), # actually US territory — handled below "345": ("KY", "Cayman Islands"), "441": ("BM", "Bermuda"), "473": ("GD", "Grenada"), "649": ("TC", "Turks and Caicos"), "658": ("JM", "Jamaica"), "664": ("MS", "Montserrat"), "670": ("MP", "Northern Mariana Islands"), # US territory "671": ("GU", "Guam"), # US territory "684": ("AS", "American Samoa"), # US territory "721": ("SX", "Sint Maarten"), "758": ("LC", "Saint Lucia"), "767": ("DM", "Dominica"), "784": ("VC", "Saint Vincent and the Grenadines"), "787": ("PR", "Puerto Rico"), # US territory "809": ("DO", "Dominican Republic"), "829": ("DO", "Dominican Republic"), "849": ("DO", "Dominican Republic"), "868": ("TT", "Trinidad and Tobago"), "869": ("KN", "Saint Kitts and Nevis"), "876": ("JM", "Jamaica"), "939": ("PR", "Puerto Rico"), # US territory } # US territories (still country=US, not international): _US_TERRITORY_NPAS = {"340", "670", "671", "684", "787", "939"} def _fetch_csv(url: str) -> str | None: try: req = urllib.request.Request( url, headers={"User-Agent": "Mozilla/5.0 (PerformanceWest NPA Importer)"} ) with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode("utf-8", errors="replace") except Exception as exc: log.warning("NANPA fetch failed (%s): %s — falling back to snapshot", url, exc) return None def _parse_nanpa_csv(csv_text: str) -> Iterable[dict]: """Yield normalized rows from the NANPA CSV. The public NANPA report varies slightly across the two endpoints (geoAreaCodeNumberReport vs. displayNpaCityReport). We accept either and key fields we care about: NPA, Location (state/province/country), Status. Rows without a 3-digit numeric NPA are skipped (header rows, footers, non-geographic codes like 500/700/800/900 toll-free). """ reader = csv.DictReader(io.StringIO(csv_text)) for row in reader: # Normalize common column-name variants npa = (row.get("NPA") or row.get("Area Code") or row.get("npa") or "").strip() if not npa.isdigit() or len(npa) != 3: continue location = ( row.get("Location") or row.get("Jurisdiction") or row.get("Service Area Description") or "" ).strip() status = (row.get("Status") or row.get("status") or "").strip().upper() if status in ("UNASSIGNED", "RETURNED", "RESERVED"): continue # Resolve country / state country, state, note = _resolve_geography(npa, location) yield { "npa": npa, "state": state, "country": country, "note": note or location, } def _resolve_geography(npa: str, location: str) -> tuple[str, str | None, str]: """Return (country_code, state_code_or_None, note).""" # Hardcoded NANP non-US lookup wins (covers Caribbean + territories) if npa in _NANP_NON_US_BY_NPA: country, name = _NANP_NON_US_BY_NPA[npa] if npa in _US_TERRITORY_NPAS: return "US", country, name # "PR" / "GU" / etc. as state return country, None, name # Canadian area codes — location typically includes the province name # or "Canada". Map the common ones. canadian_npas = { "204","226","236","249","250","263","289","306","343","354","365","367", "368","382","403","416","418","428","431","437","438","450","468","474", "506","514","519","548","579","581","584","587","600","604","613","639", "647","672","683","705","709","742","753","778","780","782","807","819", "825","867","873","879","902","905", } if npa in canadian_npas or "canada" in location.lower(): return "CA", None, location or "Canada" # Default: US. State is the first two-char token matching a USPS code. state = _extract_us_state(location) return "US", state, location # USPS state/territory codes — used to extract state from the free-text # "Location" column ("CALIFORNIA" / "New York" / "CA" / "NY"). _USPS_STATES = { "AL":"ALABAMA","AK":"ALASKA","AZ":"ARIZONA","AR":"ARKANSAS","CA":"CALIFORNIA", "CO":"COLORADO","CT":"CONNECTICUT","DE":"DELAWARE","DC":"DISTRICT OF COLUMBIA", "FL":"FLORIDA","GA":"GEORGIA","HI":"HAWAII","ID":"IDAHO","IL":"ILLINOIS", "IN":"INDIANA","IA":"IOWA","KS":"KANSAS","KY":"KENTUCKY","LA":"LOUISIANA", "ME":"MAINE","MD":"MARYLAND","MA":"MASSACHUSETTS","MI":"MICHIGAN","MN":"MINNESOTA", "MS":"MISSISSIPPI","MO":"MISSOURI","MT":"MONTANA","NE":"NEBRASKA","NV":"NEVADA", "NH":"NEW HAMPSHIRE","NJ":"NEW JERSEY","NM":"NEW MEXICO","NY":"NEW YORK", "NC":"NORTH CAROLINA","ND":"NORTH DAKOTA","OH":"OHIO","OK":"OKLAHOMA","OR":"OREGON", "PA":"PENNSYLVANIA","RI":"RHODE ISLAND","SC":"SOUTH CAROLINA","SD":"SOUTH DAKOTA", "TN":"TENNESSEE","TX":"TEXAS","UT":"UTAH","VT":"VERMONT","VA":"VIRGINIA", "WA":"WASHINGTON","WV":"WEST VIRGINIA","WI":"WISCONSIN","WY":"WYOMING", } def _extract_us_state(location: str) -> str | None: if not location: return None up = location.upper() # Try explicit 2-char token first for token in up.replace(",", " ").split(): if token in _USPS_STATES: return token # Then try full-name match for code, full_name in _USPS_STATES.items(): if full_name in up: return code return None def _load_snapshot() -> str | None: if not SNAPSHOT_PATH.exists(): return None return SNAPSHOT_PATH.read_text(encoding="utf-8", errors="replace") def import_rows(rows: Iterable[dict]) -> int: """Upsert into nanpa_area_codes. Returns count written.""" conn = psycopg2.connect(DATABASE_URL) count = 0 try: with conn.cursor() as cur: for row in rows: cur.execute( """ INSERT INTO nanpa_area_codes (npa, state, country, note, updated_at) VALUES (%s, %s, %s, %s, NOW()) ON CONFLICT (npa) DO UPDATE SET state = EXCLUDED.state, country = EXCLUDED.country, note = EXCLUDED.note, updated_at = NOW() """, (row["npa"], row["state"], row["country"], row.get("note")), ) count += 1 conn.commit() finally: conn.close() return count def main() -> None: parser = argparse.ArgumentParser(description="Import NANPA area codes.") parser.add_argument( "--snapshot-only", action="store_true", help="Skip the network fetch; use the bundled snapshot CSV only.", ) args = parser.parse_args() if not DATABASE_URL: log.error("DATABASE_URL not set — aborting") sys.exit(1) csv_text: str | None = None if not args.snapshot_only: csv_text = _fetch_csv(NANPA_URL) if csv_text is None: csv_text = _load_snapshot() if csv_text is None: log.error( "No NANPA data available — network fetch failed and no snapshot at %s", SNAPSHOT_PATH, ) sys.exit(1) rows = list(_parse_nanpa_csv(csv_text)) log.info("Parsed %d NPA rows from input", len(rows)) # Always ensure the hardcoded NANP non-US set is present even if the # public CSV misses it (the NANPA report sometimes omits rows). for npa, (country, name) in _NANP_NON_US_BY_NPA.items(): state = country if npa in _US_TERRITORY_NPAS else None country_code = "US" if npa in _US_TERRITORY_NPAS else country rows.append({"npa": npa, "state": state, "country": country_code, "note": name}) written = import_rows(rows) log.info("Upserted %d NPA records", written) if __name__ == "__main__": main()