new-site/scripts/workers/cdr_npa_importer.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

274 lines
10 KiB
Python

"""
NANPA area-code importer.
One-shot (re-runnable) loader for `nanpa_area_codes`. Fetches the public
NANPA "Geographic Area Code Number Report" — a CSV listing every
active/assigned NANP area code and the geography it serves.
Source: https://www.nationalnanpa.com/enas/geoAreaCodeNumberReport.do
(the site exposes the data as CSV behind a simple GET; we mirror the
fields we care about into our PG table.)
Fallback: if the NANPA endpoint is unreachable, we have a bundled
snapshot at scripts/data/nanpa_area_codes_snapshot.csv that we fall
back to so classification still works in dev / offline environments.
Coverage:
- All 50 US states + DC + territories (PR, VI, GU, AS, MP)
- Canada (for international classification)
- Caribbean NANP jurisdictions (1-242 Bahamas, 1-246 Barbados, etc. —
these are "NANP but not US" and classify as international)
Usage:
python -m scripts.workers.cdr_npa_importer
python -m scripts.workers.cdr_npa_importer --snapshot-only # skip network
"""
from __future__ import annotations
import argparse
import csv
import io
import logging
import os
import sys
import urllib.request
from pathlib import Path
from typing import Iterable
import psycopg2
import psycopg2.extras
log = logging.getLogger("cdr_npa_importer")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
)
NANPA_URL = os.environ.get(
"NANPA_CSV_URL",
"https://www.nationalnanpa.com/enas/displayNpaCityReport.do"
"?csv=Y",
)
SNAPSHOT_PATH = (
Path(__file__).resolve().parent.parent / "data" / "nanpa_area_codes_snapshot.csv"
)
DATABASE_URL = os.environ.get("DATABASE_URL", "")
# NANP jurisdictions that are NOT the United States — classifying their
# area codes as "international" is critical for correct 499-A splits.
# Source: ITU-T E.164 + NANPA country-code assignments.
_NANP_NON_US_BY_NPA: dict[str, tuple[str, str]] = {
# NPA: (ISO-2 country, region label)
"242": ("BS", "Bahamas"),
"246": ("BB", "Barbados"),
"264": ("AI", "Anguilla"),
"268": ("AG", "Antigua and Barbuda"),
"284": ("VG", "British Virgin Islands"),
"340": ("VI", "US Virgin Islands"), # actually US territory — handled below
"345": ("KY", "Cayman Islands"),
"441": ("BM", "Bermuda"),
"473": ("GD", "Grenada"),
"649": ("TC", "Turks and Caicos"),
"658": ("JM", "Jamaica"),
"664": ("MS", "Montserrat"),
"670": ("MP", "Northern Mariana Islands"), # US territory
"671": ("GU", "Guam"), # US territory
"684": ("AS", "American Samoa"), # US territory
"721": ("SX", "Sint Maarten"),
"758": ("LC", "Saint Lucia"),
"767": ("DM", "Dominica"),
"784": ("VC", "Saint Vincent and the Grenadines"),
"787": ("PR", "Puerto Rico"), # US territory
"809": ("DO", "Dominican Republic"),
"829": ("DO", "Dominican Republic"),
"849": ("DO", "Dominican Republic"),
"868": ("TT", "Trinidad and Tobago"),
"869": ("KN", "Saint Kitts and Nevis"),
"876": ("JM", "Jamaica"),
"939": ("PR", "Puerto Rico"), # US territory
}
# US territories (still country=US, not international):
_US_TERRITORY_NPAS = {"340", "670", "671", "684", "787", "939"}
def _fetch_csv(url: str) -> str | None:
try:
req = urllib.request.Request(
url, headers={"User-Agent": "Mozilla/5.0 (PerformanceWest NPA Importer)"}
)
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
except Exception as exc:
log.warning("NANPA fetch failed (%s): %s — falling back to snapshot", url, exc)
return None
def _parse_nanpa_csv(csv_text: str) -> Iterable[dict]:
"""Yield normalized rows from the NANPA CSV.
The public NANPA report varies slightly across the two endpoints
(geoAreaCodeNumberReport vs. displayNpaCityReport). We accept either
and key fields we care about: NPA, Location (state/province/country),
Status. Rows without a 3-digit numeric NPA are skipped (header rows,
footers, non-geographic codes like 500/700/800/900 toll-free).
"""
reader = csv.DictReader(io.StringIO(csv_text))
for row in reader:
# Normalize common column-name variants
npa = (row.get("NPA") or row.get("Area Code") or row.get("npa") or "").strip()
if not npa.isdigit() or len(npa) != 3:
continue
location = (
row.get("Location")
or row.get("Jurisdiction")
or row.get("Service Area Description")
or ""
).strip()
status = (row.get("Status") or row.get("status") or "").strip().upper()
if status in ("UNASSIGNED", "RETURNED", "RESERVED"):
continue
# Resolve country / state
country, state, note = _resolve_geography(npa, location)
yield {
"npa": npa,
"state": state,
"country": country,
"note": note or location,
}
def _resolve_geography(npa: str, location: str) -> tuple[str, str | None, str]:
"""Return (country_code, state_code_or_None, note)."""
# Hardcoded NANP non-US lookup wins (covers Caribbean + territories)
if npa in _NANP_NON_US_BY_NPA:
country, name = _NANP_NON_US_BY_NPA[npa]
if npa in _US_TERRITORY_NPAS:
return "US", country, name # "PR" / "GU" / etc. as state
return country, None, name
# Canadian area codes — location typically includes the province name
# or "Canada". Map the common ones.
canadian_npas = {
"204","226","236","249","250","263","289","306","343","354","365","367",
"368","382","403","416","418","428","431","437","438","450","468","474",
"506","514","519","548","579","581","584","587","600","604","613","639",
"647","672","683","705","709","742","753","778","780","782","807","819",
"825","867","873","879","902","905",
}
if npa in canadian_npas or "canada" in location.lower():
return "CA", None, location or "Canada"
# Default: US. State is the first two-char token matching a USPS code.
state = _extract_us_state(location)
return "US", state, location
# USPS state/territory codes — used to extract state from the free-text
# "Location" column ("CALIFORNIA" / "New York" / "CA" / "NY").
_USPS_STATES = {
"AL":"ALABAMA","AK":"ALASKA","AZ":"ARIZONA","AR":"ARKANSAS","CA":"CALIFORNIA",
"CO":"COLORADO","CT":"CONNECTICUT","DE":"DELAWARE","DC":"DISTRICT OF COLUMBIA",
"FL":"FLORIDA","GA":"GEORGIA","HI":"HAWAII","ID":"IDAHO","IL":"ILLINOIS",
"IN":"INDIANA","IA":"IOWA","KS":"KANSAS","KY":"KENTUCKY","LA":"LOUISIANA",
"ME":"MAINE","MD":"MARYLAND","MA":"MASSACHUSETTS","MI":"MICHIGAN","MN":"MINNESOTA",
"MS":"MISSISSIPPI","MO":"MISSOURI","MT":"MONTANA","NE":"NEBRASKA","NV":"NEVADA",
"NH":"NEW HAMPSHIRE","NJ":"NEW JERSEY","NM":"NEW MEXICO","NY":"NEW YORK",
"NC":"NORTH CAROLINA","ND":"NORTH DAKOTA","OH":"OHIO","OK":"OKLAHOMA","OR":"OREGON",
"PA":"PENNSYLVANIA","RI":"RHODE ISLAND","SC":"SOUTH CAROLINA","SD":"SOUTH DAKOTA",
"TN":"TENNESSEE","TX":"TEXAS","UT":"UTAH","VT":"VERMONT","VA":"VIRGINIA",
"WA":"WASHINGTON","WV":"WEST VIRGINIA","WI":"WISCONSIN","WY":"WYOMING",
}
def _extract_us_state(location: str) -> str | None:
if not location:
return None
up = location.upper()
# Try explicit 2-char token first
for token in up.replace(",", " ").split():
if token in _USPS_STATES:
return token
# Then try full-name match
for code, full_name in _USPS_STATES.items():
if full_name in up:
return code
return None
def _load_snapshot() -> str | None:
if not SNAPSHOT_PATH.exists():
return None
return SNAPSHOT_PATH.read_text(encoding="utf-8", errors="replace")
def import_rows(rows: Iterable[dict]) -> int:
"""Upsert into nanpa_area_codes. Returns count written."""
conn = psycopg2.connect(DATABASE_URL)
count = 0
try:
with conn.cursor() as cur:
for row in rows:
cur.execute(
"""
INSERT INTO nanpa_area_codes (npa, state, country, note, updated_at)
VALUES (%s, %s, %s, %s, NOW())
ON CONFLICT (npa) DO UPDATE SET
state = EXCLUDED.state,
country = EXCLUDED.country,
note = EXCLUDED.note,
updated_at = NOW()
""",
(row["npa"], row["state"], row["country"], row.get("note")),
)
count += 1
conn.commit()
finally:
conn.close()
return count
def main() -> None:
parser = argparse.ArgumentParser(description="Import NANPA area codes.")
parser.add_argument(
"--snapshot-only",
action="store_true",
help="Skip the network fetch; use the bundled snapshot CSV only.",
)
args = parser.parse_args()
if not DATABASE_URL:
log.error("DATABASE_URL not set — aborting")
sys.exit(1)
csv_text: str | None = None
if not args.snapshot_only:
csv_text = _fetch_csv(NANPA_URL)
if csv_text is None:
csv_text = _load_snapshot()
if csv_text is None:
log.error(
"No NANPA data available — network fetch failed and no snapshot at %s",
SNAPSHOT_PATH,
)
sys.exit(1)
rows = list(_parse_nanpa_csv(csv_text))
log.info("Parsed %d NPA rows from input", len(rows))
# Always ensure the hardcoded NANP non-US set is present even if the
# public CSV misses it (the NANPA report sometimes omits rows).
for npa, (country, name) in _NANP_NON_US_BY_NPA.items():
state = country if npa in _US_TERRITORY_NPAS else None
country_code = "US" if npa in _US_TERRITORY_NPAS else country
rows.append({"npa": npa, "state": state, "country": country_code, "note": name})
written = import_rows(rows)
log.info("Upserted %d NPA records", written)
if __name__ == "__main__":
main()