new-site/scripts/workers/cdr_npa_importer.py

"""
NANPA area-code importer.

One-shot (re-runnable) loader for `nanpa_area_codes`. Fetches the public
NANPA "Geographic Area Code Number Report" — a CSV listing every
active/assigned NANP area code and the geography it serves.

Source: https://www.nationalnanpa.com/enas/geoAreaCodeNumberReport.do
  (the site exposes the data as CSV behind a simple GET; we mirror the
  fields we care about into our PG table.)

Fallback: if the NANPA endpoint is unreachable, we have a bundled
snapshot at scripts/data/nanpa_area_codes_snapshot.csv that we fall
back to so classification still works in dev / offline environments.

Coverage:
  - All 50 US states + DC + territories (PR, VI, GU, AS, MP)
  - Canada (for international classification)
  - Caribbean NANP jurisdictions (1-242 Bahamas, 1-246 Barbados, etc. —
    these are "NANP but not US" and classify as international)

Usage:
    python -m scripts.workers.cdr_npa_importer
    python -m scripts.workers.cdr_npa_importer --snapshot-only  # skip network
"""

from __future__ import annotations

import argparse
import csv
import io
import logging
import os
import sys
import urllib.request
from pathlib import Path
from typing import Iterable

import psycopg2
import psycopg2.extras

log = logging.getLogger("cdr_npa_importer")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)

NANPA_URL = os.environ.get(
    "NANPA_CSV_URL",
    "https://www.nationalnanpa.com/enas/displayNpaCityReport.do"
    "?csv=Y",
)
SNAPSHOT_PATH = (
    Path(__file__).resolve().parent.parent / "data" / "nanpa_area_codes_snapshot.csv"
)
DATABASE_URL = os.environ.get("DATABASE_URL", "")


# NANP jurisdictions that are NOT the United States — classifying their
# area codes as "international" is critical for correct 499-A splits.
# Source: ITU-T E.164 + NANPA country-code assignments.
_NANP_NON_US_BY_NPA: dict[str, tuple[str, str]] = {
    # NPA: (ISO-2 country, region label)
    "242": ("BS", "Bahamas"),
    "246": ("BB", "Barbados"),
    "264": ("AI", "Anguilla"),
    "268": ("AG", "Antigua and Barbuda"),
    "284": ("VG", "British Virgin Islands"),
    "340": ("VI", "US Virgin Islands"),  # actually US territory — handled below
    "345": ("KY", "Cayman Islands"),
    "441": ("BM", "Bermuda"),
    "473": ("GD", "Grenada"),
    "649": ("TC", "Turks and Caicos"),
    "658": ("JM", "Jamaica"),
    "664": ("MS", "Montserrat"),
    "670": ("MP", "Northern Mariana Islands"),  # US territory
    "671": ("GU", "Guam"),                      # US territory
    "684": ("AS", "American Samoa"),            # US territory
    "721": ("SX", "Sint Maarten"),
    "758": ("LC", "Saint Lucia"),
    "767": ("DM", "Dominica"),
    "784": ("VC", "Saint Vincent and the Grenadines"),
    "787": ("PR", "Puerto Rico"),               # US territory
    "809": ("DO", "Dominican Republic"),
    "829": ("DO", "Dominican Republic"),
    "849": ("DO", "Dominican Republic"),
    "868": ("TT", "Trinidad and Tobago"),
    "869": ("KN", "Saint Kitts and Nevis"),
    "876": ("JM", "Jamaica"),
    "939": ("PR", "Puerto Rico"),               # US territory
}
# US territories (still country=US, not international):
_US_TERRITORY_NPAS = {"340", "670", "671", "684", "787", "939"}


def _fetch_csv(url: str) -> str | None:
    try:
        req = urllib.request.Request(
            url, headers={"User-Agent": "Mozilla/5.0 (PerformanceWest NPA Importer)"}
        )
        with urllib.request.urlopen(req, timeout=30) as resp:
            return resp.read().decode("utf-8", errors="replace")
    except Exception as exc:
        log.warning("NANPA fetch failed (%s): %s — falling back to snapshot", url, exc)
        return None


def _parse_nanpa_csv(csv_text: str) -> Iterable[dict]:
    """Yield normalized rows from the NANPA CSV.

    The public NANPA report varies slightly across the two endpoints
    (geoAreaCodeNumberReport vs. displayNpaCityReport). We accept either
    and key fields we care about: NPA, Location (state/province/country),
    Status. Rows without a 3-digit numeric NPA are skipped (header rows,
    footers, non-geographic codes like 500/700/800/900 toll-free).
    """
    reader = csv.DictReader(io.StringIO(csv_text))
    for row in reader:
        # Normalize common column-name variants
        npa = (row.get("NPA") or row.get("Area Code") or row.get("npa") or "").strip()
        if not npa.isdigit() or len(npa) != 3:
            continue
        location = (
            row.get("Location")
            or row.get("Jurisdiction")
            or row.get("Service Area Description")
            or ""
        ).strip()
        status = (row.get("Status") or row.get("status") or "").strip().upper()
        if status in ("UNASSIGNED", "RETURNED", "RESERVED"):
            continue

        # Resolve country / state
        country, state, note = _resolve_geography(npa, location)
        yield {
            "npa": npa,
            "state": state,
            "country": country,
            "note": note or location,
        }


def _resolve_geography(npa: str, location: str) -> tuple[str, str | None, str]:
    """Return (country_code, state_code_or_None, note)."""
    # Hardcoded NANP non-US lookup wins (covers Caribbean + territories)
    if npa in _NANP_NON_US_BY_NPA:
        country, name = _NANP_NON_US_BY_NPA[npa]
        if npa in _US_TERRITORY_NPAS:
            return "US", country, name  # "PR" / "GU" / etc. as state
        return country, None, name

    # Canadian area codes — location typically includes the province name
    # or "Canada". Map the common ones.
    canadian_npas = {
        "204","226","236","249","250","263","289","306","343","354","365","367",
        "368","382","403","416","418","428","431","437","438","450","468","474",
        "506","514","519","548","579","581","584","587","600","604","613","639",
        "647","672","683","705","709","742","753","778","780","782","807","819",
        "825","867","873","879","902","905",
    }
    if npa in canadian_npas or "canada" in location.lower():
        return "CA", None, location or "Canada"

    # Default: US. State is the first two-char token matching a USPS code.
    state = _extract_us_state(location)
    return "US", state, location


# USPS state/territory codes — used to extract state from the free-text
# "Location" column ("CALIFORNIA" / "New York" / "CA" / "NY").
_USPS_STATES = {
    "AL":"ALABAMA","AK":"ALASKA","AZ":"ARIZONA","AR":"ARKANSAS","CA":"CALIFORNIA",
    "CO":"COLORADO","CT":"CONNECTICUT","DE":"DELAWARE","DC":"DISTRICT OF COLUMBIA",
    "FL":"FLORIDA","GA":"GEORGIA","HI":"HAWAII","ID":"IDAHO","IL":"ILLINOIS",
    "IN":"INDIANA","IA":"IOWA","KS":"KANSAS","KY":"KENTUCKY","LA":"LOUISIANA",
    "ME":"MAINE","MD":"MARYLAND","MA":"MASSACHUSETTS","MI":"MICHIGAN","MN":"MINNESOTA",
    "MS":"MISSISSIPPI","MO":"MISSOURI","MT":"MONTANA","NE":"NEBRASKA","NV":"NEVADA",
    "NH":"NEW HAMPSHIRE","NJ":"NEW JERSEY","NM":"NEW MEXICO","NY":"NEW YORK",
    "NC":"NORTH CAROLINA","ND":"NORTH DAKOTA","OH":"OHIO","OK":"OKLAHOMA","OR":"OREGON",
    "PA":"PENNSYLVANIA","RI":"RHODE ISLAND","SC":"SOUTH CAROLINA","SD":"SOUTH DAKOTA",
    "TN":"TENNESSEE","TX":"TEXAS","UT":"UTAH","VT":"VERMONT","VA":"VIRGINIA",
    "WA":"WASHINGTON","WV":"WEST VIRGINIA","WI":"WISCONSIN","WY":"WYOMING",
}


def _extract_us_state(location: str) -> str | None:
    if not location:
        return None
    up = location.upper()
    # Try explicit 2-char token first
    for token in up.replace(",", " ").split():
        if token in _USPS_STATES:
            return token
    # Then try full-name match
    for code, full_name in _USPS_STATES.items():
        if full_name in up:
            return code
    return None


def _load_snapshot() -> str | None:
    if not SNAPSHOT_PATH.exists():
        return None
    return SNAPSHOT_PATH.read_text(encoding="utf-8", errors="replace")


def import_rows(rows: Iterable[dict]) -> int:
    """Upsert into nanpa_area_codes. Returns count written."""
    conn = psycopg2.connect(DATABASE_URL)
    count = 0
    try:
        with conn.cursor() as cur:
            for row in rows:
                cur.execute(
                    """
                    INSERT INTO nanpa_area_codes (npa, state, country, note, updated_at)
                    VALUES (%s, %s, %s, %s, NOW())
                    ON CONFLICT (npa) DO UPDATE SET
                        state = EXCLUDED.state,
                        country = EXCLUDED.country,
                        note = EXCLUDED.note,
                        updated_at = NOW()
                    """,
                    (row["npa"], row["state"], row["country"], row.get("note")),
                )
                count += 1
        conn.commit()
    finally:
        conn.close()
    return count


def main() -> None:
    parser = argparse.ArgumentParser(description="Import NANPA area codes.")
    parser.add_argument(
        "--snapshot-only",
        action="store_true",
        help="Skip the network fetch; use the bundled snapshot CSV only.",
    )
    args = parser.parse_args()

    if not DATABASE_URL:
        log.error("DATABASE_URL not set — aborting")
        sys.exit(1)

    csv_text: str | None = None
    if not args.snapshot_only:
        csv_text = _fetch_csv(NANPA_URL)
    if csv_text is None:
        csv_text = _load_snapshot()
    if csv_text is None:
        log.error(
            "No NANPA data available — network fetch failed and no snapshot at %s",
            SNAPSHOT_PATH,
        )
        sys.exit(1)

    rows = list(_parse_nanpa_csv(csv_text))
    log.info("Parsed %d NPA rows from input", len(rows))

    # Always ensure the hardcoded NANP non-US set is present even if the
    # public CSV misses it (the NANPA report sometimes omits rows).
    for npa, (country, name) in _NANP_NON_US_BY_NPA.items():
        state = country if npa in _US_TERRITORY_NPAS else None
        country_code = "US" if npa in _US_TERRITORY_NPAS else country
        rows.append({"npa": npa, "state": state, "country": country_code, "note": name})

    written = import_rows(rows)
    log.info("Upserted %d NPA records", written)


if __name__ == "__main__":
    main()