diff --git a/scripts/load_npi_companion_data.py b/scripts/load_npi_companion_data.py index ba2d766..3916863 100644 --- a/scripts/load_npi_companion_data.py +++ b/scripts/load_npi_companion_data.py @@ -27,6 +27,11 @@ from psycopg2.extras import execute_values DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest") +# CMS/OIG exports are not always clean UTF-8 (stray latin-1 bytes like 0xa0). +# Decode leniently so a few bad bytes don't abort a multi-hundred-MB load. +def open_csv(path): + return open(path, newline="", encoding="utf-8-sig", errors="replace") + def parse_date(s): if not s: @@ -49,7 +54,7 @@ def clean_npi(s): def load_revalidation(conn, path): rows = [] - with open(path, newline="", encoding="utf-8-sig") as f: + with open_csv(path) as f: for r in csv.DictReader(f): npi = (r.get("National Provider Identifier") or "").strip() if not (npi.isdigit() and len(npi) == 10): @@ -84,7 +89,7 @@ def load_revalidation(conn, path): def load_exclusions(conn, path): rows = [] - with open(path, newline="", encoding="utf-8-sig") as f: + with open_csv(path) as f: for r in csv.DictReader(f): rows.append(( clean_npi(r.get("NPI")), @@ -114,7 +119,7 @@ def load_exclusions(conn, path): def load_optout(conn, path): rows = [] - with open(path, newline="", encoding="utf-8-sig") as f: + with open_csv(path) as f: for r in csv.DictReader(f): npi = (r.get("npi") or r.get("NPI") or "").strip() if not (npi.isdigit() and len(npi) == 10):