fix(npi): lenient CSV decoding in companion loader (CMS exports have stray latin-1 bytes)

This commit is contained in:
justin 2026-06-05 01:38:02 -05:00
parent 157c7a2571
commit e32193352b

View file

@ -27,6 +27,11 @@ from psycopg2.extras import execute_values
DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest") DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://pw:pw@localhost:5432/performancewest")
# CMS/OIG exports are not always clean UTF-8 (stray latin-1 bytes like 0xa0).
# Decode leniently so a few bad bytes don't abort a multi-hundred-MB load.
def open_csv(path):
return open(path, newline="", encoding="utf-8-sig", errors="replace")
def parse_date(s): def parse_date(s):
if not s: if not s:
@ -49,7 +54,7 @@ def clean_npi(s):
def load_revalidation(conn, path): def load_revalidation(conn, path):
rows = [] rows = []
with open(path, newline="", encoding="utf-8-sig") as f: with open_csv(path) as f:
for r in csv.DictReader(f): for r in csv.DictReader(f):
npi = (r.get("National Provider Identifier") or "").strip() npi = (r.get("National Provider Identifier") or "").strip()
if not (npi.isdigit() and len(npi) == 10): if not (npi.isdigit() and len(npi) == 10):
@ -84,7 +89,7 @@ def load_revalidation(conn, path):
def load_exclusions(conn, path): def load_exclusions(conn, path):
rows = [] rows = []
with open(path, newline="", encoding="utf-8-sig") as f: with open_csv(path) as f:
for r in csv.DictReader(f): for r in csv.DictReader(f):
rows.append(( rows.append((
clean_npi(r.get("NPI")), clean_npi(r.get("NPI")),
@ -114,7 +119,7 @@ def load_exclusions(conn, path):
def load_optout(conn, path): def load_optout(conn, path):
rows = [] rows = []
with open(path, newline="", encoding="utf-8-sig") as f: with open_csv(path) as f:
for r in csv.DictReader(f): for r in csv.DictReader(f):
npi = (r.get("npi") or r.get("NPI") or "").strip() npi = (r.get("npi") or r.get("NPI") or "").strip()
if not (npi.isdigit() and len(npi) == 10): if not (npi.isdigit() and len(npi) == 10):