hc: NPPES endpoint mailable-inbox harvester (institutional/consumer, HISP-filtered)

Extracts cold-mailable provider inboxes from the NPPES endpoint_pfile, dropping
Direct/HISP gateway domains (not deliverable from a normal MTA). From the
June 2026 NPPES file: 88,728 institutional + 19,355 consumer mailable
candidates. Institutional is the warmup-safe slice (consumer webmail is held
back -- aggressive filtering would hurt the warming IP).
This commit is contained in:
justin 2026-06-12 20:03:12 -05:00
parent a648ae6e0a
commit 51a287271f

View file

@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""Harvest the cold-mailable NPPES endpoint inboxes from the endpoint_pfile.
Reads the NPPES endpoint_pfile, classifies each endpoint email with the shared
healthcare_email_streams.classify (so it stays consistent with the warmup
import), and writes only the COLD-MAILABLE streams (institutional + consumer)
to a CSV. Direct/HISP and invalid endpoints are dropped (they can't be
cold-emailed from a normal MTA). Reports the universe sizes.
Usage:
python3 scripts/harvest_nppes_mailable.py ENDPOINT_PFILE.csv OUT.csv
"""
import csv
import sys
from collections import defaultdict
sys.path.insert(0, "/opt/performancewest/scripts")
sys.path.insert(0, "scripts")
from healthcare_email_streams import classify # noqa: E402
ENDPOINT_TYPE_COL = 1
NPI_COL = 0
EMAIL_COL = 3
# Direct Secure Messaging (HISP) domains are NOT cold-mailable from a normal MTA
# -- they route only inside DirectTrust and will fail/bounce. The stream
# classifier's "institutional" bucket leaks these (e.g. upmcdirect.com,
# *.providencedirect.org, *shdirect.org, epicdirect.promedica.org), so we filter
# them out here by the unmistakable HISP domain patterns.
_HISP_MARKERS = ("direct", "hisp", "secure", "directtrust")
def is_hisp_domain(domain: str) -> bool:
d = domain.lower()
# Any domain whose label contains a Direct/HISP marker word, or a known
# *.org/.com Direct gateway shape. "direct" as a substring catches the vast
# majority (xdirect.org, directX.com, *.providencedirect.org, etc.).
return any(m in d for m in _HISP_MARKERS)
# Common real consumer-inbox providers -- always genuinely mailable.
_CONSUMER_DOMAINS = {
"gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com",
"icloud.com", "msn.com", "live.com", "comcast.net", "att.net",
"sbcglobal.net", "verizon.net", "me.com", "ymail.com", "protonmail.com",
}
def main():
src = sys.argv[1]
out = sys.argv[2] if len(sys.argv) > 2 else "nppes_mailable.csv"
stats = defaultdict(int)
domains = defaultdict(int)
seen = set() # (npi, email) dedupe
mailable = [] # (npi, email, stream)
with open(src, newline="", encoding="latin-1") as f:
r = csv.reader(f)
next(r, None) # header
for row in r:
if len(row) <= EMAIL_COL:
continue
npi = row[NPI_COL].strip().strip('"')
ep = row[EMAIL_COL].strip().strip('"')
if not npi or not ep:
continue
stream = classify(ep)
stats[stream] += 1
if stream in ("institutional", "consumer"):
dom = ep.rsplit("@", 1)[-1].lower()
# Drop Direct/HISP gateways that leak into 'institutional'.
if is_hisp_domain(dom):
stats["hisp_filtered"] += 1
continue
key = (npi, ep.lower())
if key in seen:
continue
seen.add(key)
mailable.append((npi, ep, stream))
domains[dom] += 1
with open(out, "w", newline="") as f:
w = csv.writer(f)
w.writerow(["npi", "email", "stream"])
w.writerows(mailable)
print("=== NPPES endpoint classification ===")
for k in sorted(stats, key=lambda k: -stats[k]):
print(f" {k:14} {stats[k]:>8,}")
print()
print(f"COLD-MAILABLE (institutional+consumer), deduped: {len(mailable):,}")
print(f" -> wrote {out}")
print()
print("Top mailable domains:")
for dom, n in sorted(domains.items(), key=lambda x: -x[1])[:15]:
print(f" {dom:30} {n:>7,}")
if __name__ == "__main__":
main()