hc: NPPES endpoint mailable-inbox harvester (institutional/consumer, HISP-filtered)
Extracts cold-mailable provider inboxes from the NPPES endpoint_pfile, dropping Direct/HISP gateway domains (not deliverable from a normal MTA). From the June 2026 NPPES file: 88,728 institutional + 19,355 consumer mailable candidates. Institutional is the warmup-safe slice (consumer webmail is held back -- aggressive filtering would hurt the warming IP).
This commit is contained in:
parent
a648ae6e0a
commit
51a287271f
1 changed files with 101 additions and 0 deletions
101
scripts/harvest_nppes_mailable.py
Normal file
101
scripts/harvest_nppes_mailable.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Harvest the cold-mailable NPPES endpoint inboxes from the endpoint_pfile.
|
||||
|
||||
Reads the NPPES endpoint_pfile, classifies each endpoint email with the shared
|
||||
healthcare_email_streams.classify (so it stays consistent with the warmup
|
||||
import), and writes only the COLD-MAILABLE streams (institutional + consumer)
|
||||
to a CSV. Direct/HISP and invalid endpoints are dropped (they can't be
|
||||
cold-emailed from a normal MTA). Reports the universe sizes.
|
||||
|
||||
Usage:
|
||||
python3 scripts/harvest_nppes_mailable.py ENDPOINT_PFILE.csv OUT.csv
|
||||
"""
|
||||
import csv
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
sys.path.insert(0, "/opt/performancewest/scripts")
|
||||
sys.path.insert(0, "scripts")
|
||||
from healthcare_email_streams import classify # noqa: E402
|
||||
|
||||
ENDPOINT_TYPE_COL = 1
|
||||
NPI_COL = 0
|
||||
EMAIL_COL = 3
|
||||
|
||||
# Direct Secure Messaging (HISP) domains are NOT cold-mailable from a normal MTA
|
||||
# -- they route only inside DirectTrust and will fail/bounce. The stream
|
||||
# classifier's "institutional" bucket leaks these (e.g. upmcdirect.com,
|
||||
# *.providencedirect.org, *shdirect.org, epicdirect.promedica.org), so we filter
|
||||
# them out here by the unmistakable HISP domain patterns.
|
||||
_HISP_MARKERS = ("direct", "hisp", "secure", "directtrust")
|
||||
|
||||
|
||||
def is_hisp_domain(domain: str) -> bool:
|
||||
d = domain.lower()
|
||||
# Any domain whose label contains a Direct/HISP marker word, or a known
|
||||
# *.org/.com Direct gateway shape. "direct" as a substring catches the vast
|
||||
# majority (xdirect.org, directX.com, *.providencedirect.org, etc.).
|
||||
return any(m in d for m in _HISP_MARKERS)
|
||||
|
||||
|
||||
# Common real consumer-inbox providers -- always genuinely mailable.
|
||||
_CONSUMER_DOMAINS = {
|
||||
"gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com",
|
||||
"icloud.com", "msn.com", "live.com", "comcast.net", "att.net",
|
||||
"sbcglobal.net", "verizon.net", "me.com", "ymail.com", "protonmail.com",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
src = sys.argv[1]
|
||||
out = sys.argv[2] if len(sys.argv) > 2 else "nppes_mailable.csv"
|
||||
|
||||
stats = defaultdict(int)
|
||||
domains = defaultdict(int)
|
||||
seen = set() # (npi, email) dedupe
|
||||
mailable = [] # (npi, email, stream)
|
||||
|
||||
with open(src, newline="", encoding="latin-1") as f:
|
||||
r = csv.reader(f)
|
||||
next(r, None) # header
|
||||
for row in r:
|
||||
if len(row) <= EMAIL_COL:
|
||||
continue
|
||||
npi = row[NPI_COL].strip().strip('"')
|
||||
ep = row[EMAIL_COL].strip().strip('"')
|
||||
if not npi or not ep:
|
||||
continue
|
||||
stream = classify(ep)
|
||||
stats[stream] += 1
|
||||
if stream in ("institutional", "consumer"):
|
||||
dom = ep.rsplit("@", 1)[-1].lower()
|
||||
# Drop Direct/HISP gateways that leak into 'institutional'.
|
||||
if is_hisp_domain(dom):
|
||||
stats["hisp_filtered"] += 1
|
||||
continue
|
||||
key = (npi, ep.lower())
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
mailable.append((npi, ep, stream))
|
||||
domains[dom] += 1
|
||||
|
||||
with open(out, "w", newline="") as f:
|
||||
w = csv.writer(f)
|
||||
w.writerow(["npi", "email", "stream"])
|
||||
w.writerows(mailable)
|
||||
|
||||
print("=== NPPES endpoint classification ===")
|
||||
for k in sorted(stats, key=lambda k: -stats[k]):
|
||||
print(f" {k:14} {stats[k]:>8,}")
|
||||
print()
|
||||
print(f"COLD-MAILABLE (institutional+consumer), deduped: {len(mailable):,}")
|
||||
print(f" -> wrote {out}")
|
||||
print()
|
||||
print("Top mailable domains:")
|
||||
for dom, n in sorted(domains.items(), key=lambda x: -x[1])[:15]:
|
||||
print(f" {dom:30} {n:>7,}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue