fix(npi): two-tier Direct/HISP classifier so real Direct-Primary-Care/counseling practices stay institutional (was wrongly parked); add classifier unit tests

This commit is contained in:
justin 2026-06-06 00:09:42 -05:00
parent c3b2c4e89a
commit 68333148e6
2 changed files with 98 additions and 4 deletions

View file

@ -28,13 +28,34 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
# DirectTrust network and will NOT accept normal cold email. Substring match on
# the domain. Verified against the May 2026 endpoint_pfile top domains (catches
# direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
DIRECT_MARKERS: tuple[str, ...] = (
"direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd",
#
# Two tiers, because a naive substring "direct" wrongly parks real practices
# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care"
# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling
# practices like newdirectionscounselingservices.com). HISP gateways instead
# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com).
#
# DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring.
# DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or
# as the leading token of a label (direct-ci, directhisp).
DIRECT_VENDOR_MARKERS: tuple[str, ...] = (
"medicity.net", "surescripts", "updox", "maxmd",
"secureexchange", "directaddress", "directplus", "ehrdirect",
"mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
"directtrust", "secure-health", "directnppes",
"directtrust", "secure-health", "directnppes", "nextgenshare",
"cernerdirect", "directbygreenway", "directathenahealth",
"epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect",
"directmedgenehr", "emadirect", "compulinkdirect", "elationemr",
"directwellstar", "directhisp", "gwaydirect", "directmail",
"direct-ci", "direct-ehr", "direct-srhs",
)
# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS
# label (or the leading token of one). Checked label-by-label, not as a raw
# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional.
DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp")
DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias
# Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
# (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
# (low-cap) stream, never the hot institutional one.
@ -62,7 +83,19 @@ def domain_of(email: str) -> str:
def is_direct_secure(domain: str) -> bool:
d = domain.lower()
return any(m in d for m in DIRECT_MARKERS)
# Tier 1: unambiguous vendor/HISP tokens anywhere in the domain.
if any(m in d for m in DIRECT_VENDOR_MARKERS):
return True
# Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole
# DNS label or the leading token of one (e.g. "direct.foo.org",
# "foo.directplus.bar.com"), NOT when embedded in the registrable domain
# such as "arthurdirectcare.com" or "newdirectionscounseling.com".
labels = d.split(".")
for lab in labels:
for w in DIRECT_LABEL_WORDS:
if lab == w or lab.startswith(w + "-"):
return True
return False
def is_consumer(domain: str) -> bool:

View file

@ -0,0 +1,61 @@
"""Unit tests for the healthcare email-stream classifier.
Run: python3 -m scripts.test_healthcare_email_streams (or pytest)
Guards the subtle case that motivated the two-tier DIRECT detection: a naive
substring "direct" match wrongly parked real "Direct Primary Care" / counseling
practices (registrable domain merely contains the word) into the undeliverable
HISP pile. Direct/HISP gateways instead use "direct"/"hisp" as a whole DNS label.
"""
from scripts.healthcare_email_streams import classify
CASES = [
# Real Direct-Primary-Care / counseling practices -> institutional
("chelsea@arthurdirectcare.com", "institutional"),
("bassi@valleydirectprimarycare.com", "institutional"),
("megan@newdirectionscounselingservices.com", "institutional"),
("john@islanddirectprimarycare.com", "institutional"),
("kamlesh@mydirectcare.com", "institutional"),
("allison@truedirectioncounseling.com", "institutional"),
("marty@holtondirectcare.com", "institutional"),
("sbass@newdirectionsnonemergency.org", "institutional"),
("info@consumerdirectcare.com", "institutional"),
("x@rehabdirectives.com", "institutional"),
# Genuine Direct/HISP gateways -> direct (parked)
("x@direct.novanthealth.org", "direct"),
("x@CarolinasHealthcareSystem.direct-ci.com", "direct"),
("x@cfp.directbygreenway.com", "direct"),
("x@foo.4693.direct.athenahealth.com", "direct"),
("x@directHISP.wakemed.org", "direct"),
("x@boss.directak.net", "direct"),
("x@hisp.bryanhealth.org", "direct"),
("x@ehrdirect.mayoclinicmsg.org", "direct"),
("x@directaddress.net", "direct"),
("x@negaidx.allscriptsdirect.net", "direct"),
("x@mmiller@lickingmemorial.medicity.net".replace("mmiller@", ""), "direct"),
("x@foo.nextgenshare.com", "direct"),
# Consumer / institutional / excluded / invalid
("drsmith@gmail.com", "consumer"),
("info@smallclinic.com", "institutional"),
("x@somehospital.va.gov", "excluded"),
("x@base.health.mil", "excluded"),
("not-an-email", "invalid"),
("", "invalid"),
]
def test_classify():
failures = []
for email, expected in CASES:
got = classify(email)
if got != expected:
failures.append((email, got, expected))
assert not failures, "Misclassified: " + "; ".join(
f"{e} -> {g} (want {x})" for e, g, x in failures
)
if __name__ == "__main__":
test_classify()
print(f"OK: all {len(CASES)} classifier cases pass")