From 68333148e69bcc4b58a2be31e673312c6f03521d Mon Sep 17 00:00:00 2001 From: justin Date: Sat, 6 Jun 2026 00:09:42 -0500 Subject: [PATCH] fix(npi): two-tier Direct/HISP classifier so real Direct-Primary-Care/counseling practices stay institutional (was wrongly parked); add classifier unit tests --- scripts/healthcare_email_streams.py | 41 ++++++++++++++-- scripts/test_healthcare_email_streams.py | 61 ++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 scripts/test_healthcare_email_streams.py diff --git a/scripts/healthcare_email_streams.py b/scripts/healthcare_email_streams.py index 342b7ab..7da6b3f 100644 --- a/scripts/healthcare_email_streams.py +++ b/scripts/healthcare_email_streams.py @@ -28,13 +28,34 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") # DirectTrust network and will NOT accept normal cold email. Substring match on # the domain. Verified against the May 2026 endpoint_pfile top domains (catches # direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.). -DIRECT_MARKERS: tuple[str, ...] = ( - "direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd", +# +# Two tiers, because a naive substring "direct" wrongly parks real practices +# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care" +# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling +# practices like newdirectionscounselingservices.com). HISP gateways instead +# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com). +# +# DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring. +# DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or +# as the leading token of a label (direct-ci, directhisp). +DIRECT_VENDOR_MARKERS: tuple[str, ...] = ( + "medicity.net", "surescripts", "updox", "maxmd", "secureexchange", "directaddress", "directplus", "ehrdirect", "mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure", - "directtrust", "secure-health", "directnppes", + "directtrust", "secure-health", "directnppes", "nextgenshare", + "cernerdirect", "directbygreenway", "directathenahealth", + "epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect", + "directmedgenehr", "emadirect", "compulinkdirect", "elationemr", + "directwellstar", "directhisp", "gwaydirect", "directmail", + "direct-ci", "direct-ehr", "direct-srhs", ) +# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS +# label (or the leading token of one). Checked label-by-label, not as a raw +# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional. +DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp") +DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias + # Consumer webmail: real inboxes a clinician reads, but reputation-sensitive # (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline # (low-cap) stream, never the hot institutional one. @@ -62,7 +83,19 @@ def domain_of(email: str) -> str: def is_direct_secure(domain: str) -> bool: d = domain.lower() - return any(m in d for m in DIRECT_MARKERS) + # Tier 1: unambiguous vendor/HISP tokens anywhere in the domain. + if any(m in d for m in DIRECT_VENDOR_MARKERS): + return True + # Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole + # DNS label or the leading token of one (e.g. "direct.foo.org", + # "foo.directplus.bar.com"), NOT when embedded in the registrable domain + # such as "arthurdirectcare.com" or "newdirectionscounseling.com". + labels = d.split(".") + for lab in labels: + for w in DIRECT_LABEL_WORDS: + if lab == w or lab.startswith(w + "-"): + return True + return False def is_consumer(domain: str) -> bool: diff --git a/scripts/test_healthcare_email_streams.py b/scripts/test_healthcare_email_streams.py new file mode 100644 index 0000000..282a344 --- /dev/null +++ b/scripts/test_healthcare_email_streams.py @@ -0,0 +1,61 @@ +"""Unit tests for the healthcare email-stream classifier. + +Run: python3 -m scripts.test_healthcare_email_streams (or pytest) + +Guards the subtle case that motivated the two-tier DIRECT detection: a naive +substring "direct" match wrongly parked real "Direct Primary Care" / counseling +practices (registrable domain merely contains the word) into the undeliverable +HISP pile. Direct/HISP gateways instead use "direct"/"hisp" as a whole DNS label. +""" + +from scripts.healthcare_email_streams import classify + +CASES = [ + # Real Direct-Primary-Care / counseling practices -> institutional + ("chelsea@arthurdirectcare.com", "institutional"), + ("bassi@valleydirectprimarycare.com", "institutional"), + ("megan@newdirectionscounselingservices.com", "institutional"), + ("john@islanddirectprimarycare.com", "institutional"), + ("kamlesh@mydirectcare.com", "institutional"), + ("allison@truedirectioncounseling.com", "institutional"), + ("marty@holtondirectcare.com", "institutional"), + ("sbass@newdirectionsnonemergency.org", "institutional"), + ("info@consumerdirectcare.com", "institutional"), + ("x@rehabdirectives.com", "institutional"), + # Genuine Direct/HISP gateways -> direct (parked) + ("x@direct.novanthealth.org", "direct"), + ("x@CarolinasHealthcareSystem.direct-ci.com", "direct"), + ("x@cfp.directbygreenway.com", "direct"), + ("x@foo.4693.direct.athenahealth.com", "direct"), + ("x@directHISP.wakemed.org", "direct"), + ("x@boss.directak.net", "direct"), + ("x@hisp.bryanhealth.org", "direct"), + ("x@ehrdirect.mayoclinicmsg.org", "direct"), + ("x@directaddress.net", "direct"), + ("x@negaidx.allscriptsdirect.net", "direct"), + ("x@mmiller@lickingmemorial.medicity.net".replace("mmiller@", ""), "direct"), + ("x@foo.nextgenshare.com", "direct"), + # Consumer / institutional / excluded / invalid + ("drsmith@gmail.com", "consumer"), + ("info@smallclinic.com", "institutional"), + ("x@somehospital.va.gov", "excluded"), + ("x@base.health.mil", "excluded"), + ("not-an-email", "invalid"), + ("", "invalid"), +] + + +def test_classify(): + failures = [] + for email, expected in CASES: + got = classify(email) + if got != expected: + failures.append((email, got, expected)) + assert not failures, "Misclassified: " + "; ".join( + f"{e} -> {g} (want {x})" for e, g, x in failures + ) + + +if __name__ == "__main__": + test_classify() + print(f"OK: all {len(CASES)} classifier cases pass")