fix(npi): two-tier Direct/HISP classifier so real Direct-Primary-Care/counseling practices stay institutional (was wrongly parked); add classifier unit tests

2026-06-06 00:09:42 -05:00 · 2026-06-06 00:09:42 -05:00 · 68333148e6
commit 68333148e6
parent c3b2c4e89a
2 changed files with 98 additions and 4 deletions
--- a/scripts/healthcare_email_streams.py
+++ b/scripts/healthcare_email_streams.py
@ -28,13 +28,34 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
 # DirectTrust network and will NOT accept normal cold email. Substring match on
 # the domain. Verified against the May 2026 endpoint_pfile top domains (catches
 # direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
-DIRECT_MARKERS: tuple[str, ...] = (
-    "direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd",
+#
+# Two tiers, because a naive substring "direct" wrongly parks real practices
+# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care"
+# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling
+# practices like newdirectionscounselingservices.com). HISP gateways instead
+# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com).
+#
+#   DIRECT_VENDOR_MARKERS  unambiguous vendor/HISP tokens -> plain substring.
+#   DIRECT_LABEL_WORDS     generic words -> only match as a whole DNS label or
+#                          as the leading token of a label (direct-ci, directhisp).
+DIRECT_VENDOR_MARKERS: tuple[str, ...] = (
+    "medicity.net", "surescripts", "updox", "maxmd",
    "secureexchange", "directaddress", "directplus", "ehrdirect",
    "mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
-    "directtrust", "secure-health", "directnppes",
+    "directtrust", "secure-health", "directnppes", "nextgenshare",
+    "cernerdirect", "directbygreenway", "directathenahealth",
+    "epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect",
+    "directmedgenehr", "emadirect", "compulinkdirect", "elationemr",
+    "directwellstar", "directhisp", "gwaydirect", "directmail",
+    "direct-ci", "direct-ehr", "direct-srhs",
 )

+# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS
+# label (or the leading token of one). Checked label-by-label, not as a raw
+# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional.
+DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp")
+DIRECT_MARKERS = DIRECT_VENDOR_MARKERS  # back-compat alias
+
 # Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
 # (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
 # (low-cap) stream, never the hot institutional one.
@ -62,7 +83,19 @@ def domain_of(email: str) -> str:

 def is_direct_secure(domain: str) -> bool:
    d = domain.lower()
-    return any(m in d for m in DIRECT_MARKERS)
+    # Tier 1: unambiguous vendor/HISP tokens anywhere in the domain.
+    if any(m in d for m in DIRECT_VENDOR_MARKERS):
+        return True
+    # Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole
+    # DNS label or the leading token of one (e.g. "direct.foo.org",
+    # "foo.directplus.bar.com"), NOT when embedded in the registrable domain
+    # such as "arthurdirectcare.com" or "newdirectionscounseling.com".
+    labels = d.split(".")
+    for lab in labels:
+        for w in DIRECT_LABEL_WORDS:
+            if lab == w or lab.startswith(w + "-"):
+                return True
+    return False


 def is_consumer(domain: str) -> bool:
--- a/scripts/test_healthcare_email_streams.py
+++ b/scripts/test_healthcare_email_streams.py
@ -0,0 +1,61 @@
+"""Unit tests for the healthcare email-stream classifier.
+
+Run: python3 -m scripts.test_healthcare_email_streams   (or pytest)
+
+Guards the subtle case that motivated the two-tier DIRECT detection: a naive
+substring "direct" match wrongly parked real "Direct Primary Care" / counseling
+practices (registrable domain merely contains the word) into the undeliverable
+HISP pile. Direct/HISP gateways instead use "direct"/"hisp" as a whole DNS label.
+"""
+
+from scripts.healthcare_email_streams import classify
+
+CASES = [
+    # Real Direct-Primary-Care / counseling practices -> institutional
+    ("chelsea@arthurdirectcare.com", "institutional"),
+    ("bassi@valleydirectprimarycare.com", "institutional"),
+    ("megan@newdirectionscounselingservices.com", "institutional"),
+    ("john@islanddirectprimarycare.com", "institutional"),
+    ("kamlesh@mydirectcare.com", "institutional"),
+    ("allison@truedirectioncounseling.com", "institutional"),
+    ("marty@holtondirectcare.com", "institutional"),
+    ("sbass@newdirectionsnonemergency.org", "institutional"),
+    ("info@consumerdirectcare.com", "institutional"),
+    ("x@rehabdirectives.com", "institutional"),
+    # Genuine Direct/HISP gateways -> direct (parked)
+    ("x@direct.novanthealth.org", "direct"),
+    ("x@CarolinasHealthcareSystem.direct-ci.com", "direct"),
+    ("x@cfp.directbygreenway.com", "direct"),
+    ("x@foo.4693.direct.athenahealth.com", "direct"),
+    ("x@directHISP.wakemed.org", "direct"),
+    ("x@boss.directak.net", "direct"),
+    ("x@hisp.bryanhealth.org", "direct"),
+    ("x@ehrdirect.mayoclinicmsg.org", "direct"),
+    ("x@directaddress.net", "direct"),
+    ("x@negaidx.allscriptsdirect.net", "direct"),
+    ("x@mmiller@lickingmemorial.medicity.net".replace("mmiller@", ""), "direct"),
+    ("x@foo.nextgenshare.com", "direct"),
+    # Consumer / institutional / excluded / invalid
+    ("drsmith@gmail.com", "consumer"),
+    ("info@smallclinic.com", "institutional"),
+    ("x@somehospital.va.gov", "excluded"),
+    ("x@base.health.mil", "excluded"),
+    ("not-an-email", "invalid"),
+    ("", "invalid"),
+]
+
+
+def test_classify():
+    failures = []
+    for email, expected in CASES:
+        got = classify(email)
+        if got != expected:
+            failures.append((email, got, expected))
+    assert not failures, "Misclassified: " + "; ".join(
+        f"{e} -> {g} (want {x})" for e, g, x in failures
+    )
+
+
+if __name__ == "__main__":
+    test_classify()
+    print(f"OK: all {len(CASES)} classifier cases pass")