fix(npi): two-tier Direct/HISP classifier so real Direct-Primary-Care/counseling practices stay institutional (was wrongly parked); add classifier unit tests
This commit is contained in:
parent
c3b2c4e89a
commit
68333148e6
2 changed files with 98 additions and 4 deletions
|
|
@ -28,13 +28,34 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|||
# DirectTrust network and will NOT accept normal cold email. Substring match on
|
||||
# the domain. Verified against the May 2026 endpoint_pfile top domains (catches
|
||||
# direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
|
||||
DIRECT_MARKERS: tuple[str, ...] = (
|
||||
"direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd",
|
||||
#
|
||||
# Two tiers, because a naive substring "direct" wrongly parks real practices
|
||||
# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care"
|
||||
# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling
|
||||
# practices like newdirectionscounselingservices.com). HISP gateways instead
|
||||
# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com).
|
||||
#
|
||||
# DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring.
|
||||
# DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or
|
||||
# as the leading token of a label (direct-ci, directhisp).
|
||||
DIRECT_VENDOR_MARKERS: tuple[str, ...] = (
|
||||
"medicity.net", "surescripts", "updox", "maxmd",
|
||||
"secureexchange", "directaddress", "directplus", "ehrdirect",
|
||||
"mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
|
||||
"directtrust", "secure-health", "directnppes",
|
||||
"directtrust", "secure-health", "directnppes", "nextgenshare",
|
||||
"cernerdirect", "directbygreenway", "directathenahealth",
|
||||
"epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect",
|
||||
"directmedgenehr", "emadirect", "compulinkdirect", "elationemr",
|
||||
"directwellstar", "directhisp", "gwaydirect", "directmail",
|
||||
"direct-ci", "direct-ehr", "direct-srhs",
|
||||
)
|
||||
|
||||
# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS
|
||||
# label (or the leading token of one). Checked label-by-label, not as a raw
|
||||
# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional.
|
||||
DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp")
|
||||
DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias
|
||||
|
||||
# Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
|
||||
# (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
|
||||
# (low-cap) stream, never the hot institutional one.
|
||||
|
|
@ -62,7 +83,19 @@ def domain_of(email: str) -> str:
|
|||
|
||||
def is_direct_secure(domain: str) -> bool:
|
||||
d = domain.lower()
|
||||
return any(m in d for m in DIRECT_MARKERS)
|
||||
# Tier 1: unambiguous vendor/HISP tokens anywhere in the domain.
|
||||
if any(m in d for m in DIRECT_VENDOR_MARKERS):
|
||||
return True
|
||||
# Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole
|
||||
# DNS label or the leading token of one (e.g. "direct.foo.org",
|
||||
# "foo.directplus.bar.com"), NOT when embedded in the registrable domain
|
||||
# such as "arthurdirectcare.com" or "newdirectionscounseling.com".
|
||||
labels = d.split(".")
|
||||
for lab in labels:
|
||||
for w in DIRECT_LABEL_WORDS:
|
||||
if lab == w or lab.startswith(w + "-"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_consumer(domain: str) -> bool:
|
||||
|
|
|
|||
61
scripts/test_healthcare_email_streams.py
Normal file
61
scripts/test_healthcare_email_streams.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
"""Unit tests for the healthcare email-stream classifier.
|
||||
|
||||
Run: python3 -m scripts.test_healthcare_email_streams (or pytest)
|
||||
|
||||
Guards the subtle case that motivated the two-tier DIRECT detection: a naive
|
||||
substring "direct" match wrongly parked real "Direct Primary Care" / counseling
|
||||
practices (registrable domain merely contains the word) into the undeliverable
|
||||
HISP pile. Direct/HISP gateways instead use "direct"/"hisp" as a whole DNS label.
|
||||
"""
|
||||
|
||||
from scripts.healthcare_email_streams import classify
|
||||
|
||||
CASES = [
|
||||
# Real Direct-Primary-Care / counseling practices -> institutional
|
||||
("chelsea@arthurdirectcare.com", "institutional"),
|
||||
("bassi@valleydirectprimarycare.com", "institutional"),
|
||||
("megan@newdirectionscounselingservices.com", "institutional"),
|
||||
("john@islanddirectprimarycare.com", "institutional"),
|
||||
("kamlesh@mydirectcare.com", "institutional"),
|
||||
("allison@truedirectioncounseling.com", "institutional"),
|
||||
("marty@holtondirectcare.com", "institutional"),
|
||||
("sbass@newdirectionsnonemergency.org", "institutional"),
|
||||
("info@consumerdirectcare.com", "institutional"),
|
||||
("x@rehabdirectives.com", "institutional"),
|
||||
# Genuine Direct/HISP gateways -> direct (parked)
|
||||
("x@direct.novanthealth.org", "direct"),
|
||||
("x@CarolinasHealthcareSystem.direct-ci.com", "direct"),
|
||||
("x@cfp.directbygreenway.com", "direct"),
|
||||
("x@foo.4693.direct.athenahealth.com", "direct"),
|
||||
("x@directHISP.wakemed.org", "direct"),
|
||||
("x@boss.directak.net", "direct"),
|
||||
("x@hisp.bryanhealth.org", "direct"),
|
||||
("x@ehrdirect.mayoclinicmsg.org", "direct"),
|
||||
("x@directaddress.net", "direct"),
|
||||
("x@negaidx.allscriptsdirect.net", "direct"),
|
||||
("x@mmiller@lickingmemorial.medicity.net".replace("mmiller@", ""), "direct"),
|
||||
("x@foo.nextgenshare.com", "direct"),
|
||||
# Consumer / institutional / excluded / invalid
|
||||
("drsmith@gmail.com", "consumer"),
|
||||
("info@smallclinic.com", "institutional"),
|
||||
("x@somehospital.va.gov", "excluded"),
|
||||
("x@base.health.mil", "excluded"),
|
||||
("not-an-email", "invalid"),
|
||||
("", "invalid"),
|
||||
]
|
||||
|
||||
|
||||
def test_classify():
|
||||
failures = []
|
||||
for email, expected in CASES:
|
||||
got = classify(email)
|
||||
if got != expected:
|
||||
failures.append((email, got, expected))
|
||||
assert not failures, "Misclassified: " + "; ".join(
|
||||
f"{e} -> {g} (want {x})" for e, g, x in failures
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_classify()
|
||||
print(f"OK: all {len(CASES)} classifier cases pass")
|
||||
Loading…
Add table
Add a link
Reference in a new issue