fix(npi): two-tier Direct/HISP classifier so real Direct-Primary-Care/counseling practices stay institutional (was wrongly parked); add classifier unit tests

This commit is contained in:
justin 2026-06-06 00:09:42 -05:00
parent c3b2c4e89a
commit 68333148e6
2 changed files with 98 additions and 4 deletions

View file

@ -28,13 +28,34 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
# DirectTrust network and will NOT accept normal cold email. Substring match on
# the domain. Verified against the May 2026 endpoint_pfile top domains (catches
# direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
DIRECT_MARKERS: tuple[str, ...] = (
"direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd",
#
# Two tiers, because a naive substring "direct" wrongly parks real practices
# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care"
# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling
# practices like newdirectionscounselingservices.com). HISP gateways instead
# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com).
#
# DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring.
# DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or
# as the leading token of a label (direct-ci, directhisp).
DIRECT_VENDOR_MARKERS: tuple[str, ...] = (
"medicity.net", "surescripts", "updox", "maxmd",
"secureexchange", "directaddress", "directplus", "ehrdirect",
"mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
"directtrust", "secure-health", "directnppes",
"directtrust", "secure-health", "directnppes", "nextgenshare",
"cernerdirect", "directbygreenway", "directathenahealth",
"epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect",
"directmedgenehr", "emadirect", "compulinkdirect", "elationemr",
"directwellstar", "directhisp", "gwaydirect", "directmail",
"direct-ci", "direct-ehr", "direct-srhs",
)
# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS
# label (or the leading token of one). Checked label-by-label, not as a raw
# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional.
DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp")
DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias
# Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
# (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
# (low-cap) stream, never the hot institutional one.
@ -62,7 +83,19 @@ def domain_of(email: str) -> str:
def is_direct_secure(domain: str) -> bool:
d = domain.lower()
return any(m in d for m in DIRECT_MARKERS)
# Tier 1: unambiguous vendor/HISP tokens anywhere in the domain.
if any(m in d for m in DIRECT_VENDOR_MARKERS):
return True
# Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole
# DNS label or the leading token of one (e.g. "direct.foo.org",
# "foo.directplus.bar.com"), NOT when embedded in the registrable domain
# such as "arthurdirectcare.com" or "newdirectionscounseling.com".
labels = d.split(".")
for lab in labels:
for w in DIRECT_LABEL_WORDS:
if lab == w or lab.startswith(w + "-"):
return True
return False
def is_consumer(domain: str) -> bool: