fix(npi): two-tier Direct/HISP classifier so real Direct-Primary-Care/counseling practices stay institutional (was wrongly parked); add classifier unit tests
This commit is contained in:
parent
c3b2c4e89a
commit
68333148e6
2 changed files with 98 additions and 4 deletions
|
|
@ -28,13 +28,34 @@ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|||
# DirectTrust network and will NOT accept normal cold email. Substring match on
|
||||
# the domain. Verified against the May 2026 endpoint_pfile top domains (catches
|
||||
# direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
|
||||
DIRECT_MARKERS: tuple[str, ...] = (
|
||||
"direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd",
|
||||
#
|
||||
# Two tiers, because a naive substring "direct" wrongly parks real practices
|
||||
# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care"
|
||||
# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling
|
||||
# practices like newdirectionscounselingservices.com). HISP gateways instead
|
||||
# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com).
|
||||
#
|
||||
# DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring.
|
||||
# DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or
|
||||
# as the leading token of a label (direct-ci, directhisp).
|
||||
DIRECT_VENDOR_MARKERS: tuple[str, ...] = (
|
||||
"medicity.net", "surescripts", "updox", "maxmd",
|
||||
"secureexchange", "directaddress", "directplus", "ehrdirect",
|
||||
"mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
|
||||
"directtrust", "secure-health", "directnppes",
|
||||
"directtrust", "secure-health", "directnppes", "nextgenshare",
|
||||
"cernerdirect", "directbygreenway", "directathenahealth",
|
||||
"epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect",
|
||||
"directmedgenehr", "emadirect", "compulinkdirect", "elationemr",
|
||||
"directwellstar", "directhisp", "gwaydirect", "directmail",
|
||||
"direct-ci", "direct-ehr", "direct-srhs",
|
||||
)
|
||||
|
||||
# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS
|
||||
# label (or the leading token of one). Checked label-by-label, not as a raw
|
||||
# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional.
|
||||
DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp")
|
||||
DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias
|
||||
|
||||
# Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
|
||||
# (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
|
||||
# (low-cap) stream, never the hot institutional one.
|
||||
|
|
@ -62,7 +83,19 @@ def domain_of(email: str) -> str:
|
|||
|
||||
def is_direct_secure(domain: str) -> bool:
|
||||
d = domain.lower()
|
||||
return any(m in d for m in DIRECT_MARKERS)
|
||||
# Tier 1: unambiguous vendor/HISP tokens anywhere in the domain.
|
||||
if any(m in d for m in DIRECT_VENDOR_MARKERS):
|
||||
return True
|
||||
# Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole
|
||||
# DNS label or the leading token of one (e.g. "direct.foo.org",
|
||||
# "foo.directplus.bar.com"), NOT when embedded in the registrable domain
|
||||
# such as "arthurdirectcare.com" or "newdirectionscounseling.com".
|
||||
labels = d.split(".")
|
||||
for lab in labels:
|
||||
for w in DIRECT_LABEL_WORDS:
|
||||
if lab == w or lab.startswith(w + "-"):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_consumer(domain: str) -> bool:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue