121 lines
5.2 KiB
Python
121 lines
5.2 KiB
Python
"""Healthcare email-stream segmentation.
|
|
|
|
Splits NPPES-endpoint emails into the three outbound streams used by the
|
|
dual-stream MTA design (see docs/healthcare-email-stream-plan.md):
|
|
|
|
institutional practice/clinic domains -> HEALTHCARE HOT stream (own IPs+cap)
|
|
consumer gmail/outlook/icloud... -> rides the TRUCKING consumer-discipline
|
|
stream (low cap), NOT the hot one
|
|
direct DirectTrust / HISP -> parked until DirectTrust signup
|
|
(will not cold-deliver via SMTP)
|
|
|
|
Also drops a small set of non-prospect institutional giants (federal, big-box
|
|
pharmacy/retail) that are not our small-practice buyer and would only add
|
|
volume + complaint risk.
|
|
|
|
This is the single source of truth for the classification; both the list
|
|
builder and any campaign-import tooling import from here so the streams can
|
|
never drift.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
|
|
# DirectTrust / HISP secure-messaging gateways. These route only inside the
|
|
# DirectTrust network and will NOT accept normal cold email. Substring match on
|
|
# the domain. Verified against the May 2026 endpoint_pfile top domains (catches
|
|
# direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
|
|
#
|
|
# Two tiers, because a naive substring "direct" wrongly parks real practices
|
|
# whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care"
|
|
# clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling
|
|
# practices like newdirectionscounselingservices.com). HISP gateways instead
|
|
# put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com).
|
|
#
|
|
# DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring.
|
|
# DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or
|
|
# as the leading token of a label (direct-ci, directhisp).
|
|
DIRECT_VENDOR_MARKERS: tuple[str, ...] = (
|
|
"medicity.net", "surescripts", "updox", "maxmd",
|
|
"secureexchange", "directaddress", "directplus", "ehrdirect",
|
|
"mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
|
|
"directtrust", "secure-health", "directnppes", "nextgenshare",
|
|
"cernerdirect", "directbygreenway", "directathenahealth",
|
|
"epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect",
|
|
"directmedgenehr", "emadirect", "compulinkdirect", "elationemr",
|
|
"directwellstar", "directhisp", "gwaydirect", "directmail",
|
|
"direct-ci", "direct-ehr", "direct-srhs",
|
|
)
|
|
|
|
# Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS
|
|
# label (or the leading token of one). Checked label-by-label, not as a raw
|
|
# substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional.
|
|
DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp")
|
|
DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias
|
|
|
|
# Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
|
|
# (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
|
|
# (low-cap) stream, never the hot institutional one.
|
|
CONSUMER_WEBMAIL: frozenset[str] = frozenset({
|
|
"gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "aol.com",
|
|
"icloud.com", "comcast.net", "att.net", "sbcglobal.net", "me.com",
|
|
"ymail.com", "live.com", "msn.com", "protonmail.com", "verizon.net",
|
|
"mail.com", "gmx.com",
|
|
})
|
|
|
|
# Institutional domains that are NOT our small-practice buyer: federal/military
|
|
# and big-box retail/pharmacy. Exclude from the hot stream (low yield, high
|
|
# complaint/volume risk). Substring match.
|
|
INSTITUTIONAL_EXCLUDE_MARKERS: tuple[str, ...] = (
|
|
"va.gov", "mail.mil", "health.mil", ".mil", ".gov",
|
|
"cvshealth.com", "walgreens.com", "wal-mart.com", "walmart.com",
|
|
)
|
|
|
|
|
|
def domain_of(email: str) -> str:
|
|
if "@" not in email:
|
|
return ""
|
|
return email.rsplit("@", 1)[-1].strip().lower()
|
|
|
|
|
|
def is_direct_secure(domain: str) -> bool:
|
|
d = domain.lower()
|
|
# Tier 1: unambiguous vendor/HISP tokens anywhere in the domain.
|
|
if any(m in d for m in DIRECT_VENDOR_MARKERS):
|
|
return True
|
|
# Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole
|
|
# DNS label or the leading token of one (e.g. "direct.foo.org",
|
|
# "foo.directplus.bar.com"), NOT when embedded in the registrable domain
|
|
# such as "arthurdirectcare.com" or "newdirectionscounseling.com".
|
|
labels = d.split(".")
|
|
for lab in labels:
|
|
for w in DIRECT_LABEL_WORDS:
|
|
if lab == w or lab.startswith(w + "-"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_consumer(domain: str) -> bool:
|
|
return domain.lower() in CONSUMER_WEBMAIL
|
|
|
|
|
|
def is_institutional_excluded(domain: str) -> bool:
|
|
d = domain.lower()
|
|
return any(m in d for m in INSTITUTIONAL_EXCLUDE_MARKERS)
|
|
|
|
|
|
def classify(email: str) -> str:
|
|
"""Return one of: 'direct', 'consumer', 'institutional', 'excluded', 'invalid'."""
|
|
if not EMAIL_RE.match(email or ""):
|
|
return "invalid"
|
|
dom = domain_of(email)
|
|
if is_direct_secure(dom):
|
|
return "direct"
|
|
if is_consumer(dom):
|
|
return "consumer"
|
|
if is_institutional_excluded(dom):
|
|
return "excluded"
|
|
return "institutional"
|