"""Healthcare email-stream segmentation. Splits NPPES-endpoint emails into the three outbound streams used by the dual-stream MTA design (see docs/healthcare-email-stream-plan.md): institutional practice/clinic domains -> HEALTHCARE HOT stream (own IPs+cap) consumer gmail/outlook/icloud... -> rides the TRUCKING consumer-discipline stream (low cap), NOT the hot one direct DirectTrust / HISP -> parked until DirectTrust signup (will not cold-deliver via SMTP) Also drops a small set of non-prospect institutional giants (federal, big-box pharmacy/retail) that are not our small-practice buyer and would only add volume + complaint risk. This is the single source of truth for the classification; both the list builder and any campaign-import tooling import from here so the streams can never drift. """ from __future__ import annotations import re EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") # DirectTrust / HISP secure-messaging gateways. These route only inside the # DirectTrust network and will NOT accept normal cold email. Substring match on # the domain. Verified against the May 2026 endpoint_pfile top domains (catches # direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.). # # Two tiers, because a naive substring "direct" wrongly parks real practices # whose registrable domain merely CONTAINS the word (e.g. "Direct Primary Care" # clinics like arthurdirectcare.com, valleydirectprimarycare.com, or counseling # practices like newdirectionscounselingservices.com). HISP gateways instead # put "direct"/"hisp" as a full DNS LABEL (direct.foo.org, foo.directbygreenway.com). # # DIRECT_VENDOR_MARKERS unambiguous vendor/HISP tokens -> plain substring. # DIRECT_LABEL_WORDS generic words -> only match as a whole DNS label or # as the leading token of a label (direct-ci, directhisp). DIRECT_VENDOR_MARKERS: tuple[str, ...] = ( "medicity.net", "surescripts", "updox", "maxmd", "secureexchange", "directaddress", "directplus", "ehrdirect", "mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure", "directtrust", "secure-health", "directnppes", "nextgenshare", "cernerdirect", "directbygreenway", "directathenahealth", "epichosted", "medalliesdirect", "directak.net", "gadirect", "kydirect", "directmedgenehr", "emadirect", "compulinkdirect", "elationemr", "directwellstar", "directhisp", "gwaydirect", "directmail", "direct-ci", "direct-ehr", "direct-srhs", ) # Generic words that mark a Direct/HISP endpoint ONLY when they form a whole DNS # label (or the leading token of one). Checked label-by-label, not as a raw # substring, so "arthurdirectcare.com" / "newdirection*.com" stay institutional. DIRECT_LABEL_WORDS: tuple[str, ...] = ("direct", "hisp") DIRECT_MARKERS = DIRECT_VENDOR_MARKERS # back-compat alias # Consumer webmail: real inboxes a clinician reads, but reputation-sensitive # (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline # (low-cap) stream, never the hot institutional one. CONSUMER_WEBMAIL: frozenset[str] = frozenset({ "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "aol.com", "icloud.com", "comcast.net", "att.net", "sbcglobal.net", "me.com", "ymail.com", "live.com", "msn.com", "protonmail.com", "verizon.net", "mail.com", "gmx.com", }) # Institutional domains that are NOT our small-practice buyer: federal/military # and big-box retail/pharmacy. Exclude from the hot stream (low yield, high # complaint/volume risk). Substring match. INSTITUTIONAL_EXCLUDE_MARKERS: tuple[str, ...] = ( "va.gov", "mail.mil", "health.mil", ".mil", ".gov", "cvshealth.com", "walgreens.com", "wal-mart.com", "walmart.com", ) def domain_of(email: str) -> str: if "@" not in email: return "" return email.rsplit("@", 1)[-1].strip().lower() def is_direct_secure(domain: str) -> bool: d = domain.lower() # Tier 1: unambiguous vendor/HISP tokens anywhere in the domain. if any(m in d for m in DIRECT_VENDOR_MARKERS): return True # Tier 2: a generic word ("direct"/"hisp") only counts when it is a whole # DNS label or the leading token of one (e.g. "direct.foo.org", # "foo.directplus.bar.com"), NOT when embedded in the registrable domain # such as "arthurdirectcare.com" or "newdirectionscounseling.com". labels = d.split(".") for lab in labels: for w in DIRECT_LABEL_WORDS: if lab == w or lab.startswith(w + "-"): return True return False def is_consumer(domain: str) -> bool: return domain.lower() in CONSUMER_WEBMAIL def is_institutional_excluded(domain: str) -> bool: d = domain.lower() return any(m in d for m in INSTITUTIONAL_EXCLUDE_MARKERS) def classify(email: str) -> str: """Return one of: 'direct', 'consumer', 'institutional', 'excluded', 'invalid'.""" if not EMAIL_RE.match(email or ""): return "invalid" dom = domain_of(email) if is_direct_secure(dom): return "direct" if is_consumer(dom): return "consumer" if is_institutional_excluded(dom): return "excluded" return "institutional"