new-site/scripts/healthcare_email_streams.py

"""Healthcare email-stream segmentation.

Splits NPPES-endpoint emails into the three outbound streams used by the
dual-stream MTA design (see docs/healthcare-email-stream-plan.md):

  institutional  practice/clinic domains -> HEALTHCARE HOT stream (own IPs+cap)
  consumer       gmail/outlook/icloud... -> rides the TRUCKING consumer-discipline
                                            stream (low cap), NOT the hot one
  direct         DirectTrust / HISP      -> parked until DirectTrust signup
                                            (will not cold-deliver via SMTP)

Also drops a small set of non-prospect institutional giants (federal, big-box
pharmacy/retail) that are not our small-practice buyer and would only add
volume + complaint risk.

This is the single source of truth for the classification; both the list
builder and any campaign-import tooling import from here so the streams can
never drift.
"""

from __future__ import annotations

import re

EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")

# DirectTrust / HISP secure-messaging gateways. These route only inside the
# DirectTrust network and will NOT accept normal cold email. Substring match on
# the domain. Verified against the May 2026 endpoint_pfile top domains (catches
# direct*.org, *.medicity.net (NextGen HISP), Surescripts, Updox, MaxMD, etc.).
DIRECT_MARKERS: tuple[str, ...] = (
    "direct", "hisp", "medicity.net", "surescripts", "updox", "maxmd",
    "secureexchange", "directaddress", "directplus", "ehrdirect",
    "mayoclinicmsg", "allscriptsdirect", "eclinicaldirect", "phicure",
    "directtrust", "secure-health", "directnppes",
)

# Consumer webmail: real inboxes a clinician reads, but reputation-sensitive
# (Gmail/Microsoft cold-mail heuristics). These ride the trucking-discipline
# (low-cap) stream, never the hot institutional one.
CONSUMER_WEBMAIL: frozenset[str] = frozenset({
    "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "aol.com",
    "icloud.com", "comcast.net", "att.net", "sbcglobal.net", "me.com",
    "ymail.com", "live.com", "msn.com", "protonmail.com", "verizon.net",
    "mail.com", "gmx.com",
})

# Institutional domains that are NOT our small-practice buyer: federal/military
# and big-box retail/pharmacy. Exclude from the hot stream (low yield, high
# complaint/volume risk). Substring match.
INSTITUTIONAL_EXCLUDE_MARKERS: tuple[str, ...] = (
    "va.gov", "mail.mil", "health.mil", ".mil", ".gov",
    "cvshealth.com", "walgreens.com", "wal-mart.com", "walmart.com",
)


def domain_of(email: str) -> str:
    if "@" not in email:
        return ""
    return email.rsplit("@", 1)[-1].strip().lower()


def is_direct_secure(domain: str) -> bool:
    d = domain.lower()
    return any(m in d for m in DIRECT_MARKERS)


def is_consumer(domain: str) -> bool:
    return domain.lower() in CONSUMER_WEBMAIL


def is_institutional_excluded(domain: str) -> bool:
    d = domain.lower()
    return any(m in d for m in INSTITUTIONAL_EXCLUDE_MARKERS)


def classify(email: str) -> str:
    """Return one of: 'direct', 'consumer', 'institutional', 'excluded', 'invalid'."""
    if not EMAIL_RE.match(email or ""):
        return "invalid"
    dom = domain_of(email)
    if is_direct_secure(dom):
        return "direct"
    if is_consumer(dom):
        return "consumer"
    if is_institutional_excluded(dom):
        return "excluded"
    return "institutional"