fix(hc deliverability): MX-based Google-host exclusion during warmup

Found via live mail.log: Google-Workspace-hosted PRACTICE domains (custom
domains whose MX is aspmx.l.google.com, e.g. moosepharmacy.com, hc2kidney.com)
were getting hard 550-5.7.1 rejects from Google's cold-IP bulk filter -- exactly
the bounces that wreck a warming IP's reputation. The original google/non-google
split classified by the email's domain STRING, which can't see that a custom
domain silently uses Google Workspace; only an MX lookup reveals it (33% of our
domains, 228/689, are Google-hosted this way).

- hc_data_refresh.py: new MX classification (one lookup per unique domain via
  dnspython, cached) writes an mx_provider=google/other flag into the master and
  propagates it into the channel CSVs (auto-adding the column). --skip-mx for a
  fast status-only run.
- build_healthcare_campaigns_cron.py: warm_segment now drops mx_provider=google
  rows during warmup (HC_SKIP_GOOGLE=1 default; set 0 once IPs are warm). This is
  defense-in-depth -- correct regardless of which CSV the cron is pointed at.

Verified: today's sends (nongoogle CSV) had 0 Google bounces; the guard cuts the
Google-containing week1_verified cohort's revalidation candidates 82->8.
This commit is contained in:
justin 2026-06-08 03:32:12 -05:00
parent feb677f6ce
commit 54b92b1f06
2 changed files with 73 additions and 4 deletions

View file

@ -84,6 +84,20 @@ def _overdue_days(r: dict):
return None
# During warmup, hold out Google-Workspace-hosted domains: Google rejects bulk
# mail from cold/warming IPs hard (550-5.7.1), and those bounces wreck the
# warming reputation. The mx_provider flag is set by the weekly hc_data_refresh
# (an MX lookup, since a custom domain can silently use Google Workspace). Set
# HC_SKIP_GOOGLE=0 to lift this once the IPs are warm.
SKIP_GOOGLE = os.getenv("HC_SKIP_GOOGLE", "1") not in ("0", "false", "no")
def _is_google_hosted(r: dict) -> bool:
if not SKIP_GOOGLE:
return False
return (r.get("mx_provider") or "").strip().lower() == "google"
def warmup_day() -> int:
try:
start = int(open(WARMUP_STAMP).read().strip())
@ -265,6 +279,7 @@ def warm_segment(seg_key: str, rows: list[dict], slice_n: int,
candidates = [r for r in rows
if r.get("email", "").strip()
and r["email"].strip().lower() not in imported
and not _is_google_hosted(r)
and row_matches(seg_key, r)]
todo = candidates[:slice_n]
print(f"[hc-cron] {seg_key}: candidates={len(candidates)} "

View file

@ -44,7 +44,7 @@ UA = "PerformanceWest-HCRefresh/1.0 (compliance@performancewest.net)"
# campaign cron's column expectations never change).
HEADER = ["npi", "email", "stream", "name", "specialty", "state",
"reval_due_date", "days_overdue", "reval_status",
"leie_excluded", "optout_ending"]
"leie_excluded", "optout_ending", "mx_provider"]
def log(*a):
@ -59,6 +59,42 @@ def http_json(url: str, timeout: int = 30):
# ── Source pulls ────────────────────────────────────────────────────────────
# Mail providers whose MX indicates the domain is hosted by Google Workspace.
# Google rejects bulk mail from cold/warming IPs hard (550-5.7.1), so these must
# be held out of the warmup -- and the only reliable signal is the MX record,
# since a custom domain (e.g. practice.com) can silently use Google Workspace.
_GOOGLE_MX_SUFFIXES = ("google.com", "googlemail.com", "aspmx.l.google.com")
def classify_mx(domain: str) -> str:
"""Return 'google' if the domain's MX is Google-hosted, else 'other'.
Best-effort: DNS failures classify as 'other' (we don't want a transient
resolver error to permanently exclude a deliverable domain)."""
try:
import dns.resolver # type: ignore
answers = dns.resolver.resolve(domain, "MX", lifetime=5)
hosts = [str(r.exchange).rstrip(".").lower() for r in answers]
except Exception:
return "other"
for h in hosts:
if any(h == s or h.endswith("." + s) or h.endswith(s) for s in _GOOGLE_MX_SUFFIXES):
return "google"
return "other"
def classify_domains_mx(emails: list[str]) -> dict[str, str]:
"""Map each unique email domain -> 'google'/'other' via one MX lookup per
domain (cached), so the daily campaign cron can skip Google-hosted addresses
during warmup without re-resolving."""
domains = sorted({e.split("@", 1)[1].strip().lower() for e in emails if "@" in e})
out: dict[str, str] = {}
for i, d in enumerate(domains, 1):
out[d] = classify_mx(d)
if i % 100 == 0:
log(f" mx: classified {i}/{len(domains)} domains")
return out
def sam_key() -> str | None:
t = os.getenv("SAM_GOV_API_KEY")
if t:
@ -206,6 +242,7 @@ def main() -> int:
help="crawl first N SAM exclusion pages for an NPI cross-flag (slow; default off)")
ap.add_argument("--skip-cms", action="store_true")
ap.add_argument("--skip-oig", action="store_true")
ap.add_argument("--skip-mx", action="store_true", help="skip MX (Google-host) classification")
args = ap.parse_args()
if not os.path.exists(args.master):
@ -239,9 +276,21 @@ def main() -> int:
excluded = leie | sam
today = datetime.date.today()
# MX classification (Google Workspace vs other) for the warmup deliverability
# guard. Done once per unique domain. Skippable for a fast status-only run.
mx_map = {}
if not args.skip_mx:
all_emails = [r.get("email", "") for r in rows]
mx_map = classify_domains_mx(all_emails)
n_google = sum(1 for v in mx_map.values() if v == "google")
log(f"mx: {len(mx_map)} domains classified; {n_google} Google-hosted")
refreshed = []
for r in rows:
npi = r["npi"].strip()
if mx_map:
dom = r.get("email", "").split("@", 1)[-1].strip().lower()
r["mx_provider"] = mx_map.get(dom, "other")
if not npi:
# No NPI to re-check; leave the row's existing status untouched.
refreshed.append(r)
@ -287,7 +336,7 @@ def main() -> int:
# optout_ending, which only the original list builder computes -- including
# it here would blank it and starve the compliance_bundle segment).
REFRESHED_FIELDS = ["reval_due_date", "days_overdue", "reval_status",
"leie_excluded", "name", "specialty", "state"]
"leie_excluded", "mx_provider", "name", "specialty", "state"]
by_email = {r["email"].strip().lower(): r for r in refreshed if r.get("email")}
channel_csvs = [os.path.join(args.out_dir, f) for f in
("hc_warmup_nongoogle.csv", "hc_warmup_google.csv",
@ -297,15 +346,20 @@ def main() -> int:
continue
with open(path, newline="") as f:
rdr = csv.DictReader(f)
cols = rdr.fieldnames or []
cols = list(rdr.fieldnames or [])
rows_ch = list(rdr)
# Add any refreshed field the channel CSV doesn't have yet (e.g. a newly
# introduced mx_provider column) so the cron can read it.
for fld in REFRESHED_FIELDS:
if fld not in cols:
cols.append(fld)
updated = 0
for r in rows_ch:
m = by_email.get(r.get("email", "").strip().lower())
if not m:
continue
for fld in REFRESHED_FIELDS:
if fld in cols and fld in m:
if fld in m:
r[fld] = m[fld]
updated += 1
write_atomic(path, rows_ch, cols)