diff --git a/scripts/build_trucking_campaigns.py b/scripts/build_trucking_campaigns.py index e1ec7e3..9084a34 100644 --- a/scripts/build_trucking_campaigns.py +++ b/scripts/build_trucking_campaigns.py @@ -989,6 +989,16 @@ def select_sendable_carriers( caps = mx_daily_caps(main_warmup_day()) per_op: dict = {} default_cap = caps.get("__default__", 50) + # Untagged (NULL mx_provider) safety cap. We can't exclude NULLs (the big-MX + # exclusion is MX-based, so an untagged Google/Yahoo domain would slip through), + # but we also shouldn't let a flood of freshly-imported, never-resolved domains + # dominate a run -- some are big/consumer operators we'd otherwise hold out. + # The pw-mx-tag cron drains the *sendable* untagged backlog fast (only ~3k + # distinct verified domains as of 2026-06-20, < one 20k/day run), so this is a + # between-runs safety net, not the primary gate. Generous enough to never starve + # the pool in normal operation. Tunable via MAIN_UNTAGGED_MX_CAP. + untagged_cap = int(os.getenv("MAIN_UNTAGGED_MX_CAP", str(max(quota, 200)))) + untagged_used = 0 MX_IDX = 5 # mx_provider is the 6th column from fetch_carriers # Warmup caps are small, but old audiences can contain many prior bounces or # unsubscribes. Scan beyond the quota while still bounding worst-case API calls. @@ -1007,23 +1017,30 @@ def select_sendable_carriers( continue seen_emails.add(email) # Per-MX-operator cap (reputation is per receiving operator). - # Untagged carriers (no mx_provider yet) are NOT capped here -- they - # would otherwise all collapse onto one __default__ bucket and starve - # the pool before tagging completes. The big-operator EXCLUSION in - # fetch_carriers already keeps Google/MS out during warmup; this cap - # bounds the KNOWN operators once tagging fills in. + # Tagged carriers are capped per operator; untagged (no mx_provider + # yet) are bounded by a single shared safety cap (untagged_cap) instead + # of being uncapped -- this stops a flood of unresolved domains (which + # could include big/consumer operators) from dominating a run, without + # starving the pool. The big-operator EXCLUSION in fetch_carriers keeps + # KNOWN Google/MS/consumer-MX out; the pw-mx-tag cron keeps NULL small. prov = (row[MX_IDX] or "").strip().lower() if len(row) > MX_IDX else "" if prov: cap = caps.get(prov, default_cap) if per_op.get(prov, 0) >= cap: skipped[f"mx_cap:{prov}"] = skipped.get(f"mx_cap:{prov}", 0) + 1 continue + else: + if untagged_used >= untagged_cap: + skipped["mx_cap:untagged"] = skipped.get("mx_cap:untagged", 0) + 1 + continue ok, reason = listmonk_sendable(email) if not ok: skipped[reason] = skipped.get(reason, 0) + 1 continue if prov: per_op[prov] = per_op.get(prov, 0) + 1 + else: + untagged_used += 1 selected.append(row) if len(selected) >= quota: break