new-site/scripts/burner_list_verify.py

#!/usr/bin/env python3
"""Burner-domain list verification: write deliverability back to fmcsa_carriers.

The SMTP-probe verifier (email_verifier.py) can't tell which catch-all /
mx_unreachable addresses actually deliver. The only ground truth is a REAL send.
We do that from a disposable burner sending domain (NOT performancewest.net /
carrierone.com — see docs/campaign-deliverability-plan.md) so the inevitable
bounces never touch PW's reputation. This script reconciles that send:

  1. Scan the burner MTA's mail.log for messages FROM the burner sender.
  2. Any recipient that hard-bounced  -> fmcsa_carriers.email_verify_result =
     'hard_bounced'        (permanently excluded from PW campaigns).
  3. Any recipient that was DELIVERED (status=sent, no later bounce) and is not
     already smtp_valid     -> 'send_confirmed'  (proven deliverable; the PW
     campaign filter treats smtp_valid + send_confirmed as sendable).

Idempotent: only upgrades 'catch_all_*' / 'mx_unreachable' / NULL rows to
'send_confirmed', and only sets 'hard_bounced' on a real bounce. Never downgrades
an already-confirmed address except to mark a genuine bounce.

Usage:
    python3 -m scripts.burner_list_verify --log /var/log/burner-mail.log
    python3 -m scripts.burner_list_verify --log mail.log --dry-run
"""
from __future__ import annotations

import argparse
import os
import re
import sys

import psycopg2

DATABASE_URL = os.getenv("DATABASE_URL", "")

# Sender(s) used by the burner verification campaign. Override via env when the
# burner domain is provisioned (e.g. BURNER_SENDERS="verify@listcheck-xyz.com").
BURNER_SENDERS = {
    s.strip().lower()
    for s in os.getenv("BURNER_SENDERS", "").split(",")
    if s.strip()
}

QID_RE = re.compile(r"postfix/\w+\[\d+\]: ([A-Z0-9]+):")
FROM_RE = re.compile(r"from=<([^>]*)>")
TO_RE = re.compile(r"to=<([^>]*)>")
STATUS_RE = re.compile(r"status=(\w+)")

# Results we are allowed to UPGRADE to 'send_confirmed'. We never overwrite an
# explicit smtp_valid (already best) or a hard_bounced (worse signal wins).
# 'mx_probe_blocked' is the big-ISP pool (Comcast/AT&T/Verizon/etc.) the SMTP
# probe couldn't reach — these are the prime burner-verification targets.
UPGRADABLE = ("catch_all_domain", "catch_all_detected", "mx_probe_blocked",
              "mx_unreachable", "smtp_temp_error", "smtp_unknown_451",
              "smtp_unknown_450")


def scan_log(log_path: str) -> tuple[set[str], set[str]]:
    """Return (delivered_emails, bounced_emails) for burner-sender messages."""
    if not BURNER_SENDERS:
        print("ERROR: set BURNER_SENDERS (e.g. verify@your-burner-domain.com)",
              file=sys.stderr)
        return set(), set()

    burner_qids: set[str] = set()
    qid_rcpt: dict[str, str] = {}
    delivered: set[str] = set()
    bounced: set[str] = set()

    with open(log_path, errors="ignore") as f:
        for line in f:
            qm = QID_RE.search(line)
            if not qm:
                continue
            qid = qm.group(1)

            fm = FROM_RE.search(line)
            if fm and fm.group(1).lower() in BURNER_SENDERS:
                burner_qids.add(qid)

            tm = TO_RE.search(line)
            sm = STATUS_RE.search(line)
            if tm and sm and qid in burner_qids:
                rcpt = tm.group(1).lower()
                qid_rcpt[qid] = rcpt
                status = sm.group(1).lower()
                if status == "bounced":
                    bounced.add(rcpt)
                elif status == "sent":
                    delivered.add(rcpt)

    # A bounce anywhere wins over a "sent" (deferred-then-bounced).
    delivered -= bounced
    return delivered, bounced


def writeback(delivered: set[str], bounced: set[str], dry_run: bool = False) -> dict:
    """Apply send_confirmed / hard_bounced to fmcsa_carriers."""
    stats = {"confirmed": 0, "bounced": 0}
    if not (delivered or bounced):
        return stats
    conn = psycopg2.connect(DATABASE_URL)
    try:
        with conn.cursor() as cur:
            # Hard bounces: always mark (worst signal wins), excludes from PW sends.
            for email in bounced:
                if dry_run:
                    stats["bounced"] += 1
                    continue
                cur.execute(
                    """UPDATE fmcsa_carriers
                          SET email_verify_result = 'hard_bounced',
                              email_verified = FALSE
                        WHERE lower(email_address) = %s
                          AND email_verify_result IS DISTINCT FROM 'hard_bounced'""",
                    (email,),
                )
                stats["bounced"] += cur.rowcount
            # Delivered: upgrade soft/unknown results to send_confirmed.
            for email in delivered:
                if dry_run:
                    stats["confirmed"] += 1
                    continue
                cur.execute(
                    """UPDATE fmcsa_carriers
                          SET email_verify_result = 'send_confirmed',
                              email_verified = TRUE
                        WHERE lower(email_address) = %s
                          AND (email_verify_result IN %s OR email_verify_result IS NULL)""",
                    (email, UPGRADABLE),
                )
                stats["confirmed"] += cur.rowcount
        if not dry_run:
            conn.commit()
    finally:
        conn.close()
    return stats


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--log", default="/var/log/burner-mail.log",
                    help="burner MTA mail.log to scan")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    if not os.path.exists(args.log):
        print(f"log not found: {args.log}", file=sys.stderr)
        return 1
    delivered, bounced = scan_log(args.log)
    print(f"burner scan: {len(delivered)} delivered, {len(bounced)} bounced")
    stats = writeback(delivered, bounced, dry_run=args.dry_run)
    tag = "[dry-run] " if args.dry_run else ""
    print(f"{tag}writeback: send_confirmed +{stats['confirmed']}, "
          f"hard_bounced +{stats['bounced']}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())