diff --git a/infra/cron/pw-mail-reputation b/infra/cron/pw-mail-reputation index 7b186c2..1525f00 100644 --- a/infra/cron/pw-mail-reputation +++ b/infra/cron/pw-mail-reputation @@ -9,4 +9,4 @@ # warmup-tg-alert cron) and pipe it into the DB-connected workers container. # Runs at 06:10 UTC (before the 06:30 scrub + the 07:00-08:00 campaign builders) # so it captures the full prior day before logrotate. -10 6 * * * deploy { sudo cat /var/log/mail.log /var/log/mail.log.1 2>/dev/null; sudo zcat /var/log/mail.log.2.gz 2>/dev/null; } | (cd /opt/performancewest && docker compose exec -T workers python3 -m scripts.mail_reputation_monitor --alert -) >> /var/log/pw-mail-reputation.log 2>&1 +10 6 * * * deploy { sudo cat /var/log/mail.log /var/log/mail.log.1 2>/dev/null; sudo zcat /var/log/mail.log.2.gz 2>/dev/null; } | (cd /opt/performancewest && docker compose exec -T workers python3 -m scripts.mail_reputation_monitor --alert -) >> /opt/performancewest/logs/pw-mail-reputation.log 2>&1 diff --git a/infra/cron/pw-warmup-tg-alert b/infra/cron/pw-warmup-tg-alert index 420f803..be24d55 100644 --- a/infra/cron/pw-warmup-tg-alert +++ b/infra/cron/pw-warmup-tg-alert @@ -4,4 +4,4 @@ # to /var/log/pw-warmup-healthcheck.log. Script: infra/monitoring/pw-warmup-tg-alert.sh # -> /usr/local/bin/pw-warmup-tg-alert. Reads TELEGRAM_BOT_TOKEN/CHAT_ID from # /opt/performancewest/.env. -0 20 * * * deploy /usr/local/bin/pw-warmup-tg-alert >> /var/log/pw-warmup-healthcheck.log 2>&1 +0 20 * * * deploy /usr/local/bin/pw-warmup-tg-alert >> /opt/performancewest/logs/pw-warmup-healthcheck.log 2>&1 diff --git a/infra/monitoring/pw-warmup-tg-alert.sh b/infra/monitoring/pw-warmup-tg-alert.sh index fabed9d..728d8ef 100755 --- a/infra/monitoring/pw-warmup-tg-alert.sh +++ b/infra/monitoring/pw-warmup-tg-alert.sh @@ -4,7 +4,7 @@ set -uo pipefail LOG=/var/log/mail.log TODAY=$(date '+%b %d') -REPORT=/var/log/pw-warmup-healthcheck.log +REPORT=/opt/performancewest/logs/pw-warmup-healthcheck.log MIN_DELIVERY=65 MAX_SPAMBLOCK=150 MIN_SENT=50 @@ -27,20 +27,9 @@ HBOUNCE=$(mlog 'hcout[0-9]/smtp' | grep -c 'status=bounced') HSPAM=$(mlog 'hcout[0-9]/smtp' | grep 'status=bounced' | grep -c '550-5.7.1') HTOTAL=$((HSENT + HBOUNCE)); HDEL=0 [ "$HTOTAL" -gt 0 ] && HDEL=$(python3 -c "print(round(100*$HSENT/$HTOTAL))") -# IP rehab pool (.91-.93 / rehab02-04) — recovering after the May 30-31 blast. -# A recovering IP naturally bounces more on a cold list, so the rehab bounce -# threshold is lenient; we only flag if it is alarmingly high (a sign the rehab -# recipient quality regressed) or if a rehab IP lands on a DNSBL. -RSENT=$(mlog 'rehab0[234]/smtp' | grep -c 'status=sent') -RBOUNCE=$(mlog 'rehab0[234]/smtp' | grep -c 'status=bounced') -RTOTAL=$((RSENT + RBOUNCE)); RDEL=0 -[ "$RTOTAL" -gt 0 ] && RDEL=$(python3 -c "print(round(100*$RSENT/$RTOTAL))") -# DNSBL check for the rehab IPs (reuse Spamhaus ZEN — the one that matters most). -RBL='' -for ip in 91 92 93; do - hit=$(dig +short +time=3 +tries=1 ${ip}.124.174.207.zen.spamhaus.org 2>/dev/null | head -1) - [ -n "$hit" ] && RBL="${RBL}.${ip} on Spamhaus ZEN ($hit); " -done +# NOTE: the IP rehab pool (.91-.93 / rehab02-04) and the multi-IP rotation were +# REMOVED 2026-06-23 (snowshoe cleanup, see docs/deliverability.md). Only the two +# warm sending IPs remain: .94 (trucking / out05) and .107 (HC / hcout1). PROBLEMS='' if [ "$MSENT" -ge "$MIN_SENT" ]; then [ "$MDEL" -lt "$MIN_DELIVERY" ] && PROBLEMS="${PROBLEMS}- Main pool delivery ${MDEL}% (below ${MIN_DELIVERY}%)\n" @@ -50,20 +39,23 @@ if [ "$HSENT" -ge "$MIN_SENT" ]; then [ "$HDEL" -lt "$MIN_DELIVERY" ] && PROBLEMS="${PROBLEMS}- HC stream delivery ${HDEL}%\n" [ "$HSPAM" -gt "$MAX_SPAMBLOCK" ] && PROBLEMS="${PROBLEMS}- HC stream spam/policy blocks: ${HSPAM}\n" fi -# Rehab problems: DNSBL listing is always a problem; bounce >60% with real -# volume means the recipient quality slipped (rehab should be on clean domains). -[ -n "$RBL" ] && PROBLEMS="${PROBLEMS}- IP rehab DNSBL: ${RBL}\n" -if [ "$RSENT" -ge 10 ] && [ "$RDEL" -lt 40 ]; then - PROBLEMS="${PROBLEMS}- IP rehab delivery ${RDEL}% (recipient quality slipped)\n" -fi +# DNSBL check for the two live sending IPs (.94 trucking, .107 HC). 8.8.8.8 is +# blocked by Spamhaus (returns 127.255.255.254 = "open resolver"), so query a +# resolver that returns real ZEN data (Control D 76.76.2.0; cross-check Neustar). +RBL='' +for ip in 94 107; do + ans=$(dig +short +time=3 +tries=1 ${ip}.124.174.207.zen.spamhaus.org @76.76.2.0 2>/dev/null | grep -E '^127\.0\.0\.' | head -1) + [ -n "$ans" ] && RBL="${RBL}.${ip} on Spamhaus ZEN ($ans); " +done +[ -n "$RBL" ] && PROBLEMS="${PROBLEMS}- Sending IP on DNSBL: ${RBL}\n" { echo "==== TG WARMUP CHECK $(date) ====" - echo "MAIN: sent=$MSENT bounced=$MBOUNCE delivery=${MDEL}% spamblock=$MSPAM" - echo "HC: sent=$HSENT bounced=$HBOUNCE delivery=${HDEL}% spamblock=$HSPAM" - echo "REHAB(.91-.93): sent=$RSENT bounced=$RBOUNCE delivery=${RDEL}% dnsbl=${RBL:-clean}" + echo "MAIN(.94): sent=$MSENT bounced=$MBOUNCE delivery=${MDEL}% spamblock=$MSPAM" + echo "HC(.107): sent=$HSENT bounced=$HBOUNCE delivery=${HDEL}% spamblock=$HSPAM" + echo "dnsbl: ${RBL:-clean}" echo "problems: ${PROBLEMS:-none}" } >> "$REPORT" 2>&1 if [ -n "$PROBLEMS" ]; then - MSG=$(printf '⚠️ Performance West IP reputation alert (%s)\n\nMain pool: %d%% delivery, %d sent, %d bounced, %d spam-blocks\nHC stream: %d%% delivery, %d sent, %d spam-blocks\nRehab (.91-.93): %d%% delivery, %d sent, %d bounced (dnsbl: %s)\n\nIssues:\n%b' "$TODAY" "$MDEL" "$MSENT" "$MBOUNCE" "$MSPAM" "$HDEL" "$HSENT" "$HSPAM" "$RDEL" "$RSENT" "$RBOUNCE" "${RBL:-clean}" "$PROBLEMS") + MSG=$(printf '⚠️ Performance West IP reputation alert (%s)\n\nMain pool (.94): %d%% delivery, %d sent, %d bounced, %d spam-blocks\nHC stream (.107): %d%% delivery, %d sent, %d spam-blocks\nDNSBL: %s\n\nIssues:\n%b' "$TODAY" "$MDEL" "$MSENT" "$MBOUNCE" "$MSPAM" "$HDEL" "$HSENT" "$HSPAM" "${RBL:-clean}" "$PROBLEMS") tg "$MSG" fi diff --git a/scripts/dmarc_report_parser.py b/scripts/dmarc_report_parser.py index 5149ed4..19e1f72 100644 --- a/scripts/dmarc_report_parser.py +++ b/scripts/dmarc_report_parser.py @@ -86,6 +86,47 @@ def is_ours(ip: str) -> bool: return any(addr in net for net in OUR_NETS) +# Reverse-DNS substrings that identify a LEGIT forwarder / recipient-side mail +# security gateway. These re-send our mail from their own IP, which naturally +# breaks SPF/DKIM alignment -> the forwarded copy "fails" DMARC. That is benign +# (the ORIGINAL was already delivered+aligned; our p=reject only drops the +# forwarded duplicate). We must NOT alert on these or the digest is pure noise. +# Matched case-insensitively against the source IP's PTR record. +FORWARDER_PTR_HINTS = ( + "inkyphishfence", "cloud-sec-av", "proofpoint", "pphosted", "ppe-hosted", + "mimecast", "barracuda", "messagelabs", "symanteccloud", "fireeyecloud", + "trendmicro", "mailcontrol", "forcepoint", "cisco", "iphmx", # Cisco ESA + "mxlogic", "mailprotect", "emailsrvr", "godaddy", "secureserver", + "outlook.com", "protection.outlook", "google.com", "googlemail", + "amazonses", "sendgrid", "mailgun", "mcsv.net", "mailchimp", + "fastmail", "messagingengine", "zoho", "mailroute", "spamh", + "antispamcloud", "mailspamprotection", "fortimail", "sophos", +) + +_ptr_cache: dict[str, str] = {} + + +def reverse_dns(ip: str) -> str: + """Best-effort PTR lookup (cached). Empty string on failure.""" + if ip in _ptr_cache: + return _ptr_cache[ip] + ptr = "" + try: + import socket + ptr = socket.gethostbyaddr(ip)[0].lower() + except Exception: + ptr = "" + _ptr_cache[ip] = ptr + return ptr + + +def is_known_forwarder(ip: str) -> bool: + """True if the IP's PTR looks like a legit forwarder / security gateway, so + DMARC failures from it are benign (forwarded mail, not spoofing).""" + ptr = reverse_dns(ip) + return any(h in ptr for h in FORWARDER_PTR_HINTS) if ptr else False + + # ── attachment extraction ───────────────────────────────────────────────────── def extract_xml(payload: bytes, filename: str) -> bytes | None: """Decompress a DMARC report attachment to raw XML bytes.""" @@ -280,13 +321,23 @@ def summarize(conn, days: int = 7) -> tuple[str, list[str]]: continue pass_pct = round(100 * passed / total) ours = is_ours(ip) - tag = "ours" if ours else "EXTERNAL" + if ours: + tag = "ours" + elif failed > 0 and is_known_forwarder(ip): + tag = "fwd" # legit forwarder / security gateway -- failures benign + else: + tag = "EXTERNAL" lines.append(f" {ip:<16} [{tag:<8}] total={total:<6} pass={pass_pct}% fail={failed}") - # Alerts: our IP failing alignment, OR an external IP sending as us at volume. + # Alert ONLY on genuinely actionable cases: + # 1. OUR OWN IP failing alignment = a real auth/config break we must fix. + # 2. An UNKNOWN external IP (not ours, not a recognized forwarder) sending + # as us at high volume = possible spoofing. Recognized forwarders + # (Proofpoint/Mimecast/Inky/etc. re-sending our mail) naturally fail + # SPF/DKIM alignment and are filtered out -- they were the digest noise. if ours and pass_pct < 95 and total >= 20: problems.append(f"{ip} (ours): only {pass_pct}% DMARC pass ({failed}/{total} fail) -- alignment broken") - if not ours and failed >= 20: - problems.append(f"{ip} (EXTERNAL): {failed} failing msgs sending as us -- possible spoofing") + elif tag == "EXTERNAL" and failed >= 100: + problems.append(f"{ip} (EXTERNAL, PTR={reverse_dns(ip) or 'none'}): {failed} failing msgs sending as us -- possible spoofing") return "\n".join(lines), problems