new-site/scripts/ops/carbonio/nr_purge.sh
justin 2d220a273d ops(carbonio): add noreply@ mailbox auto-purge + daily cron
Server-side classifier for the noreply@performancewest.net Carbonio mailbox
(35,337 msgs, ~98.6% machine noise). Deletes bounces/auto-replies/ticket
auto-acks, keeps genuine human Re: replies + unsubscribes (move to Trash,
reversible).

Classifier precedence: unsubscribe guard > RFC3834 Auto-Submitted header >
machine From-address (localpart/strong-token/display-bot) > STRONG auto
subjects (deletes deceptive Re: auto-acks) > human Re: keep > broad auto-ack
subjects > default keep. Subjects RFC2047 MIME-decoded first.

Three-phase execution: Phase1 fast MAILER-DAEMON search-delete, Phase1.5 fast
search-delete of common auto classes (guarded against Re:/unsub), Phase2
header-classify the small remainder with KEEP-caching.

Validated 23/23 against hand-labelled live sample. Initial backfill reduced
35,337 -> 68 (67 human replies + 1 unsubscribe). Daily cron installed in root
crontab: 17 4 * * * --apply --days 3.
2026-06-21 04:55:50 -05:00

202 lines
12 KiB
Bash
Executable file

#!/bin/bash
# nr_purge.sh -- auto-purge noreply@ mailbox on Carbonio.
# Policy: DELETE bounces + ticket auto-acks + auto-replies; KEEP human replies + unsubscribes.
# Discriminator: RFC 3834 Auto-Submitted header (reliable; catches fake "Re:" auto-responders).
# Reversible: deletions MOVE to /Trash (not hard delete).
#
# Modes:
# (no args) preview: classify most-recent $PREVIEW_N msgs, read-only, print decisions+survivors
# --preview N preview N most-recent
# --apply full two-phase purge (Phase1 bulk bounces, Phase2 header-classify remainder)
# --apply --quick Phase1 only (bulk bounce delete), skip header classify
set -uo pipefail
M="noreply@performancewest.net"
TRASH="/Trash"
PREVIEW_N=200
MODE="preview"; QUICK=0; DAYS="${NR_DAYS:-}"
while [ $# -gt 0 ]; do case "$1" in
--apply) MODE="apply";;
--quick) QUICK=1;;
--days) DAYS="${2:-}"; shift;;
--preview) MODE="preview"; PREVIEW_N="${2:-200}"; shift;;
*) ;;
esac; shift; done
# Optional date bound for Phase2 (daily cron uses a small window; initial run leaves blank=all)
DATEQ=""; [ -n "$DAYS" ] && DATEQ=" AND after:-${DAYS}day"
TS=$(date +%Y%m%d_%H%M%S); LOG="/tmp/nr_purge_$TS.log"
zm(){ zmmailbox -z -m "$M" "$@" 2>/dev/null; }
# ---- RFC 2047 MIME-header decode (handles =?utf-8?Q?..?= and ?B?..?=) ----
mime_decode(){ perl -MEncode -CS -ne 'print Encode::decode("MIME-Header",$_)' 2>/dev/null; }
# Machine-sender address localparts (exact, lowercased): definitionally non-human.
# Matched against the localpart of the From address only (not display name) to avoid eating humans.
FROM_MACHINE_RE='^(mailer-daemon|postmaster|auto-reply|autoreply|auto-responder|autoresponder|no-reply|noreply|donotreply|do-not-reply|bounce|bounces|mdaemon|odoobot|helpdesk|notification|notifications|notify|sysadmin|system|root|abuse)([._+-].*)?$'
# Strong machine tokens that may appear ANYWHERE in the localpart (no human address has these).
FROM_TOKEN_RE='noreply|no-reply|donotreply|do-not-reply|mailer-daemon|auto-reply|autoreply|autoresponse|auto-response|bounces|auth-results|postmaster'
# Display-name bots (substring, lowercased) that use human-ish addresses but are clearly automated.
FROM_DISPLAY_BOT_RE='odoobot|mail delivery (sub)?system|system administrator|microsoft outlook|mail administrator|postmaster'
# STRONG auto markers checked BEFORE the human "Re:" guard -- unambiguous machine subjects that
# no human types, so safe to delete even when wearing a "Re:" prefix (e.g. ticket auto-acks).
STRONG_AUTO_RE='new ticket|ticket created|ticket #|ticket no\.?|ticket has been|has been (assigned|received|resolved|closed|created|opened|updated)|your request with id|request with id|we.?re on it|\(autoresponse\)|auto ?re:|automatic (reply|response)|auto-?response|out of office|out-of-office|authentication report|undeliverable|undelivered|delivery status notification|could ?n.?t be delivered|could not be delivered|message could ?n.?t be|failure notice|returned mail|welcome to .*help ?desk|new case notification'
# Broader auto-ack / bounce subject patterns (lowercased subject), checked AFTER the Re: guard.
SUBJ_AUTO_RE='has been (received|resolved|closed|updated|created|opened|assigned)|case (received|closed|resolved|notification)|ticket ?#|ticket no\.?|ticket has been|your ticket|new ticket|ticket created|your request with id|thanks,? we got your|we have received your|out of office|out-of-office|automatic reply|automatic response|auto[- ]?reply|autoreply|auto-?response|\(autoresponse\)|new case|message (recieved|received)|delivery (status notification|failure|has failed)|undelivered|undeliverable|failure notice|^failed:|returned mail|mail delivery|could not be delivered|could ?n.?t be delivered|delayed mail|invalid address|address not found|recipient (address )?rejected|new email address|quota|read-?receipt|priority opened|authentication report|help ?desk'
# from_localpart <header-block> -> echoes lowercased localpart of From address
from_localpart(){
printf '%s' "$1" | grep -iE '^From:' | head -1 \
| sed -E 's/^From:[[:space:]]*//I' \
| grep -oE '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+' | head -1 \
| sed -E 's/@.*$//' | tr 'A-Z' 'a-z'
}
from_display(){ printf '%s' "$1" | grep -iE '^From:' | head -1 | sed -E 's/^From:[[:space:]]*//I' | tr 'A-Z' 'a-z'; }
# decide <header-block> <decoded-subject> -> prints "KEEP <reason>" | "DEL <reason>"
# Precedence: (1) unsubscribe wins; (2) Auto-Submitted header; (3) machine From-address (exact
# localpart / strong token / display-bot); (4) STRONG auto subjects (delete even if "Re:");
# (5) genuine human Re:; (6) ticket-tag / broad auto-ack subjects; (7) default keep.
decide(){
local H="$1" subj="$2"
local s as lp disp
s=$(printf '%s' "$subj" | tr 'A-Z' 'a-z')
as=$(printf '%s' "$H" | grep -iE '^Auto-Submitted:' | head -1 | sed -E 's/^Auto-Submitted:[[:space:]]*//I' | tr 'A-Z' 'a-z' | tr -d ' ')
lp=$(from_localpart "$H"); disp=$(from_display "$H")
# 1) compliance: unsubscribe/opt-out always KEEP (overrides every machine signal)
if printf '%s' "$s" | grep -qE 'unsubscribe|opt[ -]?out|remove me|stop emailing'; then echo "KEEP unsubscribe"; return; fi
# 2) RFC 3834 Auto-Submitted present & != no -> machine
if [ -n "$as" ] && [ "$as" != "no" ]; then echo "DEL auto-submitted=$as"; return; fi
# 3) machine From-address (exact localpart, strong token anywhere, or display-name bot)
if printf '%s' "$lp" | grep -qE "$FROM_MACHINE_RE"; then echo "DEL from-machine=$lp"; return; fi
if printf '%s' "$lp" | grep -qE "$FROM_TOKEN_RE"; then echo "DEL from-token=$lp"; return; fi
if printf '%s' "$disp" | grep -qE "$FROM_DISPLAY_BOT_RE"; then echo "DEL from-bot"; return; fi
# 4) STRONG auto subjects: unambiguous machine markers, delete even if dressed as "Re:"
if printf '%s' "$s" | grep -qE "$STRONG_AUTO_RE"; then echo "DEL strong-auto-subject"; return; fi
# 5) genuine human threaded reply (auto ones already removed above)
if printf '%s' "$subj" | grep -qE '^[[:space:]]*(Re|RE|Fwd|Fw|FW)[:[]'; then echo "KEEP human-reply"; return; fi
# 6) ticket tag / broad auto-ack subject patterns
if printf '%s' "$subj" | grep -qE '^[[:space:]]*\[##'; then echo "DEL ticket-tag"; return; fi
if printf '%s' "$s" | grep -qE "$SUBJ_AUTO_RE"; then echo "DEL auto-ack-subject"; return; fi
# 7) default: keep (human-safe)
echo "KEEP default"
}
# fetch the joined+decoded Subject for a message id
get_subject(){ zm getRestURL "/?id=$1&fmt=rfc822" | sed -n '1,/^$/p' \
| awk 'BEGIN{IGNORECASE=1} /^Subject:/{s=$0;getline n; while(n ~ /^[ \t]/){sub(/^[ \t]+/," ",n); s=s n; getline n} print s; exit}' \
| sed -E 's/^Subject:[[:space:]]*//I' | mime_decode; }
# ---- classify one message id -> prints "KEEP <reason>" or "DEL <reason>" ----
classify(){
local H subj
H=$(zm getRestURL "/?id=$1&fmt=rfc822" | sed -n '1,/^$/p')
subj=$(printf '%s' "$H" | awk 'BEGIN{IGNORECASE=1} /^Subject:/{s=$0;getline n; while(n ~ /^[ \t]/){sub(/^[ \t]+/," ",n); s=s n; getline n} print s; exit}' | sed -E 's/^Subject:[[:space:]]*//I' | mime_decode)
decide "$H" "$subj"
}
# classify + emit decoded subject (single fetch) -> "<DECISION>\t<subject>"
classify2(){
local H subj
H=$(zm getRestURL "/?id=$1&fmt=rfc822" | sed -n '1,/^$/p')
subj=$(printf '%s' "$H" | awk 'BEGIN{IGNORECASE=1} /^Subject:/{s=$0;getline n; while(n ~ /^[ \t]/){sub(/^[ \t]+/," ",n); s=s n; getline n} print s; exit}' | sed -E 's/^Subject:[[:space:]]*//I' | mime_decode)
printf '%s\t%s\n' "$(decide "$H" "$subj")" "$(printf '%s' "$subj" | cut -c1-70)"
}
ids_for(){ zm search -l 1000 -t message "$1" | grep -w mess | awk '{print $2}'; }
move_to_trash(){ # stdin: ids one per line
local buf=() id n=0
while read -r id; do [ -z "$id" ] && continue; buf+=("$id"); n=$((n+1))
if [ ${#buf[@]} -ge 200 ]; then
local c="${buf[*]}"; zm moveMessage "${c// /,}" "$TRASH" >/dev/null; buf=(); fi
done
if [ ${#buf[@]} -gt 0 ]; then local c="${buf[*]}"; zm moveMessage "${c// /,}" "$TRASH" >/dev/null; fi
echo "$n"
}
if [ "$MODE" = "preview" ]; then
echo "=== PREVIEW (read-only) most-recent $PREVIEW_N ===" | tee -a "$LOG"
IDS=$(zm search -l "$PREVIEW_N" -t message "in:inbox" | grep -w mess | awk '{print $2}')
keep=0; del=0; survivors="/tmp/nr_survivors_$TS.txt"; deletes="/tmp/nr_deletes_$TS.txt"
: > "$survivors"; : > "$deletes"
for id in $IDS; do
line=$(classify2 "$id") # "<DECISION>\t<subject>"
d=${line%%$'\t'*}; subj=${line#*$'\t'}
if [[ "$d" == KEEP* ]]; then keep=$((keep+1)); echo "id=$id [$d] $subj" >> "$survivors"
else del=$((del+1)); echo "id=$id [$d] $subj" >> "$deletes"; fi
done
echo "kept=$keep deleted=$del" | tee -a "$LOG"
echo "--- SURVIVORS (would KEEP) ---" | tee -a "$LOG"
cat "$survivors" | tee -a "$LOG"
echo "--- sample DELETES (first 25) ---" | tee -a "$LOG"
head -25 "$deletes" | tee -a "$LOG"
exit 0
fi
# ---- APPLY ----
echo "=== APPLY purge $TS (move to $TRASH) ===" | tee -a "$LOG"
# Phase 1: fast bulk bounce delete (MAILER-DAEMON = definitionally bounce), keep-guard on unsubscribe
echo "PHASE1 bulk bounces..." | tee -a "$LOG"
p1=0; g1=0
while :; do
B=$(ids_for "in:inbox from:MAILER-DAEMON AND NOT (subject:unsubscribe OR subject:\"opt out\")$DATEQ")
[ -z "${B// }" ] && break
n=$(printf '%s\n' "$B" | move_to_trash)
p1=$((p1+n)); echo " moved $n (cum $p1)" | tee -a "$LOG"
[ "$n" -lt 1 ] && break
g1=$((g1+1)); [ "$g1" -gt 200 ] && { echo " PHASE1 guard stop" | tee -a "$LOG"; break; }
done
echo "PHASE1 done: $p1 bounces -> Trash" | tee -a "$LOG"
[ "$QUICK" = "1" ] && { echo "quick mode: stop after phase1"; exit 0; }
# Phase 1.5: fast SEARCH-based bulk delete of the common non-MAILER machine classes
# (postmaster bounces, Undeliverable/Undelivered DSNs, OOO/automatic-reply). These are
# matched server-side (no per-message fetch) and HARD-guarded so anything wearing a
# genuine "Re:"/Fwd: or unsubscribe falls through to the accurate Phase 2 classifier.
echo "PHASE1.5 fast search-delete of common auto classes..." | tee -a "$LOG"
GUARD='AND NOT (subject:Re OR subject:Fwd OR subject:unsubscribe OR subject:"opt out")'
p15=0
for q in \
"in:inbox from:postmaster $GUARD$DATEQ" \
"in:inbox subject:Undeliverable $GUARD$DATEQ" \
"in:inbox subject:\"Undelivered Mail\" $GUARD$DATEQ" \
"in:inbox subject:\"automatic reply\" $GUARD$DATEQ" \
"in:inbox subject:\"out of office\" $GUARD$DATEQ" \
"in:inbox subject:\"failure notice\" $GUARD$DATEQ" \
"in:inbox subject:\"delivery status notification\" $GUARD$DATEQ" \
; do
qg=0
while :; do
B=$(ids_for "$q")
[ -z "${B// }" ] && break
n=$(printf '%s\n' "$B" | move_to_trash)
p15=$((p15+n)); echo " [$q] moved $n (cum $p15)" | tee -a "$LOG"
[ "$n" -lt 1 ] && break
qg=$((qg+1)); [ "$qg" -gt 50 ] && break
done
done
echo "PHASE1.5 done: $p15 auto-class -> Trash" | tee -a "$LOG"
# Phase 2: header-classify the remainder. Offset paging is unreliable on this box,
# so we loop: classify the top page, delete its DELs, cache KEEPs as "seen" so we
# don't re-fetch them next pass. Terminate when a page yields only already-seen KEEPs.
echo "PHASE2 header-classify remainder..." | tee -a "$LOG"
p2=0; guard=0; SEEN="/tmp/nr_seen_$TS.txt"; : > "$SEEN"
while :; do
IDS=$(zm search -l 1000 -t message "in:inbox AND NOT from:MAILER-DAEMON$DATEQ" | grep -w mess | awk '{print $2}')
[ -z "${IDS// }" ] && break
delbuf=""; newwork=0
for id in $IDS; do
grep -qx "$id" "$SEEN" && continue # already classified KEEP, skip
newwork=1
d=$(classify "$id")
if [[ "$d" == DEL* ]]; then delbuf+="$id"$'\n'; else echo "$id" >> "$SEEN"; fi
done
if [ -n "${delbuf// }" ]; then
n=$(printf '%s' "$delbuf" | move_to_trash); p2=$((p2+n)); echo " page moved $n (cum $p2)" | tee -a "$LOG"
fi
# A page with no new (unseen) messages means everything left is cached-KEEP -> done.
if [ "$newwork" = "0" ]; then echo " page all-seen-KEEP, stop" | tee -a "$LOG"; break; fi
guard=$((guard+1)); [ "$guard" -gt 120 ] && { echo "guard stop" | tee -a "$LOG"; break; }
done
echo "PHASE2 done: $p2 auto/ack -> Trash; survivors cached in $SEEN ($(wc -l < "$SEEN"))" | tee -a "$LOG"
echo "TOTAL moved to Trash: $((p1+p2))" | tee -a "$LOG"