Server-side classifier for the noreply@performancewest.net Carbonio mailbox (35,337 msgs, ~98.6% machine noise). Deletes bounces/auto-replies/ticket auto-acks, keeps genuine human Re: replies + unsubscribes (move to Trash, reversible). Classifier precedence: unsubscribe guard > RFC3834 Auto-Submitted header > machine From-address (localpart/strong-token/display-bot) > STRONG auto subjects (deletes deceptive Re: auto-acks) > human Re: keep > broad auto-ack subjects > default keep. Subjects RFC2047 MIME-decoded first. Three-phase execution: Phase1 fast MAILER-DAEMON search-delete, Phase1.5 fast search-delete of common auto classes (guarded against Re:/unsub), Phase2 header-classify the small remainder with KEEP-caching. Validated 23/23 against hand-labelled live sample. Initial backfill reduced 35,337 -> 68 (67 human replies + 1 unsubscribe). Daily cron installed in root crontab: 17 4 * * * --apply --days 3.
202 lines
12 KiB
Bash
Executable file
202 lines
12 KiB
Bash
Executable file
#!/bin/bash
|
|
# nr_purge.sh -- auto-purge noreply@ mailbox on Carbonio.
|
|
# Policy: DELETE bounces + ticket auto-acks + auto-replies; KEEP human replies + unsubscribes.
|
|
# Discriminator: RFC 3834 Auto-Submitted header (reliable; catches fake "Re:" auto-responders).
|
|
# Reversible: deletions MOVE to /Trash (not hard delete).
|
|
#
|
|
# Modes:
|
|
# (no args) preview: classify most-recent $PREVIEW_N msgs, read-only, print decisions+survivors
|
|
# --preview N preview N most-recent
|
|
# --apply full two-phase purge (Phase1 bulk bounces, Phase2 header-classify remainder)
|
|
# --apply --quick Phase1 only (bulk bounce delete), skip header classify
|
|
set -uo pipefail
|
|
M="noreply@performancewest.net"
|
|
TRASH="/Trash"
|
|
PREVIEW_N=200
|
|
MODE="preview"; QUICK=0; DAYS="${NR_DAYS:-}"
|
|
while [ $# -gt 0 ]; do case "$1" in
|
|
--apply) MODE="apply";;
|
|
--quick) QUICK=1;;
|
|
--days) DAYS="${2:-}"; shift;;
|
|
--preview) MODE="preview"; PREVIEW_N="${2:-200}"; shift;;
|
|
*) ;;
|
|
esac; shift; done
|
|
# Optional date bound for Phase2 (daily cron uses a small window; initial run leaves blank=all)
|
|
DATEQ=""; [ -n "$DAYS" ] && DATEQ=" AND after:-${DAYS}day"
|
|
TS=$(date +%Y%m%d_%H%M%S); LOG="/tmp/nr_purge_$TS.log"
|
|
zm(){ zmmailbox -z -m "$M" "$@" 2>/dev/null; }
|
|
|
|
# ---- RFC 2047 MIME-header decode (handles =?utf-8?Q?..?= and ?B?..?=) ----
|
|
mime_decode(){ perl -MEncode -CS -ne 'print Encode::decode("MIME-Header",$_)' 2>/dev/null; }
|
|
|
|
# Machine-sender address localparts (exact, lowercased): definitionally non-human.
|
|
# Matched against the localpart of the From address only (not display name) to avoid eating humans.
|
|
FROM_MACHINE_RE='^(mailer-daemon|postmaster|auto-reply|autoreply|auto-responder|autoresponder|no-reply|noreply|donotreply|do-not-reply|bounce|bounces|mdaemon|odoobot|helpdesk|notification|notifications|notify|sysadmin|system|root|abuse)([._+-].*)?$'
|
|
# Strong machine tokens that may appear ANYWHERE in the localpart (no human address has these).
|
|
FROM_TOKEN_RE='noreply|no-reply|donotreply|do-not-reply|mailer-daemon|auto-reply|autoreply|autoresponse|auto-response|bounces|auth-results|postmaster'
|
|
# Display-name bots (substring, lowercased) that use human-ish addresses but are clearly automated.
|
|
FROM_DISPLAY_BOT_RE='odoobot|mail delivery (sub)?system|system administrator|microsoft outlook|mail administrator|postmaster'
|
|
# STRONG auto markers checked BEFORE the human "Re:" guard -- unambiguous machine subjects that
|
|
# no human types, so safe to delete even when wearing a "Re:" prefix (e.g. ticket auto-acks).
|
|
STRONG_AUTO_RE='new ticket|ticket created|ticket #|ticket no\.?|ticket has been|has been (assigned|received|resolved|closed|created|opened|updated)|your request with id|request with id|we.?re on it|\(autoresponse\)|auto ?re:|automatic (reply|response)|auto-?response|out of office|out-of-office|authentication report|undeliverable|undelivered|delivery status notification|could ?n.?t be delivered|could not be delivered|message could ?n.?t be|failure notice|returned mail|welcome to .*help ?desk|new case notification'
|
|
# Broader auto-ack / bounce subject patterns (lowercased subject), checked AFTER the Re: guard.
|
|
SUBJ_AUTO_RE='has been (received|resolved|closed|updated|created|opened|assigned)|case (received|closed|resolved|notification)|ticket ?#|ticket no\.?|ticket has been|your ticket|new ticket|ticket created|your request with id|thanks,? we got your|we have received your|out of office|out-of-office|automatic reply|automatic response|auto[- ]?reply|autoreply|auto-?response|\(autoresponse\)|new case|message (recieved|received)|delivery (status notification|failure|has failed)|undelivered|undeliverable|failure notice|^failed:|returned mail|mail delivery|could not be delivered|could ?n.?t be delivered|delayed mail|invalid address|address not found|recipient (address )?rejected|new email address|quota|read-?receipt|priority opened|authentication report|help ?desk'
|
|
|
|
# from_localpart <header-block> -> echoes lowercased localpart of From address
|
|
from_localpart(){
|
|
printf '%s' "$1" | grep -iE '^From:' | head -1 \
|
|
| sed -E 's/^From:[[:space:]]*//I' \
|
|
| grep -oE '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+' | head -1 \
|
|
| sed -E 's/@.*$//' | tr 'A-Z' 'a-z'
|
|
}
|
|
from_display(){ printf '%s' "$1" | grep -iE '^From:' | head -1 | sed -E 's/^From:[[:space:]]*//I' | tr 'A-Z' 'a-z'; }
|
|
|
|
# decide <header-block> <decoded-subject> -> prints "KEEP <reason>" | "DEL <reason>"
|
|
# Precedence: (1) unsubscribe wins; (2) Auto-Submitted header; (3) machine From-address (exact
|
|
# localpart / strong token / display-bot); (4) STRONG auto subjects (delete even if "Re:");
|
|
# (5) genuine human Re:; (6) ticket-tag / broad auto-ack subjects; (7) default keep.
|
|
decide(){
|
|
local H="$1" subj="$2"
|
|
local s as lp disp
|
|
s=$(printf '%s' "$subj" | tr 'A-Z' 'a-z')
|
|
as=$(printf '%s' "$H" | grep -iE '^Auto-Submitted:' | head -1 | sed -E 's/^Auto-Submitted:[[:space:]]*//I' | tr 'A-Z' 'a-z' | tr -d ' ')
|
|
lp=$(from_localpart "$H"); disp=$(from_display "$H")
|
|
# 1) compliance: unsubscribe/opt-out always KEEP (overrides every machine signal)
|
|
if printf '%s' "$s" | grep -qE 'unsubscribe|opt[ -]?out|remove me|stop emailing'; then echo "KEEP unsubscribe"; return; fi
|
|
# 2) RFC 3834 Auto-Submitted present & != no -> machine
|
|
if [ -n "$as" ] && [ "$as" != "no" ]; then echo "DEL auto-submitted=$as"; return; fi
|
|
# 3) machine From-address (exact localpart, strong token anywhere, or display-name bot)
|
|
if printf '%s' "$lp" | grep -qE "$FROM_MACHINE_RE"; then echo "DEL from-machine=$lp"; return; fi
|
|
if printf '%s' "$lp" | grep -qE "$FROM_TOKEN_RE"; then echo "DEL from-token=$lp"; return; fi
|
|
if printf '%s' "$disp" | grep -qE "$FROM_DISPLAY_BOT_RE"; then echo "DEL from-bot"; return; fi
|
|
# 4) STRONG auto subjects: unambiguous machine markers, delete even if dressed as "Re:"
|
|
if printf '%s' "$s" | grep -qE "$STRONG_AUTO_RE"; then echo "DEL strong-auto-subject"; return; fi
|
|
# 5) genuine human threaded reply (auto ones already removed above)
|
|
if printf '%s' "$subj" | grep -qE '^[[:space:]]*(Re|RE|Fwd|Fw|FW)[:[]'; then echo "KEEP human-reply"; return; fi
|
|
# 6) ticket tag / broad auto-ack subject patterns
|
|
if printf '%s' "$subj" | grep -qE '^[[:space:]]*\[##'; then echo "DEL ticket-tag"; return; fi
|
|
if printf '%s' "$s" | grep -qE "$SUBJ_AUTO_RE"; then echo "DEL auto-ack-subject"; return; fi
|
|
# 7) default: keep (human-safe)
|
|
echo "KEEP default"
|
|
}
|
|
|
|
# fetch the joined+decoded Subject for a message id
|
|
get_subject(){ zm getRestURL "/?id=$1&fmt=rfc822" | sed -n '1,/^$/p' \
|
|
| awk 'BEGIN{IGNORECASE=1} /^Subject:/{s=$0;getline n; while(n ~ /^[ \t]/){sub(/^[ \t]+/," ",n); s=s n; getline n} print s; exit}' \
|
|
| sed -E 's/^Subject:[[:space:]]*//I' | mime_decode; }
|
|
|
|
# ---- classify one message id -> prints "KEEP <reason>" or "DEL <reason>" ----
|
|
classify(){
|
|
local H subj
|
|
H=$(zm getRestURL "/?id=$1&fmt=rfc822" | sed -n '1,/^$/p')
|
|
subj=$(printf '%s' "$H" | awk 'BEGIN{IGNORECASE=1} /^Subject:/{s=$0;getline n; while(n ~ /^[ \t]/){sub(/^[ \t]+/," ",n); s=s n; getline n} print s; exit}' | sed -E 's/^Subject:[[:space:]]*//I' | mime_decode)
|
|
decide "$H" "$subj"
|
|
}
|
|
|
|
# classify + emit decoded subject (single fetch) -> "<DECISION>\t<subject>"
|
|
classify2(){
|
|
local H subj
|
|
H=$(zm getRestURL "/?id=$1&fmt=rfc822" | sed -n '1,/^$/p')
|
|
subj=$(printf '%s' "$H" | awk 'BEGIN{IGNORECASE=1} /^Subject:/{s=$0;getline n; while(n ~ /^[ \t]/){sub(/^[ \t]+/," ",n); s=s n; getline n} print s; exit}' | sed -E 's/^Subject:[[:space:]]*//I' | mime_decode)
|
|
printf '%s\t%s\n' "$(decide "$H" "$subj")" "$(printf '%s' "$subj" | cut -c1-70)"
|
|
}
|
|
|
|
ids_for(){ zm search -l 1000 -t message "$1" | grep -w mess | awk '{print $2}'; }
|
|
|
|
move_to_trash(){ # stdin: ids one per line
|
|
local buf=() id n=0
|
|
while read -r id; do [ -z "$id" ] && continue; buf+=("$id"); n=$((n+1))
|
|
if [ ${#buf[@]} -ge 200 ]; then
|
|
local c="${buf[*]}"; zm moveMessage "${c// /,}" "$TRASH" >/dev/null; buf=(); fi
|
|
done
|
|
if [ ${#buf[@]} -gt 0 ]; then local c="${buf[*]}"; zm moveMessage "${c// /,}" "$TRASH" >/dev/null; fi
|
|
echo "$n"
|
|
}
|
|
|
|
if [ "$MODE" = "preview" ]; then
|
|
echo "=== PREVIEW (read-only) most-recent $PREVIEW_N ===" | tee -a "$LOG"
|
|
IDS=$(zm search -l "$PREVIEW_N" -t message "in:inbox" | grep -w mess | awk '{print $2}')
|
|
keep=0; del=0; survivors="/tmp/nr_survivors_$TS.txt"; deletes="/tmp/nr_deletes_$TS.txt"
|
|
: > "$survivors"; : > "$deletes"
|
|
for id in $IDS; do
|
|
line=$(classify2 "$id") # "<DECISION>\t<subject>"
|
|
d=${line%%$'\t'*}; subj=${line#*$'\t'}
|
|
if [[ "$d" == KEEP* ]]; then keep=$((keep+1)); echo "id=$id [$d] $subj" >> "$survivors"
|
|
else del=$((del+1)); echo "id=$id [$d] $subj" >> "$deletes"; fi
|
|
done
|
|
echo "kept=$keep deleted=$del" | tee -a "$LOG"
|
|
echo "--- SURVIVORS (would KEEP) ---" | tee -a "$LOG"
|
|
cat "$survivors" | tee -a "$LOG"
|
|
echo "--- sample DELETES (first 25) ---" | tee -a "$LOG"
|
|
head -25 "$deletes" | tee -a "$LOG"
|
|
exit 0
|
|
fi
|
|
|
|
# ---- APPLY ----
|
|
echo "=== APPLY purge $TS (move to $TRASH) ===" | tee -a "$LOG"
|
|
# Phase 1: fast bulk bounce delete (MAILER-DAEMON = definitionally bounce), keep-guard on unsubscribe
|
|
echo "PHASE1 bulk bounces..." | tee -a "$LOG"
|
|
p1=0; g1=0
|
|
while :; do
|
|
B=$(ids_for "in:inbox from:MAILER-DAEMON AND NOT (subject:unsubscribe OR subject:\"opt out\")$DATEQ")
|
|
[ -z "${B// }" ] && break
|
|
n=$(printf '%s\n' "$B" | move_to_trash)
|
|
p1=$((p1+n)); echo " moved $n (cum $p1)" | tee -a "$LOG"
|
|
[ "$n" -lt 1 ] && break
|
|
g1=$((g1+1)); [ "$g1" -gt 200 ] && { echo " PHASE1 guard stop" | tee -a "$LOG"; break; }
|
|
done
|
|
echo "PHASE1 done: $p1 bounces -> Trash" | tee -a "$LOG"
|
|
[ "$QUICK" = "1" ] && { echo "quick mode: stop after phase1"; exit 0; }
|
|
|
|
# Phase 1.5: fast SEARCH-based bulk delete of the common non-MAILER machine classes
|
|
# (postmaster bounces, Undeliverable/Undelivered DSNs, OOO/automatic-reply). These are
|
|
# matched server-side (no per-message fetch) and HARD-guarded so anything wearing a
|
|
# genuine "Re:"/Fwd: or unsubscribe falls through to the accurate Phase 2 classifier.
|
|
echo "PHASE1.5 fast search-delete of common auto classes..." | tee -a "$LOG"
|
|
GUARD='AND NOT (subject:Re OR subject:Fwd OR subject:unsubscribe OR subject:"opt out")'
|
|
p15=0
|
|
for q in \
|
|
"in:inbox from:postmaster $GUARD$DATEQ" \
|
|
"in:inbox subject:Undeliverable $GUARD$DATEQ" \
|
|
"in:inbox subject:\"Undelivered Mail\" $GUARD$DATEQ" \
|
|
"in:inbox subject:\"automatic reply\" $GUARD$DATEQ" \
|
|
"in:inbox subject:\"out of office\" $GUARD$DATEQ" \
|
|
"in:inbox subject:\"failure notice\" $GUARD$DATEQ" \
|
|
"in:inbox subject:\"delivery status notification\" $GUARD$DATEQ" \
|
|
; do
|
|
qg=0
|
|
while :; do
|
|
B=$(ids_for "$q")
|
|
[ -z "${B// }" ] && break
|
|
n=$(printf '%s\n' "$B" | move_to_trash)
|
|
p15=$((p15+n)); echo " [$q] moved $n (cum $p15)" | tee -a "$LOG"
|
|
[ "$n" -lt 1 ] && break
|
|
qg=$((qg+1)); [ "$qg" -gt 50 ] && break
|
|
done
|
|
done
|
|
echo "PHASE1.5 done: $p15 auto-class -> Trash" | tee -a "$LOG"
|
|
|
|
# Phase 2: header-classify the remainder. Offset paging is unreliable on this box,
|
|
# so we loop: classify the top page, delete its DELs, cache KEEPs as "seen" so we
|
|
# don't re-fetch them next pass. Terminate when a page yields only already-seen KEEPs.
|
|
echo "PHASE2 header-classify remainder..." | tee -a "$LOG"
|
|
p2=0; guard=0; SEEN="/tmp/nr_seen_$TS.txt"; : > "$SEEN"
|
|
while :; do
|
|
IDS=$(zm search -l 1000 -t message "in:inbox AND NOT from:MAILER-DAEMON$DATEQ" | grep -w mess | awk '{print $2}')
|
|
[ -z "${IDS// }" ] && break
|
|
delbuf=""; newwork=0
|
|
for id in $IDS; do
|
|
grep -qx "$id" "$SEEN" && continue # already classified KEEP, skip
|
|
newwork=1
|
|
d=$(classify "$id")
|
|
if [[ "$d" == DEL* ]]; then delbuf+="$id"$'\n'; else echo "$id" >> "$SEEN"; fi
|
|
done
|
|
if [ -n "${delbuf// }" ]; then
|
|
n=$(printf '%s' "$delbuf" | move_to_trash); p2=$((p2+n)); echo " page moved $n (cum $p2)" | tee -a "$LOG"
|
|
fi
|
|
# A page with no new (unseen) messages means everything left is cached-KEEP -> done.
|
|
if [ "$newwork" = "0" ]; then echo " page all-seen-KEEP, stop" | tee -a "$LOG"; break; fi
|
|
guard=$((guard+1)); [ "$guard" -gt 120 ] && { echo "guard stop" | tee -a "$LOG"; break; }
|
|
done
|
|
echo "PHASE2 done: $p2 auto/ack -> Trash; survivors cached in $SEEN ($(wc -l < "$SEEN"))" | tee -a "$LOG"
|
|
echo "TOTAL moved to Trash: $((p1+p2))" | tee -a "$LOG"
|