From 4dc5690666d08c17283375ef76cceff14b5157f9 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 17 Jun 2026 20:26:01 -0500 Subject: [PATCH] infra: codify the email-campaign pipeline in Ansible (new mail-pipeline role) The entire outbound campaign pipeline lived ONLY on the host and was never in IaC -- a fresh rebuild would have silently shipped NO campaigns, NO IP warmup/ ramp, and NO bounce processing. New mail-pipeline role + deploy-mail-pipeline.yml playbook deploy it from the canonical repo copies: cron.d (infra/cron/): - pw-trucking-campaign-builder, pw-ifta-campaign, pw-ucr-campaign - pw-hc-campaign, pw-hc-nppes, pw-hc-refresh - pw-mta-warmup, pw-listmonk-rampcap, pw-hc-rampcap - pw-ip-rehab, pw-warmup-tg-alert helper scripts (-> /usr/local/bin): - pw-mta-warmup, pw-listmonk-rampcap, pw-hc-rampcap, pw-warmup-tg-alert - postfix-bounce-notify.sh, postfix-hc-bounce-notify.sh, listmonk-bounce-sync.py systemd services: - pw-bounce-watcher.service (was missing from repo), pw-hc-bounce-watcher.service Also creates the deploy-owned {{project_dir}}/logs dir (deploy can't write /var/log, so a missing dir made cron redirects fail). Added the 6 cron.d files that existed only on the host, the trucking bounce-watcher unit, and synced infra/cron/pw-hc-refresh to the live version (revalidation download + enrich steps). Role wired into site.yml after the mail (OpenDKIM) role. Part of the email-deliverability incident hardening. --- .../playbooks/deploy-mail-pipeline.yml | 11 ++ infra/ansible/playbooks/site.yml | 2 + .../roles/mail-pipeline/defaults/main.yml | 7 ++ .../roles/mail-pipeline/handlers/main.yml | 14 +++ .../roles/mail-pipeline/tasks/main.yml | 119 ++++++++++++++++++ infra/cron/pw-hc-rampcap | 5 + infra/cron/pw-hc-refresh | 11 +- infra/cron/pw-ifta-campaign | 5 + infra/cron/pw-listmonk-rampcap | 5 + infra/cron/pw-mta-warmup | 5 + infra/cron/pw-trucking-campaign-builder | 4 + infra/cron/pw-ucr-campaign | 4 + infra/systemd/pw-bounce-watcher.service | 13 ++ 13 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 infra/ansible/playbooks/deploy-mail-pipeline.yml create mode 100644 infra/ansible/roles/mail-pipeline/defaults/main.yml create mode 100644 infra/ansible/roles/mail-pipeline/handlers/main.yml create mode 100644 infra/ansible/roles/mail-pipeline/tasks/main.yml create mode 100644 infra/cron/pw-hc-rampcap create mode 100644 infra/cron/pw-ifta-campaign create mode 100644 infra/cron/pw-listmonk-rampcap create mode 100644 infra/cron/pw-mta-warmup create mode 100644 infra/cron/pw-trucking-campaign-builder create mode 100644 infra/cron/pw-ucr-campaign create mode 100644 infra/systemd/pw-bounce-watcher.service diff --git a/infra/ansible/playbooks/deploy-mail-pipeline.yml b/infra/ansible/playbooks/deploy-mail-pipeline.yml new file mode 100644 index 0000000..9d3ad43 --- /dev/null +++ b/infra/ansible/playbooks/deploy-mail-pipeline.yml @@ -0,0 +1,11 @@ +--- +# Deploy only the mail-pipeline role (campaign crons, IP warmup/ramp helpers, +# bounce watchers). Run after changing anything under infra/cron, infra/postfix, +# infra/monitoring, infra/systemd, or scripts/*bounce*. +# +# Usage: ansible-playbook playbooks/deploy-mail-pipeline.yml -i inventory/hosts.yml --ask-vault-pass +- name: Deploy mail-pipeline (campaign crons + warmup + bounce watchers) + hosts: pw + become: true + roles: + - role: "{{ playbook_dir }}/../roles/mail-pipeline" diff --git a/infra/ansible/playbooks/site.yml b/infra/ansible/playbooks/site.yml index ada4b31..3e8aeed 100644 --- a/infra/ansible/playbooks/site.yml +++ b/infra/ansible/playbooks/site.yml @@ -16,6 +16,7 @@ # workers — Python job server + Ollama LLM # shkeeper — k3s + Helm + SHKeeper (crypto payments: BTC/ETH/USDC/Polygon/TRX/BNB/LTC) # mail — OpenDKIM signing for outbound Postfix mail (incl. Listmonk campaigns) +# mail-pipeline — campaign cron builders + IP warmup/ramp + bounce watchers # nginx — nginx + certbot TLS for all domains + fail2ban - name: Provision Performance West server @@ -33,6 +34,7 @@ - worker-crons - shkeeper - mail + - mail-pipeline - nginx - monitoring - security-updates diff --git a/infra/ansible/roles/mail-pipeline/defaults/main.yml b/infra/ansible/roles/mail-pipeline/defaults/main.yml new file mode 100644 index 0000000..4712e1f --- /dev/null +++ b/infra/ansible/roles/mail-pipeline/defaults/main.yml @@ -0,0 +1,7 @@ +--- +# mail-pipeline role defaults +# +# project_dir + deploy_user are normally provided by the common/app roles' +# group_vars; these defaults keep the role self-contained. +project_dir: /opt/performancewest +deploy_user: deploy diff --git a/infra/ansible/roles/mail-pipeline/handlers/main.yml b/infra/ansible/roles/mail-pipeline/handlers/main.yml new file mode 100644 index 0000000..21c9402 --- /dev/null +++ b/infra/ansible/roles/mail-pipeline/handlers/main.yml @@ -0,0 +1,14 @@ +--- +- name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + +- name: Restart pw-bounce-watcher + ansible.builtin.systemd: + name: pw-bounce-watcher.service + state: restarted + +- name: Restart pw-hc-bounce-watcher + ansible.builtin.systemd: + name: pw-hc-bounce-watcher.service + state: restarted diff --git a/infra/ansible/roles/mail-pipeline/tasks/main.yml b/infra/ansible/roles/mail-pipeline/tasks/main.yml new file mode 100644 index 0000000..0b478e0 --- /dev/null +++ b/infra/ansible/roles/mail-pipeline/tasks/main.yml @@ -0,0 +1,119 @@ +--- +# mail-pipeline role +# +# Codifies the outbound email-campaign pipeline that previously lived ONLY on +# the host (none of this was in IaC before -- a fresh rebuild would have silently +# shipped NO campaigns, NO IP warmup/ramp, and NO bounce processing): +# +# - /etc/cron.d/pw-* daily campaign builders + IP-warmup/ramp drivers +# - /usr/local/bin/pw-* warmup/ramp/healthcheck helper scripts +# - /usr/local/bin/postfix-*-bounce-notify.sh bounce watchers +# - pw-bounce-watcher / pw-hc-bounce-watcher systemd watcher services +# +# The campaign BUILDER logic (scripts/build_*.py) is synced with the app/workers +# code; this role only deploys the host-level glue (cron + helper scripts + +# services). The OpenDKIM signing + mail.log logrotate live in the `mail` role. + +# ── log + state dirs ──────────────────────────────────────────────────────── +# The deploy user CANNOT write /var/log, so the deploy-owned cron jobs log to +# /opt/performancewest/logs. A missing dir makes the `>>` redirect fail before +# the command runs (cron then mails the error to deploy@ -> self-bounce). +- name: Ensure deploy-owned cron log directory exists + ansible.builtin.file: + path: "{{ project_dir }}/logs" + state: directory + owner: "{{ deploy_user }}" + group: "{{ deploy_user }}" + mode: "0775" + +# ── warmup / ramp helper scripts (run as root: edit main.cf, restart cntrs) ── +- name: Deploy mail warmup/ramp/healthcheck helper scripts + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../{{ item.src }}" + dest: "/usr/local/bin/{{ item.dest }}" + owner: root + group: root + mode: "0755" + loop: + - { src: "infra/postfix/pw-mta-warmup.sh", dest: "pw-mta-warmup" } + - { src: "infra/postfix/pw-listmonk-rampcap.sh", dest: "pw-listmonk-rampcap" } + - { src: "infra/postfix/pw-hc-rampcap.sh", dest: "pw-hc-rampcap" } + - { src: "infra/monitoring/pw-warmup-tg-alert.sh", dest: "pw-warmup-tg-alert" } + +# ── bounce watchers (tail mail.log -> Listmonk bounce webhook) ────────────── +- name: Deploy bounce-watcher scripts + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../{{ item.src }}" + dest: "/usr/local/bin/{{ item.dest }}" + owner: root + group: root + mode: "0755" + loop: + - { src: "scripts/bounce-watcher.sh", dest: "postfix-bounce-notify.sh" } + - { src: "scripts/hc-bounce-watcher.sh", dest: "postfix-hc-bounce-notify.sh" } + notify: + - Restart pw-bounce-watcher + - Restart pw-hc-bounce-watcher + +- name: Deploy bounce-watcher systemd units + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../infra/systemd/{{ item }}" + dest: "/etc/systemd/system/{{ item }}" + owner: root + group: root + mode: "0644" + loop: + - pw-bounce-watcher.service + - pw-hc-bounce-watcher.service + notify: + - Reload systemd + - Restart pw-bounce-watcher + - Restart pw-hc-bounce-watcher + +- name: Enable + start bounce-watcher services + ansible.builtin.systemd: + name: "{{ item }}" + enabled: true + state: started + daemon_reload: true + loop: + - pw-bounce-watcher.service + - pw-hc-bounce-watcher.service + +# ── listmonk bounce-sync poller (host python, every 5 min via root crontab) ── +- name: Deploy listmonk bounce-sync poller + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../scripts/listmonk-bounce-sync.py" + dest: /usr/local/bin/listmonk-bounce-sync.py + owner: root + group: root + mode: "0755" + +- name: Schedule listmonk bounce-sync (root crontab, every 5 min) + ansible.builtin.cron: + name: listmonk-bounce-sync + minute: "*/5" + job: "/usr/bin/python3 /usr/local/bin/listmonk-bounce-sync.py >> /var/log/bounce-sync.log 2>&1" + +# ── campaign + warmup cron.d files ────────────────────────────────────────── +# These reference scripts/ in {{ project_dir }} and the docker compose stack, so +# they are deployed verbatim from infra/cron/ (the canonical, reviewed copies). +- name: Deploy campaign + warmup cron.d files + ansible.builtin.copy: + src: "{{ playbook_dir }}/../../infra/cron/{{ item }}" + dest: "/etc/cron.d/{{ item }}" + owner: root + group: root + mode: "0644" + loop: + - pw-trucking-campaign-builder + - pw-ifta-campaign + - pw-ucr-campaign + - pw-hc-campaign + - pw-hc-nppes + - pw-hc-refresh + - pw-mta-warmup + - pw-listmonk-rampcap + - pw-hc-rampcap + - pw-ip-rehab + - pw-warmup-tg-alert diff --git a/infra/cron/pw-hc-rampcap b/infra/cron/pw-hc-rampcap new file mode 100644 index 0000000..35b52d0 --- /dev/null +++ b/infra/cron/pw-hc-rampcap @@ -0,0 +1,5 @@ +# Ramp the healthcare listmonk-hc hourly send cap in lockstep with the HC IP +# warmup, driven off the SEPARATE /etc/postfix/hc-warmup-start stamp and writing +# the SEPARATE listmonk_hc DB. Restarts listmonk-hc only when the cap changes. +# Helper: infra/postfix/pw-hc-rampcap.sh -> /usr/local/bin/pw-hc-rampcap. +20 7 * * * root /usr/local/bin/pw-hc-rampcap >> /var/log/pw-hc-rampcap.log 2>&1 diff --git a/infra/cron/pw-hc-refresh b/infra/cron/pw-hc-refresh index 64559a5..939c8de 100644 --- a/infra/cron/pw-hc-refresh +++ b/infra/cron/pw-hc-refresh @@ -8,6 +8,11 @@ # CMS data-lag window to ~2-3 days, so a provider who just completed their # revalidation stops being targeted faster (fewer "already done" replies). # Takes ~8 min. SAM is opt-in (--sam-pages); SAM exclusions rarely carry an NPI, -# so OIG LEIE is the NPI-bearing exclusion source. Then prune-only removes newly- -# Google-hosted and suppressed subscribers from the warmup lists. -0 6 * * 1,3,5 deploy cd /opt/performancewest && python3 -u scripts/hc_data_refresh.py >> /opt/performancewest/logs/pw-hc-refresh.log 2>&1 && python3 -u scripts/build_healthcare_campaigns_cron.py --prune-only >> /opt/performancewest/logs/pw-hc-refresh.log 2>&1 +# so OIG LEIE is the NPI-bearing exclusion source. Pipeline: +# 1. hc_data_refresh.py -- re-verify NPIs vs CMS/OIG + MX reclassify +# 2. download CMS revalidation_base.csv (institutional revalidation dates) +# 3. enrich_institutional_revalidation.py -- merge reval dates into the +# institutional CSV consumed by the pw-hc-nppes builder +# 4. build_healthcare_campaigns_cron.py --prune-only -- evict newly-Google- +# hosted + suppressed subscribers from the warmup lists +0 6 * * 1,3,5 deploy cd /opt/performancewest && python3 -u scripts/hc_data_refresh.py >> /opt/performancewest/logs/pw-hc-refresh.log 2>&1 && curl -s "https://data.cms.gov/sites/default/files/2026-05/96484587-20ec-4070-a4de-cd7de3ec0093/revalidation_base.csv" -o data/npi_build/revalidation_base.csv 2>>/opt/performancewest/logs/pw-hc-refresh.log && python3 -u scripts/enrich_institutional_revalidation.py data/hc_nppes_institutional_verified.csv data/npi_build/revalidation_base.csv data/hc_nppes_institutional_enriched.csv >> /opt/performancewest/logs/pw-hc-refresh.log 2>&1 && python3 -u scripts/build_healthcare_campaigns_cron.py --prune-only >> /opt/performancewest/logs/pw-hc-refresh.log 2>&1 diff --git a/infra/cron/pw-ifta-campaign b/infra/cron/pw-ifta-campaign new file mode 100644 index 0000000..15ee0e7 --- /dev/null +++ b/infra/cron/pw-ifta-campaign @@ -0,0 +1,5 @@ +# IFTA quarterly-return reminder. Runs daily; the builder self-gates to the +# ~21-day-before-deadline window (Apr30/Jul31/Oct31/Jan31), so it only actually +# sends 4 times/year. Reuses the trucking sender plumbing + same-day coupon. +# CAMPAIGN_IFTA_QUARTERLY_ID is the source/base campaign to clone. +45 7 * * 1-5 deploy cd /opt/performancewest && docker compose exec -T -e CAMPAIGN_IFTA_QUARTERLY_ID=469 workers python3 -m scripts.build_ifta_quarterly_campaign --start-campaign >> /opt/performancewest/logs/pw-ifta-campaign.log 2>&1 diff --git a/infra/cron/pw-listmonk-rampcap b/infra/cron/pw-listmonk-rampcap new file mode 100644 index 0000000..4773b9a --- /dev/null +++ b/infra/cron/pw-listmonk-rampcap @@ -0,0 +1,5 @@ +# Ramp the trucking Listmonk hourly send cap (sliding window) in lockstep with +# the Postfix IP warmup, driven off /etc/postfix/pw-warmup-start. Restarts the +# listmonk container only when the cap changes. Helper: +# infra/postfix/pw-listmonk-rampcap.sh -> /usr/local/bin/pw-listmonk-rampcap. +20 7 * * * root /usr/local/bin/pw-listmonk-rampcap >> /var/log/pw-listmonk-rampcap.log 2>&1 diff --git a/infra/cron/pw-mta-warmup b/infra/cron/pw-mta-warmup new file mode 100644 index 0000000..2ae41c7 --- /dev/null +++ b/infra/cron/pw-mta-warmup @@ -0,0 +1,5 @@ +# Postfix outbound-IP warmup scheduler. Recomputes the active sending-IP +# rotation pool from the warmup start date (/etc/postfix/pw-warmup-start) and +# reloads Postfix only when it changes. Helper: infra/postfix/pw-mta-warmup.sh +# -> /usr/local/bin/pw-mta-warmup. Runs as root (edits main.cf + postfix reload). +17 7 * * * root /usr/local/bin/pw-mta-warmup >> /var/log/pw-mta-warmup.log 2>&1 diff --git a/infra/cron/pw-trucking-campaign-builder b/infra/cron/pw-trucking-campaign-builder new file mode 100644 index 0000000..1b4a459 --- /dev/null +++ b/infra/cron/pw-trucking-campaign-builder @@ -0,0 +1,4 @@ +# Build next day's trucking Listmonk campaigns daily at 08:00 UTC (3 AM EST). +# 4 TZ regions x {MCS-150 overdue, Inactive USDOT}. Runs inside the workers +# container; per-MX throttling + warmup ramp bound the actual volume. +0 8 * * * deploy cd /opt/performancewest && docker compose exec -T workers python3 -m scripts.build_trucking_campaigns >> /var/log/pw-trucking-campaign-builder.log 2>&1 diff --git a/infra/cron/pw-ucr-campaign b/infra/cron/pw-ucr-campaign new file mode 100644 index 0000000..a21430a --- /dev/null +++ b/infra/cron/pw-ucr-campaign @@ -0,0 +1,4 @@ +# UCR annual-registration reminder. Runs daily; the builder self-gates to the +# 30/12/4-business-day-before-Dec-31 touch windows, so it only sends ~3x/year. +# CAMPAIGN_UCR_ANNUAL_ID is the source/base campaign to clone. +50 7 * * 1-5 deploy cd /opt/performancewest && docker compose exec -T -e CAMPAIGN_UCR_ANNUAL_ID=473 workers python3 -m scripts.build_ucr_annual_campaign --start-campaign >> /opt/performancewest/logs/pw-ucr-campaign.log 2>&1 diff --git a/infra/systemd/pw-bounce-watcher.service b/infra/systemd/pw-bounce-watcher.service new file mode 100644 index 0000000..fffb8ed --- /dev/null +++ b/infra/systemd/pw-bounce-watcher.service @@ -0,0 +1,13 @@ +[Unit] +Description=Postfix bounce watcher -> Listmonk webhook +After=postfix.service +Wants=postfix.service + +[Service] +ExecStart=/usr/local/bin/postfix-bounce-notify.sh +Restart=always +RestartSec=10 +User=root + +[Install] +WantedBy=multi-user.target