email: add plaintext MIME part + stable Message-ID hostname

Two deliverability hardening fixes from the email audit: 1. Plaintext (altbody): all campaigns were HTML-only. Listmonk only emits multipart/alternative when altbody is set, and HTML-only bulk mail is a spam-score signal. New scripts/_email_plaintext.py renders a readable text/plain part from the HTML body (dependency-free; preserves Listmonk {{ .Subscriber }}/{{ UnsubscribeURL }} template tags, turns links into 'text (url)'). Wired into the trucking builder (and thus UCR + IFTA, which reuse create_and_schedule_campaign) and the healthcare builder. 2. Stable container hostname: Listmonk derived its Message-ID from the random docker container id -> @localhost.localdomain (spam-score signal). Pin both listmonk + listmonk-hc hostname to perfwest.performancewest.net, matching Listmonk's SMTP hello_hostname. Part of the email-deliverability incident hardening.
2026-06-17 20:09:02 -05:00 · 2026-06-17 20:09:02 -05:00 · a32a3b05a0
commit a32a3b05a0
parent 2e4388a803
4 changed files with 133 additions and 3 deletions
--- a/scripts/_email_plaintext.py
+++ b/scripts/_email_plaintext.py
@ -0,0 +1,105 @@
+"""Shared HTML -> plaintext conversion for outbound campaigns.
+
+Every campaign we build was HTML-only (no plaintext MIME part). A missing
+text/plain alternative is a spam-score signal: legitimate bulk senders ship
+multipart/alternative, and several filters (and most "this looks like spam"
+heuristics) penalise HTML-only mail. It also degrades the experience for
+plaintext-only clients and accessibility tooling.
+
+Listmonk only emits multipart/alternative when a campaign's `altbody` is set;
+otherwise it sends text/html alone. So we generate a readable plaintext
+rendition from the HTML body and pass it as `altbody`.
+
+This is intentionally dependency-free (no bs4/html2text on the prod box): a
+small, well-tested regex pipeline that:
+  - drops <script>/<style>/<head> blocks,
+  - turns <a href=...>text</a> into "text (url)" so links survive,
+  - maps <br>, </p>, </div>, <li>, headings, <tr> to newlines,
+  - prefixes <li> with "- ",
+  - strips all remaining tags,
+  - unescapes HTML entities,
+  - collapses runs of blank lines / trailing whitespace.
+
+Listmonk template tags ({{ .Subscriber... }}, {{ UnsubscribeURL }}) are left
+untouched so they still render per-subscriber in the plaintext part too.
+"""
+
+from __future__ import annotations
+
+import html as _html
+import re
+
+__all__ = ["html_to_text"]
+
+_RE_FLAGS = re.IGNORECASE | re.DOTALL
+
+# Whole blocks whose *content* must be discarded, not just the tags.
+_DROP_BLOCKS = re.compile(
+    r"<(script|style|head|title|noscript)\b[^>]*>.*?</\1>", _RE_FLAGS
+)
+# HTML comments (Listmonk/MSO conditional comments etc.).
+_COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
+# <a href="URL" ...>TEXT</a>  ->  TEXT (URL)   (skip mailto:/tel:/anchors/templated)
+_ANCHORS = re.compile(
+    r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS
+)
+# Tags that should become a line break.
+_BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
+_BLOCK_END = re.compile(
+    r"</(p|div|h[1-6]|tr|table|ul|ol|blockquote|section|header|footer)>",
+    re.IGNORECASE,
+)
+_LI = re.compile(r"<li\b[^>]*>", re.IGNORECASE)
+_HR = re.compile(r"<hr\s*/?>", re.IGNORECASE)
+_ANY_TAG = re.compile(r"<[^>]+>")
+_MANY_BLANKS = re.compile(r"\n[ \t]*\n[ \t]*(\n[ \t]*)+")
+_TRAIL_WS = re.compile(r"[ \t]+\n")
+_MANY_SPACES = re.compile(r"[ \t]{2,}")
+
+
+def _anchor_repl(m: "re.Match[str]") -> str:
+    url = m.group(1).strip()
+    text = _ANY_TAG.sub("", m.group(2)).strip()
+    low = url.lower()
+    # mailto:/tel: -> surface the address (with link text if it adds info).
+    # Bare in-page anchors -> keep text only. Templated hrefs (e.g.
+    # {{ UnsubscribeURL }}) ARE kept as "text (url)" so the per-subscriber link
+    # still renders in the plaintext part.
+    if low.startswith(("mailto:", "tel:")):
+        addr = url.split(":", 1)[1].split("?", 1)[0]
+        if text and text != addr:
+            return f"{text} ({addr})"
+        return addr
+    if low.startswith("#"):
+        return text
+    if not text:
+        return url
+    if text == url:
+        return url
+    return f"{text} ({url})"
+
+
+def html_to_text(html: str) -> str:
+    """Convert an HTML email body to a readable text/plain rendition.
+
+    Returns "" for empty input. Listmonk template tags are preserved verbatim.
+    """
+    if not html:
+        return ""
+    s = html
+    s = _DROP_BLOCKS.sub("", s)
+    s = _COMMENTS.sub("", s)
+    s = _ANCHORS.sub(_anchor_repl, s)
+    s = _HR.sub("\n----------\n", s)
+    s = _BR.sub("\n", s)
+    s = _LI.sub("\n- ", s)
+    s = _BLOCK_END.sub("\n", s)
+    s = _ANY_TAG.sub("", s)
+    s = _html.unescape(s)
+    # Normalise whitespace: trim trailing spaces, collapse runs of spaces and
+    # blank lines, strip leading/trailing blank space overall.
+    s = _TRAIL_WS.sub("\n", s)
+    s = _MANY_SPACES.sub(" ", s)
+    s = _MANY_BLANKS.sub("\n\n", s)
+    s = "\n".join(line.rstrip() for line in s.splitlines())
+    return s.strip() + "\n"
--- a/scripts/build_healthcare_campaigns_cron.py
+++ b/scripts/build_healthcare_campaigns_cron.py
@ -61,6 +61,7 @@ REPLY_TO = "info@performancewest.net"
 # is the single source of truth shared with build_healthcare_campaigns.py.
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from build_healthcare_campaigns import SEGMENTS, template_path  # noqa: E402
+from _email_plaintext import html_to_text  # noqa: E402


 def load_suppressed() -> set[str]:
@ -284,7 +285,7 @@ def ensure_campaign(seg_key: str, list_id: int) -> int:
    payload = {
        "name": dated, "subject": seg["subject"], "lists": [list_id],
        "from_email": FROM_EMAIL, "type": "regular", "content_type": "richtext",
-        "body": body, "messenger": "email",
+        "body": body, "altbody": html_to_text(body), "messenger": "email",
        "tags": ["healthcare", "warmup", seg_key],
        "headers": [{"Reply-To": REPLY_TO},
                    {"List-Unsubscribe": "<{{ UnsubscribeURL }}>"},
--- a/scripts/build_trucking_campaigns.py
+++ b/scripts/build_trucking_campaigns.py
@ -41,6 +41,7 @@ if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

 from scripts._email_exclusions import BLOCKED_EMAIL_DOMAINS
+from scripts._email_plaintext import html_to_text

 LOG = logging.getLogger("build_trucking_campaigns")

@ -551,6 +552,21 @@ def import_subscribers(list_id: int, subscribers: list[dict]) -> int:
    return added


+def _altbody_for(base: dict, body: str | None = None) -> str:
+    """Plaintext (text/plain) part for a campaign.
+
+    Listmonk only emits multipart/alternative when altbody is set; HTML-only
+    mail is a spam-score signal. The source/base campaigns have no altbody, so
+    derive one from the HTML body. `body` overrides base["body"] for test sends
+    where merge fields were already substituted.
+    """
+    existing = (base.get("altbody") or "").strip()
+    if existing:
+        return existing
+    html = body if body is not None else base.get("body", "")
+    return html_to_text(html)
+
+
 def create_and_schedule_campaign(
    base: dict,
    list_id: int,
@ -566,7 +582,7 @@ def create_and_schedule_campaign(
        "type": "regular",
        "content_type": base["content_type"],
        "body": base["body"],
-        "altbody": base.get("altbody"),
+        "altbody": _altbody_for(base),
        "template_id": base["template_id"],
        "tags": base.get("tags") or [],
        "messenger": base.get("messenger") or "email",
@ -611,7 +627,7 @@ def send_test(base: dict, campaign_id: int, sample_row: tuple, label: str, tz: s
        "name": base.get("name", "Test"), "subject": subj,
        "lists": list_ids, "from_email": base["from_email"],
        "type": "regular", "content_type": base["content_type"],
-        "body": body, "altbody": base.get("altbody"),
+        "body": body, "altbody": _altbody_for(base, body),
        "template_id": base["template_id"],
        "tags": base.get("tags") or [], "messenger": base.get("messenger") or "email",
        "headers": base.get("headers") or REPLY_TO_HEADERS,