new-site/scripts/_email_plaintext.py

"""Shared HTML -> plaintext conversion for outbound campaigns.

Every campaign we build was HTML-only (no plaintext MIME part). A missing
text/plain alternative is a spam-score signal: legitimate bulk senders ship
multipart/alternative, and several filters (and most "this looks like spam"
heuristics) penalise HTML-only mail. It also degrades the experience for
plaintext-only clients and accessibility tooling.

Listmonk only emits multipart/alternative when a campaign's `altbody` is set;
otherwise it sends text/html alone. So we generate a readable plaintext
rendition from the HTML body and pass it as `altbody`.

This is intentionally dependency-free (no bs4/html2text on the prod box): a
small, well-tested regex pipeline that:
  - drops <script>/<style>/<head> blocks,
  - turns <a href=...>text</a> into "text (url)" so links survive,
  - maps <br>, </p>, </div>, <li>, headings, <tr> to newlines,
  - prefixes <li> with "- ",
  - strips all remaining tags,
  - unescapes HTML entities,
  - collapses runs of blank lines / trailing whitespace.

Listmonk template tags ({{ .Subscriber... }}, {{ UnsubscribeURL }}) are left
untouched so they still render per-subscriber in the plaintext part too.
"""

from __future__ import annotations

import html as _html
import re

__all__ = ["html_to_text"]

_RE_FLAGS = re.IGNORECASE | re.DOTALL

# Whole blocks whose *content* must be discarded, not just the tags.
_DROP_BLOCKS = re.compile(
    r"<(script|style|head|title|noscript)\b[^>]*>.*?</\1>", _RE_FLAGS
)
# HTML comments (Listmonk/MSO conditional comments etc.).
_COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
# <a href="URL" ...>TEXT</a>  ->  TEXT (URL)   (skip mailto:/tel:/anchors)
# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
_ANCHORS = re.compile(
    r'<a\b[^>]*?\bhref\s*=\s*'
    r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
    r'[^>]*>(.*?)</a>',
    _RE_FLAGS,
)
# Tags that should become a line break.
_BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
_BLOCK_END = re.compile(
    r"</(p|div|h[1-6]|tr|table|ul|ol|blockquote|section|header|footer)>",
    re.IGNORECASE,
)
_LI = re.compile(r"<li\b[^>]*>", re.IGNORECASE)
_HR = re.compile(r"<hr\s*/?>", re.IGNORECASE)
_ANY_TAG = re.compile(r"<[^>]+>")
_MANY_BLANKS = re.compile(r"\n[ \t]*\n[ \t]*(\n[ \t]*)+")
_TRAIL_WS = re.compile(r"[ \t]+\n")
_MANY_SPACES = re.compile(r"[ \t]{2,}")


def _anchor_repl(m: "re.Match[str]") -> str:
    # href is whichever of the 3 alternatives matched (double/single/unquoted).
    url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
    text = _ANY_TAG.sub("", m.group(4)).strip()
    low = url.lower()
    # mailto:/tel: -> surface the address (with link text if it adds info).
    # Bare in-page anchors -> keep text only. Templated hrefs (e.g.
    # {{ UnsubscribeURL }}) ARE kept as "text (url)" so the per-subscriber link
    # still renders in the plaintext part.
    if low.startswith(("mailto:", "tel:")):
        addr = url.split(":", 1)[1].split("?", 1)[0]
        if text and text != addr:
            return f"{text} ({addr})"
        return addr
    if low.startswith("#"):
        return text
    if not text:
        return url
    if text == url:
        return url
    return f"{text} ({url})"


def html_to_text(html: str) -> str:
    """Convert an HTML email body to a readable text/plain rendition.

    Returns "" for empty input. Listmonk template tags are preserved verbatim.
    """
    if not html:
        return ""
    s = html
    s = _DROP_BLOCKS.sub("", s)
    s = _COMMENTS.sub("", s)
    s = _ANCHORS.sub(_anchor_repl, s)
    s = _HR.sub("\n----------\n", s)
    s = _BR.sub("\n", s)
    s = _LI.sub("\n- ", s)
    s = _BLOCK_END.sub("\n", s)
    s = _ANY_TAG.sub("", s)
    s = _html.unescape(s)
    # Normalise whitespace: trim trailing spaces, collapse runs of spaces and
    # blank lines, strip leading/trailing blank space overall.
    s = _TRAIL_WS.sub("\n", s)
    s = _MANY_SPACES.sub(" ", s)
    s = _MANY_BLANKS.sub("\n\n", s)
    s = "\n".join(line.rstrip() for line in s.splitlines())
    return s.strip() + "\n"