From 466460112b52d565bf2ef9e5f22507172c1a67c3 Mon Sep 17 00:00:00 2001 From: justin Date: Wed, 17 Jun 2026 20:28:15 -0500 Subject: [PATCH] email: handle unquoted hrefs in plaintext converter + add tests The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the URL from the plaintext part. Now handles double/single/unquoted. Added scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag preservation, tag stripping, entity unescape, blank-line collapse). --- scripts/_email_plaintext.py | 13 +++-- scripts/test_email_plaintext.py | 86 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 scripts/test_email_plaintext.py diff --git a/scripts/_email_plaintext.py b/scripts/_email_plaintext.py index 3fa7f99..4980d9d 100644 --- a/scripts/_email_plaintext.py +++ b/scripts/_email_plaintext.py @@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile( ) # HTML comments (Listmonk/MSO conditional comments etc.). _COMMENTS = re.compile(r"", _RE_FLAGS) -# TEXT -> TEXT (URL) (skip mailto:/tel:/anchors/templated) +# TEXT -> TEXT (URL) (skip mailto:/tel:/anchors) +# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs. _ANCHORS = re.compile( - r']*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)', _RE_FLAGS + r']*?\bhref\s*=\s*' + r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))' + r'[^>]*>(.*?)', + _RE_FLAGS, ) # Tags that should become a line break. _BR = re.compile(r"", re.IGNORECASE) @@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}") def _anchor_repl(m: "re.Match[str]") -> str: - url = m.group(1).strip() - text = _ANY_TAG.sub("", m.group(2)).strip() + # href is whichever of the 3 alternatives matched (double/single/unquoted). + url = (m.group(1) or m.group(2) or m.group(3) or "").strip() + text = _ANY_TAG.sub("", m.group(4)).strip() low = url.lower() # mailto:/tel: -> surface the address (with link text if it adds info). # Bare in-page anchors -> keep text only. Templated hrefs (e.g. diff --git a/scripts/test_email_plaintext.py b/scripts/test_email_plaintext.py new file mode 100644 index 0000000..82432e6 --- /dev/null +++ b/scripts/test_email_plaintext.py @@ -0,0 +1,86 @@ +"""Tests for scripts/_email_plaintext.html_to_text (campaign altbody generation). + +Run: python3 -m pytest scripts/test_email_plaintext.py + or: python3 scripts/test_email_plaintext.py +""" + +from __future__ import annotations + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from scripts._email_plaintext import html_to_text # noqa: E402 + + +def test_empty_input(): + assert html_to_text("") == "" + assert html_to_text(None) == "" # type: ignore[arg-type] + + +def test_strips_script_style_head(): + html = "tHi" + out = html_to_text(html) + assert "color:red" not in out + assert "t" not in out.splitlines()[0] if out.splitlines() else True + assert out.strip() == "Hi" + + +def test_links_quoted_double(): + assert "Check (https://performancewest.net/m?dot=1)" in html_to_text( + 'Check' + ) + + +def test_links_quoted_single(): + assert "Check (https://x.io)" in html_to_text("Check") + + +def test_links_unquoted_with_attrs(): + assert "Check (https://x.io)" in html_to_text( + "Check" + ) + + +def test_mailto_surfaces_address(): + assert "Email us (info@performancewest.net)" in html_to_text( + 'Email us' + ) + + +def test_preserves_listmonk_template_tags(): + html = ( + "

Hello {{ .Subscriber.Attribs.company }}

" + 'unsubscribe' + ) + out = html_to_text(html) + assert "{{ .Subscriber.Attribs.company }}" in out + assert "{{ UnsubscribeURL }}" in out + + +def test_lists_become_dashes(): + out = html_to_text("
  • One
  • Two
") + assert "- One" in out and "- Two" in out + + +def test_no_tags_leak(): + html = "

A


B
C
" + out = html_to_text(html) + assert "<" not in out and ">" not in out + + +def test_entities_unescaped(): + assert "Tom & Jerry's" in html_to_text("

Tom & Jerry's

") + + +def test_collapses_blank_lines(): + out = html_to_text("

A

B

") + assert "\n\n\n" not in out + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")] + for fn in fns: + fn() + print(f"PASS {fn.__name__}") + print(f"\nAll {len(fns)} tests passed.")