email: handle unquoted hrefs in plaintext converter + add tests

The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the URL from the plaintext part. Now handles double/single/unquoted. Added scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag preservation, tag stripping, entity unescape, blank-line collapse).
2026-06-17 20:28:15 -05:00 · 2026-06-17 20:28:15 -05:00 · 466460112b
commit 466460112b
parent 4dc5690666
2 changed files with 95 additions and 4 deletions
--- a/scripts/_email_plaintext.py
+++ b/scripts/_email_plaintext.py
@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile(
 )
 # HTML comments (Listmonk/MSO conditional comments etc.).
 _COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
-# <a href="URL" ...>TEXT</a>  ->  TEXT (URL)   (skip mailto:/tel:/anchors/templated)
+# <a href="URL" ...>TEXT</a>  ->  TEXT (URL)   (skip mailto:/tel:/anchors)
+# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
 _ANCHORS = re.compile(
-    r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS
+    r'<a\b[^>]*?\bhref\s*=\s*'
+    r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
+    r'[^>]*>(.*?)</a>',
+    _RE_FLAGS,
 )
 # Tags that should become a line break.
 _BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}")


 def _anchor_repl(m: "re.Match[str]") -> str:
-    url = m.group(1).strip()
-    text = _ANY_TAG.sub("", m.group(2)).strip()
+    # href is whichever of the 3 alternatives matched (double/single/unquoted).
+    url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
+    text = _ANY_TAG.sub("", m.group(4)).strip()
    low = url.lower()
    # mailto:/tel: -> surface the address (with link text if it adds info).
    # Bare in-page anchors -> keep text only. Templated hrefs (e.g.
--- a/scripts/test_email_plaintext.py
+++ b/scripts/test_email_plaintext.py
@ -0,0 +1,86 @@
+"""Tests for scripts/_email_plaintext.html_to_text (campaign altbody generation).
+
+Run: python3 -m pytest scripts/test_email_plaintext.py
+ or: python3 scripts/test_email_plaintext.py
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from scripts._email_plaintext import html_to_text  # noqa: E402
+
+
+def test_empty_input():
+    assert html_to_text("") == ""
+    assert html_to_text(None) == ""  # type: ignore[arg-type]
+
+
+def test_strips_script_style_head():
+    html = "<head><style>.x{color:red}</style><title>t</title></head><body>Hi</body>"
+    out = html_to_text(html)
+    assert "color:red" not in out
+    assert "t" not in out.splitlines()[0] if out.splitlines() else True
+    assert out.strip() == "Hi"
+
+
+def test_links_quoted_double():
+    assert "Check (https://performancewest.net/m?dot=1)" in html_to_text(
+        '<a href="https://performancewest.net/m?dot=1">Check</a>'
+    )
+
+
+def test_links_quoted_single():
+    assert "Check (https://x.io)" in html_to_text("<a href='https://x.io'>Check</a>")
+
+
+def test_links_unquoted_with_attrs():
+    assert "Check (https://x.io)" in html_to_text(
+        "<a href=https://x.io target=_blank>Check</a>"
+    )
+
+
+def test_mailto_surfaces_address():
+    assert "Email us (info@performancewest.net)" in html_to_text(
+        '<a href="mailto:info@performancewest.net">Email us</a>'
+    )
+
+
+def test_preserves_listmonk_template_tags():
+    html = (
+        "<p>Hello {{ .Subscriber.Attribs.company }}</p>"
+        '<a href="{{ UnsubscribeURL }}">unsubscribe</a>'
+    )
+    out = html_to_text(html)
+    assert "{{ .Subscriber.Attribs.company }}" in out
+    assert "{{ UnsubscribeURL }}" in out
+
+
+def test_lists_become_dashes():
+    out = html_to_text("<ul><li>One</li><li>Two</li></ul>")
+    assert "- One" in out and "- Two" in out
+
+
+def test_no_tags_leak():
+    html = "<div><p>A</p><br><span>B</span></div><table><tr><td>C</td></tr></table>"
+    out = html_to_text(html)
+    assert "<" not in out and ">" not in out
+
+
+def test_entities_unescaped():
+    assert "Tom & Jerry's" in html_to_text("<p>Tom &amp; Jerry&#39;s</p>")
+
+
+def test_collapses_blank_lines():
+    out = html_to_text("<p>A</p><p></p><p></p><p>B</p>")
+    assert "\n\n\n" not in out
+
+
+if __name__ == "__main__":
+    fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
+    for fn in fns:
+        fn()
+        print(f"PASS {fn.__name__}")
+    print(f"\nAll {len(fns)} tests passed.")