email: handle unquoted hrefs in plaintext converter + add tests

The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the URL from the plaintext part. Now handles double/single/unquoted. Added scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag preservation, tag stripping, entity unescape, blank-line collapse).
2026-06-17 20:28:15 -05:00 · 2026-06-17 20:28:15 -05:00 · 466460112b
commit 466460112b
parent 4dc5690666
2 changed files with 95 additions and 4 deletions
--- a/scripts/_email_plaintext.py
+++ b/scripts/_email_plaintext.py
@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile(
 )
 # HTML comments (Listmonk/MSO conditional comments etc.).
 _COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
-# <a href="URL" ...>TEXT</a>  ->  TEXT (URL)   (skip mailto:/tel:/anchors/templated)
+# <a href="URL" ...>TEXT</a>  ->  TEXT (URL)   (skip mailto:/tel:/anchors)
 # Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
 _ANCHORS = re.compile(
-    r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS
+    r'<a\b[^>]*?\bhref\s*=\s*'
    r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
    r'[^>]*>(.*?)</a>',
    _RE_FLAGS,
 )
 # Tags that should become a line break.
 _BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}")
 def _anchor_repl(m: "re.Match[str]") -> str:
-    url = m.group(1).strip()
+    # href is whichever of the 3 alternatives matched (double/single/unquoted).
-    text = _ANY_TAG.sub("", m.group(2)).strip()
+    url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
    text = _ANY_TAG.sub("", m.group(4)).strip()
    low = url.lower()
    # mailto:/tel: -> surface the address (with link text if it adds info).
    # Bare in-page anchors -> keep text only. Templated hrefs (e.g.
--- a/scripts/test_email_plaintext.py
+++ b/scripts/test_email_plaintext.py
@ -0,0 +1,86 @@
 """Tests for scripts/_email_plaintext.html_to_text (campaign altbody generation).
 Run: python3 -m pytest scripts/test_email_plaintext.py
 or: python3 scripts/test_email_plaintext.py
 """
 from __future__ import annotations
 import os
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from scripts._email_plaintext import html_to_text  # noqa: E402
 def test_empty_input():
    assert html_to_text("") == ""
    assert html_to_text(None) == ""  # type: ignore[arg-type]
 def test_strips_script_style_head():
    html = "<head><style>.x{color:red}</style><title>t</title></head><body>Hi</body>"
    out = html_to_text(html)
    assert "color:red" not in out
    assert "t" not in out.splitlines()[0] if out.splitlines() else True
    assert out.strip() == "Hi"
 def test_links_quoted_double():
    assert "Check (https://performancewest.net/m?dot=1)" in html_to_text(
        '<a href="https://performancewest.net/m?dot=1">Check</a>'
    )
 def test_links_quoted_single():
    assert "Check (https://x.io)" in html_to_text("<a href='https://x.io'>Check</a>")
 def test_links_unquoted_with_attrs():
    assert "Check (https://x.io)" in html_to_text(
        "<a href=https://x.io target=_blank>Check</a>"
    )
 def test_mailto_surfaces_address():
    assert "Email us (info@performancewest.net)" in html_to_text(
        '<a href="mailto:info@performancewest.net">Email us</a>'
    )
 def test_preserves_listmonk_template_tags():
    html = (
        "<p>Hello {{ .Subscriber.Attribs.company }}</p>"
        '<a href="{{ UnsubscribeURL }}">unsubscribe</a>'
    )
    out = html_to_text(html)
    assert "{{ .Subscriber.Attribs.company }}" in out
    assert "{{ UnsubscribeURL }}" in out
 def test_lists_become_dashes():
    out = html_to_text("<ul><li>One</li><li>Two</li></ul>")
    assert "- One" in out and "- Two" in out
 def test_no_tags_leak():
    html = "<div><p>A</p><br><span>B</span></div><table><tr><td>C</td></tr></table>"
    out = html_to_text(html)
    assert "<" not in out and ">" not in out
 def test_entities_unescaped():
    assert "Tom & Jerry's" in html_to_text("<p>Tom &amp; Jerry&#39;s</p>")
 def test_collapses_blank_lines():
    out = html_to_text("<p>A</p><p></p><p></p><p>B</p>")
    assert "\n\n\n" not in out
 if __name__ == "__main__":
    fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
    for fn in fns:
        fn()
        print(f"PASS {fn.__name__}")
    print(f"\nAll {len(fns)} tests passed.")