email: handle unquoted hrefs in plaintext converter + add tests

The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the
URL from the plaintext part. Now handles double/single/unquoted. Added
scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag
preservation, tag stripping, entity unescape, blank-line collapse).
This commit is contained in:
justin 2026-06-17 20:28:15 -05:00
parent 4dc5690666
commit 466460112b
2 changed files with 95 additions and 4 deletions

View file

@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile(
) )
# HTML comments (Listmonk/MSO conditional comments etc.). # HTML comments (Listmonk/MSO conditional comments etc.).
_COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS) _COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors/templated) # <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors)
# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
_ANCHORS = re.compile( _ANCHORS = re.compile(
r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS r'<a\b[^>]*?\bhref\s*=\s*'
r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
r'[^>]*>(.*?)</a>',
_RE_FLAGS,
) )
# Tags that should become a line break. # Tags that should become a line break.
_BR = re.compile(r"<br\s*/?>", re.IGNORECASE) _BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}")
def _anchor_repl(m: "re.Match[str]") -> str: def _anchor_repl(m: "re.Match[str]") -> str:
url = m.group(1).strip() # href is whichever of the 3 alternatives matched (double/single/unquoted).
text = _ANY_TAG.sub("", m.group(2)).strip() url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
text = _ANY_TAG.sub("", m.group(4)).strip()
low = url.lower() low = url.lower()
# mailto:/tel: -> surface the address (with link text if it adds info). # mailto:/tel: -> surface the address (with link text if it adds info).
# Bare in-page anchors -> keep text only. Templated hrefs (e.g. # Bare in-page anchors -> keep text only. Templated hrefs (e.g.

View file

@ -0,0 +1,86 @@
"""Tests for scripts/_email_plaintext.html_to_text (campaign altbody generation).
Run: python3 -m pytest scripts/test_email_plaintext.py
or: python3 scripts/test_email_plaintext.py
"""
from __future__ import annotations
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from scripts._email_plaintext import html_to_text # noqa: E402
def test_empty_input():
assert html_to_text("") == ""
assert html_to_text(None) == "" # type: ignore[arg-type]
def test_strips_script_style_head():
html = "<head><style>.x{color:red}</style><title>t</title></head><body>Hi</body>"
out = html_to_text(html)
assert "color:red" not in out
assert "t" not in out.splitlines()[0] if out.splitlines() else True
assert out.strip() == "Hi"
def test_links_quoted_double():
assert "Check (https://performancewest.net/m?dot=1)" in html_to_text(
'<a href="https://performancewest.net/m?dot=1">Check</a>'
)
def test_links_quoted_single():
assert "Check (https://x.io)" in html_to_text("<a href='https://x.io'>Check</a>")
def test_links_unquoted_with_attrs():
assert "Check (https://x.io)" in html_to_text(
"<a href=https://x.io target=_blank>Check</a>"
)
def test_mailto_surfaces_address():
assert "Email us (info@performancewest.net)" in html_to_text(
'<a href="mailto:info@performancewest.net">Email us</a>'
)
def test_preserves_listmonk_template_tags():
html = (
"<p>Hello {{ .Subscriber.Attribs.company }}</p>"
'<a href="{{ UnsubscribeURL }}">unsubscribe</a>'
)
out = html_to_text(html)
assert "{{ .Subscriber.Attribs.company }}" in out
assert "{{ UnsubscribeURL }}" in out
def test_lists_become_dashes():
out = html_to_text("<ul><li>One</li><li>Two</li></ul>")
assert "- One" in out and "- Two" in out
def test_no_tags_leak():
html = "<div><p>A</p><br><span>B</span></div><table><tr><td>C</td></tr></table>"
out = html_to_text(html)
assert "<" not in out and ">" not in out
def test_entities_unescaped():
assert "Tom & Jerry's" in html_to_text("<p>Tom &amp; Jerry&#39;s</p>")
def test_collapses_blank_lines():
out = html_to_text("<p>A</p><p></p><p></p><p>B</p>")
assert "\n\n\n" not in out
if __name__ == "__main__":
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
for fn in fns:
fn()
print(f"PASS {fn.__name__}")
print(f"\nAll {len(fns)} tests passed.")