email: handle unquoted hrefs in plaintext converter + add tests
The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the URL from the plaintext part. Now handles double/single/unquoted. Added scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag preservation, tag stripping, entity unescape, blank-line collapse).
This commit is contained in:
parent
4dc5690666
commit
466460112b
2 changed files with 95 additions and 4 deletions
|
|
@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile(
|
|||
)
|
||||
# HTML comments (Listmonk/MSO conditional comments etc.).
|
||||
_COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
|
||||
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors/templated)
|
||||
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors)
|
||||
# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
|
||||
_ANCHORS = re.compile(
|
||||
r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS
|
||||
r'<a\b[^>]*?\bhref\s*=\s*'
|
||||
r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
|
||||
r'[^>]*>(.*?)</a>',
|
||||
_RE_FLAGS,
|
||||
)
|
||||
# Tags that should become a line break.
|
||||
_BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
|
||||
|
|
@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}")
|
|||
|
||||
|
||||
def _anchor_repl(m: "re.Match[str]") -> str:
|
||||
url = m.group(1).strip()
|
||||
text = _ANY_TAG.sub("", m.group(2)).strip()
|
||||
# href is whichever of the 3 alternatives matched (double/single/unquoted).
|
||||
url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
|
||||
text = _ANY_TAG.sub("", m.group(4)).strip()
|
||||
low = url.lower()
|
||||
# mailto:/tel: -> surface the address (with link text if it adds info).
|
||||
# Bare in-page anchors -> keep text only. Templated hrefs (e.g.
|
||||
|
|
|
|||
86
scripts/test_email_plaintext.py
Normal file
86
scripts/test_email_plaintext.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
"""Tests for scripts/_email_plaintext.html_to_text (campaign altbody generation).
|
||||
|
||||
Run: python3 -m pytest scripts/test_email_plaintext.py
|
||||
or: python3 scripts/test_email_plaintext.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from scripts._email_plaintext import html_to_text # noqa: E402
|
||||
|
||||
|
||||
def test_empty_input():
|
||||
assert html_to_text("") == ""
|
||||
assert html_to_text(None) == "" # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_strips_script_style_head():
|
||||
html = "<head><style>.x{color:red}</style><title>t</title></head><body>Hi</body>"
|
||||
out = html_to_text(html)
|
||||
assert "color:red" not in out
|
||||
assert "t" not in out.splitlines()[0] if out.splitlines() else True
|
||||
assert out.strip() == "Hi"
|
||||
|
||||
|
||||
def test_links_quoted_double():
|
||||
assert "Check (https://performancewest.net/m?dot=1)" in html_to_text(
|
||||
'<a href="https://performancewest.net/m?dot=1">Check</a>'
|
||||
)
|
||||
|
||||
|
||||
def test_links_quoted_single():
|
||||
assert "Check (https://x.io)" in html_to_text("<a href='https://x.io'>Check</a>")
|
||||
|
||||
|
||||
def test_links_unquoted_with_attrs():
|
||||
assert "Check (https://x.io)" in html_to_text(
|
||||
"<a href=https://x.io target=_blank>Check</a>"
|
||||
)
|
||||
|
||||
|
||||
def test_mailto_surfaces_address():
|
||||
assert "Email us (info@performancewest.net)" in html_to_text(
|
||||
'<a href="mailto:info@performancewest.net">Email us</a>'
|
||||
)
|
||||
|
||||
|
||||
def test_preserves_listmonk_template_tags():
|
||||
html = (
|
||||
"<p>Hello {{ .Subscriber.Attribs.company }}</p>"
|
||||
'<a href="{{ UnsubscribeURL }}">unsubscribe</a>'
|
||||
)
|
||||
out = html_to_text(html)
|
||||
assert "{{ .Subscriber.Attribs.company }}" in out
|
||||
assert "{{ UnsubscribeURL }}" in out
|
||||
|
||||
|
||||
def test_lists_become_dashes():
|
||||
out = html_to_text("<ul><li>One</li><li>Two</li></ul>")
|
||||
assert "- One" in out and "- Two" in out
|
||||
|
||||
|
||||
def test_no_tags_leak():
|
||||
html = "<div><p>A</p><br><span>B</span></div><table><tr><td>C</td></tr></table>"
|
||||
out = html_to_text(html)
|
||||
assert "<" not in out and ">" not in out
|
||||
|
||||
|
||||
def test_entities_unescaped():
|
||||
assert "Tom & Jerry's" in html_to_text("<p>Tom & Jerry's</p>")
|
||||
|
||||
|
||||
def test_collapses_blank_lines():
|
||||
out = html_to_text("<p>A</p><p></p><p></p><p>B</p>")
|
||||
assert "\n\n\n" not in out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")]
|
||||
for fn in fns:
|
||||
fn()
|
||||
print(f"PASS {fn.__name__}")
|
||||
print(f"\nAll {len(fns)} tests passed.")
|
||||
Loading…
Add table
Add a link
Reference in a new issue