email: handle unquoted hrefs in plaintext converter + add tests

The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the
URL from the plaintext part. Now handles double/single/unquoted. Added
scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag
preservation, tag stripping, entity unescape, blank-line collapse).
This commit is contained in:
justin 2026-06-17 20:28:15 -05:00
parent 4dc5690666
commit 466460112b
2 changed files with 95 additions and 4 deletions

View file

@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile(
)
# HTML comments (Listmonk/MSO conditional comments etc.).
_COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors/templated)
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors)
# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
_ANCHORS = re.compile(
r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS
r'<a\b[^>]*?\bhref\s*=\s*'
r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
r'[^>]*>(.*?)</a>',
_RE_FLAGS,
)
# Tags that should become a line break.
_BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}")
def _anchor_repl(m: "re.Match[str]") -> str:
url = m.group(1).strip()
text = _ANY_TAG.sub("", m.group(2)).strip()
# href is whichever of the 3 alternatives matched (double/single/unquoted).
url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
text = _ANY_TAG.sub("", m.group(4)).strip()
low = url.lower()
# mailto:/tel: -> surface the address (with link text if it adds info).
# Bare in-page anchors -> keep text only. Templated hrefs (e.g.