email: handle unquoted hrefs in plaintext converter + add tests
The anchor regex only matched quoted hrefs; unquoted (href=URL) dropped the URL from the plaintext part. Now handles double/single/unquoted. Added scripts/test_email_plaintext.py (11 cases: link forms, mailto, template-tag preservation, tag stripping, entity unescape, blank-line collapse).
This commit is contained in:
parent
4dc5690666
commit
466460112b
2 changed files with 95 additions and 4 deletions
|
|
@ -39,9 +39,13 @@ _DROP_BLOCKS = re.compile(
|
|||
)
|
||||
# HTML comments (Listmonk/MSO conditional comments etc.).
|
||||
_COMMENTS = re.compile(r"<!--.*?-->", _RE_FLAGS)
|
||||
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors/templated)
|
||||
# <a href="URL" ...>TEXT</a> -> TEXT (URL) (skip mailto:/tel:/anchors)
|
||||
# Handles quoted ("..." / '...') and unquoted (href=URL) hrefs.
|
||||
_ANCHORS = re.compile(
|
||||
r'<a\b[^>]*?\bhref\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', _RE_FLAGS
|
||||
r'<a\b[^>]*?\bhref\s*=\s*'
|
||||
r'(?:"([^"]*)"|\'([^\']*)\'|([^\s">]+))'
|
||||
r'[^>]*>(.*?)</a>',
|
||||
_RE_FLAGS,
|
||||
)
|
||||
# Tags that should become a line break.
|
||||
_BR = re.compile(r"<br\s*/?>", re.IGNORECASE)
|
||||
|
|
@ -58,8 +62,9 @@ _MANY_SPACES = re.compile(r"[ \t]{2,}")
|
|||
|
||||
|
||||
def _anchor_repl(m: "re.Match[str]") -> str:
|
||||
url = m.group(1).strip()
|
||||
text = _ANY_TAG.sub("", m.group(2)).strip()
|
||||
# href is whichever of the 3 alternatives matched (double/single/unquoted).
|
||||
url = (m.group(1) or m.group(2) or m.group(3) or "").strip()
|
||||
text = _ANY_TAG.sub("", m.group(4)).strip()
|
||||
low = url.lower()
|
||||
# mailto:/tel: -> surface the address (with link text if it adds info).
|
||||
# Bare in-page anchors -> keep text only. Templated hrefs (e.g.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue