mcs150: render verification harness + auto-generate appearance streams
- fill_mcs150 now uses auto_regenerate=True so pypdf writes appearance streams for every text field. Preview/Chrome ignore /NeedAppearances and were showing blank widgets over the values; generated /AP streams make the text render in all viewers. - New verify_mcs150.py reads each widget's /AP /N appearance stream (the literal drawn glyphs) to confirm expected values actually render, since the container has no OCR/raster tooling. Exits non-zero on any miss.
This commit is contained in:
parent
534f13e480
commit
79c460ef25
2 changed files with 122 additions and 14 deletions
|
|
@ -209,23 +209,19 @@ def fill_mcs150(intake: dict, order_number: str = "") -> str:
|
|||
checkbox_on["certifyBox"] = True
|
||||
|
||||
# ── Apply fields to PDF ──────────────────────────────────────────
|
||||
# Update text fields
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[0],
|
||||
{k: v for k, v in field_updates.items() if v},
|
||||
auto_regenerate=False,
|
||||
)
|
||||
|
||||
# For multi-page forms, try updating all pages
|
||||
for page_idx in range(len(writer.pages)):
|
||||
# Apply text-field values to every page. The fields live on pages 9-11 of
|
||||
# the official form, so we update across all pages (pypdf silently ignores
|
||||
# field names not present on a given page). auto_regenerate=True makes pypdf
|
||||
# build appearance streams from the values, so viewers that ignore
|
||||
# /NeedAppearances (Preview, Chrome) still render the text.
|
||||
text_values = {k: v for k, v in field_updates.items() if v}
|
||||
for page in writer.pages:
|
||||
try:
|
||||
writer.update_page_form_field_values(
|
||||
writer.pages[page_idx],
|
||||
{k: v for k, v in field_updates.items() if v},
|
||||
auto_regenerate=False,
|
||||
page, text_values, auto_regenerate=True,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as exc:
|
||||
LOG.debug("Form field apply on page failed: %s", exc)
|
||||
|
||||
# Apply checkbox fields
|
||||
for field_name, checked in checkbox_on.items():
|
||||
|
|
|
|||
112
scripts/document_gen/templates/verify_mcs150.py
Normal file
112
scripts/document_gen/templates/verify_mcs150.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
"""Verify an MCS-150 PDF actually RENDERS the expected values.
|
||||
|
||||
OCR tooling (tesseract/pdftoppm/fitz) is not available in the workers
|
||||
container, so instead of rasterising we verify what a viewer would draw by
|
||||
reading each text widget's *appearance stream* (/AP /N), which is the literal
|
||||
content the PDF renders. After fill_mcs150 with auto_regenerate=True, pypdf
|
||||
writes these streams, so their presence + content is ground truth that the
|
||||
value will be visible (not just sitting in /V under a blank widget).
|
||||
|
||||
Usage:
|
||||
python3 -m scripts.document_gen.templates.verify_mcs150 [DOT_NUMBER]
|
||||
|
||||
Exits non-zero if any expected value is missing from the rendered output.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from .mcs150_pdf_filler import fill_mcs150
|
||||
|
||||
|
||||
def _appearance_text(reader: PdfReader) -> dict[str, str]:
|
||||
"""Return {field_name: rendered text from its /AP /N stream}."""
|
||||
out: dict[str, str] = {}
|
||||
for page in reader.pages:
|
||||
annots = page.get("/Annots")
|
||||
if not annots:
|
||||
continue
|
||||
for ref in annots:
|
||||
obj = ref.get_object()
|
||||
name = obj.get("/T")
|
||||
if name is None:
|
||||
continue
|
||||
ap = obj.get("/AP")
|
||||
text = ""
|
||||
if ap:
|
||||
n = ap.get_object().get("/N")
|
||||
if n is not None:
|
||||
try:
|
||||
raw = n.get_object().get_data()
|
||||
# Tj / TJ text operators carry the drawn glyphs.
|
||||
for m in re.findall(rb"\((.*?)\)\s*Tj", raw):
|
||||
text += m.decode("latin-1", "ignore")
|
||||
for m in re.findall(rb"\[(.*?)\]\s*TJ", raw):
|
||||
for s in re.findall(rb"\((.*?)\)", m):
|
||||
text += s.decode("latin-1", "ignore")
|
||||
except Exception:
|
||||
pass
|
||||
out[str(name)] = text.strip()
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
dot = sys.argv[1] if len(sys.argv) > 1 else "1609564"
|
||||
|
||||
# Simulate enrichment by pulling the FMCSA census the same way the handler
|
||||
# does, then fill.
|
||||
from scripts.workers.services.mcs150_update import MCS150UpdateService
|
||||
|
||||
svc = MCS150UpdateService()
|
||||
census = svc._fetch_carrier_record(dot)
|
||||
if not census:
|
||||
print(f"FAIL: no FMCSA census for DOT {dot}")
|
||||
return 2
|
||||
|
||||
intake = dict(census)
|
||||
intake.setdefault("signer_name", "Test Signer")
|
||||
intake.setdefault("signer_title", "Owner")
|
||||
|
||||
path = fill_mcs150(intake, order_number="CO-VERIFY")
|
||||
reader = PdfReader(path)
|
||||
rendered = _appearance_text(reader)
|
||||
|
||||
# What we expect to be visible on the rendered form.
|
||||
checks = {
|
||||
"1bizName": intake.get("legal_name", ""),
|
||||
"3principalStreet": intake.get("address_street", ""),
|
||||
"4principalCity": intake.get("address_city", ""),
|
||||
"5principalState": intake.get("address_state", ""),
|
||||
"6principalZip": intake.get("address_zip", ""),
|
||||
"16usdotNumber": intake.get("dot_number", ""),
|
||||
"19irsNumber": intake.get("ein", ""),
|
||||
"certifyName": intake.get("signer_name", ""),
|
||||
}
|
||||
|
||||
print(f"=== MCS-150 render verification for DOT {dot} ===")
|
||||
print(f"PDF: {path}")
|
||||
print(f"census fields: {sorted(census.keys())}\n")
|
||||
|
||||
failures = []
|
||||
for field, expected in checks.items():
|
||||
if not expected:
|
||||
continue
|
||||
got = rendered.get(field, "")
|
||||
ok = expected.upper().replace(" ", "") in got.upper().replace(" ", "")
|
||||
flag = "OK " if ok else "MISS"
|
||||
print(f" [{flag}] {field}: expected '{expected}' | rendered '{got}'")
|
||||
if not ok:
|
||||
failures.append(field)
|
||||
|
||||
if failures:
|
||||
print(f"\nFAIL: {len(failures)} field(s) not rendered: {failures}")
|
||||
return 1
|
||||
print("\nPASS: all expected values render in their appearance streams.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue