mcs150: render verification harness + auto-generate appearance streams

- fill_mcs150 now uses auto_regenerate=True so pypdf writes appearance
  streams for every text field. Preview/Chrome ignore /NeedAppearances and
  were showing blank widgets over the values; generated /AP streams make
  the text render in all viewers.
- New verify_mcs150.py reads each widget's /AP /N appearance stream (the
  literal drawn glyphs) to confirm expected values actually render, since
  the container has no OCR/raster tooling. Exits non-zero on any miss.
This commit is contained in:
justin 2026-06-10 12:37:25 -05:00
parent 534f13e480
commit 79c460ef25
2 changed files with 122 additions and 14 deletions

View file

@ -209,23 +209,19 @@ def fill_mcs150(intake: dict, order_number: str = "") -> str:
checkbox_on["certifyBox"] = True
# ── Apply fields to PDF ──────────────────────────────────────────
# Update text fields
writer.update_page_form_field_values(
writer.pages[0],
{k: v for k, v in field_updates.items() if v},
auto_regenerate=False,
)
# For multi-page forms, try updating all pages
for page_idx in range(len(writer.pages)):
# Apply text-field values to every page. The fields live on pages 9-11 of
# the official form, so we update across all pages (pypdf silently ignores
# field names not present on a given page). auto_regenerate=True makes pypdf
# build appearance streams from the values, so viewers that ignore
# /NeedAppearances (Preview, Chrome) still render the text.
text_values = {k: v for k, v in field_updates.items() if v}
for page in writer.pages:
try:
writer.update_page_form_field_values(
writer.pages[page_idx],
{k: v for k, v in field_updates.items() if v},
auto_regenerate=False,
page, text_values, auto_regenerate=True,
)
except Exception:
pass
except Exception as exc:
LOG.debug("Form field apply on page failed: %s", exc)
# Apply checkbox fields
for field_name, checked in checkbox_on.items():

View file

@ -0,0 +1,112 @@
"""Verify an MCS-150 PDF actually RENDERS the expected values.
OCR tooling (tesseract/pdftoppm/fitz) is not available in the workers
container, so instead of rasterising we verify what a viewer would draw by
reading each text widget's *appearance stream* (/AP /N), which is the literal
content the PDF renders. After fill_mcs150 with auto_regenerate=True, pypdf
writes these streams, so their presence + content is ground truth that the
value will be visible (not just sitting in /V under a blank widget).
Usage:
python3 -m scripts.document_gen.templates.verify_mcs150 [DOT_NUMBER]
Exits non-zero if any expected value is missing from the rendered output.
"""
from __future__ import annotations
import re
import sys
from pypdf import PdfReader
from .mcs150_pdf_filler import fill_mcs150
def _appearance_text(reader: PdfReader) -> dict[str, str]:
"""Return {field_name: rendered text from its /AP /N stream}."""
out: dict[str, str] = {}
for page in reader.pages:
annots = page.get("/Annots")
if not annots:
continue
for ref in annots:
obj = ref.get_object()
name = obj.get("/T")
if name is None:
continue
ap = obj.get("/AP")
text = ""
if ap:
n = ap.get_object().get("/N")
if n is not None:
try:
raw = n.get_object().get_data()
# Tj / TJ text operators carry the drawn glyphs.
for m in re.findall(rb"\((.*?)\)\s*Tj", raw):
text += m.decode("latin-1", "ignore")
for m in re.findall(rb"\[(.*?)\]\s*TJ", raw):
for s in re.findall(rb"\((.*?)\)", m):
text += s.decode("latin-1", "ignore")
except Exception:
pass
out[str(name)] = text.strip()
return out
def main() -> int:
dot = sys.argv[1] if len(sys.argv) > 1 else "1609564"
# Simulate enrichment by pulling the FMCSA census the same way the handler
# does, then fill.
from scripts.workers.services.mcs150_update import MCS150UpdateService
svc = MCS150UpdateService()
census = svc._fetch_carrier_record(dot)
if not census:
print(f"FAIL: no FMCSA census for DOT {dot}")
return 2
intake = dict(census)
intake.setdefault("signer_name", "Test Signer")
intake.setdefault("signer_title", "Owner")
path = fill_mcs150(intake, order_number="CO-VERIFY")
reader = PdfReader(path)
rendered = _appearance_text(reader)
# What we expect to be visible on the rendered form.
checks = {
"1bizName": intake.get("legal_name", ""),
"3principalStreet": intake.get("address_street", ""),
"4principalCity": intake.get("address_city", ""),
"5principalState": intake.get("address_state", ""),
"6principalZip": intake.get("address_zip", ""),
"16usdotNumber": intake.get("dot_number", ""),
"19irsNumber": intake.get("ein", ""),
"certifyName": intake.get("signer_name", ""),
}
print(f"=== MCS-150 render verification for DOT {dot} ===")
print(f"PDF: {path}")
print(f"census fields: {sorted(census.keys())}\n")
failures = []
for field, expected in checks.items():
if not expected:
continue
got = rendered.get(field, "")
ok = expected.upper().replace(" ", "") in got.upper().replace(" ", "")
flag = "OK " if ok else "MISS"
print(f" [{flag}] {field}: expected '{expected}' | rendered '{got}'")
if not ok:
failures.append(field)
if failures:
print(f"\nFAIL: {len(failures)} field(s) not rendered: {failures}")
return 1
print("\nPASS: all expected values render in their appearance streams.")
return 0
if __name__ == "__main__":
sys.exit(main())