diff --git a/scripts/document_gen/templates/mcs150_pdf_filler.py b/scripts/document_gen/templates/mcs150_pdf_filler.py index f52936e..d3a72ed 100644 --- a/scripts/document_gen/templates/mcs150_pdf_filler.py +++ b/scripts/document_gen/templates/mcs150_pdf_filler.py @@ -209,23 +209,19 @@ def fill_mcs150(intake: dict, order_number: str = "") -> str: checkbox_on["certifyBox"] = True # ── Apply fields to PDF ────────────────────────────────────────── - # Update text fields - writer.update_page_form_field_values( - writer.pages[0], - {k: v for k, v in field_updates.items() if v}, - auto_regenerate=False, - ) - - # For multi-page forms, try updating all pages - for page_idx in range(len(writer.pages)): + # Apply text-field values to every page. The fields live on pages 9-11 of + # the official form, so we update across all pages (pypdf silently ignores + # field names not present on a given page). auto_regenerate=True makes pypdf + # build appearance streams from the values, so viewers that ignore + # /NeedAppearances (Preview, Chrome) still render the text. + text_values = {k: v for k, v in field_updates.items() if v} + for page in writer.pages: try: writer.update_page_form_field_values( - writer.pages[page_idx], - {k: v for k, v in field_updates.items() if v}, - auto_regenerate=False, + page, text_values, auto_regenerate=True, ) - except Exception: - pass + except Exception as exc: + LOG.debug("Form field apply on page failed: %s", exc) # Apply checkbox fields for field_name, checked in checkbox_on.items(): diff --git a/scripts/document_gen/templates/verify_mcs150.py b/scripts/document_gen/templates/verify_mcs150.py new file mode 100644 index 0000000..34fc3af --- /dev/null +++ b/scripts/document_gen/templates/verify_mcs150.py @@ -0,0 +1,112 @@ +"""Verify an MCS-150 PDF actually RENDERS the expected values. + +OCR tooling (tesseract/pdftoppm/fitz) is not available in the workers +container, so instead of rasterising we verify what a viewer would draw by +reading each text widget's *appearance stream* (/AP /N), which is the literal +content the PDF renders. After fill_mcs150 with auto_regenerate=True, pypdf +writes these streams, so their presence + content is ground truth that the +value will be visible (not just sitting in /V under a blank widget). + +Usage: + python3 -m scripts.document_gen.templates.verify_mcs150 [DOT_NUMBER] + +Exits non-zero if any expected value is missing from the rendered output. +""" +from __future__ import annotations + +import re +import sys + +from pypdf import PdfReader + +from .mcs150_pdf_filler import fill_mcs150 + + +def _appearance_text(reader: PdfReader) -> dict[str, str]: + """Return {field_name: rendered text from its /AP /N stream}.""" + out: dict[str, str] = {} + for page in reader.pages: + annots = page.get("/Annots") + if not annots: + continue + for ref in annots: + obj = ref.get_object() + name = obj.get("/T") + if name is None: + continue + ap = obj.get("/AP") + text = "" + if ap: + n = ap.get_object().get("/N") + if n is not None: + try: + raw = n.get_object().get_data() + # Tj / TJ text operators carry the drawn glyphs. + for m in re.findall(rb"\((.*?)\)\s*Tj", raw): + text += m.decode("latin-1", "ignore") + for m in re.findall(rb"\[(.*?)\]\s*TJ", raw): + for s in re.findall(rb"\((.*?)\)", m): + text += s.decode("latin-1", "ignore") + except Exception: + pass + out[str(name)] = text.strip() + return out + + +def main() -> int: + dot = sys.argv[1] if len(sys.argv) > 1 else "1609564" + + # Simulate enrichment by pulling the FMCSA census the same way the handler + # does, then fill. + from scripts.workers.services.mcs150_update import MCS150UpdateService + + svc = MCS150UpdateService() + census = svc._fetch_carrier_record(dot) + if not census: + print(f"FAIL: no FMCSA census for DOT {dot}") + return 2 + + intake = dict(census) + intake.setdefault("signer_name", "Test Signer") + intake.setdefault("signer_title", "Owner") + + path = fill_mcs150(intake, order_number="CO-VERIFY") + reader = PdfReader(path) + rendered = _appearance_text(reader) + + # What we expect to be visible on the rendered form. + checks = { + "1bizName": intake.get("legal_name", ""), + "3principalStreet": intake.get("address_street", ""), + "4principalCity": intake.get("address_city", ""), + "5principalState": intake.get("address_state", ""), + "6principalZip": intake.get("address_zip", ""), + "16usdotNumber": intake.get("dot_number", ""), + "19irsNumber": intake.get("ein", ""), + "certifyName": intake.get("signer_name", ""), + } + + print(f"=== MCS-150 render verification for DOT {dot} ===") + print(f"PDF: {path}") + print(f"census fields: {sorted(census.keys())}\n") + + failures = [] + for field, expected in checks.items(): + if not expected: + continue + got = rendered.get(field, "") + ok = expected.upper().replace(" ", "") in got.upper().replace(" ", "") + flag = "OK " if ok else "MISS" + print(f" [{flag}] {field}: expected '{expected}' | rendered '{got}'") + if not ok: + failures.append(field) + + if failures: + print(f"\nFAIL: {len(failures)} field(s) not rendered: {failures}") + return 1 + print("\nPASS: all expected values render in their appearance streams.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())