"""Verify an MCS-150 PDF actually RENDERS the expected values. OCR tooling (tesseract/pdftoppm/fitz) is not available in the workers container, so instead of rasterising we verify what a viewer would draw by reading each text widget's *appearance stream* (/AP /N), which is the literal content the PDF renders. After fill_mcs150 with auto_regenerate=True, pypdf writes these streams, so their presence + content is ground truth that the value will be visible (not just sitting in /V under a blank widget). Usage: python3 -m scripts.document_gen.templates.verify_mcs150 [DOT_NUMBER] Exits non-zero if any expected value is missing from the rendered output. """ from __future__ import annotations import re import sys from pypdf import PdfReader from .mcs150_pdf_filler import fill_mcs150 def _appearance_text(reader: PdfReader) -> dict[str, str]: """Return {field_name: rendered text from its /AP /N stream}.""" out: dict[str, str] = {} for page in reader.pages: annots = page.get("/Annots") if not annots: continue for ref in annots: obj = ref.get_object() name = obj.get("/T") if name is None: continue ap = obj.get("/AP") text = "" if ap: n = ap.get_object().get("/N") if n is not None: try: raw = n.get_object().get_data() # Tj / TJ text operators carry the drawn glyphs. for m in re.findall(rb"\((.*?)\)\s*Tj", raw): text += m.decode("latin-1", "ignore") for m in re.findall(rb"\[(.*?)\]\s*TJ", raw): for s in re.findall(rb"\((.*?)\)", m): text += s.decode("latin-1", "ignore") except Exception: pass out[str(name)] = text.strip() return out def main() -> int: dot = sys.argv[1] if len(sys.argv) > 1 else "1609564" # Simulate enrichment by pulling the FMCSA census the same way the handler # does, then fill. from scripts.workers.services.mcs150_update import MCS150UpdateHandler svc = MCS150UpdateHandler() census = svc._fetch_carrier_record(dot) if not census: print(f"FAIL: no FMCSA census for DOT {dot}") return 2 intake = dict(census) intake.setdefault("signer_name", "Test Signer") intake.setdefault("signer_title", "Owner") path = fill_mcs150(intake, order_number="CO-VERIFY") reader = PdfReader(path) rendered = _appearance_text(reader) # What we expect to be visible on the rendered form. checks = { "1bizName": intake.get("legal_name", ""), "3principalStreet": intake.get("address_street", ""), "4principalCity": intake.get("address_city", ""), "5principalState": intake.get("address_state", ""), "6principalZip": intake.get("address_zip", ""), "16usdotNumber": intake.get("dot_number", ""), "19irsNumber": intake.get("ein", ""), "certifyName": intake.get("signer_name", ""), } print(f"=== MCS-150 render verification for DOT {dot} ===") print(f"PDF: {path}") print(f"census fields: {sorted(census.keys())}\n") failures = [] for field, expected in checks.items(): if not expected: continue got = rendered.get(field, "") ok = expected.upper().replace(" ", "") in got.upper().replace(" ", "") flag = "OK " if ok else "MISS" print(f" [{flag}] {field}: expected '{expected}' | rendered '{got}'") if not ok: failures.append(field) if failures: print(f"\nFAIL: {len(failures)} field(s) not rendered: {failures}") return 1 print("\nPASS: all expected values render in their appearance streams.") return 0 if __name__ == "__main__": sys.exit(main())