""" fcc_rmd_auditor.py — Audit existing FCC RMD filings for deficiencies. Three analysis layers: Layer 1 — Structured data checks against local fcc_rmd table (fast, no network) Layer 2 — Download certification PDF from ServiceNow Attachment API (or Playwright) Layer 3 — Analyze PDF content for missing required sections (regex first, Ollama fallback) Usage: # Audit a single FRN (structured + PDF): python -m workers.fcc_rmd_auditor --frn 0012345678 # Structured checks only (no PDF download): python -m workers.fcc_rmd_auditor --frn 0012345678 --no-pdf # Batch audit (structured only, fast): python -m workers.fcc_rmd_auditor --batch --no-pdf # Batch with PDF (slow, ~2s/record): python -m workers.fcc_rmd_auditor --batch --limit 100 # JSON output for piping: python -m workers.fcc_rmd_auditor --frn 0012345678 --json Environment variables: DATABASE_URL PostgreSQL connection string """ from __future__ import annotations import argparse import json import logging import os import re import sys import tempfile import time from datetime import date, datetime, timezone from typing import Optional from urllib.parse import urlparse, parse_qs import psycopg2 import psycopg2.extras import requests LOG = logging.getLogger("workers.fcc_rmd_auditor") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s", stream=sys.stdout, ) DATABASE_URL = os.environ.get("DATABASE_URL", "") SERVICENOW_BASE = "https://fccprod.servicenowservices.com" ATTACHMENT_API = f"{SERVICENOW_BASE}/api/now/attachment" RMD_TABLE_NAME = "x_g_fmc_rmd_robocall_mitigation_database" USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ) # ═══════════════════════════════════════════════════════════════════════ # Finding data structure # ═══════════════════════════════════════════════════════════════════════ def _finding( check_id: str, severity: str, label: str, detail: str, ) -> dict: return { "id": check_id, "severity": severity, "label": label, "detail": detail, } def _worst_severity(findings: list[dict]) -> str: order = {"critical": 0, "major": 1, "minor": 2} worst = "clean" for f in findings: s = f["severity"] if order.get(s, 99) < order.get(worst, 99): worst = s return worst # ═══════════════════════════════════════════════════════════════════════ # Layer 1 — Structured data checks # ═══════════════════════════════════════════════════════════════════════ def _check_structured(row: dict, removed_frns: set[str]) -> list[dict]: """Run rule-based checks against a single fcc_rmd row.""" findings: list[dict] = [] today = date.today() # ── Certification staleness ────────────────────────────────────── last_recert = row.get("last_recertified") if last_recert: if isinstance(last_recert, str): try: last_recert = datetime.strptime(last_recert, "%Y-%m-%d").date() except ValueError: last_recert = None if last_recert: months_ago = (today.year - last_recert.year) * 12 + (today.month - last_recert.month) if months_ago > 13: findings.append(_finding( "stale_cert", "critical", "Stale Certification", f"Last recertified {last_recert.isoformat()} — {months_ago} months ago. " "Annual recertification required by March 1.", )) elif months_ago >= 10: findings.append(_finding( "expiring_cert", "major", "Certification Expiring Soon", f"Last recertified {last_recert.isoformat()} — {months_ago} months ago. " "Recertification due within 3 months.", )) else: findings.append(_finding( "no_recert_date", "major", "No Recertification Date", "No last_recertified date on file. Unable to determine certification currency.", )) # ── Provider classification ────────────────────────────────────── vsp = bool(row.get("voice_service_provider")) gw = bool(row.get("gateway_provider")) inter = bool(row.get("intermediate_provider")) if not vsp and not gw and not inter: findings.append(_finding( "no_classification", "critical", "No Provider Classification", "None of Voice Service Provider, Gateway, or Intermediate Provider " "is selected. At least one classification is required.", )) # Note: having all three (VSP + gateway + intermediate) is valid for # large carriers like Peerless Network. Only flag gateway + intermediate # without VSP, which is unusual. if gw and inter and not vsp: findings.append(_finding( "conflicting_classification", "minor", "Unusual Provider Classification", "Both Gateway Provider and Non-Gateway Intermediate Provider are " "selected without Voice Service Provider. Verify this is correct — " "most providers are one or the other.", )) # ── STIR/SHAKEN consistency ────────────────────────────────────── impl = (row.get("implementation") or "").lower() if vsp and not gw and not inter: # VSPs should implement STIR/SHAKEN unless exempt small carrier if "robocall mitigation" in impl and "partial" not in impl and "complete" not in impl: findings.append(_finding( "ss_vsp_no_shaken", "major", "VSP Without STIR/SHAKEN", "Voice Service Provider selected 'Robocall Mitigation Only' but " "VSPs are generally required to implement STIR/SHAKEN unless they " "qualify for the small carrier exemption.", )) if inter and not vsp and not gw: # Intermediate-only providers don't originate calls → can't do "complete" STIR/SHAKEN if "complete" in impl and "partial" not in impl: findings.append(_finding( "ss_intermediate_complete", "major", "Intermediate Provider Claims Complete STIR/SHAKEN", "Non-gateway intermediate provider claims Complete STIR/SHAKEN " "Implementation, but intermediates don't originate calls and " "therefore can't sign them with STIR/SHAKEN attestation.", )) if "partial" in impl: # Partial implementation should reference an upstream provider # We can't check this from structured data alone — flag as informational findings.append(_finding( "ss_partial_note", "minor", "Partial STIR/SHAKEN — Verify Upstream Provider", "Partial STIR/SHAKEN implementation declared. Ensure the filing " "names the upstream provider responsible for signing calls on " "non-SIP portions of the network.", )) # Note: contact_email/name are often NULL in our local DB because the RMD # CSV doesn't include them (requires separate scrape). Not a filing deficiency. # ── Removed from RMD ───────────────────────────────────────────── frn = (row.get("frn") or "").strip() if row.get("removed_from_rmd"): findings.append(_finding( "removed_from_rmd", "critical", "Removed from RMD", "This provider has been removed from the Robocall Mitigation Database. " "Downstream carriers are required to block their traffic.", )) elif frn and frn in removed_frns: findings.append(_finding( "removed_from_rmd", "critical", "Removed from RMD (enforcement action)", "This provider appears in the FCC's RMD removal list. " "A deficiency or enforcement action has been taken.", )) # ── Red light (CORES financial delinquency) ────────────────────── if row.get("red_light_status") == "red": findings.append(_finding( "red_light", "critical", "CORES Red Light Status", "FRN has outstanding delinquent debts to the FCC. " "Red-light status blocks certain filings and authorizations.", )) return findings # ═══════════════════════════════════════════════════════════════════════ # Layer 2 — PDF download # ═══════════════════════════════════════════════════════════════════════ def _extract_sys_id(filing_url: str) -> Optional[str]: try: qs = parse_qs(urlparse(filing_url).query) ids = qs.get("sys_id", []) return ids[0] if ids else None except Exception: return None def _get_pdf_attachment_sys_id(record_sys_id: str) -> Optional[str]: """Query the SP page API (unauthenticated) to get the PDF attachment sys_id.""" url = ( f"{SERVICENOW_BASE}/api/now/sp/page" f"?id=rmd_form&table={RMD_TABLE_NAME}" f"&sys_id={record_sys_id}&view=sp&time=1" f"&portal_id=ac2856301b92681048c6ed7bbc4bcb27" f"&request_uri=%2Frmd" ) try: resp = requests.get( url, headers={"User-Agent": USER_AGENT, "Accept": "application/json"}, timeout=30, ) resp.raise_for_status() data = resp.json() # Walk the nested widget structure to find attachments.pdf def _find_attachment_id(obj): if isinstance(obj, dict): if "attachments" in obj and isinstance(obj["attachments"], dict): pdf_id = obj["attachments"].get("pdf", "") if pdf_id: return pdf_id for v in obj.values(): r = _find_attachment_id(v) if r: return r elif isinstance(obj, list): for v in obj: r = _find_attachment_id(v) if r: return r return None att_id = _find_attachment_id(data) if att_id: LOG.info("Found PDF attachment sys_id=%s for record=%s", att_id, record_sys_id) else: LOG.info("No PDF attachment found for record=%s", record_sys_id) return att_id except Exception as exc: LOG.warning("SP API query failed: %s", exc) return None def _download_pdf_via_playwright( record_sys_id: str, attachment_sys_id: str, dest_dir: str ) -> Optional[str]: """Use Playwright to download the PDF attachment from the RMD portal. ServiceNow attachment APIs require authentication, but the portal page renders publicly with the PDF embedded. We intercept the browser's network requests to capture the attachment download. """ try: from playwright.sync_api import sync_playwright except ImportError: LOG.warning("Playwright not available — skipping PDF download") return None dest_path = os.path.join(dest_dir, f"rmd_cert_{attachment_sys_id}.pdf") portal_url = ( f"{SERVICENOW_BASE}/rmd" f"?id=rmd_form&table={RMD_TABLE_NAME}" f"&sys_id={record_sys_id}" ) captured_content: list[bytes] = [] try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) ctx = browser.new_context(user_agent=USER_AGENT) # Intercept network requests for attachment downloads def on_response(response): url = response.url if attachment_sys_id in url and response.status == 200: try: body = response.body() if len(body) > 500 and not body[:20].lstrip().startswith(b"<"): captured_content.append(body) except Exception: pass page = ctx.new_page() page.on("response", on_response) # Navigate to the filing page — the portal loads the PDF viewer page.goto(portal_url, wait_until="networkidle", timeout=45000) page.wait_for_timeout(3000) if captured_content: with open(dest_path, "wb") as f: f.write(captured_content[0]) LOG.info("Captured PDF from network: %s (%d bytes)", dest_path, len(captured_content[0])) browser.close() return dest_path # Fallback: try clicking on the PDF link if visible pdf_links = page.query_selector_all(f"a[href*='{attachment_sys_id}']") if not pdf_links: pdf_links = page.query_selector_all("a.attachment-link, a[data-type='pdf']") if not pdf_links: pdf_links = page.query_selector_all("a[href*='sys_attachment']") for link in pdf_links: try: with page.expect_download(timeout=15000) as dl_info: link.click() download = dl_info.value download.save_as(dest_path) # Verify not HTML with open(dest_path, "rb") as f: header = f.read(20) if not header.lstrip().startswith(b"<"): LOG.info("Downloaded PDF via link click: %s", dest_path) browser.close() return dest_path except Exception: continue # Last resort: capture the entire page as a PDF # (won't have the original document content but captures the form data) page.pdf(path=dest_path, format="Letter") LOG.info("Captured filing page as PDF fallback: %s", dest_path) browser.close() return dest_path except Exception as exc: LOG.warning("Playwright PDF download failed: %s", exc) return None def download_certification_pdf( filing_url: Optional[str], sys_id: Optional[str], dest_dir: str ) -> Optional[str]: """Download the RMD certification document. 1. Query SP page API (unauthenticated) to get the PDF attachment sys_id 2. Download via Playwright (sys_attachment.do requires a session cookie) """ if not sys_id and filing_url: sys_id = _extract_sys_id(filing_url) if not sys_id: LOG.info("No sys_id available — cannot download PDF") return None # Step 1: Get the PDF attachment sys_id from the SP API att_sys_id = _get_pdf_attachment_sys_id(sys_id) if not att_sys_id: LOG.info("No PDF attachment for record sys_id=%s", sys_id) return None # Step 2: Download via Playwright (needs both record and attachment sys_ids) return _download_pdf_via_playwright(sys_id, att_sys_id, dest_dir) # ═══════════════════════════════════════════════════════════════════════ # Layer 3 — PDF content analysis # ═══════════════════════════════════════════════════════════════════════ # Required sections and their keyword indicators REQUIRED_SECTIONS: list[tuple[str, str, str, list[str]]] = [ # (check_id, severity, label, keywords) ( "missing_provider_id", "major", "Missing Provider Identification", [r"\bfrn\b", r"\b\d{10}\b", "fcc registration number", "registration number"], ), ( "missing_classification", "major", "Missing Provider Classification", ["voice service provider", "gateway provider", "intermediate provider", "provider classification", "provider type"], ), ( "missing_stir_shaken", "major", "Missing STIR/SHAKEN Details", ["stir/shaken", "stir-shaken", "stirshaken", "sti certificate", "sti-ca", "spc token", "attestation"], ), ( "missing_mitigation", "major", "Missing Robocall Mitigation Program", ["robocall mitigation", "call blocking", "call analytics", "monitoring", "mitigation program", "call pattern"], ), ( "missing_kyc", "major", "Missing KYC Procedures", ["know your customer", "know-your-customer", r"\bkyc\b", "customer vetting", "identity verif", "customer verification", "due diligence"], ), ( "missing_traceback", "major", "Missing Traceback Commitment", ["traceback", r"\bitg\b", "industry traceback", "24 hour", "24-hour", "ustelecom"], ), ( "missing_enforcement", "minor", "Missing Enforcement History Disclosure", ["enforcement", "citation", "forfeiture", "consent decree", "adverse finding", "no pending"], ), ( "missing_recertification", "minor", "Missing Recertification Acknowledgment", ["recertif", "annual certification", "march 1", "march 2", "annually"], ), ( "missing_material_change", "major", "Missing Material Change Update Commitment", ["material change", "10 business day", "10-business-day", "update.*within", "update.*filing", "promptly update"], ), ( "missing_perjury", "minor", "Missing Perjury Declaration in Document", ["perjury", "penalty of perjury", "true and correct", "true, complete", "under penalty"], ), ( "missing_dno", "minor", "Missing DNO List Reference", ["do-not-originate", r"\bdno\b", "do not originate", "dno list"], ), ] def _extract_pdf_text(pdf_path: str) -> str: """Extract text from a PDF using pdfplumber.""" try: import pdfplumber except ImportError: LOG.warning("pdfplumber not installed — pip install pdfplumber") return "" try: with pdfplumber.open(pdf_path) as pdf: pages = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages) except Exception as exc: LOG.warning("PDF text extraction failed: %s", exc) return "" def _check_pdf_regex(text: str) -> list[dict]: """Tier 1: regex/keyword matching for required sections.""" if not text.strip(): return [_finding( "pdf_empty", "critical", "Empty or Unreadable PDF", "The certification document could not be read or contained no extractable text.", )] findings: list[dict] = [] text_lower = text.lower() for check_id, severity, label, keywords in REQUIRED_SECTIONS: found = False for kw in keywords: if re.search(kw, text_lower): found = True break if not found: findings.append(_finding( check_id, severity, label, f"No reference to this required section found in the certification document. " f"Searched for: {', '.join(k for k in keywords if not k.startswith('\\'))}.", )) return findings def _check_pdf_crossref( text: str, row: dict ) -> list[dict]: """Cross-reference structured data selections against PDF content.""" findings: list[dict] = [] text_lower = text.lower() impl = (row.get("implementation") or "").lower() # Check: PDF mentions a different STIR/SHAKEN status than structured data if "complete" in impl and "partial" not in impl: if "partial" in text_lower and "complete" not in text_lower: findings.append(_finding( "xref_ss_mismatch", "major", "STIR/SHAKEN Status Mismatch", "Structured data says 'Complete STIR/SHAKEN' but the uploaded " "certification document appears to reference 'partial' implementation.", )) elif "partial" in impl: if "complete implementation" in text_lower and "partial" not in text_lower: findings.append(_finding( "xref_ss_mismatch", "major", "STIR/SHAKEN Status Mismatch", "Structured data says 'Partial Implementation' but the uploaded " "certification document appears to reference 'complete' implementation.", )) # Check: document date is from a prior year year_pattern = re.compile(r"\b(20\d{2})\b") years_in_doc = set(int(m) for m in year_pattern.findall(text)) current_year = date.today().year if years_in_doc and max(years_in_doc) < current_year - 1: findings.append(_finding( "xref_old_document", "major", "Outdated Certification Document", f"The most recent year referenced in the document is {max(years_in_doc)}, " f"which is more than one year behind the current year ({current_year}). " "The plan may not reflect current 2026 requirements.", )) # Check: business name mismatch biz_name = (row.get("business_name") or "").lower().strip() if biz_name and len(biz_name) > 3: # Normalize: remove Inc., LLC, etc. biz_core = re.sub(r"\b(inc|llc|llp|corp|co|ltd)\.?\b", "", biz_name).strip() if biz_core and biz_core not in text_lower: findings.append(_finding( "xref_name_mismatch", "minor", "Business Name Not Found in Document", f"The RMD business name '{row.get('business_name')}' was not found " "in the uploaded certification document. The document may belong " "to a different entity.", )) return findings def _check_pdf_ollama(text: str) -> list[dict]: """Tier 2: Use Ollama LLM for ambiguous documents. Only called when regex finds < 6/11 sections or text is very short.""" try: from scripts.ollama_client import generate, start_tunnel, warmup except ImportError: LOG.warning("Ollama client not available — skipping LLM analysis") return [] if not start_tunnel(): LOG.warning("Ollama not reachable — skipping LLM analysis") return [] warmup() # Truncate text to fit model context (qwen2.5:7b ~ 8K tokens) truncated = text[:6000] section_names = [label for _, _, label, _ in REQUIRED_SECTIONS] prompt = ( "Analyze this FCC Robocall Mitigation Database (RMD) certification document.\n" "Determine which of these required sections are PRESENT or ABSENT:\n\n" + "\n".join(f"- {s}" for s in section_names) + "\n\n" "Document text:\n" "---\n" f"{truncated}\n" "---\n\n" "Respond in JSON format as a list of objects:\n" '[{"section": "...", "present": true/false, "reason": "brief explanation"}]\n' "Only output the JSON array, nothing else." ) try: raw = generate( prompt, system="You are an FCC regulatory compliance auditor. Respond only in valid JSON.", max_tokens=1500, temperature=0.1, ) # Try to parse JSON from the response # Strip markdown code fences if present raw = re.sub(r"^```json\s*", "", raw.strip()) raw = re.sub(r"\s*```$", "", raw.strip()) sections = json.loads(raw) if not isinstance(sections, list): LOG.warning("Ollama returned non-list JSON — ignoring") return [] findings: list[dict] = [] for item in sections: if not isinstance(item, dict): continue if item.get("present") is False: section_name = item.get("section", "Unknown Section") # Map back to check IDs check_id = "llm_" + re.sub(r"[^a-z0-9]+", "_", section_name.lower()).strip("_") findings.append(_finding( check_id, "major", f"Missing: {section_name} (LLM analysis)", item.get("reason", "Section not found in document."), )) return findings except (json.JSONDecodeError, Exception) as exc: LOG.warning("Ollama analysis failed: %s", exc) return [] def analyze_pdf( pdf_path: str, row: dict, use_ollama: bool = True ) -> list[dict]: """Full PDF analysis: extract text → regex → cross-reference → optional Ollama.""" text = _extract_pdf_text(pdf_path) if not text.strip(): return [_finding( "pdf_empty", "critical", "Empty or Unreadable PDF", "The certification document could not be read or contained no extractable text. " "The file may be scanned/image-based or corrupted.", )] findings: list[dict] = [] # Tier 1: regex keyword matching regex_findings = _check_pdf_regex(text) regex_missing_count = len(regex_findings) # Tier 2: Ollama for ambiguous cases total_sections = len(REQUIRED_SECTIONS) sections_found = total_sections - regex_missing_count if use_ollama and (sections_found < 6 or len(text) < 500): LOG.info( "Regex found %d/%d sections (text=%d chars) — running Ollama analysis", sections_found, total_sections, len(text), ) llm_findings = _check_pdf_ollama(text) if llm_findings: # LLM findings replace regex findings when LLM was used findings.extend(llm_findings) else: findings.extend(regex_findings) else: findings.extend(regex_findings) # Cross-reference: structured data vs PDF content findings.extend(_check_pdf_crossref(text, row)) return findings # ═══════════════════════════════════════════════════════════════════════ # Main audit function # ═══════════════════════════════════════════════════════════════════════ def audit_single_filing( conn, *, frn: Optional[str] = None, rmd_number: Optional[str] = None, include_pdf: bool = True, use_ollama: bool = True, ) -> Optional[dict]: """Audit a single RMD filing. Returns result dict or None if not found.""" cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) if frn: cur.execute("SELECT * FROM fcc_rmd WHERE frn = %s LIMIT 1", (frn,)) elif rmd_number: cur.execute("SELECT * FROM fcc_rmd WHERE rmd_number = %s LIMIT 1", (rmd_number,)) else: return None row = cur.fetchone() if not row: LOG.info("No RMD record found for frn=%s rmd=%s", frn, rmd_number) return None row = dict(row) frn = row.get("frn", "") rmd_num = row.get("rmd_number", "") # Get set of removed FRNs for cross-reference cur.execute("SELECT frn FROM fcc_rmd_removed WHERE frn IS NOT NULL") removed_frns = {r["frn"] for r in cur.fetchall()} # Layer 1: structured checks structured = _check_structured(row, removed_frns) # Layer 2+3: PDF analysis pdf_findings: list[dict] = [] pdf_downloaded = False pdf_text_length = 0 if include_pdf: sys_id = row.get("servicenow_sys_id") or _extract_sys_id(row.get("filing_url", "")) filing_url = row.get("filing_url") with tempfile.TemporaryDirectory(prefix="rmd_audit_") as tmpdir: pdf_path = download_certification_pdf(filing_url, sys_id, tmpdir) if pdf_path and os.path.exists(pdf_path): pdf_downloaded = True text = _extract_pdf_text(pdf_path) pdf_text_length = len(text) pdf_findings = analyze_pdf(pdf_path, row, use_ollama=use_ollama) else: LOG.info("Could not download PDF for %s — structured checks only", rmd_num) # Assemble result all_findings = structured + pdf_findings total = len(all_findings) severity = _worst_severity(all_findings) if all_findings else "clean" result = { "rmd_number": rmd_num, "frn": frn, "business_name": row.get("business_name", ""), "total_deficiencies": total, "severity": severity, "structured_checks": structured, "structured_score": len(structured), "pdf_checks": pdf_findings if include_pdf else None, "pdf_score": len(pdf_findings) if include_pdf else None, "pdf_downloaded": pdf_downloaded, "pdf_text_length": pdf_text_length, "audited_at": datetime.now(timezone.utc).isoformat(), } # Cache in DB _save_result(conn, row.get("id"), result) return result def _save_result(conn, fcc_rmd_id: Optional[int], result: dict) -> None: """Upsert the audit result into fcc_rmd_audit_results.""" try: cur = conn.cursor() cur.execute( """ INSERT INTO fcc_rmd_audit_results ( fcc_rmd_id, rmd_number, frn, business_name, structured_checks, structured_score, pdf_checks, pdf_score, pdf_downloaded, pdf_text_length, total_deficiencies, severity, audited_at, pdf_audited_at, updated_at ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now()) ON CONFLICT (rmd_number) DO UPDATE SET fcc_rmd_id = EXCLUDED.fcc_rmd_id, frn = EXCLUDED.frn, business_name = EXCLUDED.business_name, structured_checks = EXCLUDED.structured_checks, structured_score = EXCLUDED.structured_score, pdf_checks = COALESCE(EXCLUDED.pdf_checks, fcc_rmd_audit_results.pdf_checks), pdf_score = COALESCE(EXCLUDED.pdf_score, fcc_rmd_audit_results.pdf_score), pdf_downloaded = EXCLUDED.pdf_downloaded OR fcc_rmd_audit_results.pdf_downloaded, pdf_text_length = COALESCE(EXCLUDED.pdf_text_length, fcc_rmd_audit_results.pdf_text_length), total_deficiencies = EXCLUDED.total_deficiencies, severity = EXCLUDED.severity, audited_at = EXCLUDED.audited_at, pdf_audited_at = CASE WHEN EXCLUDED.pdf_checks IS NOT NULL THEN now() ELSE fcc_rmd_audit_results.pdf_audited_at END, updated_at = now() """, ( fcc_rmd_id, result["rmd_number"], result["frn"], result["business_name"], json.dumps(result["structured_checks"]), result["structured_score"], json.dumps(result["pdf_checks"]) if result.get("pdf_checks") is not None else None, result.get("pdf_score"), result.get("pdf_downloaded", False), result.get("pdf_text_length", 0), result["total_deficiencies"], result["severity"], result["audited_at"], result["audited_at"] if result.get("pdf_checks") is not None else None, ), ) conn.commit() except Exception as exc: LOG.warning("Failed to save audit result: %s", exc) conn.rollback() # ═══════════════════════════════════════════════════════════════════════ # Batch mode # ═══════════════════════════════════════════════════════════════════════ def run_batch( conn, *, include_pdf: bool = False, use_ollama: bool = True, limit: Optional[int] = None, offset: Optional[int] = None, year: Optional[int] = None, skip_recent: bool = True, ) -> dict: """Run structured checks on fcc_rmd records. Returns summary stats.""" cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) query = """ SELECT * FROM fcc_rmd WHERE (removed_from_rmd = FALSE OR removed_from_rmd IS NULL) """ if year: query += f" AND last_recertified >= '{year}-01-01'" if skip_recent: # Skip records already audited in the last 7 days query += " AND rmd_number NOT IN (SELECT rmd_number FROM fcc_rmd_audit_results WHERE audited_at > NOW() - INTERVAL '7 days')" query += " ORDER BY rmd_number" if offset: query += f" OFFSET {int(offset)}" if limit: query += f" LIMIT {int(limit)}" cur.execute(query) rows = cur.fetchall() LOG.info("Batch auditing %d RMD records (pdf=%s)", len(rows), include_pdf) # Get removed FRNs once cur.execute("SELECT frn FROM fcc_rmd_removed WHERE frn IS NOT NULL") removed_frns = {r["frn"] for r in cur.fetchall()} stats = {"total": 0, "clean": 0, "minor": 0, "major": 0, "critical": 0} for i, row in enumerate(rows): row = dict(row) rmd_num = row.get("rmd_number", "") frn = row.get("frn", "") structured = _check_structured(row, removed_frns) pdf_findings: list[dict] = [] pdf_downloaded = False pdf_text_length = 0 if include_pdf: sys_id = row.get("servicenow_sys_id") or _extract_sys_id(row.get("filing_url", "")) with tempfile.TemporaryDirectory(prefix="rmd_audit_") as tmpdir: pdf_path = download_certification_pdf(row.get("filing_url"), sys_id, tmpdir) if pdf_path and os.path.exists(pdf_path): pdf_downloaded = True text = _extract_pdf_text(pdf_path) pdf_text_length = len(text) pdf_findings = analyze_pdf(pdf_path, row, use_ollama=use_ollama) time.sleep(1.0) # Rate limit all_findings = structured + pdf_findings severity = _worst_severity(all_findings) if all_findings else "clean" result = { "rmd_number": rmd_num, "frn": frn, "business_name": row.get("business_name", ""), "total_deficiencies": len(all_findings), "severity": severity, "structured_checks": structured, "structured_score": len(structured), "pdf_checks": pdf_findings if include_pdf else None, "pdf_score": len(pdf_findings) if include_pdf else None, "pdf_downloaded": pdf_downloaded, "pdf_text_length": pdf_text_length, "audited_at": datetime.now(timezone.utc).isoformat(), } _save_result(conn, row.get("id"), result) stats["total"] += 1 stats[severity] += 1 if (i + 1) % 500 == 0: LOG.info("Progress: %d/%d processed", i + 1, len(rows)) return stats # ═══════════════════════════════════════════════════════════════════════ # CLI # ═══════════════════════════════════════════════════════════════════════ def _format_report(result: dict) -> str: """Format audit result as human-readable text.""" lines = [ f"RMD Audit Report — {result['rmd_number']}", f" FRN: {result['frn']}", f" Business: {result['business_name']}", f" Severity: {result['severity'].upper()}", f" Total deficiencies: {result['total_deficiencies']}", "", ] if result["structured_checks"]: lines.append("Structured Data Checks:") for f in result["structured_checks"]: icon = {"critical": "!!!", "major": "!!", "minor": "!"}.get(f["severity"], "?") lines.append(f" [{icon}] {f['label']}") lines.append(f" {f['detail']}") lines.append("") if result.get("pdf_checks"): lines.append(f"PDF Content Analysis (text: {result.get('pdf_text_length', 0)} chars):") for f in result["pdf_checks"]: icon = {"critical": "!!!", "major": "!!", "minor": "!"}.get(f["severity"], "?") lines.append(f" [{icon}] {f['label']}") lines.append(f" {f['detail']}") lines.append("") elif result.get("pdf_checks") is None and result.get("pdf_downloaded") is False: lines.append("PDF: Not analyzed (use --no-pdf to skip, or PDF not downloadable)") lines.append("") if result["total_deficiencies"] == 0: lines.append("No deficiencies found.") return "\n".join(lines) def main(): parser = argparse.ArgumentParser(description="Audit FCC RMD filings for deficiencies") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--frn", help="Audit a single FRN") group.add_argument("--rmd", help="Audit a single RMD number") group.add_argument("--batch", action="store_true", help="Batch audit all records") parser.add_argument("--no-pdf", action="store_true", help="Skip PDF download and analysis") parser.add_argument("--no-ollama", action="store_true", help="Skip Ollama LLM analysis") parser.add_argument("--limit", type=int, help="Limit batch to N records") parser.add_argument("--offset", type=int, help="Skip first N records (for parallel workers)") parser.add_argument("--year", type=int, help="Only audit filings from this year (e.g., 2026)") parser.add_argument("--no-skip-recent", action="store_true", help="Re-audit even recently audited records") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() if not DATABASE_URL: LOG.error("DATABASE_URL not set") sys.exit(1) conn = psycopg2.connect(DATABASE_URL) if args.batch: stats = run_batch( conn, include_pdf=not args.no_pdf, use_ollama=not args.no_ollama, limit=args.limit, offset=args.offset, year=args.year, skip_recent=not args.no_skip_recent, ) if args.json: print(json.dumps(stats, indent=2)) else: print(f"\nBatch audit complete:") print(f" Total: {stats['total']}") print(f" Clean: {stats['clean']}") print(f" Minor: {stats['minor']}") print(f" Major: {stats['major']}") print(f" Critical: {stats['critical']}") else: result = audit_single_filing( conn, frn=args.frn, rmd_number=args.rmd, include_pdf=not args.no_pdf, use_ollama=not args.no_ollama, ) if not result: LOG.error("No RMD record found") sys.exit(1) if args.json: print(json.dumps(result, indent=2, default=str)) else: print(_format_report(result)) conn.close() if __name__ == "__main__": main()