Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
46 lines
1.8 KiB
Python
46 lines
1.8 KiB
Python
"""Carrier invoice PDF adapter (v2 stub).
|
|
|
|
Real-world carrier invoices frequently arrive only as PDF — either as
|
|
structured text or scanned images. Extracting line-item revenue from
|
|
arbitrary carrier PDFs (CenturyLink, Verizon, AT&T, Lumen, Frontier,
|
|
Windstream, etc.) requires an OCR+layout-aware pipeline that is out of
|
|
scope for the v1 ICC ingester.
|
|
|
|
This class is a **registered stub** so that the upload endpoint can
|
|
accept the ``carrier_invoice_pdf`` source format and persist the blob
|
|
into MinIO. When the ingester dispatches to this adapter it raises
|
|
``NotImplementedError``; the ingester catches that in its outer
|
|
try/except and marks the upload ``status='failed'`` with an explanatory
|
|
error message, letting customers see "awaiting v2 PDF parser" in the
|
|
portal without losing the file.
|
|
|
|
v2 plan
|
|
-------
|
|
* ``pdfplumber`` for text extraction + table reconstruction
|
|
* Vendor-specific anchor templates (e.g. "Total Switched Access Charges")
|
|
* Fall back to Anthropic Claude vision on scanned-image pages
|
|
* OCR output manually QA'd by the Accounting-Advisor role before
|
|
``rows_accepted`` is trusted
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Iterator
|
|
|
|
from .common import BaseICCAdapter, IccRevenueLine
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CarrierInvoicePDFAdapter(BaseICCAdapter):
|
|
SOURCE_FORMAT = "carrier_invoice_pdf"
|
|
|
|
def iter_rows(self, local_path: str) -> Iterator[IccRevenueLine]:
|
|
raise NotImplementedError(
|
|
"carrier_invoice_pdf parsing is deferred to v2 — upload accepted "
|
|
"and stored, but no line items have been extracted. See the "
|
|
"module docstring for the planned OCR pipeline."
|
|
)
|
|
# Unreachable but makes this a generator-returning function:
|
|
yield # pragma: no cover
|