Initial commit — Performance West telecom compliance platform

Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00 · 2026-04-27 06:54:22 -05:00 · f8cd37ac8c
commit f8cd37ac8c
1823 changed files with 145167 additions and 0 deletions
--- a/scripts/document_gen/pdf_converter.py
+++ b/scripts/document_gen/pdf_converter.py
@ -0,0 +1,285 @@
+"""
+DOCX → PDF conversion.
+
+Primary:  Windows Word VM via MinIO (pixel-perfect, no open ports required).
+Fallback: LibreOffice headless (70-80% fidelity, always available in container).
+
+MinIO transport protocol
+─────────────────────────
+  PUT  docx → {bucket}/to-convert/{job_id}.docx   (this module)
+  WAIT poll  → {bucket}/converted/{job_id}.pdf      (this module)
+  GET  pdf   ← {bucket}/converted/{job_id}.pdf      (this module)
+  DEL  docx  ← {bucket}/to-convert/{job_id}.docx    (docserver_worker.py)
+  DEL  pdf   ← {bucket}/converted/{job_id}.pdf      (this module, after download)
+
+The Windows VM runs docserver_worker.py which:
+  1. Polls to-convert/ every 12 seconds
+  2. Downloads the DOCX, converts via Word COM, uploads the PDF to converted/
+  3. Deletes the source DOCX from to-convert/
+
+No HTTP server, no open ports, no SSH tunnel. Only MinIO is needed.
+
+Environment variables (same MinIO creds as the workers):
+  MINIO_ENDPOINT     — MinIO host (default: minio)
+  MINIO_PORT         — MinIO port (default: 9000)
+  MINIO_ACCESS_KEY   — access key
+  MINIO_SECRET_KEY   — secret key
+  MINIO_BUCKET       — bucket name (default: performancewest)
+  USE_DOCSERVER      — enable Word VM path (default: true)
+  DOCSERVER_TIMEOUT  — max seconds to wait for Word to produce the PDF (default: 120)
+"""
+
+from __future__ import annotations
+
+import io
+import logging
+import os
+import subprocess
+import time
+import uuid
+from pathlib import Path
+
+LOG = logging.getLogger("document_gen.pdf")
+
+# MinIO settings — inherited from the workers container env
+_MINIO_ENDPOINT   = os.getenv("MINIO_ENDPOINT",   "minio")
+_MINIO_PORT       = int(os.getenv("MINIO_PORT",   "9000"))
+_MINIO_ACCESS     = os.getenv("MINIO_ACCESS_KEY",  "")
+_MINIO_SECRET     = os.getenv("MINIO_SECRET_KEY",  "")
+_MINIO_BUCKET     = os.getenv("MINIO_BUCKET",      "performancewest")
+_MINIO_SECURE     = os.getenv("MINIO_SECURE",      "false").lower() == "true"
+
+USE_DOCSERVER     = os.getenv("USE_DOCSERVER",     "true").lower() == "true"
+DOCSERVER_TIMEOUT = int(os.getenv("DOCSERVER_TIMEOUT", "120"))  # seconds
+_POLL_INTERVAL    = 12  # seconds between polls for the converted PDF
+
+# MinIO key prefixes
+_PREFIX_IN  = "to-convert"    # docx files waiting to be processed
+_PREFIX_OUT = "converted"     # pdf files ready for pickup
+
+
+def _minio_client():
+    """Return a configured MinIO client."""
+    from minio import Minio  # type: ignore
+    return Minio(
+        f"{_MINIO_ENDPOINT}:{_MINIO_PORT}",
+        access_key=_MINIO_ACCESS,
+        secret_key=_MINIO_SECRET,
+        secure=_MINIO_SECURE,
+    )
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+def convert_to_pdf(docx_path: str | Path, output_dir: str | Path | None = None) -> Path:
+    """Convert a DOCX to PDF.
+
+    Tries the Word VM via MinIO first (pixel-perfect).
+    Falls back to LibreOffice headless if the VM is unavailable or slow.
+
+    Args:
+        docx_path:  Path to the .docx file on disk
+        output_dir: Where to write the PDF (defaults to same dir as docx)
+
+    Returns:
+        Path to the generated PDF file
+    """
+    docx_path = Path(docx_path)
+    if not docx_path.exists():
+        raise FileNotFoundError(f"DOCX not found: {docx_path}")
+
+    out_dir  = Path(output_dir) if output_dir else docx_path.parent
+    out_dir.mkdir(parents=True, exist_ok=True)
+    pdf_path = out_dir / docx_path.with_suffix(".pdf").name
+
+    if USE_DOCSERVER and _MINIO_ACCESS:
+        try:
+            return _convert_via_minio(docx_path, pdf_path)
+        except Exception as exc:
+            LOG.warning(
+                "Word VM via MinIO unavailable (%s) — falling back to LibreOffice", exc
+            )
+
+    return _convert_via_libreoffice(docx_path, pdf_path, out_dir)
+
+
+def convert_batch(docx_paths: list[str | Path], output_dir: str | Path) -> list[Path]:
+    """Convert multiple DOCX files to PDFs.
+
+    Submits all jobs to the Word VM concurrently (each gets its own MinIO key),
+    then collects results as they arrive. Falls back per-file to LibreOffice.
+    """
+    docx_paths = [Path(p) for p in docx_paths]
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if USE_DOCSERVER and _MINIO_ACCESS and docx_paths:
+        try:
+            return _batch_via_minio(docx_paths, output_dir)
+        except Exception as exc:
+            LOG.warning("Batch via Word VM failed (%s) — converting one by one via LibreOffice", exc)
+
+    results = []
+    for docx_path in docx_paths:
+        try:
+            results.append(convert_to_pdf(docx_path, output_dir))
+        except Exception as exc:
+            LOG.error("Failed to convert %s: %s", docx_path.name, exc)
+    return results
+
+
+def health_check() -> dict:
+    """Return status of both conversion backends."""
+    status: dict = {"libreoffice": False, "docserver_minio": False}
+
+    # LibreOffice
+    try:
+        r = subprocess.run(
+            ["libreoffice", "--version"],
+            capture_output=True, text=True, timeout=10,
+        )
+        status["libreoffice"] = r.returncode == 0
+    except Exception:
+        pass
+
+    # Word VM — check if the MinIO bucket is accessible and if the worker
+    # has recently touched a heartbeat object
+    if USE_DOCSERVER and _MINIO_ACCESS:
+        try:
+            mc = _minio_client()
+            mc.bucket_exists(_MINIO_BUCKET)   # just checks connectivity
+            status["docserver_minio"] = True
+            status["minio_bucket"]    = _MINIO_BUCKET
+        except Exception as exc:
+            status["minio_error"] = str(exc)
+
+    return status
+
+
+# ── MinIO transport ───────────────────────────────────────────────────────────
+
+def _convert_via_minio(docx_path: Path, pdf_path: Path) -> Path:
+    """Upload DOCX to MinIO, wait for the Word VM to convert it, download PDF.
+
+    Atomic upload: the DOCX is first uploaded to a .tmp key, then renamed
+    (copy + delete) to the final key. This prevents the Windows worker from
+    downloading a partially-uploaded file.
+    """
+    from minio.commonconfig import CopySource  # type: ignore
+
+    job_id   = str(uuid.uuid4()).replace("-", "")
+    tmp_key  = f"{_PREFIX_IN}/.tmp_{job_id}.docx"
+    in_key   = f"{_PREFIX_IN}/{job_id}.docx"
+    out_key  = f"{_PREFIX_OUT}/{job_id}.pdf"
+
+    mc = _minio_client()
+
+    # Ensure bucket exists
+    if not mc.bucket_exists(_MINIO_BUCKET):
+        mc.make_bucket(_MINIO_BUCKET)
+
+    # Upload DOCX to temp key first (invisible to worker — it ignores .tmp_ prefix)
+    LOG.info("[%s] Uploading %s → minio://%s/%s (staging)", job_id[:8], docx_path.name, _MINIO_BUCKET, tmp_key)
+    mc.fput_object(
+        _MINIO_BUCKET, tmp_key, str(docx_path),
+        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        metadata={"x-amz-meta-source": docx_path.name},
+    )
+
+    # Atomic rename: copy tmp → final, then delete tmp
+    # MinIO copy_object is a server-side operation — the object appears
+    # at the destination key atomically (no partial state visible)
+    mc.copy_object(
+        _MINIO_BUCKET, in_key,
+        CopySource(_MINIO_BUCKET, tmp_key),
+    )
+    mc.remove_object(_MINIO_BUCKET, tmp_key)
+    LOG.info("[%s] Staged → minio://%s/%s (live)", job_id[:8], _MINIO_BUCKET, in_key)
+
+    # Poll for the converted PDF
+    deadline = time.monotonic() + DOCSERVER_TIMEOUT
+    LOG.info("[%s] Waiting for Word VM to convert (timeout=%ds)...", job_id[:8], DOCSERVER_TIMEOUT)
+
+    while time.monotonic() < deadline:
+        try:
+            mc.stat_object(_MINIO_BUCKET, out_key)
+            # Object exists — download it
+            LOG.info("[%s] PDF ready — downloading", job_id[:8])
+            mc.fget_object(_MINIO_BUCKET, out_key, str(pdf_path))
+            # Clean up the converted output from MinIO
+            try:
+                mc.remove_object(_MINIO_BUCKET, out_key)
+            except Exception:
+                pass
+            LOG.info("[%s] PDF written: %s (%d bytes)", job_id[:8], pdf_path.name, pdf_path.stat().st_size)
+            return pdf_path
+        except Exception:
+            # Object not there yet — keep waiting
+            time.sleep(_POLL_INTERVAL)
+
+    # Timed out — clean up the orphaned DOCX and raise
+    try:
+        mc.remove_object(_MINIO_BUCKET, in_key)
+    except Exception:
+        pass
+    raise TimeoutError(
+        f"Word VM did not convert {docx_path.name} within {DOCSERVER_TIMEOUT}s. "
+        f"Is docserver_worker.py running and connected to MinIO?"
+    )
+
+
+def _batch_via_minio(docx_paths: list[Path], output_dir: Path) -> list[Path]:
+    """Submit all DOCX files in parallel, collect results."""
+    import threading
+
+    results: list[Path | None] = [None] * len(docx_paths)
+    errors:  list[str | None]  = [None] * len(docx_paths)
+
+    def _convert_one(idx: int, docx_path: Path) -> None:
+        pdf_path = output_dir / docx_path.with_suffix(".pdf").name
+        try:
+            results[idx] = _convert_via_minio(docx_path, pdf_path)
+        except Exception as exc:
+            LOG.error("Batch item %d (%s) failed: %s", idx, docx_path.name, exc)
+            errors[idx] = str(exc)
+            # Fallback per-file
+            try:
+                results[idx] = _convert_via_libreoffice(docx_path, pdf_path, output_dir)
+            except Exception as lo_exc:
+                LOG.error("LibreOffice fallback also failed for %s: %s", docx_path.name, lo_exc)
+
+    threads = [
+        threading.Thread(target=_convert_one, args=(i, p), daemon=True)
+        for i, p in enumerate(docx_paths)
+    ]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(timeout=DOCSERVER_TIMEOUT + 10)
+
+    return [r for r in results if r is not None]
+
+
+# ── LibreOffice fallback ──────────────────────────────────────────────────────
+
+def _convert_via_libreoffice(docx_path: Path, pdf_path: Path, out_dir: Path) -> Path:
+    """Convert DOCX to PDF using LibreOffice headless (fallback)."""
+    LOG.info("Converting %s via LibreOffice headless...", docx_path.name)
+
+    cmd = [
+        "libreoffice", "--headless",
+        "--convert-to", "pdf",
+        "--outdir", str(out_dir),
+        str(docx_path),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+
+    if result.returncode != 0:
+        LOG.error("LibreOffice conversion failed: %s", result.stderr)
+        raise RuntimeError(f"LibreOffice failed: {result.stderr[:300]}")
+
+    if not pdf_path.exists():
+        raise RuntimeError(f"PDF not found at expected path after LibreOffice: {pdf_path}")
+
+    LOG.info("PDF created via LibreOffice: %s (%d bytes)", pdf_path.name, pdf_path.stat().st_size)
+    return pdf_path