Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
commit
f8cd37ac8c
1823 changed files with 145167 additions and 0 deletions
285
scripts/document_gen/pdf_converter.py
Normal file
285
scripts/document_gen/pdf_converter.py
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
"""
|
||||
DOCX → PDF conversion.
|
||||
|
||||
Primary: Windows Word VM via MinIO (pixel-perfect, no open ports required).
|
||||
Fallback: LibreOffice headless (70-80% fidelity, always available in container).
|
||||
|
||||
MinIO transport protocol
|
||||
─────────────────────────
|
||||
PUT docx → {bucket}/to-convert/{job_id}.docx (this module)
|
||||
WAIT poll → {bucket}/converted/{job_id}.pdf (this module)
|
||||
GET pdf ← {bucket}/converted/{job_id}.pdf (this module)
|
||||
DEL docx ← {bucket}/to-convert/{job_id}.docx (docserver_worker.py)
|
||||
DEL pdf ← {bucket}/converted/{job_id}.pdf (this module, after download)
|
||||
|
||||
The Windows VM runs docserver_worker.py which:
|
||||
1. Polls to-convert/ every 12 seconds
|
||||
2. Downloads the DOCX, converts via Word COM, uploads the PDF to converted/
|
||||
3. Deletes the source DOCX from to-convert/
|
||||
|
||||
No HTTP server, no open ports, no SSH tunnel. Only MinIO is needed.
|
||||
|
||||
Environment variables (same MinIO creds as the workers):
|
||||
MINIO_ENDPOINT — MinIO host (default: minio)
|
||||
MINIO_PORT — MinIO port (default: 9000)
|
||||
MINIO_ACCESS_KEY — access key
|
||||
MINIO_SECRET_KEY — secret key
|
||||
MINIO_BUCKET — bucket name (default: performancewest)
|
||||
USE_DOCSERVER — enable Word VM path (default: true)
|
||||
DOCSERVER_TIMEOUT — max seconds to wait for Word to produce the PDF (default: 120)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
LOG = logging.getLogger("document_gen.pdf")
|
||||
|
||||
# MinIO settings — inherited from the workers container env
|
||||
_MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio")
|
||||
_MINIO_PORT = int(os.getenv("MINIO_PORT", "9000"))
|
||||
_MINIO_ACCESS = os.getenv("MINIO_ACCESS_KEY", "")
|
||||
_MINIO_SECRET = os.getenv("MINIO_SECRET_KEY", "")
|
||||
_MINIO_BUCKET = os.getenv("MINIO_BUCKET", "performancewest")
|
||||
_MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
|
||||
USE_DOCSERVER = os.getenv("USE_DOCSERVER", "true").lower() == "true"
|
||||
DOCSERVER_TIMEOUT = int(os.getenv("DOCSERVER_TIMEOUT", "120")) # seconds
|
||||
_POLL_INTERVAL = 12 # seconds between polls for the converted PDF
|
||||
|
||||
# MinIO key prefixes
|
||||
_PREFIX_IN = "to-convert" # docx files waiting to be processed
|
||||
_PREFIX_OUT = "converted" # pdf files ready for pickup
|
||||
|
||||
|
||||
def _minio_client():
|
||||
"""Return a configured MinIO client."""
|
||||
from minio import Minio # type: ignore
|
||||
return Minio(
|
||||
f"{_MINIO_ENDPOINT}:{_MINIO_PORT}",
|
||||
access_key=_MINIO_ACCESS,
|
||||
secret_key=_MINIO_SECRET,
|
||||
secure=_MINIO_SECURE,
|
||||
)
|
||||
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def convert_to_pdf(docx_path: str | Path, output_dir: str | Path | None = None) -> Path:
|
||||
"""Convert a DOCX to PDF.
|
||||
|
||||
Tries the Word VM via MinIO first (pixel-perfect).
|
||||
Falls back to LibreOffice headless if the VM is unavailable or slow.
|
||||
|
||||
Args:
|
||||
docx_path: Path to the .docx file on disk
|
||||
output_dir: Where to write the PDF (defaults to same dir as docx)
|
||||
|
||||
Returns:
|
||||
Path to the generated PDF file
|
||||
"""
|
||||
docx_path = Path(docx_path)
|
||||
if not docx_path.exists():
|
||||
raise FileNotFoundError(f"DOCX not found: {docx_path}")
|
||||
|
||||
out_dir = Path(output_dir) if output_dir else docx_path.parent
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
pdf_path = out_dir / docx_path.with_suffix(".pdf").name
|
||||
|
||||
if USE_DOCSERVER and _MINIO_ACCESS:
|
||||
try:
|
||||
return _convert_via_minio(docx_path, pdf_path)
|
||||
except Exception as exc:
|
||||
LOG.warning(
|
||||
"Word VM via MinIO unavailable (%s) — falling back to LibreOffice", exc
|
||||
)
|
||||
|
||||
return _convert_via_libreoffice(docx_path, pdf_path, out_dir)
|
||||
|
||||
|
||||
def convert_batch(docx_paths: list[str | Path], output_dir: str | Path) -> list[Path]:
|
||||
"""Convert multiple DOCX files to PDFs.
|
||||
|
||||
Submits all jobs to the Word VM concurrently (each gets its own MinIO key),
|
||||
then collects results as they arrive. Falls back per-file to LibreOffice.
|
||||
"""
|
||||
docx_paths = [Path(p) for p in docx_paths]
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if USE_DOCSERVER and _MINIO_ACCESS and docx_paths:
|
||||
try:
|
||||
return _batch_via_minio(docx_paths, output_dir)
|
||||
except Exception as exc:
|
||||
LOG.warning("Batch via Word VM failed (%s) — converting one by one via LibreOffice", exc)
|
||||
|
||||
results = []
|
||||
for docx_path in docx_paths:
|
||||
try:
|
||||
results.append(convert_to_pdf(docx_path, output_dir))
|
||||
except Exception as exc:
|
||||
LOG.error("Failed to convert %s: %s", docx_path.name, exc)
|
||||
return results
|
||||
|
||||
|
||||
def health_check() -> dict:
|
||||
"""Return status of both conversion backends."""
|
||||
status: dict = {"libreoffice": False, "docserver_minio": False}
|
||||
|
||||
# LibreOffice
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["libreoffice", "--version"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
status["libreoffice"] = r.returncode == 0
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Word VM — check if the MinIO bucket is accessible and if the worker
|
||||
# has recently touched a heartbeat object
|
||||
if USE_DOCSERVER and _MINIO_ACCESS:
|
||||
try:
|
||||
mc = _minio_client()
|
||||
mc.bucket_exists(_MINIO_BUCKET) # just checks connectivity
|
||||
status["docserver_minio"] = True
|
||||
status["minio_bucket"] = _MINIO_BUCKET
|
||||
except Exception as exc:
|
||||
status["minio_error"] = str(exc)
|
||||
|
||||
return status
|
||||
|
||||
|
||||
# ── MinIO transport ───────────────────────────────────────────────────────────
|
||||
|
||||
def _convert_via_minio(docx_path: Path, pdf_path: Path) -> Path:
|
||||
"""Upload DOCX to MinIO, wait for the Word VM to convert it, download PDF.
|
||||
|
||||
Atomic upload: the DOCX is first uploaded to a .tmp key, then renamed
|
||||
(copy + delete) to the final key. This prevents the Windows worker from
|
||||
downloading a partially-uploaded file.
|
||||
"""
|
||||
from minio.commonconfig import CopySource # type: ignore
|
||||
|
||||
job_id = str(uuid.uuid4()).replace("-", "")
|
||||
tmp_key = f"{_PREFIX_IN}/.tmp_{job_id}.docx"
|
||||
in_key = f"{_PREFIX_IN}/{job_id}.docx"
|
||||
out_key = f"{_PREFIX_OUT}/{job_id}.pdf"
|
||||
|
||||
mc = _minio_client()
|
||||
|
||||
# Ensure bucket exists
|
||||
if not mc.bucket_exists(_MINIO_BUCKET):
|
||||
mc.make_bucket(_MINIO_BUCKET)
|
||||
|
||||
# Upload DOCX to temp key first (invisible to worker — it ignores .tmp_ prefix)
|
||||
LOG.info("[%s] Uploading %s → minio://%s/%s (staging)", job_id[:8], docx_path.name, _MINIO_BUCKET, tmp_key)
|
||||
mc.fput_object(
|
||||
_MINIO_BUCKET, tmp_key, str(docx_path),
|
||||
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
metadata={"x-amz-meta-source": docx_path.name},
|
||||
)
|
||||
|
||||
# Atomic rename: copy tmp → final, then delete tmp
|
||||
# MinIO copy_object is a server-side operation — the object appears
|
||||
# at the destination key atomically (no partial state visible)
|
||||
mc.copy_object(
|
||||
_MINIO_BUCKET, in_key,
|
||||
CopySource(_MINIO_BUCKET, tmp_key),
|
||||
)
|
||||
mc.remove_object(_MINIO_BUCKET, tmp_key)
|
||||
LOG.info("[%s] Staged → minio://%s/%s (live)", job_id[:8], _MINIO_BUCKET, in_key)
|
||||
|
||||
# Poll for the converted PDF
|
||||
deadline = time.monotonic() + DOCSERVER_TIMEOUT
|
||||
LOG.info("[%s] Waiting for Word VM to convert (timeout=%ds)...", job_id[:8], DOCSERVER_TIMEOUT)
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
mc.stat_object(_MINIO_BUCKET, out_key)
|
||||
# Object exists — download it
|
||||
LOG.info("[%s] PDF ready — downloading", job_id[:8])
|
||||
mc.fget_object(_MINIO_BUCKET, out_key, str(pdf_path))
|
||||
# Clean up the converted output from MinIO
|
||||
try:
|
||||
mc.remove_object(_MINIO_BUCKET, out_key)
|
||||
except Exception:
|
||||
pass
|
||||
LOG.info("[%s] PDF written: %s (%d bytes)", job_id[:8], pdf_path.name, pdf_path.stat().st_size)
|
||||
return pdf_path
|
||||
except Exception:
|
||||
# Object not there yet — keep waiting
|
||||
time.sleep(_POLL_INTERVAL)
|
||||
|
||||
# Timed out — clean up the orphaned DOCX and raise
|
||||
try:
|
||||
mc.remove_object(_MINIO_BUCKET, in_key)
|
||||
except Exception:
|
||||
pass
|
||||
raise TimeoutError(
|
||||
f"Word VM did not convert {docx_path.name} within {DOCSERVER_TIMEOUT}s. "
|
||||
f"Is docserver_worker.py running and connected to MinIO?"
|
||||
)
|
||||
|
||||
|
||||
def _batch_via_minio(docx_paths: list[Path], output_dir: Path) -> list[Path]:
|
||||
"""Submit all DOCX files in parallel, collect results."""
|
||||
import threading
|
||||
|
||||
results: list[Path | None] = [None] * len(docx_paths)
|
||||
errors: list[str | None] = [None] * len(docx_paths)
|
||||
|
||||
def _convert_one(idx: int, docx_path: Path) -> None:
|
||||
pdf_path = output_dir / docx_path.with_suffix(".pdf").name
|
||||
try:
|
||||
results[idx] = _convert_via_minio(docx_path, pdf_path)
|
||||
except Exception as exc:
|
||||
LOG.error("Batch item %d (%s) failed: %s", idx, docx_path.name, exc)
|
||||
errors[idx] = str(exc)
|
||||
# Fallback per-file
|
||||
try:
|
||||
results[idx] = _convert_via_libreoffice(docx_path, pdf_path, output_dir)
|
||||
except Exception as lo_exc:
|
||||
LOG.error("LibreOffice fallback also failed for %s: %s", docx_path.name, lo_exc)
|
||||
|
||||
threads = [
|
||||
threading.Thread(target=_convert_one, args=(i, p), daemon=True)
|
||||
for i, p in enumerate(docx_paths)
|
||||
]
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join(timeout=DOCSERVER_TIMEOUT + 10)
|
||||
|
||||
return [r for r in results if r is not None]
|
||||
|
||||
|
||||
# ── LibreOffice fallback ──────────────────────────────────────────────────────
|
||||
|
||||
def _convert_via_libreoffice(docx_path: Path, pdf_path: Path, out_dir: Path) -> Path:
|
||||
"""Convert DOCX to PDF using LibreOffice headless (fallback)."""
|
||||
LOG.info("Converting %s via LibreOffice headless...", docx_path.name)
|
||||
|
||||
cmd = [
|
||||
"libreoffice", "--headless",
|
||||
"--convert-to", "pdf",
|
||||
"--outdir", str(out_dir),
|
||||
str(docx_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
LOG.error("LibreOffice conversion failed: %s", result.stderr)
|
||||
raise RuntimeError(f"LibreOffice failed: {result.stderr[:300]}")
|
||||
|
||||
if not pdf_path.exists():
|
||||
raise RuntimeError(f"PDF not found at expected path after LibreOffice: {pdf_path}")
|
||||
|
||||
LOG.info("PDF created via LibreOffice: %s (%d bytes)", pdf_path.name, pdf_path.stat().st_size)
|
||||
return pdf_path
|
||||
Loading…
Add table
Add a link
Reference in a new issue