new-site/scripts/document_gen/pdf_converter.py
justin f8cd37ac8c Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers,
document generators, FCC compliance tools, Canada CRTC formation,
Ansible infrastructure, and deployment scripts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-27 06:54:22 -05:00

285 lines
11 KiB
Python

"""
DOCX → PDF conversion.
Primary: Windows Word VM via MinIO (pixel-perfect, no open ports required).
Fallback: LibreOffice headless (70-80% fidelity, always available in container).
MinIO transport protocol
─────────────────────────
PUT docx → {bucket}/to-convert/{job_id}.docx (this module)
WAIT poll → {bucket}/converted/{job_id}.pdf (this module)
GET pdf ← {bucket}/converted/{job_id}.pdf (this module)
DEL docx ← {bucket}/to-convert/{job_id}.docx (docserver_worker.py)
DEL pdf ← {bucket}/converted/{job_id}.pdf (this module, after download)
The Windows VM runs docserver_worker.py which:
1. Polls to-convert/ every 12 seconds
2. Downloads the DOCX, converts via Word COM, uploads the PDF to converted/
3. Deletes the source DOCX from to-convert/
No HTTP server, no open ports, no SSH tunnel. Only MinIO is needed.
Environment variables (same MinIO creds as the workers):
MINIO_ENDPOINT — MinIO host (default: minio)
MINIO_PORT — MinIO port (default: 9000)
MINIO_ACCESS_KEY — access key
MINIO_SECRET_KEY — secret key
MINIO_BUCKET — bucket name (default: performancewest)
USE_DOCSERVER — enable Word VM path (default: true)
DOCSERVER_TIMEOUT — max seconds to wait for Word to produce the PDF (default: 120)
"""
from __future__ import annotations
import io
import logging
import os
import subprocess
import time
import uuid
from pathlib import Path
LOG = logging.getLogger("document_gen.pdf")
# MinIO settings — inherited from the workers container env
_MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio")
_MINIO_PORT = int(os.getenv("MINIO_PORT", "9000"))
_MINIO_ACCESS = os.getenv("MINIO_ACCESS_KEY", "")
_MINIO_SECRET = os.getenv("MINIO_SECRET_KEY", "")
_MINIO_BUCKET = os.getenv("MINIO_BUCKET", "performancewest")
_MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
USE_DOCSERVER = os.getenv("USE_DOCSERVER", "true").lower() == "true"
DOCSERVER_TIMEOUT = int(os.getenv("DOCSERVER_TIMEOUT", "120")) # seconds
_POLL_INTERVAL = 12 # seconds between polls for the converted PDF
# MinIO key prefixes
_PREFIX_IN = "to-convert" # docx files waiting to be processed
_PREFIX_OUT = "converted" # pdf files ready for pickup
def _minio_client():
"""Return a configured MinIO client."""
from minio import Minio # type: ignore
return Minio(
f"{_MINIO_ENDPOINT}:{_MINIO_PORT}",
access_key=_MINIO_ACCESS,
secret_key=_MINIO_SECRET,
secure=_MINIO_SECURE,
)
# ── Public API ────────────────────────────────────────────────────────────────
def convert_to_pdf(docx_path: str | Path, output_dir: str | Path | None = None) -> Path:
"""Convert a DOCX to PDF.
Tries the Word VM via MinIO first (pixel-perfect).
Falls back to LibreOffice headless if the VM is unavailable or slow.
Args:
docx_path: Path to the .docx file on disk
output_dir: Where to write the PDF (defaults to same dir as docx)
Returns:
Path to the generated PDF file
"""
docx_path = Path(docx_path)
if not docx_path.exists():
raise FileNotFoundError(f"DOCX not found: {docx_path}")
out_dir = Path(output_dir) if output_dir else docx_path.parent
out_dir.mkdir(parents=True, exist_ok=True)
pdf_path = out_dir / docx_path.with_suffix(".pdf").name
if USE_DOCSERVER and _MINIO_ACCESS:
try:
return _convert_via_minio(docx_path, pdf_path)
except Exception as exc:
LOG.warning(
"Word VM via MinIO unavailable (%s) — falling back to LibreOffice", exc
)
return _convert_via_libreoffice(docx_path, pdf_path, out_dir)
def convert_batch(docx_paths: list[str | Path], output_dir: str | Path) -> list[Path]:
"""Convert multiple DOCX files to PDFs.
Submits all jobs to the Word VM concurrently (each gets its own MinIO key),
then collects results as they arrive. Falls back per-file to LibreOffice.
"""
docx_paths = [Path(p) for p in docx_paths]
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if USE_DOCSERVER and _MINIO_ACCESS and docx_paths:
try:
return _batch_via_minio(docx_paths, output_dir)
except Exception as exc:
LOG.warning("Batch via Word VM failed (%s) — converting one by one via LibreOffice", exc)
results = []
for docx_path in docx_paths:
try:
results.append(convert_to_pdf(docx_path, output_dir))
except Exception as exc:
LOG.error("Failed to convert %s: %s", docx_path.name, exc)
return results
def health_check() -> dict:
"""Return status of both conversion backends."""
status: dict = {"libreoffice": False, "docserver_minio": False}
# LibreOffice
try:
r = subprocess.run(
["libreoffice", "--version"],
capture_output=True, text=True, timeout=10,
)
status["libreoffice"] = r.returncode == 0
except Exception:
pass
# Word VM — check if the MinIO bucket is accessible and if the worker
# has recently touched a heartbeat object
if USE_DOCSERVER and _MINIO_ACCESS:
try:
mc = _minio_client()
mc.bucket_exists(_MINIO_BUCKET) # just checks connectivity
status["docserver_minio"] = True
status["minio_bucket"] = _MINIO_BUCKET
except Exception as exc:
status["minio_error"] = str(exc)
return status
# ── MinIO transport ───────────────────────────────────────────────────────────
def _convert_via_minio(docx_path: Path, pdf_path: Path) -> Path:
"""Upload DOCX to MinIO, wait for the Word VM to convert it, download PDF.
Atomic upload: the DOCX is first uploaded to a .tmp key, then renamed
(copy + delete) to the final key. This prevents the Windows worker from
downloading a partially-uploaded file.
"""
from minio.commonconfig import CopySource # type: ignore
job_id = str(uuid.uuid4()).replace("-", "")
tmp_key = f"{_PREFIX_IN}/.tmp_{job_id}.docx"
in_key = f"{_PREFIX_IN}/{job_id}.docx"
out_key = f"{_PREFIX_OUT}/{job_id}.pdf"
mc = _minio_client()
# Ensure bucket exists
if not mc.bucket_exists(_MINIO_BUCKET):
mc.make_bucket(_MINIO_BUCKET)
# Upload DOCX to temp key first (invisible to worker — it ignores .tmp_ prefix)
LOG.info("[%s] Uploading %s → minio://%s/%s (staging)", job_id[:8], docx_path.name, _MINIO_BUCKET, tmp_key)
mc.fput_object(
_MINIO_BUCKET, tmp_key, str(docx_path),
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata={"x-amz-meta-source": docx_path.name},
)
# Atomic rename: copy tmp → final, then delete tmp
# MinIO copy_object is a server-side operation — the object appears
# at the destination key atomically (no partial state visible)
mc.copy_object(
_MINIO_BUCKET, in_key,
CopySource(_MINIO_BUCKET, tmp_key),
)
mc.remove_object(_MINIO_BUCKET, tmp_key)
LOG.info("[%s] Staged → minio://%s/%s (live)", job_id[:8], _MINIO_BUCKET, in_key)
# Poll for the converted PDF
deadline = time.monotonic() + DOCSERVER_TIMEOUT
LOG.info("[%s] Waiting for Word VM to convert (timeout=%ds)...", job_id[:8], DOCSERVER_TIMEOUT)
while time.monotonic() < deadline:
try:
mc.stat_object(_MINIO_BUCKET, out_key)
# Object exists — download it
LOG.info("[%s] PDF ready — downloading", job_id[:8])
mc.fget_object(_MINIO_BUCKET, out_key, str(pdf_path))
# Clean up the converted output from MinIO
try:
mc.remove_object(_MINIO_BUCKET, out_key)
except Exception:
pass
LOG.info("[%s] PDF written: %s (%d bytes)", job_id[:8], pdf_path.name, pdf_path.stat().st_size)
return pdf_path
except Exception:
# Object not there yet — keep waiting
time.sleep(_POLL_INTERVAL)
# Timed out — clean up the orphaned DOCX and raise
try:
mc.remove_object(_MINIO_BUCKET, in_key)
except Exception:
pass
raise TimeoutError(
f"Word VM did not convert {docx_path.name} within {DOCSERVER_TIMEOUT}s. "
f"Is docserver_worker.py running and connected to MinIO?"
)
def _batch_via_minio(docx_paths: list[Path], output_dir: Path) -> list[Path]:
"""Submit all DOCX files in parallel, collect results."""
import threading
results: list[Path | None] = [None] * len(docx_paths)
errors: list[str | None] = [None] * len(docx_paths)
def _convert_one(idx: int, docx_path: Path) -> None:
pdf_path = output_dir / docx_path.with_suffix(".pdf").name
try:
results[idx] = _convert_via_minio(docx_path, pdf_path)
except Exception as exc:
LOG.error("Batch item %d (%s) failed: %s", idx, docx_path.name, exc)
errors[idx] = str(exc)
# Fallback per-file
try:
results[idx] = _convert_via_libreoffice(docx_path, pdf_path, output_dir)
except Exception as lo_exc:
LOG.error("LibreOffice fallback also failed for %s: %s", docx_path.name, lo_exc)
threads = [
threading.Thread(target=_convert_one, args=(i, p), daemon=True)
for i, p in enumerate(docx_paths)
]
for t in threads:
t.start()
for t in threads:
t.join(timeout=DOCSERVER_TIMEOUT + 10)
return [r for r in results if r is not None]
# ── LibreOffice fallback ──────────────────────────────────────────────────────
def _convert_via_libreoffice(docx_path: Path, pdf_path: Path, out_dir: Path) -> Path:
"""Convert DOCX to PDF using LibreOffice headless (fallback)."""
LOG.info("Converting %s via LibreOffice headless...", docx_path.name)
cmd = [
"libreoffice", "--headless",
"--convert-to", "pdf",
"--outdir", str(out_dir),
str(docx_path),
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
LOG.error("LibreOffice conversion failed: %s", result.stderr)
raise RuntimeError(f"LibreOffice failed: {result.stderr[:300]}")
if not pdf_path.exists():
raise RuntimeError(f"PDF not found at expected path after LibreOffice: {pdf_path}")
LOG.info("PDF created via LibreOffice: %s (%d bytes)", pdf_path.name, pdf_path.stat().st_size)
return pdf_path