Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
285 lines
11 KiB
Python
285 lines
11 KiB
Python
"""
|
|
DOCX → PDF conversion.
|
|
|
|
Primary: Windows Word VM via MinIO (pixel-perfect, no open ports required).
|
|
Fallback: LibreOffice headless (70-80% fidelity, always available in container).
|
|
|
|
MinIO transport protocol
|
|
─────────────────────────
|
|
PUT docx → {bucket}/to-convert/{job_id}.docx (this module)
|
|
WAIT poll → {bucket}/converted/{job_id}.pdf (this module)
|
|
GET pdf ← {bucket}/converted/{job_id}.pdf (this module)
|
|
DEL docx ← {bucket}/to-convert/{job_id}.docx (docserver_worker.py)
|
|
DEL pdf ← {bucket}/converted/{job_id}.pdf (this module, after download)
|
|
|
|
The Windows VM runs docserver_worker.py which:
|
|
1. Polls to-convert/ every 12 seconds
|
|
2. Downloads the DOCX, converts via Word COM, uploads the PDF to converted/
|
|
3. Deletes the source DOCX from to-convert/
|
|
|
|
No HTTP server, no open ports, no SSH tunnel. Only MinIO is needed.
|
|
|
|
Environment variables (same MinIO creds as the workers):
|
|
MINIO_ENDPOINT — MinIO host (default: minio)
|
|
MINIO_PORT — MinIO port (default: 9000)
|
|
MINIO_ACCESS_KEY — access key
|
|
MINIO_SECRET_KEY — secret key
|
|
MINIO_BUCKET — bucket name (default: performancewest)
|
|
USE_DOCSERVER — enable Word VM path (default: true)
|
|
DOCSERVER_TIMEOUT — max seconds to wait for Word to produce the PDF (default: 120)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import time
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
LOG = logging.getLogger("document_gen.pdf")
|
|
|
|
# MinIO settings — inherited from the workers container env
|
|
_MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio")
|
|
_MINIO_PORT = int(os.getenv("MINIO_PORT", "9000"))
|
|
_MINIO_ACCESS = os.getenv("MINIO_ACCESS_KEY", "")
|
|
_MINIO_SECRET = os.getenv("MINIO_SECRET_KEY", "")
|
|
_MINIO_BUCKET = os.getenv("MINIO_BUCKET", "performancewest")
|
|
_MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
|
|
|
USE_DOCSERVER = os.getenv("USE_DOCSERVER", "true").lower() == "true"
|
|
DOCSERVER_TIMEOUT = int(os.getenv("DOCSERVER_TIMEOUT", "120")) # seconds
|
|
_POLL_INTERVAL = 12 # seconds between polls for the converted PDF
|
|
|
|
# MinIO key prefixes
|
|
_PREFIX_IN = "to-convert" # docx files waiting to be processed
|
|
_PREFIX_OUT = "converted" # pdf files ready for pickup
|
|
|
|
|
|
def _minio_client():
|
|
"""Return a configured MinIO client."""
|
|
from minio import Minio # type: ignore
|
|
return Minio(
|
|
f"{_MINIO_ENDPOINT}:{_MINIO_PORT}",
|
|
access_key=_MINIO_ACCESS,
|
|
secret_key=_MINIO_SECRET,
|
|
secure=_MINIO_SECURE,
|
|
)
|
|
|
|
|
|
# ── Public API ────────────────────────────────────────────────────────────────
|
|
|
|
def convert_to_pdf(docx_path: str | Path, output_dir: str | Path | None = None) -> Path:
|
|
"""Convert a DOCX to PDF.
|
|
|
|
Tries the Word VM via MinIO first (pixel-perfect).
|
|
Falls back to LibreOffice headless if the VM is unavailable or slow.
|
|
|
|
Args:
|
|
docx_path: Path to the .docx file on disk
|
|
output_dir: Where to write the PDF (defaults to same dir as docx)
|
|
|
|
Returns:
|
|
Path to the generated PDF file
|
|
"""
|
|
docx_path = Path(docx_path)
|
|
if not docx_path.exists():
|
|
raise FileNotFoundError(f"DOCX not found: {docx_path}")
|
|
|
|
out_dir = Path(output_dir) if output_dir else docx_path.parent
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
pdf_path = out_dir / docx_path.with_suffix(".pdf").name
|
|
|
|
if USE_DOCSERVER and _MINIO_ACCESS:
|
|
try:
|
|
return _convert_via_minio(docx_path, pdf_path)
|
|
except Exception as exc:
|
|
LOG.warning(
|
|
"Word VM via MinIO unavailable (%s) — falling back to LibreOffice", exc
|
|
)
|
|
|
|
return _convert_via_libreoffice(docx_path, pdf_path, out_dir)
|
|
|
|
|
|
def convert_batch(docx_paths: list[str | Path], output_dir: str | Path) -> list[Path]:
|
|
"""Convert multiple DOCX files to PDFs.
|
|
|
|
Submits all jobs to the Word VM concurrently (each gets its own MinIO key),
|
|
then collects results as they arrive. Falls back per-file to LibreOffice.
|
|
"""
|
|
docx_paths = [Path(p) for p in docx_paths]
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
if USE_DOCSERVER and _MINIO_ACCESS and docx_paths:
|
|
try:
|
|
return _batch_via_minio(docx_paths, output_dir)
|
|
except Exception as exc:
|
|
LOG.warning("Batch via Word VM failed (%s) — converting one by one via LibreOffice", exc)
|
|
|
|
results = []
|
|
for docx_path in docx_paths:
|
|
try:
|
|
results.append(convert_to_pdf(docx_path, output_dir))
|
|
except Exception as exc:
|
|
LOG.error("Failed to convert %s: %s", docx_path.name, exc)
|
|
return results
|
|
|
|
|
|
def health_check() -> dict:
|
|
"""Return status of both conversion backends."""
|
|
status: dict = {"libreoffice": False, "docserver_minio": False}
|
|
|
|
# LibreOffice
|
|
try:
|
|
r = subprocess.run(
|
|
["libreoffice", "--version"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
status["libreoffice"] = r.returncode == 0
|
|
except Exception:
|
|
pass
|
|
|
|
# Word VM — check if the MinIO bucket is accessible and if the worker
|
|
# has recently touched a heartbeat object
|
|
if USE_DOCSERVER and _MINIO_ACCESS:
|
|
try:
|
|
mc = _minio_client()
|
|
mc.bucket_exists(_MINIO_BUCKET) # just checks connectivity
|
|
status["docserver_minio"] = True
|
|
status["minio_bucket"] = _MINIO_BUCKET
|
|
except Exception as exc:
|
|
status["minio_error"] = str(exc)
|
|
|
|
return status
|
|
|
|
|
|
# ── MinIO transport ───────────────────────────────────────────────────────────
|
|
|
|
def _convert_via_minio(docx_path: Path, pdf_path: Path) -> Path:
|
|
"""Upload DOCX to MinIO, wait for the Word VM to convert it, download PDF.
|
|
|
|
Atomic upload: the DOCX is first uploaded to a .tmp key, then renamed
|
|
(copy + delete) to the final key. This prevents the Windows worker from
|
|
downloading a partially-uploaded file.
|
|
"""
|
|
from minio.commonconfig import CopySource # type: ignore
|
|
|
|
job_id = str(uuid.uuid4()).replace("-", "")
|
|
tmp_key = f"{_PREFIX_IN}/.tmp_{job_id}.docx"
|
|
in_key = f"{_PREFIX_IN}/{job_id}.docx"
|
|
out_key = f"{_PREFIX_OUT}/{job_id}.pdf"
|
|
|
|
mc = _minio_client()
|
|
|
|
# Ensure bucket exists
|
|
if not mc.bucket_exists(_MINIO_BUCKET):
|
|
mc.make_bucket(_MINIO_BUCKET)
|
|
|
|
# Upload DOCX to temp key first (invisible to worker — it ignores .tmp_ prefix)
|
|
LOG.info("[%s] Uploading %s → minio://%s/%s (staging)", job_id[:8], docx_path.name, _MINIO_BUCKET, tmp_key)
|
|
mc.fput_object(
|
|
_MINIO_BUCKET, tmp_key, str(docx_path),
|
|
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
metadata={"x-amz-meta-source": docx_path.name},
|
|
)
|
|
|
|
# Atomic rename: copy tmp → final, then delete tmp
|
|
# MinIO copy_object is a server-side operation — the object appears
|
|
# at the destination key atomically (no partial state visible)
|
|
mc.copy_object(
|
|
_MINIO_BUCKET, in_key,
|
|
CopySource(_MINIO_BUCKET, tmp_key),
|
|
)
|
|
mc.remove_object(_MINIO_BUCKET, tmp_key)
|
|
LOG.info("[%s] Staged → minio://%s/%s (live)", job_id[:8], _MINIO_BUCKET, in_key)
|
|
|
|
# Poll for the converted PDF
|
|
deadline = time.monotonic() + DOCSERVER_TIMEOUT
|
|
LOG.info("[%s] Waiting for Word VM to convert (timeout=%ds)...", job_id[:8], DOCSERVER_TIMEOUT)
|
|
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
mc.stat_object(_MINIO_BUCKET, out_key)
|
|
# Object exists — download it
|
|
LOG.info("[%s] PDF ready — downloading", job_id[:8])
|
|
mc.fget_object(_MINIO_BUCKET, out_key, str(pdf_path))
|
|
# Clean up the converted output from MinIO
|
|
try:
|
|
mc.remove_object(_MINIO_BUCKET, out_key)
|
|
except Exception:
|
|
pass
|
|
LOG.info("[%s] PDF written: %s (%d bytes)", job_id[:8], pdf_path.name, pdf_path.stat().st_size)
|
|
return pdf_path
|
|
except Exception:
|
|
# Object not there yet — keep waiting
|
|
time.sleep(_POLL_INTERVAL)
|
|
|
|
# Timed out — clean up the orphaned DOCX and raise
|
|
try:
|
|
mc.remove_object(_MINIO_BUCKET, in_key)
|
|
except Exception:
|
|
pass
|
|
raise TimeoutError(
|
|
f"Word VM did not convert {docx_path.name} within {DOCSERVER_TIMEOUT}s. "
|
|
f"Is docserver_worker.py running and connected to MinIO?"
|
|
)
|
|
|
|
|
|
def _batch_via_minio(docx_paths: list[Path], output_dir: Path) -> list[Path]:
|
|
"""Submit all DOCX files in parallel, collect results."""
|
|
import threading
|
|
|
|
results: list[Path | None] = [None] * len(docx_paths)
|
|
errors: list[str | None] = [None] * len(docx_paths)
|
|
|
|
def _convert_one(idx: int, docx_path: Path) -> None:
|
|
pdf_path = output_dir / docx_path.with_suffix(".pdf").name
|
|
try:
|
|
results[idx] = _convert_via_minio(docx_path, pdf_path)
|
|
except Exception as exc:
|
|
LOG.error("Batch item %d (%s) failed: %s", idx, docx_path.name, exc)
|
|
errors[idx] = str(exc)
|
|
# Fallback per-file
|
|
try:
|
|
results[idx] = _convert_via_libreoffice(docx_path, pdf_path, output_dir)
|
|
except Exception as lo_exc:
|
|
LOG.error("LibreOffice fallback also failed for %s: %s", docx_path.name, lo_exc)
|
|
|
|
threads = [
|
|
threading.Thread(target=_convert_one, args=(i, p), daemon=True)
|
|
for i, p in enumerate(docx_paths)
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=DOCSERVER_TIMEOUT + 10)
|
|
|
|
return [r for r in results if r is not None]
|
|
|
|
|
|
# ── LibreOffice fallback ──────────────────────────────────────────────────────
|
|
|
|
def _convert_via_libreoffice(docx_path: Path, pdf_path: Path, out_dir: Path) -> Path:
|
|
"""Convert DOCX to PDF using LibreOffice headless (fallback)."""
|
|
LOG.info("Converting %s via LibreOffice headless...", docx_path.name)
|
|
|
|
cmd = [
|
|
"libreoffice", "--headless",
|
|
"--convert-to", "pdf",
|
|
"--outdir", str(out_dir),
|
|
str(docx_path),
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
|
|
if result.returncode != 0:
|
|
LOG.error("LibreOffice conversion failed: %s", result.stderr)
|
|
raise RuntimeError(f"LibreOffice failed: {result.stderr[:300]}")
|
|
|
|
if not pdf_path.exists():
|
|
raise RuntimeError(f"PDF not found at expected path after LibreOffice: {pdf_path}")
|
|
|
|
LOG.info("PDF created via LibreOffice: %s (%d bytes)", pdf_path.name, pdf_path.stat().st_size)
|
|
return pdf_path
|