""" DOCX → PDF conversion. Primary: Windows Word VM via MinIO (pixel-perfect, no open ports required). Fallback: LibreOffice headless (70-80% fidelity, always available in container). MinIO transport protocol ───────────────────────── PUT docx → {bucket}/to-convert/{job_id}.docx (this module) WAIT poll → {bucket}/converted/{job_id}.pdf (this module) GET pdf ← {bucket}/converted/{job_id}.pdf (this module) DEL docx ← {bucket}/to-convert/{job_id}.docx (docserver_worker.py) DEL pdf ← {bucket}/converted/{job_id}.pdf (this module, after download) The Windows VM runs docserver_worker.py which: 1. Polls to-convert/ every 12 seconds 2. Downloads the DOCX, converts via Word COM, uploads the PDF to converted/ 3. Deletes the source DOCX from to-convert/ No HTTP server, no open ports, no SSH tunnel. Only MinIO is needed. Environment variables (same MinIO creds as the workers): MINIO_ENDPOINT — MinIO host (default: minio) MINIO_PORT — MinIO port (default: 9000) MINIO_ACCESS_KEY — access key MINIO_SECRET_KEY — secret key MINIO_BUCKET — bucket name (default: performancewest) USE_DOCSERVER — enable Word VM path (default: true) DOCSERVER_TIMEOUT — max seconds to wait for Word to produce the PDF (default: 120) """ from __future__ import annotations import io import logging import os import subprocess import time import uuid from pathlib import Path LOG = logging.getLogger("document_gen.pdf") # MinIO settings — inherited from the workers container env _MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio") _MINIO_PORT = int(os.getenv("MINIO_PORT", "9000")) _MINIO_ACCESS = os.getenv("MINIO_ACCESS_KEY", "") _MINIO_SECRET = os.getenv("MINIO_SECRET_KEY", "") _MINIO_BUCKET = os.getenv("MINIO_BUCKET", "performancewest") _MINIO_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true" USE_DOCSERVER = os.getenv("USE_DOCSERVER", "true").lower() == "true" DOCSERVER_TIMEOUT = int(os.getenv("DOCSERVER_TIMEOUT", "120")) # seconds _POLL_INTERVAL = 12 # seconds between polls for the converted PDF # MinIO key prefixes _PREFIX_IN = "to-convert" # docx files waiting to be processed _PREFIX_OUT = "converted" # pdf files ready for pickup def _minio_client(): """Return a configured MinIO client.""" from minio import Minio # type: ignore return Minio( f"{_MINIO_ENDPOINT}:{_MINIO_PORT}", access_key=_MINIO_ACCESS, secret_key=_MINIO_SECRET, secure=_MINIO_SECURE, ) # ── Public API ──────────────────────────────────────────────────────────────── def convert_to_pdf(docx_path: str | Path, output_dir: str | Path | None = None) -> Path: """Convert a DOCX to PDF. Tries the Word VM via MinIO first (pixel-perfect). Falls back to LibreOffice headless if the VM is unavailable or slow. Args: docx_path: Path to the .docx file on disk output_dir: Where to write the PDF (defaults to same dir as docx) Returns: Path to the generated PDF file """ docx_path = Path(docx_path) if not docx_path.exists(): raise FileNotFoundError(f"DOCX not found: {docx_path}") out_dir = Path(output_dir) if output_dir else docx_path.parent out_dir.mkdir(parents=True, exist_ok=True) pdf_path = out_dir / docx_path.with_suffix(".pdf").name if USE_DOCSERVER and _MINIO_ACCESS: try: return _convert_via_minio(docx_path, pdf_path) except Exception as exc: LOG.warning( "Word VM via MinIO unavailable (%s) — falling back to LibreOffice", exc ) return _convert_via_libreoffice(docx_path, pdf_path, out_dir) def convert_batch(docx_paths: list[str | Path], output_dir: str | Path) -> list[Path]: """Convert multiple DOCX files to PDFs. Submits all jobs to the Word VM concurrently (each gets its own MinIO key), then collects results as they arrive. Falls back per-file to LibreOffice. """ docx_paths = [Path(p) for p in docx_paths] output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) if USE_DOCSERVER and _MINIO_ACCESS and docx_paths: try: return _batch_via_minio(docx_paths, output_dir) except Exception as exc: LOG.warning("Batch via Word VM failed (%s) — converting one by one via LibreOffice", exc) results = [] for docx_path in docx_paths: try: results.append(convert_to_pdf(docx_path, output_dir)) except Exception as exc: LOG.error("Failed to convert %s: %s", docx_path.name, exc) return results def health_check() -> dict: """Return status of both conversion backends.""" status: dict = {"libreoffice": False, "docserver_minio": False} # LibreOffice try: r = subprocess.run( ["libreoffice", "--version"], capture_output=True, text=True, timeout=10, ) status["libreoffice"] = r.returncode == 0 except Exception: pass # Word VM — check if the MinIO bucket is accessible and if the worker # has recently touched a heartbeat object if USE_DOCSERVER and _MINIO_ACCESS: try: mc = _minio_client() mc.bucket_exists(_MINIO_BUCKET) # just checks connectivity status["docserver_minio"] = True status["minio_bucket"] = _MINIO_BUCKET except Exception as exc: status["minio_error"] = str(exc) return status # ── MinIO transport ─────────────────────────────────────────────────────────── def _convert_via_minio(docx_path: Path, pdf_path: Path) -> Path: """Upload DOCX to MinIO, wait for the Word VM to convert it, download PDF. Atomic upload: the DOCX is first uploaded to a .tmp key, then renamed (copy + delete) to the final key. This prevents the Windows worker from downloading a partially-uploaded file. """ from minio.commonconfig import CopySource # type: ignore job_id = str(uuid.uuid4()).replace("-", "") tmp_key = f"{_PREFIX_IN}/.tmp_{job_id}.docx" in_key = f"{_PREFIX_IN}/{job_id}.docx" out_key = f"{_PREFIX_OUT}/{job_id}.pdf" mc = _minio_client() # Ensure bucket exists if not mc.bucket_exists(_MINIO_BUCKET): mc.make_bucket(_MINIO_BUCKET) # Upload DOCX to temp key first (invisible to worker — it ignores .tmp_ prefix) LOG.info("[%s] Uploading %s → minio://%s/%s (staging)", job_id[:8], docx_path.name, _MINIO_BUCKET, tmp_key) mc.fput_object( _MINIO_BUCKET, tmp_key, str(docx_path), content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", metadata={"x-amz-meta-source": docx_path.name}, ) # Atomic rename: copy tmp → final, then delete tmp # MinIO copy_object is a server-side operation — the object appears # at the destination key atomically (no partial state visible) mc.copy_object( _MINIO_BUCKET, in_key, CopySource(_MINIO_BUCKET, tmp_key), ) mc.remove_object(_MINIO_BUCKET, tmp_key) LOG.info("[%s] Staged → minio://%s/%s (live)", job_id[:8], _MINIO_BUCKET, in_key) # Poll for the converted PDF deadline = time.monotonic() + DOCSERVER_TIMEOUT LOG.info("[%s] Waiting for Word VM to convert (timeout=%ds)...", job_id[:8], DOCSERVER_TIMEOUT) while time.monotonic() < deadline: try: mc.stat_object(_MINIO_BUCKET, out_key) # Object exists — download it LOG.info("[%s] PDF ready — downloading", job_id[:8]) mc.fget_object(_MINIO_BUCKET, out_key, str(pdf_path)) # Clean up the converted output from MinIO try: mc.remove_object(_MINIO_BUCKET, out_key) except Exception: pass LOG.info("[%s] PDF written: %s (%d bytes)", job_id[:8], pdf_path.name, pdf_path.stat().st_size) return pdf_path except Exception: # Object not there yet — keep waiting time.sleep(_POLL_INTERVAL) # Timed out — clean up the orphaned DOCX and raise try: mc.remove_object(_MINIO_BUCKET, in_key) except Exception: pass raise TimeoutError( f"Word VM did not convert {docx_path.name} within {DOCSERVER_TIMEOUT}s. " f"Is docserver_worker.py running and connected to MinIO?" ) def _batch_via_minio(docx_paths: list[Path], output_dir: Path) -> list[Path]: """Submit all DOCX files in parallel, collect results.""" import threading results: list[Path | None] = [None] * len(docx_paths) errors: list[str | None] = [None] * len(docx_paths) def _convert_one(idx: int, docx_path: Path) -> None: pdf_path = output_dir / docx_path.with_suffix(".pdf").name try: results[idx] = _convert_via_minio(docx_path, pdf_path) except Exception as exc: LOG.error("Batch item %d (%s) failed: %s", idx, docx_path.name, exc) errors[idx] = str(exc) # Fallback per-file try: results[idx] = _convert_via_libreoffice(docx_path, pdf_path, output_dir) except Exception as lo_exc: LOG.error("LibreOffice fallback also failed for %s: %s", docx_path.name, lo_exc) threads = [ threading.Thread(target=_convert_one, args=(i, p), daemon=True) for i, p in enumerate(docx_paths) ] for t in threads: t.start() for t in threads: t.join(timeout=DOCSERVER_TIMEOUT + 10) return [r for r in results if r is not None] # ── LibreOffice fallback ────────────────────────────────────────────────────── def _convert_via_libreoffice(docx_path: Path, pdf_path: Path, out_dir: Path) -> Path: """Convert DOCX to PDF using LibreOffice headless (fallback).""" LOG.info("Converting %s via LibreOffice headless...", docx_path.name) cmd = [ "libreoffice", "--headless", "--convert-to", "pdf", "--outdir", str(out_dir), str(docx_path), ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if result.returncode != 0: LOG.error("LibreOffice conversion failed: %s", result.stderr) raise RuntimeError(f"LibreOffice failed: {result.stderr[:300]}") if not pdf_path.exists(): raise RuntimeError(f"PDF not found at expected path after LibreOffice: {pdf_path}") LOG.info("PDF created via LibreOffice: %s (%d bytes)", pdf_path.name, pdf_path.stat().st_size) return pdf_path