new-site/scripts/document_gen/pdf_converter.py

"""
DOCX → PDF conversion.

Primary:  Windows Word VM via MinIO (pixel-perfect, no open ports required).
Fallback: LibreOffice headless (70-80% fidelity, always available in container).

MinIO transport protocol
─────────────────────────
  PUT  docx → {bucket}/to-convert/{job_id}.docx   (this module)
  WAIT poll  → {bucket}/converted/{job_id}.pdf      (this module)
  GET  pdf   ← {bucket}/converted/{job_id}.pdf      (this module)
  DEL  docx  ← {bucket}/to-convert/{job_id}.docx    (docserver_worker.py)
  DEL  pdf   ← {bucket}/converted/{job_id}.pdf      (this module, after download)

The Windows VM runs docserver_worker.py which:
  1. Polls to-convert/ every 12 seconds
  2. Downloads the DOCX, converts via Word COM, uploads the PDF to converted/
  3. Deletes the source DOCX from to-convert/

No HTTP server, no open ports, no SSH tunnel. Only MinIO is needed.

Environment variables (same MinIO creds as the workers):
  MINIO_ENDPOINT     — MinIO host (default: minio)
  MINIO_PORT         — MinIO port (default: 9000)
  MINIO_ACCESS_KEY   — access key
  MINIO_SECRET_KEY   — secret key
  MINIO_BUCKET       — bucket name (default: performancewest)
  USE_DOCSERVER      — enable Word VM path (default: true)
  DOCSERVER_TIMEOUT  — max seconds to wait for Word to produce the PDF (default: 120)
"""

from __future__ import annotations

import io
import logging
import os
import subprocess
import time
import uuid
from pathlib import Path

LOG = logging.getLogger("document_gen.pdf")

# MinIO settings — inherited from the workers container env
_MINIO_ENDPOINT   = os.getenv("MINIO_ENDPOINT",   "minio")
_MINIO_PORT       = int(os.getenv("MINIO_PORT",   "9000"))
_MINIO_ACCESS     = os.getenv("MINIO_ACCESS_KEY",  "")
_MINIO_SECRET     = os.getenv("MINIO_SECRET_KEY",  "")
_MINIO_BUCKET     = os.getenv("MINIO_BUCKET",      "performancewest")
_MINIO_SECURE     = os.getenv("MINIO_SECURE",      "false").lower() == "true"

USE_DOCSERVER     = os.getenv("USE_DOCSERVER",     "true").lower() == "true"
DOCSERVER_TIMEOUT = int(os.getenv("DOCSERVER_TIMEOUT", "120"))  # seconds
_POLL_INTERVAL    = 12  # seconds between polls for the converted PDF

# MinIO key prefixes
_PREFIX_IN  = "to-convert"    # docx files waiting to be processed
_PREFIX_OUT = "converted"     # pdf files ready for pickup


def _minio_client():
    """Return a configured MinIO client."""
    from minio import Minio  # type: ignore
    return Minio(
        f"{_MINIO_ENDPOINT}:{_MINIO_PORT}",
        access_key=_MINIO_ACCESS,
        secret_key=_MINIO_SECRET,
        secure=_MINIO_SECURE,
    )


# ── Public API ────────────────────────────────────────────────────────────────

def convert_to_pdf(docx_path: str | Path, output_dir: str | Path | None = None) -> Path:
    """Convert a DOCX to PDF.

    Tries the Word VM via MinIO first (pixel-perfect).
    Falls back to LibreOffice headless if the VM is unavailable or slow.

    Args:
        docx_path:  Path to the .docx file on disk
        output_dir: Where to write the PDF (defaults to same dir as docx)

    Returns:
        Path to the generated PDF file
    """
    docx_path = Path(docx_path)
    if not docx_path.exists():
        raise FileNotFoundError(f"DOCX not found: {docx_path}")

    out_dir  = Path(output_dir) if output_dir else docx_path.parent
    out_dir.mkdir(parents=True, exist_ok=True)
    pdf_path = out_dir / docx_path.with_suffix(".pdf").name

    if USE_DOCSERVER and _MINIO_ACCESS:
        try:
            return _convert_via_minio(docx_path, pdf_path)
        except Exception as exc:
            LOG.warning(
                "Word VM via MinIO unavailable (%s) — falling back to LibreOffice", exc
            )

    return _convert_via_libreoffice(docx_path, pdf_path, out_dir)


def convert_batch(docx_paths: list[str | Path], output_dir: str | Path) -> list[Path]:
    """Convert multiple DOCX files to PDFs.

    Submits all jobs to the Word VM concurrently (each gets its own MinIO key),
    then collects results as they arrive. Falls back per-file to LibreOffice.
    """
    docx_paths = [Path(p) for p in docx_paths]
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if USE_DOCSERVER and _MINIO_ACCESS and docx_paths:
        try:
            return _batch_via_minio(docx_paths, output_dir)
        except Exception as exc:
            LOG.warning("Batch via Word VM failed (%s) — converting one by one via LibreOffice", exc)

    results = []
    for docx_path in docx_paths:
        try:
            results.append(convert_to_pdf(docx_path, output_dir))
        except Exception as exc:
            LOG.error("Failed to convert %s: %s", docx_path.name, exc)
    return results


def health_check() -> dict:
    """Return status of both conversion backends."""
    status: dict = {"libreoffice": False, "docserver_minio": False}

    # LibreOffice
    try:
        r = subprocess.run(
            ["libreoffice", "--version"],
            capture_output=True, text=True, timeout=10,
        )
        status["libreoffice"] = r.returncode == 0
    except Exception:
        pass

    # Word VM — check if the MinIO bucket is accessible and if the worker
    # has recently touched a heartbeat object
    if USE_DOCSERVER and _MINIO_ACCESS:
        try:
            mc = _minio_client()
            mc.bucket_exists(_MINIO_BUCKET)   # just checks connectivity
            status["docserver_minio"] = True
            status["minio_bucket"]    = _MINIO_BUCKET
        except Exception as exc:
            status["minio_error"] = str(exc)

    return status


# ── MinIO transport ───────────────────────────────────────────────────────────

def _convert_via_minio(docx_path: Path, pdf_path: Path) -> Path:
    """Upload DOCX to MinIO, wait for the Word VM to convert it, download PDF.

    Atomic upload: the DOCX is first uploaded to a .tmp key, then renamed
    (copy + delete) to the final key. This prevents the Windows worker from
    downloading a partially-uploaded file.
    """
    from minio.commonconfig import CopySource  # type: ignore

    job_id   = str(uuid.uuid4()).replace("-", "")
    tmp_key  = f"{_PREFIX_IN}/.tmp_{job_id}.docx"
    in_key   = f"{_PREFIX_IN}/{job_id}.docx"
    out_key  = f"{_PREFIX_OUT}/{job_id}.pdf"

    mc = _minio_client()

    # Ensure bucket exists
    if not mc.bucket_exists(_MINIO_BUCKET):
        mc.make_bucket(_MINIO_BUCKET)

    # Upload DOCX to temp key first (invisible to worker — it ignores .tmp_ prefix)
    LOG.info("[%s] Uploading %s → minio://%s/%s (staging)", job_id[:8], docx_path.name, _MINIO_BUCKET, tmp_key)
    mc.fput_object(
        _MINIO_BUCKET, tmp_key, str(docx_path),
        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        metadata={"x-amz-meta-source": docx_path.name},
    )

    # Atomic rename: copy tmp → final, then delete tmp
    # MinIO copy_object is a server-side operation — the object appears
    # at the destination key atomically (no partial state visible)
    mc.copy_object(
        _MINIO_BUCKET, in_key,
        CopySource(_MINIO_BUCKET, tmp_key),
    )
    mc.remove_object(_MINIO_BUCKET, tmp_key)
    LOG.info("[%s] Staged → minio://%s/%s (live)", job_id[:8], _MINIO_BUCKET, in_key)

    # Poll for the converted PDF
    deadline = time.monotonic() + DOCSERVER_TIMEOUT
    LOG.info("[%s] Waiting for Word VM to convert (timeout=%ds)...", job_id[:8], DOCSERVER_TIMEOUT)

    while time.monotonic() < deadline:
        try:
            mc.stat_object(_MINIO_BUCKET, out_key)
            # Object exists — download it
            LOG.info("[%s] PDF ready — downloading", job_id[:8])
            mc.fget_object(_MINIO_BUCKET, out_key, str(pdf_path))
            # Clean up the converted output from MinIO
            try:
                mc.remove_object(_MINIO_BUCKET, out_key)
            except Exception:
                pass
            LOG.info("[%s] PDF written: %s (%d bytes)", job_id[:8], pdf_path.name, pdf_path.stat().st_size)
            return pdf_path
        except Exception:
            # Object not there yet — keep waiting
            time.sleep(_POLL_INTERVAL)

    # Timed out — clean up the orphaned DOCX and raise
    try:
        mc.remove_object(_MINIO_BUCKET, in_key)
    except Exception:
        pass
    raise TimeoutError(
        f"Word VM did not convert {docx_path.name} within {DOCSERVER_TIMEOUT}s. "
        f"Is docserver_worker.py running and connected to MinIO?"
    )


def _batch_via_minio(docx_paths: list[Path], output_dir: Path) -> list[Path]:
    """Submit all DOCX files in parallel, collect results."""
    import threading

    results: list[Path | None] = [None] * len(docx_paths)
    errors:  list[str | None]  = [None] * len(docx_paths)

    def _convert_one(idx: int, docx_path: Path) -> None:
        pdf_path = output_dir / docx_path.with_suffix(".pdf").name
        try:
            results[idx] = _convert_via_minio(docx_path, pdf_path)
        except Exception as exc:
            LOG.error("Batch item %d (%s) failed: %s", idx, docx_path.name, exc)
            errors[idx] = str(exc)
            # Fallback per-file
            try:
                results[idx] = _convert_via_libreoffice(docx_path, pdf_path, output_dir)
            except Exception as lo_exc:
                LOG.error("LibreOffice fallback also failed for %s: %s", docx_path.name, lo_exc)

    threads = [
        threading.Thread(target=_convert_one, args=(i, p), daemon=True)
        for i, p in enumerate(docx_paths)
    ]
    for t in threads:
        t.start()
    for t in threads:
        t.join(timeout=DOCSERVER_TIMEOUT + 10)

    return [r for r in results if r is not None]


# ── LibreOffice fallback ──────────────────────────────────────────────────────

def _convert_via_libreoffice(docx_path: Path, pdf_path: Path, out_dir: Path) -> Path:
    """Convert DOCX to PDF using LibreOffice headless (fallback)."""
    LOG.info("Converting %s via LibreOffice headless...", docx_path.name)

    cmd = [
        "libreoffice", "--headless",
        "--convert-to", "pdf",
        "--outdir", str(out_dir),
        str(docx_path),
    ]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)

    if result.returncode != 0:
        LOG.error("LibreOffice conversion failed: %s", result.stderr)
        raise RuntimeError(f"LibreOffice failed: {result.stderr[:300]}")

    if not pdf_path.exists():
        raise RuntimeError(f"PDF not found at expected path after LibreOffice: {pdf_path}")

    LOG.info("PDF created via LibreOffice: %s (%d bytes)", pdf_path.name, pdf_path.stat().st_size)
    return pdf_path