Initial commit — Performance West telecom compliance platform
Includes: API (Express/TypeScript), Astro site, Python workers, document generators, FCC compliance tools, Canada CRTC formation, Ansible infrastructure, and deployment scripts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
commit
f8cd37ac8c
1823 changed files with 145167 additions and 0 deletions
373
docserver/docserver_worker.py
Normal file
373
docserver/docserver_worker.py
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
r"""
|
||||
Performance West -- Document Conversion Worker (Windows)
|
||||
|
||||
Polls a MinIO bucket for DOCX files, converts them to PDF using
|
||||
Microsoft Word COM automation, and drops the PDF back into MinIO.
|
||||
|
||||
No HTTP server, no open ports, no SSH tunnel required.
|
||||
The Windows VM only needs outbound HTTPS access to MinIO.
|
||||
|
||||
Protocol
|
||||
---------
|
||||
Input: minio://{bucket}/to-convert/{job_id}.docx
|
||||
Output: minio://{bucket}/converted/{job_id}.pdf
|
||||
Cleanup: deletes the input DOCX after successful conversion
|
||||
|
||||
The Linux pdf_converter.py polls converted/ until the PDF appears
|
||||
(up to DOCSERVER_TIMEOUT seconds), then downloads and removes it.
|
||||
|
||||
Heartbeat
|
||||
---------
|
||||
Every 60 seconds this worker writes a tiny heartbeat object:
|
||||
minio://{bucket}/docserver-heartbeat.json
|
||||
Content: {"status":"ok","word_version":"...","ts":"...","host":"..."}
|
||||
The health_check() in pdf_converter.py reads this to detect if the
|
||||
worker is alive without needing a network round-trip to the VM.
|
||||
|
||||
Setup
|
||||
-----
|
||||
1. Copy this file + requirements_windows.txt to C:\docserver\ on the Windows VM
|
||||
2. pip install -r C:\docserver\requirements_windows.txt
|
||||
3. Set the MinIO env vars (see docserver.env or pass via Task Scheduler)
|
||||
4. Run: python docserver_worker.py
|
||||
Or let install.ps1 register it as a Task Scheduler task
|
||||
|
||||
Environment variables
|
||||
---------------------
|
||||
MINIO_ENDPOINT -- MinIO host:port (e.g. minio.performancewest.net or IP:9000)
|
||||
MINIO_PORT -- MinIO port (default 9000)
|
||||
MINIO_ACCESS_KEY -- access key
|
||||
MINIO_SECRET_KEY -- secret key
|
||||
MINIO_BUCKET -- bucket (default: performancewest)
|
||||
MINIO_SECURE -- true/false (default: false for internal; true for external)
|
||||
POLL_INTERVAL -- seconds between polls (default: 12)
|
||||
HEARTBEAT_INTERVAL -- seconds between heartbeats (default: 60)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import socket
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
LOG = logging.getLogger("docserver_worker")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
logging.FileHandler(
|
||||
os.path.join(os.getenv("LOG_DIR", r"C:\docserver\logs"), "worker.log"),
|
||||
encoding="utf-8",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
# ── Configuration ─────────────────────────────────────────────────────────────
|
||||
|
||||
_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio.performancewest.net")
|
||||
_PORT = int(os.getenv("MINIO_PORT", "9000"))
|
||||
_ACCESS = os.getenv("MINIO_ACCESS_KEY", "")
|
||||
_SECRET = os.getenv("MINIO_SECRET_KEY", "")
|
||||
_BUCKET = os.getenv("MINIO_BUCKET", "performancewest")
|
||||
_SECURE = os.getenv("MINIO_SECURE", "false").lower() == "true"
|
||||
|
||||
_PREFIX_IN = "to-convert" # input: DOCX files from Linux
|
||||
_PREFIX_OUT = "converted" # output: PDF files for Linux to pick up
|
||||
|
||||
_POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "12"))
|
||||
_HEARTBEAT_INTERVAL = int(os.getenv("HEARTBEAT_INTERVAL", "60"))
|
||||
|
||||
# Word COM constants
|
||||
_WD_FORMAT_PDF = 17
|
||||
_WD_DO_NOT_SAVE_CHANGES = 0
|
||||
|
||||
# ── Word COM singleton ────────────────────────────────────────────────────────
|
||||
|
||||
_word_app = None
|
||||
_word_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_word():
|
||||
"""Return the Word COM application, creating it if necessary.
|
||||
|
||||
Retries up to 3 times with increasing delays to handle DCOM startup latency
|
||||
when running under SYSTEM via Task Scheduler (Session 0 + DCOM RunAs).
|
||||
"""
|
||||
global _word_app
|
||||
if _word_app is not None:
|
||||
try:
|
||||
_ = _word_app.Visible # probe — raises if Word died
|
||||
return _word_app
|
||||
except Exception:
|
||||
LOG.warning("Word COM instance died — restarting...")
|
||||
_word_app = None
|
||||
|
||||
import win32com.client # type: ignore
|
||||
import pythoncom # type: ignore
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
pythoncom.CoInitialize()
|
||||
_word_app = win32com.client.DispatchEx("Word.Application")
|
||||
if _word_app is None:
|
||||
raise RuntimeError("DispatchEx returned None")
|
||||
_word_app.Visible = False
|
||||
_word_app.DisplayAlerts = False
|
||||
_word_app.AutomationSecurity = 3 # msoAutomationSecurityForceDisable
|
||||
LOG.info("Word COM started — version %s", _word_app.Version)
|
||||
return _word_app
|
||||
except Exception as e:
|
||||
LOG.warning("Word COM init attempt %d/%d failed: %s", attempt, max_retries, e)
|
||||
_word_app = None
|
||||
if attempt < max_retries:
|
||||
delay = attempt * 10 # 10s, 20s
|
||||
LOG.info(" Retrying in %ds...", delay)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
LOG.error("Word COM failed after %d attempts. Is DCOM configured? "
|
||||
"Run fix_dcom.bat as Administrator.", max_retries)
|
||||
raise
|
||||
|
||||
|
||||
def _quit_word():
|
||||
global _word_app
|
||||
if _word_app:
|
||||
try:
|
||||
_word_app.Quit()
|
||||
except Exception:
|
||||
pass
|
||||
_word_app = None
|
||||
|
||||
|
||||
def _convert_docx_to_pdf(docx_path: Path, pdf_path: Path) -> bool:
|
||||
"""Convert one DOCX to PDF via Word COM. Serialised by _word_lock."""
|
||||
with _word_lock:
|
||||
word = _get_word()
|
||||
doc = None
|
||||
try:
|
||||
doc = word.Documents.Open(
|
||||
str(docx_path.resolve()),
|
||||
ReadOnly=True,
|
||||
AddToRecentFiles=False,
|
||||
Visible=False,
|
||||
)
|
||||
doc.SaveAs2(str(pdf_path.resolve()), FileFormat=_WD_FORMAT_PDF)
|
||||
size = pdf_path.stat().st_size if pdf_path.exists() else 0
|
||||
LOG.info("Converted: %s → %s (%d bytes)", docx_path.name, pdf_path.name, size)
|
||||
return pdf_path.exists() and size > 0
|
||||
except Exception as exc:
|
||||
LOG.error("Conversion failed for %s: %s", docx_path.name, exc)
|
||||
return False
|
||||
finally:
|
||||
if doc:
|
||||
try:
|
||||
doc.Close(SaveChanges=_WD_DO_NOT_SAVE_CHANGES)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── MinIO helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _mc():
|
||||
from minio import Minio # type: ignore
|
||||
return Minio(
|
||||
f"{_ENDPOINT}:{_PORT}",
|
||||
access_key=_ACCESS,
|
||||
secret_key=_SECRET,
|
||||
secure=_SECURE,
|
||||
)
|
||||
|
||||
|
||||
def _ensure_bucket(mc) -> None:
|
||||
if not mc.bucket_exists(_BUCKET):
|
||||
mc.make_bucket(_BUCKET)
|
||||
LOG.info("Created bucket: %s", _BUCKET)
|
||||
|
||||
|
||||
def _list_pending(mc) -> list[str]:
|
||||
"""Return object names under to-convert/ that end in .docx.
|
||||
|
||||
Ignores .tmp_ prefixed files — those are still being uploaded atomically
|
||||
by the Linux side and are not ready for processing yet.
|
||||
"""
|
||||
try:
|
||||
objects = mc.list_objects(_BUCKET, prefix=f"{_PREFIX_IN}/", recursive=False)
|
||||
return [
|
||||
obj.object_name
|
||||
for obj in objects
|
||||
if obj.object_name.endswith(".docx")
|
||||
and "/.tmp_" not in obj.object_name
|
||||
]
|
||||
except Exception as exc:
|
||||
LOG.error("Failed to list pending jobs: %s", exc)
|
||||
return []
|
||||
|
||||
|
||||
# ── Main processing loop ──────────────────────────────────────────────────────
|
||||
|
||||
def _process_one(mc, in_key: str) -> None:
|
||||
"""Download one DOCX from MinIO, convert, upload the PDF, delete the DOCX."""
|
||||
job_id = Path(in_key).stem # e.g. "abc123"
|
||||
out_key = f"{_PREFIX_OUT}/{job_id}.pdf"
|
||||
|
||||
# Skip if the PDF is already there (duplicate poll before delete completed)
|
||||
try:
|
||||
mc.stat_object(_BUCKET, out_key)
|
||||
LOG.info("Job %s already converted — skipping", job_id[:8])
|
||||
return
|
||||
except Exception:
|
||||
pass # expected — PDF doesn't exist yet
|
||||
|
||||
work_dir = Path(tempfile.mkdtemp(prefix=f"docserver_{job_id[:8]}_"))
|
||||
docx_path = work_dir / f"{job_id}.docx"
|
||||
pdf_path = work_dir / f"{job_id}.pdf"
|
||||
|
||||
try:
|
||||
# 1. Download DOCX
|
||||
LOG.info("[%s] Downloading %s", job_id[:8], in_key)
|
||||
mc.fget_object(_BUCKET, in_key, str(docx_path))
|
||||
|
||||
# 2. Convert
|
||||
LOG.info("[%s] Converting via Word...", job_id[:8])
|
||||
t0 = time.monotonic()
|
||||
success = _convert_docx_to_pdf(docx_path, pdf_path)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
if not success:
|
||||
LOG.error("[%s] Conversion failed — leaving DOCX in to-convert/ for retry", job_id[:8])
|
||||
return
|
||||
|
||||
LOG.info("[%s] Converted in %.1fs", job_id[:8], elapsed)
|
||||
|
||||
# 3. Upload PDF to converted/ — atomic via tmp + rename
|
||||
# Upload to .tmp_ first, then server-side copy to final key.
|
||||
# Linux side polls stat_object(out_key) — it won't see the .tmp_.
|
||||
from minio.commonconfig import CopySource # type: ignore
|
||||
tmp_out = f"{_PREFIX_OUT}/.tmp_{job_id}.pdf"
|
||||
mc.fput_object(
|
||||
_BUCKET, tmp_out, str(pdf_path),
|
||||
content_type="application/pdf",
|
||||
metadata={
|
||||
"x-amz-meta-job-id": job_id,
|
||||
"x-amz-meta-elapsed": f"{elapsed:.1f}s",
|
||||
},
|
||||
)
|
||||
mc.copy_object(_BUCKET, out_key, CopySource(_BUCKET, tmp_out))
|
||||
mc.remove_object(_BUCKET, tmp_out)
|
||||
LOG.info("[%s] Uploaded PDF → minio://%s/%s (atomic)", job_id[:8], _BUCKET, out_key)
|
||||
|
||||
# 4. Delete the input DOCX so it doesn't get processed again
|
||||
mc.remove_object(_BUCKET, in_key)
|
||||
LOG.info("[%s] Removed input DOCX from to-convert/", job_id[:8])
|
||||
|
||||
except Exception as exc:
|
||||
LOG.error("[%s] Unexpected error processing %s: %s", job_id[:8], in_key, exc)
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def _heartbeat_loop(word_version: str) -> None:
|
||||
"""Write a heartbeat object to MinIO every HEARTBEAT_INTERVAL seconds."""
|
||||
mc = _mc()
|
||||
hostname = socket.gethostname()
|
||||
while True:
|
||||
try:
|
||||
payload = json.dumps({
|
||||
"status": "ok",
|
||||
"word_version": word_version,
|
||||
"host": hostname,
|
||||
"ts": datetime.now(timezone.utc).isoformat(),
|
||||
}).encode()
|
||||
mc.put_object(
|
||||
_BUCKET,
|
||||
"docserver-heartbeat.json",
|
||||
__import__("io").BytesIO(payload),
|
||||
length=len(payload),
|
||||
content_type="application/json",
|
||||
)
|
||||
except Exception as exc:
|
||||
LOG.warning("Heartbeat write failed: %s", exc)
|
||||
time.sleep(_HEARTBEAT_INTERVAL)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
LOG.info("Performance West Document Conversion Worker starting...")
|
||||
LOG.info(" Python: %s", sys.version.split()[0])
|
||||
LOG.info(" Platform: %s", platform.platform())
|
||||
LOG.info(" MinIO: %s:%d / bucket=%s", _ENDPOINT, _PORT, _BUCKET)
|
||||
|
||||
# Log session/user info for debugging COM issues
|
||||
try:
|
||||
import getpass
|
||||
LOG.info(" User: %s", getpass.getuser())
|
||||
import ctypes
|
||||
session_id = ctypes.windll.kernel32.WTSGetActiveConsoleSessionId()
|
||||
LOG.info(" Session: %d (console)", session_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not _ACCESS or not _SECRET:
|
||||
LOG.error("MINIO_ACCESS_KEY / MINIO_SECRET_KEY not set -- cannot start")
|
||||
sys.exit(1)
|
||||
|
||||
# Verify Word is available before accepting work
|
||||
LOG.info("Initialising Word COM...")
|
||||
try:
|
||||
with _word_lock:
|
||||
word = _get_word()
|
||||
word_version = word.Version
|
||||
LOG.info("Word %s ready", word_version)
|
||||
except Exception as exc:
|
||||
LOG.error("Word COM failed to initialise: %s", exc)
|
||||
LOG.error("Fix: run fix_dcom.bat as Administrator, then reboot.")
|
||||
LOG.error("Or RDP in to create an interactive session, then the AtLogOn task will fire.")
|
||||
sys.exit(1)
|
||||
|
||||
# Verify MinIO connectivity
|
||||
LOG.info("Connecting to MinIO...")
|
||||
try:
|
||||
mc = _mc()
|
||||
_ensure_bucket(mc)
|
||||
LOG.info("MinIO connected — bucket '%s' ready", _BUCKET)
|
||||
except Exception as exc:
|
||||
LOG.error("MinIO connection failed: %s", exc)
|
||||
sys.exit(1)
|
||||
|
||||
# Start heartbeat background thread
|
||||
hb = threading.Thread(target=_heartbeat_loop, args=(word_version,), daemon=True)
|
||||
hb.start()
|
||||
LOG.info("Heartbeat thread started (interval=%ds)", _HEARTBEAT_INTERVAL)
|
||||
|
||||
LOG.info("Polling to-convert/ every %ds — waiting for jobs...", _POLL_INTERVAL)
|
||||
|
||||
try:
|
||||
while True:
|
||||
pending = _list_pending(mc)
|
||||
if pending:
|
||||
LOG.info("Found %d pending job(s)", len(pending))
|
||||
for key in pending:
|
||||
_process_one(mc, key)
|
||||
time.sleep(_POLL_INTERVAL)
|
||||
except KeyboardInterrupt:
|
||||
LOG.info("Shutting down...")
|
||||
finally:
|
||||
_quit_word()
|
||||
LOG.info("Worker stopped.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure log directory exists
|
||||
log_dir = Path(os.getenv("LOG_DIR", r"C:\docserver\logs"))
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue