diff --git a/docserver/docserver_worker.py b/docserver/docserver_worker.py index 41c0047..03c7090 100644 --- a/docserver/docserver_worker.py +++ b/docserver/docserver_worker.py @@ -195,6 +195,29 @@ def _ensure_bucket(mc) -> None: LOG.info("Created bucket: %s", _BUCKET) +def _connect_minio_forever(): + """Build a MinIO client and verify the bucket, retrying forever with capped + exponential backoff. Returns a working client once MinIO is reachable. + + The worker used to ``sys.exit(1)`` on a connection error, so a single + transient 502 from MinIO (or its reverse proxy) left it dead until a reboot. + """ + delay = 5 + attempt = 0 + while True: + attempt += 1 + try: + mc = _mc() + _ensure_bucket(mc) + LOG.info("MinIO connected — bucket '%s' ready", _BUCKET) + return mc + except Exception as exc: + LOG.error("MinIO connection failed (attempt %d): %s; retrying in %ds", + attempt, exc, delay) + time.sleep(delay) + delay = min(delay * 2, 120) # 5,10,20,40,80,120,120... + + def _list_pending(mc) -> list[str]: """Return object names under to-convert/ that end in .docx. @@ -334,15 +357,11 @@ def main() -> None: LOG.error("Or RDP in to create an interactive session, then the AtLogOn task will fire.") sys.exit(1) - # Verify MinIO connectivity - LOG.info("Connecting to MinIO...") - try: - mc = _mc() - _ensure_bucket(mc) - LOG.info("MinIO connected — bucket '%s' ready", _BUCKET) - except Exception as exc: - LOG.error("MinIO connection failed: %s", exc) - sys.exit(1) + # Connect to MinIO, retrying indefinitely. MinIO (or the nginx vhost in + # front of it) can return transient 502s / be briefly unreachable; the + # worker must wait it out rather than exit, otherwise it stays dead until a + # reboot or a manual restart. + mc = _connect_minio_forever() # Start heartbeat background thread hb = threading.Thread(target=_heartbeat_loop, args=(word_version,), daemon=True) @@ -353,11 +372,17 @@ def main() -> None: try: while True: - pending = _list_pending(mc) - if pending: - LOG.info("Found %d pending job(s)", len(pending)) - for key in pending: - _process_one(mc, key) + try: + pending = _list_pending(mc) + if pending: + LOG.info("Found %d pending job(s)", len(pending)) + for key in pending: + _process_one(mc, key) + except Exception as exc: + # Never let a transient MinIO/network error kill the loop. + # Rebuild the client and keep going after a short pause. + LOG.error("Poll cycle failed (%s); reconnecting to MinIO...", exc) + mc = _connect_minio_forever() time.sleep(_POLL_INTERVAL) except KeyboardInterrupt: LOG.info("Shutting down...")