docserver: survive MinIO outages instead of exiting
The worker called sys.exit(1) on any MinIO connection error, so a single transient 502 from MinIO/its reverse proxy left it dead until a manual restart or reboot (its scheduled task only runs at system startup). It had been dead ~5 weeks after a 502 on May 9. - _connect_minio_forever(): retry the initial MinIO connect indefinitely with capped exponential backoff (5s..120s) instead of exiting. - main loop: wrap each poll cycle; on any error, log + rebuild the client and keep polling rather than crashing. Verified on the box: normal DOCX->PDF still works (~11s e2e); a bogus endpoint now retries forever without ever calling sys.exit (was the exact May-9 failure).
This commit is contained in:
parent
ef3b7a96f0
commit
7929413eeb
1 changed files with 39 additions and 14 deletions
|
|
@ -195,6 +195,29 @@ def _ensure_bucket(mc) -> None:
|
|||
LOG.info("Created bucket: %s", _BUCKET)
|
||||
|
||||
|
||||
def _connect_minio_forever():
|
||||
"""Build a MinIO client and verify the bucket, retrying forever with capped
|
||||
exponential backoff. Returns a working client once MinIO is reachable.
|
||||
|
||||
The worker used to ``sys.exit(1)`` on a connection error, so a single
|
||||
transient 502 from MinIO (or its reverse proxy) left it dead until a reboot.
|
||||
"""
|
||||
delay = 5
|
||||
attempt = 0
|
||||
while True:
|
||||
attempt += 1
|
||||
try:
|
||||
mc = _mc()
|
||||
_ensure_bucket(mc)
|
||||
LOG.info("MinIO connected — bucket '%s' ready", _BUCKET)
|
||||
return mc
|
||||
except Exception as exc:
|
||||
LOG.error("MinIO connection failed (attempt %d): %s; retrying in %ds",
|
||||
attempt, exc, delay)
|
||||
time.sleep(delay)
|
||||
delay = min(delay * 2, 120) # 5,10,20,40,80,120,120...
|
||||
|
||||
|
||||
def _list_pending(mc) -> list[str]:
|
||||
"""Return object names under to-convert/ that end in .docx.
|
||||
|
||||
|
|
@ -334,15 +357,11 @@ def main() -> None:
|
|||
LOG.error("Or RDP in to create an interactive session, then the AtLogOn task will fire.")
|
||||
sys.exit(1)
|
||||
|
||||
# Verify MinIO connectivity
|
||||
LOG.info("Connecting to MinIO...")
|
||||
try:
|
||||
mc = _mc()
|
||||
_ensure_bucket(mc)
|
||||
LOG.info("MinIO connected — bucket '%s' ready", _BUCKET)
|
||||
except Exception as exc:
|
||||
LOG.error("MinIO connection failed: %s", exc)
|
||||
sys.exit(1)
|
||||
# Connect to MinIO, retrying indefinitely. MinIO (or the nginx vhost in
|
||||
# front of it) can return transient 502s / be briefly unreachable; the
|
||||
# worker must wait it out rather than exit, otherwise it stays dead until a
|
||||
# reboot or a manual restart.
|
||||
mc = _connect_minio_forever()
|
||||
|
||||
# Start heartbeat background thread
|
||||
hb = threading.Thread(target=_heartbeat_loop, args=(word_version,), daemon=True)
|
||||
|
|
@ -353,11 +372,17 @@ def main() -> None:
|
|||
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
pending = _list_pending(mc)
|
||||
if pending:
|
||||
LOG.info("Found %d pending job(s)", len(pending))
|
||||
for key in pending:
|
||||
_process_one(mc, key)
|
||||
except Exception as exc:
|
||||
# Never let a transient MinIO/network error kill the loop.
|
||||
# Rebuild the client and keep going after a short pause.
|
||||
LOG.error("Poll cycle failed (%s); reconnecting to MinIO...", exc)
|
||||
mc = _connect_minio_forever()
|
||||
time.sleep(_POLL_INTERVAL)
|
||||
except KeyboardInterrupt:
|
||||
LOG.info("Shutting down...")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue