docserver: survive MinIO outages instead of exiting
The worker called sys.exit(1) on any MinIO connection error, so a single transient 502 from MinIO/its reverse proxy left it dead until a manual restart or reboot (its scheduled task only runs at system startup). It had been dead ~5 weeks after a 502 on May 9. - _connect_minio_forever(): retry the initial MinIO connect indefinitely with capped exponential backoff (5s..120s) instead of exiting. - main loop: wrap each poll cycle; on any error, log + rebuild the client and keep polling rather than crashing. Verified on the box: normal DOCX->PDF still works (~11s e2e); a bogus endpoint now retries forever without ever calling sys.exit (was the exact May-9 failure).
This commit is contained in:
parent
ef3b7a96f0
commit
7929413eeb
1 changed files with 39 additions and 14 deletions
|
|
@ -195,6 +195,29 @@ def _ensure_bucket(mc) -> None:
|
||||||
LOG.info("Created bucket: %s", _BUCKET)
|
LOG.info("Created bucket: %s", _BUCKET)
|
||||||
|
|
||||||
|
|
||||||
|
def _connect_minio_forever():
|
||||||
|
"""Build a MinIO client and verify the bucket, retrying forever with capped
|
||||||
|
exponential backoff. Returns a working client once MinIO is reachable.
|
||||||
|
|
||||||
|
The worker used to ``sys.exit(1)`` on a connection error, so a single
|
||||||
|
transient 502 from MinIO (or its reverse proxy) left it dead until a reboot.
|
||||||
|
"""
|
||||||
|
delay = 5
|
||||||
|
attempt = 0
|
||||||
|
while True:
|
||||||
|
attempt += 1
|
||||||
|
try:
|
||||||
|
mc = _mc()
|
||||||
|
_ensure_bucket(mc)
|
||||||
|
LOG.info("MinIO connected — bucket '%s' ready", _BUCKET)
|
||||||
|
return mc
|
||||||
|
except Exception as exc:
|
||||||
|
LOG.error("MinIO connection failed (attempt %d): %s; retrying in %ds",
|
||||||
|
attempt, exc, delay)
|
||||||
|
time.sleep(delay)
|
||||||
|
delay = min(delay * 2, 120) # 5,10,20,40,80,120,120...
|
||||||
|
|
||||||
|
|
||||||
def _list_pending(mc) -> list[str]:
|
def _list_pending(mc) -> list[str]:
|
||||||
"""Return object names under to-convert/ that end in .docx.
|
"""Return object names under to-convert/ that end in .docx.
|
||||||
|
|
||||||
|
|
@ -334,15 +357,11 @@ def main() -> None:
|
||||||
LOG.error("Or RDP in to create an interactive session, then the AtLogOn task will fire.")
|
LOG.error("Or RDP in to create an interactive session, then the AtLogOn task will fire.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Verify MinIO connectivity
|
# Connect to MinIO, retrying indefinitely. MinIO (or the nginx vhost in
|
||||||
LOG.info("Connecting to MinIO...")
|
# front of it) can return transient 502s / be briefly unreachable; the
|
||||||
try:
|
# worker must wait it out rather than exit, otherwise it stays dead until a
|
||||||
mc = _mc()
|
# reboot or a manual restart.
|
||||||
_ensure_bucket(mc)
|
mc = _connect_minio_forever()
|
||||||
LOG.info("MinIO connected — bucket '%s' ready", _BUCKET)
|
|
||||||
except Exception as exc:
|
|
||||||
LOG.error("MinIO connection failed: %s", exc)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Start heartbeat background thread
|
# Start heartbeat background thread
|
||||||
hb = threading.Thread(target=_heartbeat_loop, args=(word_version,), daemon=True)
|
hb = threading.Thread(target=_heartbeat_loop, args=(word_version,), daemon=True)
|
||||||
|
|
@ -353,11 +372,17 @@ def main() -> None:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
pending = _list_pending(mc)
|
try:
|
||||||
if pending:
|
pending = _list_pending(mc)
|
||||||
LOG.info("Found %d pending job(s)", len(pending))
|
if pending:
|
||||||
for key in pending:
|
LOG.info("Found %d pending job(s)", len(pending))
|
||||||
_process_one(mc, key)
|
for key in pending:
|
||||||
|
_process_one(mc, key)
|
||||||
|
except Exception as exc:
|
||||||
|
# Never let a transient MinIO/network error kill the loop.
|
||||||
|
# Rebuild the client and keep going after a short pause.
|
||||||
|
LOG.error("Poll cycle failed (%s); reconnecting to MinIO...", exc)
|
||||||
|
mc = _connect_minio_forever()
|
||||||
time.sleep(_POLL_INTERVAL)
|
time.sleep(_POLL_INTERVAL)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
LOG.info("Shutting down...")
|
LOG.info("Shutting down...")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue