diff --git a/infra/ansible/roles/docker/handlers/main.yml b/infra/ansible/roles/docker/handlers/main.yml index 90d557d..b2c6059 100644 --- a/infra/ansible/roles/docker/handlers/main.yml +++ b/infra/ansible/roles/docker/handlers/main.yml @@ -4,6 +4,13 @@ name: docker state: restarted +# Non-disruptive: applies daemon.json (e.g. log-rotation) without bouncing +# running containers, unlike "Restart docker". +- name: Reload docker + ansible.builtin.systemd: + name: docker + state: reloaded + - name: Reload systemd ansible.builtin.systemd: daemon_reload: true diff --git a/infra/ansible/roles/docker/tasks/main.yml b/infra/ansible/roles/docker/tasks/main.yml index 2f7f764..9b1ca7f 100644 --- a/infra/ansible/roles/docker/tasks/main.yml +++ b/infra/ansible/roles/docker/tasks/main.yml @@ -45,6 +45,25 @@ update_cache: true notify: Restart docker +# Cap container log growth. Without this, json-file logs are unbounded and a +# single chatty container (e.g. forgejo at ~1GB) plus an orphaned backup dump +# can fill / and crash Postgres (happened 2026-06-27). 50m x 3 = 150MB/container max. +- name: Configure Docker daemon log rotation + ansible.builtin.copy: + dest: /etc/docker/daemon.json + owner: root + group: root + mode: "0644" + content: | + { + "log-driver": "json-file", + "log-opts": { + "max-size": "50m", + "max-file": "3" + } + } + notify: Reload docker + - name: Ensure Docker service is enabled and started ansible.builtin.systemd: name: docker diff --git a/infra/cron/pw-disk-space-alert b/infra/cron/pw-disk-space-alert new file mode 100644 index 0000000..2c91421 --- /dev/null +++ b/infra/cron/pw-disk-space-alert @@ -0,0 +1,4 @@ +# Disk-space guardrail: alert via Telegram before / fills up and crashes +# Postgres/Listmonk; auto-reclaims build cache + orphaned forgejo dump at CRIT. +# Every 15 minutes. Runs as root (needs du on /var/lib/docker + docker socket). +*/15 * * * * root /usr/local/bin/pw-disk-space-alert >> /var/log/pw-disk-space-alert.log 2>&1 diff --git a/infra/monitoring/pw-disk-space-alert.sh b/infra/monitoring/pw-disk-space-alert.sh new file mode 100644 index 0000000..6d8f5f8 --- /dev/null +++ b/infra/monitoring/pw-disk-space-alert.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Disk-space guardrail. Alerts via Telegram BEFORE the root filesystem fills up +# and crashes Postgres/Listmonk (as happened 2026-06-27 when an orphaned 15GB +# forgejo-dump.zip + uncapped Docker logs pushed / to 100% and Postgres +# crash-looped on "No space left on device"). Silent on healthy days. +# +# Also proactively cleans the two things that caused that outage: +# 1. orphaned /tmp/forgejo-dump.zip left behind by an interrupted backup +# 2. Docker build cache (safe, fully regenerable) +# ...but ONLY once usage crosses a soft threshold, so normal days are untouched. +# +# Schedule: every 15 min via cron (see infra/cron/pw-disk-space-alert). +set -uo pipefail + +ENV_FILE=/opt/performancewest/.env +WARN_PCT=${PW_DISK_WARN_PCT:-80} # Telegram warning at/above this % +CRIT_PCT=${PW_DISK_CRIT_PCT:-88} # auto-cleanup kicks in at/above this % +MOUNT=/ + +BOT=$(grep -E '^TELEGRAM_BOT_TOKEN=' "$ENV_FILE" 2>/dev/null | head -1 | cut -d= -f2-) +CHAT=$(grep -E '^TELEGRAM_CHAT_ID=' "$ENV_FILE" 2>/dev/null | head -1 | cut -d= -f2-) +tg() { + [ -n "${BOT:-}" ] && [ -n "${CHAT:-}" ] || return 0 + curl -s --max-time 10 "https://api.telegram.org/bot${BOT}/sendMessage" \ + --data-urlencode "chat_id=${CHAT}" --data-urlencode "text=$1" >/dev/null 2>&1 +} + +usage_pct() { df --output=pcent "$MOUNT" | tail -1 | tr -dc '0-9'; } + +PCT=$(usage_pct) +[ -n "$PCT" ] || exit 0 + +if [ "$PCT" -lt "$WARN_PCT" ]; then + exit 0 # healthy, stay silent +fi + +FREED="" +if [ "$PCT" -ge "$CRIT_PCT" ]; then + # 1. Drop any orphaned forgejo dump still sitting in the container's /tmp. + if docker exec performancewest-forgejo test -f /tmp/forgejo-dump.zip 2>/dev/null; then + docker exec performancewest-forgejo rm -f /tmp/forgejo-dump.zip 2>/dev/null \ + && FREED="${FREED}orphaned forgejo-dump.zip; " + fi + # 2. Reclaim Docker build cache (safe, regenerable). + RECLAIMED=$(docker builder prune -af 2>/dev/null | grep -oE 'Total reclaimed space: .*' | head -1) + [ -n "$RECLAIMED" ] && FREED="${FREED}build-cache (${RECLAIMED#Total reclaimed space: }); " +fi + +NEWPCT=$(usage_pct) +AVAIL=$(df -h "$MOUNT" | tail -1 | awk '{print $4}') +TOPDIRS=$(sudo du -shx /var/lib/docker /opt/backups /var/log 2>/dev/null | sort -rh | head -3 | awk '{printf "%s %s; ", $1, $2}') + +LEVEL="⚠️ WARNING"; [ "$NEWPCT" -ge "$CRIT_PCT" ] && LEVEL="🔴 CRITICAL" + +MSG=$(printf '%s Performance West disk space\n\nRoot %s: %s%% used (%s free)\n%s\nTop: %s' \ + "$LEVEL" "$MOUNT" "$NEWPCT" "$AVAIL" \ + "${FREED:+Auto-freed: ${FREED}now ${NEWPCT}%}" "$TOPDIRS") +tg "$MSG" + +# Mirror to the same report log the warmup check uses, for history. +echo "[$(date '+%F %T')] disk ${PCT}% -> ${NEWPCT}% (free ${AVAIL}) ${FREED:+freed: $FREED}" \ + >> /opt/performancewest/logs/pw-disk-space-alert.log 2>&1