infra: disk-space guardrail + Docker log rotation (prevent disk-full crash)
On 2026-06-27 / filled to 100% and crash-looped Postgres ('No space left on
device'), taking down Listmonk mid-send. Cause: an orphaned 15GB
/tmp/forgejo-dump.zip (interrupted backup) + uncapped Docker json-file logs
(forgejo container log alone was ~1GB), with NO disk monitoring to warn first.
- pw-disk-space-alert.sh + cron (every 15m): Telegram warn at 80%, auto-reclaim
build cache + orphaned forgejo dump at 88%. Silent when healthy.
- ansible docker role: write /etc/docker/daemon.json with 50m x 3 log cap
(150MB/container max) + non-disruptive Reload docker handler.
This commit is contained in:
parent
bfdbf8f031
commit
e318f12e36
4 changed files with 92 additions and 0 deletions
|
|
@ -4,6 +4,13 @@
|
|||
name: docker
|
||||
state: restarted
|
||||
|
||||
# Non-disruptive: applies daemon.json (e.g. log-rotation) without bouncing
|
||||
# running containers, unlike "Restart docker".
|
||||
- name: Reload docker
|
||||
ansible.builtin.systemd:
|
||||
name: docker
|
||||
state: reloaded
|
||||
|
||||
- name: Reload systemd
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: true
|
||||
|
|
|
|||
|
|
@ -45,6 +45,25 @@
|
|||
update_cache: true
|
||||
notify: Restart docker
|
||||
|
||||
# Cap container log growth. Without this, json-file logs are unbounded and a
|
||||
# single chatty container (e.g. forgejo at ~1GB) plus an orphaned backup dump
|
||||
# can fill / and crash Postgres (happened 2026-06-27). 50m x 3 = 150MB/container max.
|
||||
- name: Configure Docker daemon log rotation
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/docker/daemon.json
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
content: |
|
||||
{
|
||||
"log-driver": "json-file",
|
||||
"log-opts": {
|
||||
"max-size": "50m",
|
||||
"max-file": "3"
|
||||
}
|
||||
}
|
||||
notify: Reload docker
|
||||
|
||||
- name: Ensure Docker service is enabled and started
|
||||
ansible.builtin.systemd:
|
||||
name: docker
|
||||
|
|
|
|||
4
infra/cron/pw-disk-space-alert
Normal file
4
infra/cron/pw-disk-space-alert
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
# Disk-space guardrail: alert via Telegram before / fills up and crashes
|
||||
# Postgres/Listmonk; auto-reclaims build cache + orphaned forgejo dump at CRIT.
|
||||
# Every 15 minutes. Runs as root (needs du on /var/lib/docker + docker socket).
|
||||
*/15 * * * * root /usr/local/bin/pw-disk-space-alert >> /var/log/pw-disk-space-alert.log 2>&1
|
||||
62
infra/monitoring/pw-disk-space-alert.sh
Normal file
62
infra/monitoring/pw-disk-space-alert.sh
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/bash
|
||||
# Disk-space guardrail. Alerts via Telegram BEFORE the root filesystem fills up
|
||||
# and crashes Postgres/Listmonk (as happened 2026-06-27 when an orphaned 15GB
|
||||
# forgejo-dump.zip + uncapped Docker logs pushed / to 100% and Postgres
|
||||
# crash-looped on "No space left on device"). Silent on healthy days.
|
||||
#
|
||||
# Also proactively cleans the two things that caused that outage:
|
||||
# 1. orphaned /tmp/forgejo-dump.zip left behind by an interrupted backup
|
||||
# 2. Docker build cache (safe, fully regenerable)
|
||||
# ...but ONLY once usage crosses a soft threshold, so normal days are untouched.
|
||||
#
|
||||
# Schedule: every 15 min via cron (see infra/cron/pw-disk-space-alert).
|
||||
set -uo pipefail
|
||||
|
||||
ENV_FILE=/opt/performancewest/.env
|
||||
WARN_PCT=${PW_DISK_WARN_PCT:-80} # Telegram warning at/above this %
|
||||
CRIT_PCT=${PW_DISK_CRIT_PCT:-88} # auto-cleanup kicks in at/above this %
|
||||
MOUNT=/
|
||||
|
||||
BOT=$(grep -E '^TELEGRAM_BOT_TOKEN=' "$ENV_FILE" 2>/dev/null | head -1 | cut -d= -f2-)
|
||||
CHAT=$(grep -E '^TELEGRAM_CHAT_ID=' "$ENV_FILE" 2>/dev/null | head -1 | cut -d= -f2-)
|
||||
tg() {
|
||||
[ -n "${BOT:-}" ] && [ -n "${CHAT:-}" ] || return 0
|
||||
curl -s --max-time 10 "https://api.telegram.org/bot${BOT}/sendMessage" \
|
||||
--data-urlencode "chat_id=${CHAT}" --data-urlencode "text=$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
usage_pct() { df --output=pcent "$MOUNT" | tail -1 | tr -dc '0-9'; }
|
||||
|
||||
PCT=$(usage_pct)
|
||||
[ -n "$PCT" ] || exit 0
|
||||
|
||||
if [ "$PCT" -lt "$WARN_PCT" ]; then
|
||||
exit 0 # healthy, stay silent
|
||||
fi
|
||||
|
||||
FREED=""
|
||||
if [ "$PCT" -ge "$CRIT_PCT" ]; then
|
||||
# 1. Drop any orphaned forgejo dump still sitting in the container's /tmp.
|
||||
if docker exec performancewest-forgejo test -f /tmp/forgejo-dump.zip 2>/dev/null; then
|
||||
docker exec performancewest-forgejo rm -f /tmp/forgejo-dump.zip 2>/dev/null \
|
||||
&& FREED="${FREED}orphaned forgejo-dump.zip; "
|
||||
fi
|
||||
# 2. Reclaim Docker build cache (safe, regenerable).
|
||||
RECLAIMED=$(docker builder prune -af 2>/dev/null | grep -oE 'Total reclaimed space: .*' | head -1)
|
||||
[ -n "$RECLAIMED" ] && FREED="${FREED}build-cache (${RECLAIMED#Total reclaimed space: }); "
|
||||
fi
|
||||
|
||||
NEWPCT=$(usage_pct)
|
||||
AVAIL=$(df -h "$MOUNT" | tail -1 | awk '{print $4}')
|
||||
TOPDIRS=$(sudo du -shx /var/lib/docker /opt/backups /var/log 2>/dev/null | sort -rh | head -3 | awk '{printf "%s %s; ", $1, $2}')
|
||||
|
||||
LEVEL="⚠️ WARNING"; [ "$NEWPCT" -ge "$CRIT_PCT" ] && LEVEL="🔴 CRITICAL"
|
||||
|
||||
MSG=$(printf '%s Performance West disk space\n\nRoot %s: %s%% used (%s free)\n%s\nTop: %s' \
|
||||
"$LEVEL" "$MOUNT" "$NEWPCT" "$AVAIL" \
|
||||
"${FREED:+Auto-freed: ${FREED}now ${NEWPCT}%}" "$TOPDIRS")
|
||||
tg "$MSG"
|
||||
|
||||
# Mirror to the same report log the warmup check uses, for history.
|
||||
echo "[$(date '+%F %T')] disk ${PCT}% -> ${NEWPCT}% (free ${AVAIL}) ${FREED:+freed: $FREED}" \
|
||||
>> /opt/performancewest/logs/pw-disk-space-alert.log 2>&1
|
||||
Loading…
Add table
Add a link
Reference in a new issue