new-site/deploy.sh
justin 7670608c1a fix(monitoring): render alertmanager.yml from template at deploy (fixes crash loop)
Alertmanager does not expand ${ENV} in its YAML, so the committed config with
${TELEGRAM_BOT_TOKEN}/${TELEGRAM_CHAT_ID} crash-looped it (line 24: cannot
unmarshal !!str `${TELEG...` into int64) - 11k+ restarts on prod, alerting dead.

- rename alertmanager.yml -> alertmanager.yml.template (keeps ${} placeholders)
- deploy.sh: envsubst the template into the (gitignored) alertmanager.yml from
  .env, scoped to the two TELEGRAM vars so the {{ }} Go-template message survives
- gitignore the rendered file (contains the bot token)
- warns if the vars are unset
2026-06-07 04:49:53 -05:00

106 lines
5.1 KiB
Bash
Executable file

#!/usr/bin/env bash
# Deploy latest code from git and rebuild containers.
# Usage: ./deploy.sh (rebuilds site, api, workers)
# ./deploy.sh site (rebuilds only site)
# ./deploy.sh api (rebuilds only api)
# ./deploy.sh erpnext (rebuild + migrate ERPNext, re-extract assets)
# ./deploy.sh api workers (rebuild a custom set)
set -euo pipefail
cd /opt/performancewest
SERVICES="${@:-site api workers proxy-relay listmonk-hc}"
# proxy-relay and listmonk-hc are upstream images (no build context). Build
# everything else, but always include them in the `up` set so the healthcare
# proxy sidecar and the healthcare-stream Listmonk run.
# NB: listmonk-hc needs a one-time DB setup the first time it is deployed:
# docker compose exec api-postgres psql -U pw -d postgres -c 'CREATE DATABASE listmonk_hc OWNER pw;'
# docker compose run --rm --entrypoint /bin/sh listmonk-hc -c './listmonk --install --idempotent --yes --config /listmonk/config.toml'
# then configure its 3 SMTP servers (hc ports 2526/2527/2528). See
# docs/healthcare-email-stream-plan.md.
BUILD_SERVICES="$(echo "$SERVICES" | tr ' ' '\n' | grep -vE '^(proxy-relay|listmonk-hc)$' | tr '\n' ' ')"
echo "=== Pulling latest from git ==="
git pull origin main
# Single source of truth for the site header: rewrite every static page's
# <nav> block from site/src/partials/nav.html so the Services dropdown stays
# identical across the static site, the Astro order pages, and dev. Idempotent;
# does nothing if already in sync. (See scripts/sync_nav.py.)
echo ""
echo "=== Syncing canonical site header (Services dropdown) ==="
python3 scripts/sync_nav.py
# Render the Alertmanager config from its template. Alertmanager does NOT expand
# ${ENV} placeholders in its YAML, so the raw template (with ${TELEGRAM_BOT_TOKEN}
# / ${TELEGRAM_CHAT_ID}) crash-loops it ("cannot unmarshal !!str `${TELEG...`").
# We substitute the real values here from .env at deploy time. Only those two
# vars are expanded so Alertmanager's own {{ }} Go-template message is untouched.
echo ""
echo "=== Rendering monitoring/alertmanager.yml from template ==="
if [ -f monitoring/alertmanager.yml.template ]; then
set -a; [ -f .env ] && . ./.env; set +a
envsubst '${TELEGRAM_BOT_TOKEN} ${TELEGRAM_CHAT_ID}' \
< monitoring/alertmanager.yml.template > monitoring/alertmanager.yml
if grep -q '\${TELEGRAM' monitoring/alertmanager.yml; then
echo "WARN: TELEGRAM_BOT_TOKEN/TELEGRAM_CHAT_ID not set in .env; Alertmanager will crash-loop." >&2
fi
fi
echo ""
echo "=== Building: $SERVICES ==="
# ERPNext bakes the custom Frappe apps into its image, so they must be staged
# into the build context (erpnext/<app>/) from the repo first. Without this,
# `docker compose build erpnext` would use stale app copies and silently ship
# old code (e.g. the set-password controller rename would never take effect).
case " $SERVICES " in
*" erpnext "*) echo "--- staging custom Frappe apps ---"; bash erpnext/build.sh ;;
esac
[ -n "${BUILD_SERVICES// }" ] && docker compose build $BUILD_SERVICES
echo ""
echo "=== Restarting: $SERVICES ==="
docker compose up -d $SERVICES
# ── ERPNext: migrate, then ALWAYS re-extract the host asset copy ─────────────
# Frappe emits content-hashed asset filenames; an ERPNext rebuild/migrate
# changes the hashes. If we don't re-sync the host copy that nginx serves for
# portal.performancewest.net, every asset 404s and the portal loses all CSS.
# So any time erpnext is (re)built we run bench migrate + re-extract assets.
case " $SERVICES " in
*" erpnext "*)
echo ""
echo "=== ERPNext: bench migrate ==="
docker compose exec -T erpnext bench --site performancewest.net migrate || \
docker compose exec -T erpnext bench migrate || true
echo ""
echo "=== ERPNext: re-extracting static assets for the portal ==="
sudo ./extract-erpnext-assets.sh
;;
esac
echo ""
echo "=== Clearing nginx cache ==="
sudo rm -rf /var/cache/nginx/* 2>/dev/null || true
sudo nginx -s reload 2>/dev/null || true
# ── Portal asset drift guard ────────────────────────────────────────────────
# Cheap safety net on EVERY deploy: if the portal's manifest references a CSS
# bundle that is missing from the host copy, the portal CSS is broken — detect
# it and auto-heal by re-extracting. This catches drift from any source
# (out-of-band ERPNext restarts, image pulls, etc.).
if docker inspect performancewest-erpnext-1 >/dev/null 2>&1; then
LOGIN_HASH="$(docker exec performancewest-erpnext-1 sh -c \
"grep -o 'login.bundle.[A-Z0-9]*.css' /home/frappe/frappe-bench/sites/assets/assets.json | head -1" 2>/dev/null || true)"
if [ -n "$LOGIN_HASH" ] && \
[ ! -f "/opt/erpnext-assets/assets/frappe/dist/css/${LOGIN_HASH}" ]; then
echo ""
echo "=== Portal asset drift detected (${LOGIN_HASH} missing) — re-extracting ==="
sudo ./extract-erpnext-assets.sh
fi
fi
echo ""
echo "=== Done ==="
git log --oneline -1
docker compose ps --format "table {{.Name}}\t{{.Status}}" | head -10