new-site/deploy.sh
justin ab9491be6a fix(deploy): hard-reset to origin/main + assert HEAD advanced (stop silent strands)
deploy.sh used 'git pull origin main', which silently ABORTS when the tracked
tree is dirty (generated site files, or any drift), stranding new commits on an
old checkout — this bit us twice today (prod stuck at b125d46 while origin had
the COC work). Replaced with:
  git fetch origin main && git reset --hard origin/main
The deploy box is a pure mirror of origin (all real changes land via git), so a
hard reset is safe and untracked files (data/*, .secrets/) are preserved. Added
a post-reset assertion that HEAD == origin/main and exits 1 loudly otherwise, so
a strand can never again be masked by a '| tail' in the caller.
2026-06-16 09:25:11 -05:00

144 lines
7.3 KiB
Bash
Executable file

#!/usr/bin/env bash
# Deploy latest code from git and rebuild containers.
# Usage: ./deploy.sh (rebuilds site, api, workers)
# ./deploy.sh site (rebuilds only site)
# ./deploy.sh api (rebuilds only api)
# ./deploy.sh erpnext (rebuild + migrate ERPNext, re-extract assets)
# ./deploy.sh api workers (rebuild a custom set)
set -euo pipefail
cd /opt/performancewest
SERVICES="${@:-site api workers proxy-relay listmonk-hc}"
# proxy-relay and listmonk-hc are upstream images (no build context). Build
# everything else, but always include them in the `up` set so the healthcare
# proxy sidecar and the healthcare-stream Listmonk run.
# NB: listmonk-hc needs a one-time DB setup the first time it is deployed:
# docker compose exec api-postgres psql -U pw -d postgres -c 'CREATE DATABASE listmonk_hc OWNER pw;'
# docker compose run --rm --entrypoint /bin/sh listmonk-hc -c './listmonk --install --idempotent --yes --config /listmonk/config.toml'
# then configure its 3 SMTP servers (hc ports 2526/2527/2528). See
# docs/healthcare-email-stream-plan.md.
BUILD_SERVICES="$(echo "$SERVICES" | tr ' ' '\n' | grep -vE '^(proxy-relay|listmonk-hc)$' | tr '\n' ' ')"
echo "=== Pulling latest from git ==="
# deploy steps below (sync_nav.py, gen-service-catalog.py) rewrite generated
# files under site/public + site/src in place, leaving the tree dirty. That dirty
# tree makes `git pull` abort ("local changes would be overwritten"), silently
# stranding new commits on an old checkout. Discard those generated changes first
# so the pull always fast-forwards. (Only generated paths are reset.)
git checkout -- site/public site/src 2>/dev/null || true
git fetch origin main
# Hard-reset the tracked tree to origin/main: the deploy box is a pure mirror of
# origin (all real changes land via git), so any other tracked-file drift is also
# generated/stale and must not be allowed to abort the pull. Untracked files
# (data/*, .secrets/) are preserved. This makes "stranded on an old commit"
# impossible — the previous `git pull` could silently abort, this cannot.
git reset --hard origin/main
# Assert we actually advanced to the just-fetched origin tip; fail LOUDLY (not
# masked by a `| tail` in the caller) if somehow we did not.
LOCAL_HEAD="$(git rev-parse HEAD)"
ORIGIN_HEAD="$(git rev-parse origin/main)"
if [ "$LOCAL_HEAD" != "$ORIGIN_HEAD" ]; then
echo "FATAL: working tree is at $LOCAL_HEAD but origin/main is $ORIGIN_HEAD — deploy aborting." >&2
exit 1
fi
echo "Deploying commit $LOCAL_HEAD"
# Single source of truth for the site header: rewrite every static page's
# <nav> block from site/src/partials/nav.html so the Services dropdown stays
# identical across the static site, the Astro order pages, and dev. Idempotent;
# does nothing if already in sync. (See scripts/sync_nav.py.)
echo ""
echo "=== Syncing canonical site header (Services dropdown) ==="
python3 scripts/sync_nav.py
# Single source of truth for service pricing: the API catalog
# (api/src/service-catalog.ts) is the authority (it is what checkout charges).
# The site build context is ./site only and cannot read ../api, so we generate
# site/src/lib/service-catalog.generated.ts here on the host before the docker
# build. This guarantees displayed prices == charged prices. (Python because the
# prod box has python3 but not node; matches scripts/sync_nav.py.)
echo ""
echo "=== Generating site service catalog from API source ==="
python3 scripts/gen-service-catalog.py
python3 scripts/check-service-catalog-drift.py
# Render the Alertmanager config from its template. Alertmanager does NOT expand
# ${ENV} placeholders in its YAML, so the raw template (with ${TELEGRAM_BOT_TOKEN}
# / ${TELEGRAM_CHAT_ID}) crash-loops it ("cannot unmarshal !!str `${TELEG...`").
# We substitute the real values here from .env at deploy time. Only those two
# vars are expanded so Alertmanager's own {{ }} Go-template message is untouched.
# NB: we extract just these two keys (not `source .env`) because .env holds values
# with shell-hostile chars (e.g. SMTP_PASS) that break `. ./.env`.
echo ""
echo "=== Rendering monitoring/alertmanager.yml from template ==="
if [ -f monitoring/alertmanager.yml.template ]; then
get_env() { sed -n "s/^$1=//p" .env | head -n1; }
TELEGRAM_BOT_TOKEN="$(get_env TELEGRAM_BOT_TOKEN)"
TELEGRAM_CHAT_ID="$(get_env TELEGRAM_CHAT_ID)"
export TELEGRAM_BOT_TOKEN TELEGRAM_CHAT_ID
envsubst '${TELEGRAM_BOT_TOKEN} ${TELEGRAM_CHAT_ID}' \
< monitoring/alertmanager.yml.template > monitoring/alertmanager.yml
if grep -q '\${TELEGRAM' monitoring/alertmanager.yml \
|| [ -z "$TELEGRAM_BOT_TOKEN" ] || [ -z "$TELEGRAM_CHAT_ID" ]; then
echo "WARN: TELEGRAM_BOT_TOKEN/TELEGRAM_CHAT_ID missing in .env; Alertmanager will crash-loop." >&2
fi
fi
echo ""
echo "=== Building: $SERVICES ==="
# ERPNext bakes the custom Frappe apps into its image, so they must be staged
# into the build context (erpnext/<app>/) from the repo first. Without this,
# `docker compose build erpnext` would use stale app copies and silently ship
# old code (e.g. the set-password controller rename would never take effect).
case " $SERVICES " in
*" erpnext "*) echo "--- staging custom Frappe apps ---"; bash erpnext/build.sh ;;
esac
[ -n "${BUILD_SERVICES// }" ] && docker compose build $BUILD_SERVICES
echo ""
echo "=== Restarting: $SERVICES ==="
docker compose up -d $SERVICES
# ── ERPNext: migrate, then ALWAYS re-extract the host asset copy ─────────────
# Frappe emits content-hashed asset filenames; an ERPNext rebuild/migrate
# changes the hashes. If we don't re-sync the host copy that nginx serves for
# portal.performancewest.net, every asset 404s and the portal loses all CSS.
# So any time erpnext is (re)built we run bench migrate + re-extract assets.
case " $SERVICES " in
*" erpnext "*)
echo ""
echo "=== ERPNext: bench migrate ==="
docker compose exec -T erpnext bench --site performancewest.net migrate || \
docker compose exec -T erpnext bench migrate || true
echo ""
echo "=== ERPNext: re-extracting static assets for the portal ==="
sudo ./extract-erpnext-assets.sh
;;
esac
echo ""
echo "=== Clearing nginx cache ==="
sudo rm -rf /var/cache/nginx/* 2>/dev/null || true
sudo nginx -s reload 2>/dev/null || true
# ── Portal asset drift guard ────────────────────────────────────────────────
# Cheap safety net on EVERY deploy: if the portal's manifest references a CSS
# bundle that is missing from the host copy, the portal CSS is broken — detect
# it and auto-heal by re-extracting. This catches drift from any source
# (out-of-band ERPNext restarts, image pulls, etc.).
if docker inspect performancewest-erpnext-1 >/dev/null 2>&1; then
LOGIN_HASH="$(docker exec performancewest-erpnext-1 sh -c \
"grep -o 'login.bundle.[A-Z0-9]*.css' /home/frappe/frappe-bench/sites/assets/assets.json | head -1" 2>/dev/null || true)"
if [ -n "$LOGIN_HASH" ] && \
[ ! -f "/opt/erpnext-assets/assets/frappe/dist/css/${LOGIN_HASH}" ]; then
echo ""
echo "=== Portal asset drift detected (${LOGIN_HASH} missing) — re-extracting ==="
sudo ./extract-erpnext-assets.sh
fi
fi
echo ""
echo "=== Done ==="
git log --oneline -1
docker compose ps --format "table {{.Name}}\t{{.Status}}" | head -10