Add deep service health monitoring for all PW dependencies

Each service gets its own Prometheus probe verifying actual functionality:
- API: /status endpoint (checks DB connectivity, returns 503 if down)
- Workers: /health endpoint (job server responsive)
- ERPNext: API method call (MariaDB + Redis + app all working)
- MinIO: /minio/health/live (storage accessible)
- Listmonk: /api/health (email service + DB)
- Ollama: root endpoint (LLM inference available)
- Umami: /api/heartbeat (analytics tracking)
- Forgejo: root page (git server accessible)
- PostgreSQL: pg_up metric from postgres-exporter
- All HTTPS endpoints: SSL + reachability from outside

Service-specific alerts with context:
- API down = DB may be unreachable
- Workers down = compliance orders not processing
- ERPNext down = CRM inaccessible
- MinIO down = document storage unavailable

Custom Grafana dashboard: "Performance West — Services Overview"
- Service status grid (UP/DOWN with colors)
- Response time charts (internal + HTTPS)
- SSL certificate expiry gauges
- Container CPU/memory per service
- PostgreSQL connections, nginx req/s, active alerts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
justin 2026-05-01 03:30:23 -05:00
parent cc463a662f
commit 2f9005693e
3 changed files with 547 additions and 87 deletions

View file

@ -1,38 +1,137 @@
groups: groups:
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
# Service Down Alerts # Performance West Core Services
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
- name: service_down - name: pw_services
rules: rules:
- alert: EndpointDown - alert: PW_API_Down
expr: probe_success{job="blackbox_http"} == 0 expr: probe_success{job="pw_api_prod"} == 0
for: 1m
labels:
severity: critical
service: api
annotations:
summary: "Prod API is DOWN"
description: "API /status endpoint failed — database may be unreachable. Check api + api-postgres containers."
- alert: PW_API_Dev_Down
expr: probe_success{job="pw_api_dev"} == 0
for: 3m
labels:
severity: warning
service: api-dev
annotations:
summary: "Dev API is DOWN"
description: "Dev API /status endpoint unreachable."
- alert: PW_Site_Down
expr: probe_success{job="pw_site_prod"} == 0
for: 1m
labels:
severity: critical
service: site
annotations:
summary: "Prod website is DOWN"
description: "performancewest.net static site is not responding."
- alert: PW_Workers_Down
expr: probe_success{job="pw_workers"} == 0
for: 2m
labels:
severity: critical
service: workers
annotations:
summary: "Workers job server is DOWN"
description: "Python workers /health endpoint failed. Compliance orders, formation filings, and cron jobs are not being processed."
- alert: PW_ERPNext_Down
expr: probe_success{job="pw_erpnext"} == 0
for: 2m
labels:
severity: critical
service: erpnext
annotations:
summary: "ERPNext CRM is DOWN"
description: "ERPNext API is unreachable. Sales orders, invoices, and customer records are inaccessible."
- alert: PW_MinIO_Down
expr: probe_success{job="pw_minio"} == 0
for: 2m
labels:
severity: critical
service: minio
annotations:
summary: "MinIO object storage is DOWN"
description: "MinIO health check failed. Document uploads, RMD packets, and file storage are unavailable."
- alert: PW_Listmonk_Down
expr: probe_success{job="pw_listmonk"} == 0
for: 5m
labels:
severity: warning
service: listmonk
annotations:
summary: "Listmonk email service is DOWN"
description: "Listmonk health endpoint failed. Email campaigns and subscriber management are unavailable."
- alert: PW_Ollama_Down
expr: probe_success{job="pw_ollama"} == 0
for: 5m
labels:
severity: warning
service: ollama
annotations:
summary: "Ollama LLM is DOWN"
description: "Ollama not responding. AI-powered document analysis in workers will fall back to regex."
- alert: PW_Umami_Down
expr: probe_success{job="pw_umami"} == 0
for: 5m
labels:
severity: warning
service: umami
annotations:
summary: "Umami analytics is DOWN"
description: "Analytics tracking endpoint unreachable. Site analytics not being recorded."
- alert: PW_Forgejo_Down
expr: probe_success{job="pw_forgejo"} == 0
for: 5m
labels:
severity: warning
service: forgejo
annotations:
summary: "Forgejo git server is DOWN"
description: "Git server unreachable. Code deployments will fail."
# ══════════════════════════════════════════════════════════════════════
# External HTTPS Endpoints (SSL + reachability)
# ══════════════════════════════════════════════════════════════════════
- name: external_endpoints
rules:
- alert: HTTPS_Endpoint_Down
expr: probe_success{job="blackbox_https"} == 0
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "{{ $labels.instance }} is DOWN" summary: "{{ $labels.instance }} is DOWN"
description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes." description: "HTTPS probe failed for {{ $labels.instance }}. Check nginx, DNS, and SSL cert."
- alert: TCPPortDown - alert: TCP_Port_Down
expr: probe_success{job="blackbox_tcp"} == 0 expr: probe_success{job="blackbox_tcp"} == 0
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "TCP port {{ $labels.instance }} is DOWN" summary: "TCP port {{ $labels.instance }} is DOWN"
description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute." description: "Database or cache port unreachable."
- alert: ContainerDown
expr: |
absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
for: 1m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} is DOWN"
description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
# ══════════════════════════════════════════════════════════════════════
# Container Health
# ══════════════════════════════════════════════════════════════════════
- name: containers
rules:
- alert: ContainerRestarting - alert: ContainerRestarting
expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2 expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
for: 5m for: 5m
@ -40,10 +139,28 @@ groups:
severity: warning severity: warning
annotations: annotations:
summary: "Container {{ $labels.name }} is restart-looping" summary: "Container {{ $labels.name }} is restart-looping"
description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes." description: "{{ $labels.name }} has restarted more than 2 times in 15 minutes."
- alert: ContainerHighCPU
expr: rate(container_cpu_usage_seconds_total{name=~"performancewest-.*"}[5m]) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU ({{ $value | printf \"%.0f\" }}%)"
description: "Container CPU usage above 80% for 10 minutes."
- alert: ContainerHighMemory
expr: container_memory_usage_bytes{name=~"performancewest-.*"} / container_spec_memory_limit_bytes{name=~"performancewest-.*"} * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high memory ({{ $value | printf \"%.0f\" }}%)"
description: "Container using more than 85% of its memory limit."
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
# Host Resource Alerts # Host Resources
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
- name: host_resources - name: host_resources
rules: rules:
@ -53,8 +170,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)" summary: "High CPU ({{ $value | printf \"%.1f\" }}%)"
description: "CPU usage has been above 85% for 10 minutes."
- alert: HighMemory - alert: HighMemory
expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90 expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
@ -62,8 +178,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)" summary: "High memory ({{ $value | printf \"%.1f\" }}%)"
description: "Memory usage has been above 90% for 5 minutes."
- alert: DiskSpaceLow - alert: DiskSpaceLow
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80 expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
@ -71,8 +186,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)" summary: "Disk {{ $value | printf \"%.1f\" }}% full"
description: "Root filesystem is more than 80% full."
- alert: DiskSpaceCritical - alert: DiskSpaceCritical
expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92 expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
@ -80,20 +194,10 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)" summary: "DISK CRITICAL {{ $value | printf \"%.1f\" }}% full"
description: "Root filesystem is more than 92% full. Immediate action required."
- alert: HighLoadAverage
expr: node_load15 > 8
for: 10m
labels:
severity: warning
annotations:
summary: "High load average ({{ $value | printf \"%.1f\" }})"
description: "15-minute load average has been above 8 for 10 minutes."
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
# Database Alerts # Database
# ══════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════
- name: database - name: database
rules: rules:
@ -104,7 +208,6 @@ groups:
severity: critical severity: critical
annotations: annotations:
summary: "PostgreSQL is DOWN" summary: "PostgreSQL is DOWN"
description: "PostgreSQL exporter cannot connect to the database."
- alert: PostgresHighConnections - alert: PostgresHighConnections
expr: pg_stat_activity_count > 80 expr: pg_stat_activity_count > 80
@ -112,45 +215,57 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "High PostgreSQL connections ({{ $value }})" summary: "PostgreSQL {{ $value }} active connections"
description: "PostgreSQL active connections exceeding 80."
# ══════════════════════════════════════════════════════════════════════ - alert: PostgresSlowQueries
# SSL Certificate Alerts expr: pg_stat_activity_max_tx_duration > 300
# ══════════════════════════════════════════════════════════════════════
- name: ssl
rules:
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
annotations:
summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
- alert: SSLCertExpiryCritical
expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
for: 10m
labels:
severity: critical
annotations:
summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
# ══════════════════════════════════════════════════════════════════════
# Response Time Alerts
# ══════════════════════════════════════════════════════════════════════
- name: latency
rules:
- alert: SlowHTTPResponse
expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)" summary: "PostgreSQL query running > 5 minutes"
description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
# ══════════════════════════════════════════════════════════════════════
# SSL Certificates
# ══════════════════════════════════════════════════════════════════════
- name: ssl
rules:
- alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 14 * 24 * 3600
for: 1h
labels:
severity: warning
annotations:
summary: "SSL cert for {{ $labels.instance }} expires in < 14 days"
- alert: SSLCertExpiryCritical
expr: probe_ssl_earliest_cert_expiry{job="blackbox_https"} - time() < 3 * 24 * 3600
for: 10m
labels:
severity: critical
annotations:
summary: "SSL cert for {{ $labels.instance }} expires in < 3 DAYS"
# ══════════════════════════════════════════════════════════════════════
# Response Time
# ══════════════════════════════════════════════════════════════════════
- name: latency
rules:
- alert: APISlowResponse
expr: probe_duration_seconds{job="pw_api_prod"} > 5
for: 5m
labels:
severity: warning
annotations:
summary: "API responding slowly ({{ $value | printf \"%.1f\" }}s)"
- alert: SiteSlowResponse
expr: probe_duration_seconds{job="pw_site_prod"} > 3
for: 5m
labels:
severity: warning
annotations:
summary: "Website responding slowly ({{ $value | printf \"%.1f\" }}s)"
- alert: HighNginx5xxRate - alert: HighNginx5xxRate
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5 expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
@ -159,4 +274,3 @@ groups:
severity: warning severity: warning
annotations: annotations:
summary: "High nginx 5xx error rate" summary: "High nginx 5xx error rate"
description: "More than 0.5 req/s returning 5xx errors."

View file

@ -44,23 +44,22 @@ scrape_configs:
static_configs: static_configs:
- targets: ["nginx-exporter:9113"] - targets: ["nginx-exporter:9113"]
# ── Blackbox probes (HTTP endpoint monitoring) ───────────────────── # ══════════════════════════════════════════════════════════════════════
- job_name: blackbox_http # Performance West Service Health Probes
# Each probe verifies the service is FUNCTIONAL, not just responding
# ══════════════════════════════════════════════════════════════════════
# ── Prod API + DB (returns 503 if DB unreachable) ──────────────────
- job_name: pw_api_prod
metrics_path: /probe metrics_path: /probe
params: params:
module: [http_2xx] module: [http_2xx]
static_configs: static_configs:
- targets: - targets:
- https://performancewest.net - http://api:3001/api/v1/status
- https://api.performancewest.net/api/v1/fcc/search?q=test labels:
- https://dev.performancewest.net service: api
- https://api.dev.performancewest.net/api/v1/fcc/search?q=test env: prod
- https://crm.performancewest.net
- https://lists.performancewest.net
- https://analytics.performancewest.net
- http://minio:9000/minio/health/live
- https://crypto.performancewest.net
- https://pay.performancewest.net
relabel_configs: relabel_configs:
- source_labels: [__address__] - source_labels: [__address__]
target_label: __param_target target_label: __param_target
@ -69,7 +68,203 @@ scrape_configs:
- target_label: __address__ - target_label: __address__
replacement: blackbox-exporter:9115 replacement: blackbox-exporter:9115
# ── Blackbox TCP probes (port monitoring) ────────────────────────── # ── Dev API + DB ───────────────────────────────────────────────────
- job_name: pw_api_dev
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://host.docker.internal:3002/api/v1/status
labels:
service: api
env: dev
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Prod Site (Astro static) ───────────────────────────────────────
- job_name: pw_site_prod
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://site:80/
labels:
service: site
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Workers (Python job server) ────────────────────────────────────
- job_name: pw_workers
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://workers:8090/health
labels:
service: workers
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── ERPNext CRM ────────────────────────────────────────────────────
- job_name: pw_erpnext
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://erpnext:8000/api/method/frappe.client.get_count?doctype=Customer
labels:
service: erpnext
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── MinIO object storage ───────────────────────────────────────────
- job_name: pw_minio
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://minio:9000/minio/health/live
labels:
service: minio
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Listmonk email marketing ───────────────────────────────────────
- job_name: pw_listmonk
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://listmonk:9000/api/health
labels:
service: listmonk
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Ollama LLM ────────────────────────────────────────────────────
- job_name: pw_ollama
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://ollama:11434/
labels:
service: ollama
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Umami analytics ────────────────────────────────────────────────
- job_name: pw_umami
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://umami:3000/api/heartbeat
labels:
service: umami
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── Forgejo git server ─────────────────────────────────────────────
- job_name: pw_forgejo
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://host.docker.internal:3030/
labels:
service: forgejo
env: prod
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ══════════════════════════════════════════════════════════════════════
# External-facing HTTPS probes (SSL + reachability from outside)
# ══════════════════════════════════════════════════════════════════════
- job_name: blackbox_https
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://performancewest.net
- https://api.performancewest.net/api/v1/status
- https://dev.performancewest.net
- https://crm.performancewest.net
- https://lists.performancewest.net
- https://analytics.performancewest.net
- https://monitoring.performancewest.net
- https://crypto.performancewest.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# ── TCP port probes (databases, caches) ────────────────────────────
- job_name: blackbox_tcp - job_name: blackbox_tcp
metrics_path: /probe metrics_path: /probe
params: params:

View file

@ -0,0 +1,151 @@
{
"dashboard": {
"id": null,
"uid": null,
"title": "Performance West — Services Overview",
"tags": ["performancewest", "services"],
"timezone": "browser",
"refresh": "30s",
"time": { "from": "now-1h", "to": "now" },
"panels": [
{
"type": "stat",
"title": "Service Status",
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" },
{ "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" }
],
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] }
},
"overrides": []
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background", "textMode": "auto" },
"targets": [
{ "expr": "probe_success{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_listmonk'}", "legendFormat": "Listmonk", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_ollama'}", "legendFormat": "Ollama", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_umami'}", "legendFormat": "Umami", "datasource": { "type": "prometheus" } },
{ "expr": "probe_success{job='pw_forgejo'}", "legendFormat": "Forgejo", "datasource": { "type": "prometheus" } },
{ "expr": "pg_up", "legendFormat": "PostgreSQL", "datasource": { "type": "prometheus" } }
]
},
{
"type": "timeseries",
"title": "Service Response Time",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"fieldConfig": {
"defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "yellow" }, { "value": 3, "color": "red" }] } }
},
"targets": [
{ "expr": "probe_duration_seconds{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } },
{ "expr": "probe_duration_seconds{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } },
{ "expr": "probe_duration_seconds{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } },
{ "expr": "probe_duration_seconds{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } },
{ "expr": "probe_duration_seconds{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } }
]
},
{
"type": "timeseries",
"title": "HTTPS Endpoint Response Time",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"fieldConfig": {
"defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 2, "color": "yellow" }, { "value": 5, "color": "red" }] } }
},
"targets": [
{ "expr": "probe_duration_seconds{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
]
},
{
"type": "stat",
"title": "HTTPS Endpoints",
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 12 },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" },
{ "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" }
],
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] }
}
},
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background" },
"targets": [
{ "expr": "probe_success{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
]
},
{
"type": "gauge",
"title": "SSL Certificate Days Remaining",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 },
"fieldConfig": {
"defaults": {
"unit": "d",
"min": 0, "max": 90,
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 7, "color": "orange" }, { "value": 14, "color": "yellow" }, { "value": 30, "color": "green" }] }
}
},
"targets": [
{ "expr": "(probe_ssl_earliest_cert_expiry{job='blackbox_https'} - time()) / 86400", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
]
},
{
"type": "timeseries",
"title": "Container CPU Usage",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"fieldConfig": { "defaults": { "unit": "percentunit" } },
"targets": [
{ "expr": "rate(container_cpu_usage_seconds_total{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama)-1'}[5m])", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } }
]
},
{
"type": "timeseries",
"title": "Container Memory Usage",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"fieldConfig": { "defaults": { "unit": "bytes" } },
"targets": [
{ "expr": "container_memory_usage_bytes{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama|api-postgres)-1'}", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } }
]
},
{
"type": "stat",
"title": "PostgreSQL",
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 30 },
"targets": [
{ "expr": "pg_stat_activity_count", "legendFormat": "Active Connections", "datasource": { "type": "prometheus" } }
]
},
{
"type": "stat",
"title": "nginx Requests/sec",
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 30 },
"fieldConfig": { "defaults": { "unit": "reqps" } },
"targets": [
{ "expr": "rate(nginx_http_requests_total[5m])", "legendFormat": "req/s", "datasource": { "type": "prometheus" } }
]
},
{
"type": "stat",
"title": "Active Alerts",
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 30 },
"fieldConfig": {
"defaults": {
"thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "orange" }, { "value": 3, "color": "red" }] }
}
},
"targets": [
{ "expr": "ALERTS{alertstate='firing'}", "legendFormat": "{{ alertname }}", "datasource": { "type": "prometheus" } }
]
}
],
"schemaVersion": 39
},
"overwrite": true,
"folderId": 0
}