Each service gets its own Prometheus probe verifying actual functionality: - API: /status endpoint (checks DB connectivity, returns 503 if down) - Workers: /health endpoint (job server responsive) - ERPNext: API method call (MariaDB + Redis + app all working) - MinIO: /minio/health/live (storage accessible) - Listmonk: /api/health (email service + DB) - Ollama: root endpoint (LLM inference available) - Umami: /api/heartbeat (analytics tracking) - Forgejo: root page (git server accessible) - PostgreSQL: pg_up metric from postgres-exporter - All HTTPS endpoints: SSL + reachability from outside Service-specific alerts with context: - API down = DB may be unreachable - Workers down = compliance orders not processing - ERPNext down = CRM inaccessible - MinIO down = document storage unavailable Custom Grafana dashboard: "Performance West — Services Overview" - Service status grid (UP/DOWN with colors) - Response time charts (internal + HTTPS) - SSL certificate expiry gauges - Container CPU/memory per service - PostgreSQL connections, nginx req/s, active alerts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
151 lines
7.2 KiB
JSON
151 lines
7.2 KiB
JSON
{
|
|
"dashboard": {
|
|
"id": null,
|
|
"uid": null,
|
|
"title": "Performance West — Services Overview",
|
|
"tags": ["performancewest", "services"],
|
|
"timezone": "browser",
|
|
"refresh": "30s",
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"panels": [
|
|
{
|
|
"type": "stat",
|
|
"title": "Service Status",
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{ "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" },
|
|
{ "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" }
|
|
],
|
|
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] }
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background", "textMode": "auto" },
|
|
"targets": [
|
|
{ "expr": "probe_success{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_listmonk'}", "legendFormat": "Listmonk", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_ollama'}", "legendFormat": "Ollama", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_umami'}", "legendFormat": "Umami", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_success{job='pw_forgejo'}", "legendFormat": "Forgejo", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "pg_up", "legendFormat": "PostgreSQL", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Service Response Time",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "yellow" }, { "value": 3, "color": "red" }] } }
|
|
},
|
|
"targets": [
|
|
{ "expr": "probe_duration_seconds{job='pw_api_prod'}", "legendFormat": "API", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_duration_seconds{job='pw_site_prod'}", "legendFormat": "Site", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_duration_seconds{job='pw_workers'}", "legendFormat": "Workers", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_duration_seconds{job='pw_erpnext'}", "legendFormat": "ERPNext", "datasource": { "type": "prometheus" } },
|
|
{ "expr": "probe_duration_seconds{job='pw_minio'}", "legendFormat": "MinIO", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "HTTPS Endpoint Response Time",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s", "thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 2, "color": "yellow" }, { "value": 5, "color": "red" }] } }
|
|
},
|
|
"targets": [
|
|
{ "expr": "probe_duration_seconds{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "HTTPS Endpoints",
|
|
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 12 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"mappings": [
|
|
{ "options": { "0": { "text": "DOWN", "color": "red" } }, "type": "value" },
|
|
{ "options": { "1": { "text": "UP", "color": "green" } }, "type": "value" }
|
|
],
|
|
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 1, "color": "green" }] }
|
|
}
|
|
},
|
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "graphMode": "none", "colorMode": "background" },
|
|
"targets": [
|
|
{ "expr": "probe_success{job='blackbox_https'}", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "gauge",
|
|
"title": "SSL Certificate Days Remaining",
|
|
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "d",
|
|
"min": 0, "max": 90,
|
|
"thresholds": { "steps": [{ "value": null, "color": "red" }, { "value": 7, "color": "orange" }, { "value": 14, "color": "yellow" }, { "value": 30, "color": "green" }] }
|
|
}
|
|
},
|
|
"targets": [
|
|
{ "expr": "(probe_ssl_earliest_cert_expiry{job='blackbox_https'} - time()) / 86400", "legendFormat": "{{ instance }}", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Container CPU Usage",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
|
|
"fieldConfig": { "defaults": { "unit": "percentunit" } },
|
|
"targets": [
|
|
{ "expr": "rate(container_cpu_usage_seconds_total{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama)-1'}[5m])", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Container Memory Usage",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
|
|
"fieldConfig": { "defaults": { "unit": "bytes" } },
|
|
"targets": [
|
|
{ "expr": "container_memory_usage_bytes{name=~'performancewest-(api|site|workers|erpnext|minio|listmonk|ollama|api-postgres)-1'}", "legendFormat": "{{ name }}", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "PostgreSQL",
|
|
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 30 },
|
|
"targets": [
|
|
{ "expr": "pg_stat_activity_count", "legendFormat": "Active Connections", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "nginx Requests/sec",
|
|
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 30 },
|
|
"fieldConfig": { "defaults": { "unit": "reqps" } },
|
|
"targets": [
|
|
{ "expr": "rate(nginx_http_requests_total[5m])", "legendFormat": "req/s", "datasource": { "type": "prometheus" } }
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "Active Alerts",
|
|
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 30 },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": { "steps": [{ "value": null, "color": "green" }, { "value": 1, "color": "orange" }, { "value": 3, "color": "red" }] }
|
|
}
|
|
},
|
|
"targets": [
|
|
{ "expr": "ALERTS{alertstate='firing'}", "legendFormat": "{{ alertname }}", "datasource": { "type": "prometheus" } }
|
|
]
|
|
}
|
|
],
|
|
"schemaVersion": 39
|
|
},
|
|
"overwrite": true,
|
|
"folderId": 0
|
|
}
|