From a4a5500bfc4a7c25bd903c9e794cbbb68c87b7d3 Mon Sep 17 00:00:00 2001
From: justin <justin@liquidator.optimal-reality.com>
Date: Fri, 1 May 2026 02:08:39 -0500
Subject: [PATCH] Add Prometheus + Grafana + Alertmanager monitoring stack

Full observability stack with Telegram alerting:

Components:
- Prometheus: metrics collection, 90-day retention
- Grafana: dashboards at monitoring.performancewest.net
- Alertmanager: routes alerts to Telegram bot
- node-exporter: OS metrics (CPU, RAM, disk, network)
- cAdvisor: container metrics (CPU, memory, restarts)
- postgres-exporter: PostgreSQL connection/query metrics
- nginx-exporter: request rate, 5xx errors, connections
- blackbox-exporter: HTTP/TCP endpoint probing + SSL cert checks

Alert rules:
- Service down (HTTP probe, TCP port, container missing)
- Container restart loops
- High CPU/memory/disk/load
- PostgreSQL down or high connections
- SSL cert expiring (14d warning, 3d critical)
- Slow HTTP responses, high 5xx rate

Blackbox probes all public endpoints:
  performancewest.net, api, dev, crm, lists, analytics,
  minio, crypto, pay

Telegram alerts: critical=1h repeat, warning=6h repeat,
  auto-resolve notifications

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docker-compose.yml                            |  99 +++++++++++
 infra/ansible/ansible.cfg                     |   4 +
 infra/ansible/inventory/group_vars/all.yml    |   1 +
 infra/ansible/playbooks/site.yml              |   1 +
 .../roles/monitoring/defaults/main.yml        |  13 ++
 .../roles/monitoring/handlers/main.yml        |   5 +
 infra/ansible/roles/monitoring/tasks/main.yml |  91 ++++++++++
 .../nginx/templates/pw-monitoring-tls.conf.j2 |  58 +++++++
 monitoring/alert_rules.yml                    | 162 ++++++++++++++++++
 monitoring/alertmanager.yml                   |  40 +++++
 monitoring/blackbox.yml                       |  15 ++
 monitoring/grafana-datasources.yml            |   9 +
 monitoring/prometheus.yml                     |  83 +++++++++
 13 files changed, 581 insertions(+)
 create mode 100644 infra/ansible/ansible.cfg
 create mode 100644 infra/ansible/roles/monitoring/defaults/main.yml
 create mode 100644 infra/ansible/roles/monitoring/handlers/main.yml
 create mode 100644 infra/ansible/roles/monitoring/tasks/main.yml
 create mode 100644 infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2
 create mode 100644 monitoring/alert_rules.yml
 create mode 100644 monitoring/alertmanager.yml
 create mode 100644 monitoring/blackbox.yml
 create mode 100644 monitoring/grafana-datasources.yml
 create mode 100644 monitoring/prometheus.yml

diff --git a/docker-compose.yml b/docker-compose.yml
index f8b4297..10ab469 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -285,6 +285,103 @@ services:
       - umami-pgdata:/var/lib/postgresql/data
     restart: unless-stopped
 
+  # ── Monitoring Stack ────────────────────────────────────────────────
+  prometheus:
+    image: prom/prometheus:latest
+    ports:
+      - "127.0.0.1:9090:9090"
+    volumes:
+      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
+      - prometheus-data:/prometheus
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --storage.tsdb.retention.time=90d
+      - --web.enable-lifecycle
+    restart: unless-stopped
+
+  grafana:
+    image: grafana/grafana:latest
+    ports:
+      - "127.0.0.1:3200:3000"
+    environment:
+      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
+      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-pw_grafana_2026}
+      - GF_SERVER_ROOT_URL=https://monitoring.performancewest.net
+      - GF_SERVER_DOMAIN=monitoring.performancewest.net
+      - GF_SMTP_ENABLED=true
+      - GF_SMTP_HOST=${SMTP_HOST}:${SMTP_PORT}
+      - GF_SMTP_USER=${SMTP_USER}
+      - GF_SMTP_PASSWORD=${SMTP_PASS}
+      - GF_SMTP_FROM_ADDRESS=noreply@performancewest.net
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_AUTH_ANONYMOUS_ENABLED=false
+    volumes:
+      - grafana-data:/var/lib/grafana
+      - ./monitoring/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
+    depends_on:
+      - prometheus
+    restart: unless-stopped
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    ports:
+      - "127.0.0.1:9093:9093"
+    volumes:
+      - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+    command:
+      - --config.file=/etc/alertmanager/alertmanager.yml
+      - --storage.path=/alertmanager
+    environment:
+      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
+      - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
+    restart: unless-stopped
+
+  node-exporter:
+    image: prom/node-exporter:latest
+    command:
+      - --path.rootfs=/host
+      - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
+    volumes:
+      - /:/host:ro,rslave
+    pid: host
+    restart: unless-stopped
+
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:latest
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /dev/disk/:/dev/disk:ro
+    devices:
+      - /dev/kmsg
+    privileged: true
+    restart: unless-stopped
+
+  postgres-exporter:
+    image: prometheuscommunity/postgres-exporter:latest
+    environment:
+      - DATA_SOURCE_NAME=postgresql://pw:${DB_PASSWORD:-pw_dev_2026}@api-postgres:5432/performancewest?sslmode=disable
+    depends_on:
+      - api-postgres
+    restart: unless-stopped
+
+  nginx-exporter:
+    image: nginx/nginx-prometheus-exporter:latest
+    command:
+      - -nginx.scrape-uri=http://host.docker.internal:80/nginx_status
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    restart: unless-stopped
+
+  blackbox-exporter:
+    image: prom/blackbox-exporter:latest
+    volumes:
+      - ./monitoring/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
+    restart: unless-stopped
+
 volumes:
   api-pgdata:
   worker-data:
@@ -297,3 +394,5 @@ volumes:
   erpnext-mariadb-data:
   listmonk-uploads:
   umami-pgdata:
+  prometheus-data:
+  grafana-data:
diff --git a/infra/ansible/ansible.cfg b/infra/ansible/ansible.cfg
new file mode 100644
index 0000000..7f78352
--- /dev/null
+++ b/infra/ansible/ansible.cfg
@@ -0,0 +1,4 @@
+[defaults]
+roles_path = ./roles
+inventory = ./inventory/hosts.yml
+host_key_checking = False
diff --git a/infra/ansible/inventory/group_vars/all.yml b/infra/ansible/inventory/group_vars/all.yml
index 83497a0..1e78060 100644
--- a/infra/ansible/inventory/group_vars/all.yml
+++ b/infra/ansible/inventory/group_vars/all.yml
@@ -14,6 +14,7 @@ shkeeper_domain: pay.performancewest.net
 shkeeper_admin_domain: crypto.performancewest.net
 minio_domain: minio.performancewest.net
 minio_console_domain: minio-console.performancewest.net
+monitoring_domain: monitoring.performancewest.net
 # Windows DocServer VM (connects to MinIO externally for DOCX→PDF conversion)
 docserver_ip: 108.181.102.34
 
diff --git a/infra/ansible/playbooks/site.yml b/infra/ansible/playbooks/site.yml
index c66031c..02500fc 100644
--- a/infra/ansible/playbooks/site.yml
+++ b/infra/ansible/playbooks/site.yml
@@ -32,4 +32,5 @@
     - worker-crons
     - shkeeper
     - nginx
+    - monitoring
     - security-updates
diff --git a/infra/ansible/roles/monitoring/defaults/main.yml b/infra/ansible/roles/monitoring/defaults/main.yml
new file mode 100644
index 0000000..9540dec
--- /dev/null
+++ b/infra/ansible/roles/monitoring/defaults/main.yml
@@ -0,0 +1,13 @@
+---
+monitoring_domain: monitoring.performancewest.net
+grafana_port: 3200
+prometheus_port: 9090
+alertmanager_port: 9093
+
+# Telegram bot for alerts (set in vault)
+telegram_bot_token: "{{ vault_telegram_bot_token | default('') }}"
+telegram_chat_id: "{{ vault_telegram_chat_id | default('') }}"
+
+# Grafana admin credentials (set in vault)
+grafana_admin_user: "{{ vault_grafana_admin_user | default('admin') }}"
+grafana_admin_password: "{{ vault_grafana_admin_password | default('pw_grafana_2026') }}"
diff --git a/infra/ansible/roles/monitoring/handlers/main.yml b/infra/ansible/roles/monitoring/handlers/main.yml
new file mode 100644
index 0000000..7419154
--- /dev/null
+++ b/infra/ansible/roles/monitoring/handlers/main.yml
@@ -0,0 +1,5 @@
+---
+- name: Reload nginx
+  ansible.builtin.systemd:
+    name: nginx
+    state: reloaded
diff --git a/infra/ansible/roles/monitoring/tasks/main.yml b/infra/ansible/roles/monitoring/tasks/main.yml
new file mode 100644
index 0000000..f7a5aa6
--- /dev/null
+++ b/infra/ansible/roles/monitoring/tasks/main.yml
@@ -0,0 +1,91 @@
+---
+# ══════════════════════════════════════════════════════════════════════════════
+# Monitoring Role — Prometheus + Grafana + Alertmanager + Telegram
+# ══════════════════════════════════════════════════════════════════════════════
+
+# ── 1. nginx stub_status for nginx-exporter ──────────────────────────
+- name: Enable nginx stub_status endpoint
+  ansible.builtin.copy:
+    content: |
+      server {
+          listen 80;
+          server_name 127.0.0.1;
+          location /nginx_status {
+              stub_status;
+              allow 127.0.0.1;
+              allow 172.16.0.0/12;
+              deny all;
+          }
+      }
+    dest: /etc/nginx/conf.d/stub-status.conf
+    owner: root
+    group: root
+    mode: "0644"
+  notify: Reload nginx
+
+# ── 2. Deploy nginx config for monitoring.performancewest.net ────────
+- name: Deploy Grafana nginx config
+  ansible.builtin.template:
+    src: ../../nginx/templates/pw-monitoring-tls.conf.j2
+    dest: /etc/nginx/sites-available/pw-monitoring.conf
+    owner: root
+    group: root
+    mode: "0644"
+  notify: Reload nginx
+
+- name: Enable Grafana nginx config
+  ansible.builtin.file:
+    src: /etc/nginx/sites-available/pw-monitoring.conf
+    dest: /etc/nginx/sites-enabled/pw-monitoring.conf
+    state: link
+  notify: Reload nginx
+
+# ── 3. Obtain TLS certificate ────────────────────────────────────────
+- name: Check if monitoring cert exists
+  ansible.builtin.stat:
+    path: /etc/letsencrypt/live/{{ monitoring_domain }}/fullchain.pem
+  register: monitoring_cert
+
+- name: Obtain Let's Encrypt cert for monitoring domain
+  ansible.builtin.command:
+    cmd: >
+      certbot certonly --webroot -w {{ certbot_webroot }}
+      -d {{ monitoring_domain }}
+      --non-interactive --agree-tos
+      --email {{ certbot_email }}
+  when: not monitoring_cert.stat.exists
+  notify: Reload nginx
+
+# ── 4. Set env vars for Telegram in .env ─────────────────────────────
+- name: Ensure Telegram vars in .env
+  ansible.builtin.lineinfile:
+    path: "{{ project_dir }}/.env"
+    regexp: "^{{ item.key }}="
+    line: "{{ item.key }}={{ item.value }}"
+    state: present
+  loop:
+    - { key: "TELEGRAM_BOT_TOKEN", value: "{{ telegram_bot_token }}" }
+    - { key: "TELEGRAM_CHAT_ID", value: "{{ telegram_chat_id }}" }
+    - { key: "GRAFANA_ADMIN_USER", value: "{{ grafana_admin_user }}" }
+    - { key: "GRAFANA_ADMIN_PASSWORD", value: "{{ grafana_admin_password }}" }
+  when: telegram_bot_token != ""
+  no_log: true
+
+# ── 5. UFW rules ─────────────────────────────────────────────────────
+- name: Allow Grafana from localhost only
+  community.general.ufw:
+    rule: allow
+    port: "{{ grafana_port }}"
+    proto: tcp
+    from_ip: 127.0.0.1
+    comment: "Grafana (via nginx)"
+
+# ── 6. Start monitoring stack ────────────────────────────────────────
+- name: Start monitoring containers
+  ansible.builtin.shell:
+    cmd: >
+      cd {{ project_dir }} &&
+      docker compose up -d prometheus grafana alertmanager
+      node-exporter cadvisor postgres-exporter nginx-exporter blackbox-exporter
+    chdir: "{{ project_dir }}"
+  changed_when: true
diff --git a/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2 b/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2
new file mode 100644
index 0000000..c4078d5
--- /dev/null
+++ b/infra/ansible/roles/nginx/templates/pw-monitoring-tls.conf.j2
@@ -0,0 +1,58 @@
+# {{ ansible_managed }}
+# HTTPS config for monitoring.performancewest.net (Grafana)
+
+# Redirect HTTP -> HTTPS
+server {
+    listen 80;
+    server_name monitoring.performancewest.net;
+
+    location /.well-known/acme-challenge/ {
+        root {{ certbot_webroot }};
+    }
+
+    location / {
+        return 301 https://monitoring.performancewest.net$request_uri;
+    }
+}
+
+# Grafana dashboard
+server {
+    listen 443 ssl;
+    http2 on;
+    server_name monitoring.performancewest.net;
+
+    ssl_certificate     /etc/letsencrypt/live/monitoring.performancewest.net/fullchain.pem;
+    ssl_certificate_key /etc/letsencrypt/live/monitoring.performancewest.net/privkey.pem;
+    ssl_protocols       TLSv1.2 TLSv1.3;
+    ssl_ciphers         HIGH:!aNULL:!MD5;
+    ssl_prefer_server_ciphers on;
+    ssl_session_cache   shared:SSL:10m;
+    ssl_session_timeout 10m;
+
+    add_header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload" always;
+
+    include /etc/nginx/snippets/pw-security.conf;
+
+    client_max_body_size 10m;
+
+    location / {
+        proxy_pass http://127.0.0.1:3200;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+
+        # WebSocket for Grafana Live
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+
+        proxy_connect_timeout 10s;
+        proxy_send_timeout 60s;
+        proxy_read_timeout 60s;
+    }
+
+    location /.well-known/acme-challenge/ {
+        root {{ certbot_webroot }};
+    }
+}
diff --git a/monitoring/alert_rules.yml b/monitoring/alert_rules.yml
new file mode 100644
index 0000000..8f84920
--- /dev/null
+++ b/monitoring/alert_rules.yml
@@ -0,0 +1,162 @@
+groups:
+  # ══════════════════════════════════════════════════════════════════════
+  # Service Down Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: service_down
+    rules:
+      - alert: EndpointDown
+        expr: probe_success{job="blackbox_http"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "{{ $labels.instance }} is DOWN"
+          description: "HTTP probe failed for {{ $labels.instance }} for more than 2 minutes."
+
+      - alert: TCPPortDown
+        expr: probe_success{job="blackbox_tcp"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "TCP port {{ $labels.instance }} is DOWN"
+          description: "TCP connection failed to {{ $labels.instance }} for more than 1 minute."
+
+      - alert: ContainerDown
+        expr: |
+          absent(container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"})
+          or time() - container_last_seen{name=~"performancewest-(api|site|workers|erpnext|listmonk|minio|umami)-1"} > 60
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Container {{ $labels.name }} is DOWN"
+          description: "Docker container {{ $labels.name }} has not been seen for more than 1 minute."
+
+      - alert: ContainerRestarting
+        expr: increase(container_start_time_seconds{name=~"performancewest-.*"}[15m]) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container {{ $labels.name }} is restart-looping"
+          description: "Container {{ $labels.name }} has restarted more than 2 times in 15 minutes."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # Host Resource Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: host_resources
+    rules:
+      - alert: HighCPU
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage ({{ $value | printf \"%.1f\" }}%)"
+          description: "CPU usage has been above 85% for 10 minutes."
+
+      - alert: HighMemory
+        expr: (1 - node_memory_AvailableBytes / node_memory_MemTotalBytes) * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage ({{ $value | printf \"%.1f\" }}%)"
+          description: "Memory usage has been above 90% for 5 minutes."
+
+      - alert: DiskSpaceLow
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk usage high ({{ $value | printf \"%.1f\" }}%)"
+          description: "Root filesystem is more than 80% full."
+
+      - alert: DiskSpaceCritical
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 > 92
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Disk usage CRITICAL ({{ $value | printf \"%.1f\" }}%)"
+          description: "Root filesystem is more than 92% full. Immediate action required."
+
+      - alert: HighLoadAverage
+        expr: node_load15 > 8
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High load average ({{ $value | printf \"%.1f\" }})"
+          description: "15-minute load average has been above 8 for 10 minutes."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # Database Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: database
+    rules:
+      - alert: PostgresDown
+        expr: pg_up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "PostgreSQL is DOWN"
+          description: "PostgreSQL exporter cannot connect to the database."
+
+      - alert: PostgresHighConnections
+        expr: pg_stat_activity_count > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High PostgreSQL connections ({{ $value }})"
+          description: "PostgreSQL active connections exceeding 80."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # SSL Certificate Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: ssl
+    rules:
+      - alert: SSLCertExpiringSoon
+        expr: probe_ssl_earliest_cert_expiry - time() < 14 * 24 * 3600
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "SSL cert expiring in < 14 days for {{ $labels.instance }}"
+          description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}."
+
+      - alert: SSLCertExpiryCritical
+        expr: probe_ssl_earliest_cert_expiry - time() < 3 * 24 * 3600
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "SSL cert expiring in < 3 days for {{ $labels.instance }}"
+          description: "Certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}. Certbot renewal may be broken."
+
+  # ══════════════════════════════════════════════════════════════════════
+  # Response Time Alerts
+  # ══════════════════════════════════════════════════════════════════════
+  - name: latency
+    rules:
+      - alert: SlowHTTPResponse
+        expr: probe_http_duration_seconds{phase="transfer", job="blackbox_http"} > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Slow response from {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}s)"
+          description: "HTTP response time exceeds 5 seconds for {{ $labels.instance }}."
+
+      - alert: HighNginx5xxRate
+        expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High nginx 5xx error rate"
+          description: "More than 0.5 req/s returning 5xx errors."
diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml
new file mode 100644
index 0000000..82907c2
--- /dev/null
+++ b/monitoring/alertmanager.yml
@@ -0,0 +1,40 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  receiver: telegram
+  group_by: [alertname, instance]
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+  routes:
+    - match:
+        severity: critical
+      receiver: telegram
+      repeat_interval: 1h
+    - match:
+        severity: warning
+      receiver: telegram
+      repeat_interval: 6h
+
+receivers:
+  - name: telegram
+    telegram_configs:
+      - bot_token: "${TELEGRAM_BOT_TOKEN}"
+        chat_id: ${TELEGRAM_CHAT_ID}
+        parse_mode: HTML
+        message: |
+          {{ if eq .Status "firing" }}🔴{{ else }}✅{{ end }} <b>{{ .Status | toUpper }}</b>
+          {{ range .Alerts }}
+          <b>{{ .Labels.alertname }}</b>
+          {{ .Annotations.summary }}
+          {{ if .Annotations.description }}<i>{{ .Annotations.description }}</i>{{ end }}
+          {{ end }}
+          <code>Server: pw-server | {{ .ExternalURL }}</code>
+
+inhibit_rules:
+  - source_match:
+      severity: critical
+    target_match:
+      severity: warning
+    equal: [alertname, instance]
diff --git a/monitoring/blackbox.yml b/monitoring/blackbox.yml
new file mode 100644
index 0000000..ab91b22
--- /dev/null
+++ b/monitoring/blackbox.yml
@@ -0,0 +1,15 @@
+modules:
+  http_2xx:
+    prober: http
+    timeout: 10s
+    http:
+      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+      valid_status_codes: [200, 301, 302]
+      follow_redirects: true
+      preferred_ip_protocol: ip4
+      tls_config:
+        insecure_skip_verify: false
+
+  tcp_connect:
+    prober: tcp
+    timeout: 5s
diff --git a/monitoring/grafana-datasources.yml b/monitoring/grafana-datasources.yml
new file mode 100644
index 0000000..bb009bb
--- /dev/null
+++ b/monitoring/grafana-datasources.yml
@@ -0,0 +1,9 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
new file mode 100644
index 0000000..88c2dcc
--- /dev/null
+++ b/monitoring/prometheus.yml
@@ -0,0 +1,83 @@
+global:
+  scrape_interval: 30s
+  evaluation_interval: 30s
+
+rule_files:
+  - /etc/prometheus/alert_rules.yml
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+
+scrape_configs:
+  # ── Prometheus self-monitoring ──────────────────────────────────────
+  - job_name: prometheus
+    static_configs:
+      - targets: ["localhost:9090"]
+
+  # ── Host OS metrics (node_exporter) ────────────────────────────────
+  - job_name: node
+    static_configs:
+      - targets: ["node-exporter:9100"]
+
+  # ── Docker container metrics (cAdvisor) ────────────────────────────
+  - job_name: cadvisor
+    static_configs:
+      - targets: ["cadvisor:8080"]
+
+  # ── PostgreSQL (prod) ──────────────────────────────────────────────
+  - job_name: postgres_prod
+    static_configs:
+      - targets: ["postgres-exporter:9187"]
+        labels:
+          instance: prod
+
+  # ── nginx ──────────────────────────────────────────────────────────
+  - job_name: nginx
+    static_configs:
+      - targets: ["nginx-exporter:9113"]
+
+  # ── Blackbox probes (HTTP endpoint monitoring) ─────────────────────
+  - job_name: blackbox_http
+    metrics_path: /probe
+    params:
+      module: [http_2xx]
+    static_configs:
+      - targets:
+          - https://performancewest.net
+          - https://api.performancewest.net/api/v1/fcc/search?q=test
+          - https://dev.performancewest.net
+          - https://api.dev.performancewest.net/api/v1/fcc/search?q=test
+          - https://crm.performancewest.net
+          - https://lists.performancewest.net
+          - https://analytics.performancewest.net
+          - https://minio.performancewest.net/minio/health/live
+          - https://crypto.performancewest.net
+          - https://pay.performancewest.net
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # ── Blackbox TCP probes (port monitoring) ──────────────────────────
+  - job_name: blackbox_tcp
+    metrics_path: /probe
+    params:
+      module: [tcp_connect]
+    static_configs:
+      - targets:
+          - api-postgres:5432
+          - erpnext-mariadb:3306
+          - erpnext-redis:6379
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115