Add Prometheus metrics and monitoring for Redis

bmcdorman · bmcdorman · commit 290d2213b543 · 2025-10-09T07:43:05.000-07:00
- Add prom-client dependency for metrics
- Create /metrics endpoint exposing Prometheus metrics
- Track Redis connection status (database_redis_connection_status)
- Track Redis operation success/failure counts by operation type
- Update RedisCache to report connection events and operation metrics
- Add comprehensive monitoring documentation

Metrics exposed:
- database_redis_connection_status: 1=connected, 0=disconnected
- database_redis_operation_success_total: successful operations by type
- database_redis_operation_failures_total: failed operations by type
diff --git a/REDIS_MONITORING.md b/REDIS_MONITORING.md
@@ -0,0 +1,87 @@
+# Redis Monitoring and Alerting
+
+This document describes the Redis monitoring and alerting setup for the database service.
+
+## Overview
+
+The database service now includes comprehensive monitoring for Redis connection status and operation health. When Redis goes down, the application continues to function without caching, and alerts are triggered to notify operators.
+
+## Metrics
+
+The following Prometheus metrics are exposed at `/metrics`:
+
+### `database_redis_connection_status`
+- **Type**: Gauge
+- **Values**: `1` (connected) or `0` (disconnected)
+- **Description**: Current Redis connection status
+
+### `database_redis_operation_success_total`
+- **Type**: Counter
+- **Labels**: `operation` (get, set, remove)
+- **Description**: Total number of successful Redis operations
+
+### `database_redis_operation_failures_total`
+- **Type**: Counter
+- **Labels**: `operation` (get, set, remove)
+- **Description**: Total number of failed Redis operations
+
+## Alerts
+
+### DatabaseRedisDown
+- **Severity**: Critical
+- **Condition**: Redis connection is down for more than 1 minute
+- **Description**: The database service has lost connection to Redis. Cache is unavailable but the service continues to operate.
+- **Action**: Check Redis pod status, network connectivity, and Redis logs.
+
+### DatabaseRedisOperationFailures
+- **Severity**: Warning
+- **Condition**: Redis operations failing at rate > 0.1/second for 2 minutes
+- **Description**: Redis operations are experiencing failures
+- **Action**: Check Redis health, network latency, and error logs.
+
+### DatabaseRedisHighFailureRate
+- **Severity**: Critical
+- **Condition**: Redis operations failing at rate > 1/second for 1 minute
+- **Description**: Critical failure rate - service is degraded
+- **Action**: Investigate immediately. Check Redis status, restart if necessary.
+
+## Grafana Dashboard
+
+A dedicated Grafana dashboard "Database Redis Monitoring" provides:
+
+1. **Redis Connection Status** - Real-time connection state
+2. **Operation Success Rate** - Rate of successful operations by type
+3. **Operation Failure Rate** - Rate of failed operations by type
+4. **Success Rate %** - Overall success percentage
+5. **Connection History** - Timeline of connection up/down events
+
+Import the dashboard from: `Simulator/grafana-dashboards/database-redis-monitoring.json`
+
+## Deployment
+
+The monitoring stack is deployed automatically with the database Helm chart:
+
+- **ServiceMonitor**: Scrapes `/metrics` endpoint every 30 seconds
+- **PrometheusRule**: Defines alert rules
+- **Service**: Labeled for Prometheus discovery
+
+## Testing Alerting
+
+To test the alerting system:
+
+1. Deploy to staging environment
+2. Stop the Redis pod: `kubectl delete pod -l app=redis`
+3. Verify metrics show `database_redis_connection_status = 0`
+4. Wait 1 minute for `DatabaseRedisDown` alert to fire
+5. Check Alertmanager UI for active alerts
+6. Restart Redis and verify recovery
+
+## Configuration
+
+Alert routing and notification channels are configured in Alertmanager. Ensure the following labels are routed appropriately:
+
+- `severity: critical` → PagerDuty / immediate notifications
+- `severity: warning` → Slack / email notifications
+- `component: database`
+- `service: redis`
+
diff --git a/package.json b/package.json
@@ -16,7 +16,8 @@
     "@google-cloud/storage": "^6.9.3",
     "fastify": "^4.9.2",
     "firebase-admin": "^11.2.0",
-    "ioredis": "^5.2.3"
+    "ioredis": "^5.2.3",
+    "prom-client": "^14.2.0"
   },
   "devDependencies": {
     "@types/argparse": "^2.0.10",
diff --git a/src/RedisCache.ts b/src/RedisCache.ts
@@ -2,6 +2,7 @@ import Cache from './Cache';
 
 import Redis, { RedisOptions } from 'ioredis';
 import Selector from './model/Selector';
+import { redisConnectionGauge, redisFailureCounter, redisSuccessCounter } from './metrics';
 
 class RedisCache implements Cache {
   private static DEFAULT_TTL = 60 * 60 * 24 * 7;
@@ -22,8 +23,29 @@ class RedisCache implements Cache {
       enableOfflineQueue: false,   // Don't queue commands when disconnected
     });
     
+    this.redis_.on('connect', () => {
+      console.log('Redis connected');
+      redisConnectionGauge.set(1);
+    });
+
+    this.redis_.on('ready', () => {
+      console.log('Redis ready');
+      redisConnectionGauge.set(1);
+    });
+
     this.redis_.on('error', (err) => {
       console.error('Redis error (app will continue without cache):', err.message);
+      redisConnectionGauge.set(0);
+    });
+
+    this.redis_.on('close', () => {
+      console.warn('Redis connection closed');
+      redisConnectionGauge.set(0);
+    });
+
+    this.redis_.on('end', () => {
+      console.warn('Redis connection ended');
+      redisConnectionGauge.set(0);
     });
   }
 
@@ -34,10 +56,12 @@ class RedisCache implements Cache {
   async get(selector: Selector): Promise<object | null> {
     try {
       const data = await this.redis_.get(RedisCache.key_(selector));
+      redisSuccessCounter.inc({ operation: 'get' });
       if (!data) return null;
       return JSON.parse(data);
     } catch (err) {
       console.error('Redis GET failed, continuing without cache:', err);
+      redisFailureCounter.inc({ operation: 'get' });
       return null;
     }
   }
@@ -46,19 +70,24 @@ class RedisCache implements Cache {
     try {
       if (!value) {
         await this.redis_.del(RedisCache.key_(selector));
+        redisSuccessCounter.inc({ operation: 'set' });
         return;
       }
       await this.redis_.setex(RedisCache.key_(selector), RedisCache.DEFAULT_TTL, JSON.stringify(value));
+      redisSuccessCounter.inc({ operation: 'set' });
     } catch (err) {
       console.error('Redis SET failed, continuing without cache:', err);
+      redisFailureCounter.inc({ operation: 'set' });
     }
   }
 
   async remove(selector: Selector): Promise<void> {
     try {
       await this.redis_.del(RedisCache.key_(selector));
+      redisSuccessCounter.inc({ operation: 'remove' });
     } catch (err) {
       console.error('Redis DEL failed, continuing without cache:', err);
+      redisFailureCounter.inc({ operation: 'remove' });
     }
   }
 }
diff --git a/src/index.ts b/src/index.ts
@@ -12,6 +12,7 @@ import authorize, { AuthorizeResult } from './authorize';
 import { CHALLENGE_COMPLETION_COLLECTION, USER_COLLECTION } from './model/constants';
 
 import bigStore from './big-store';
+import { register as metricsRegister } from './metrics';
 
 const UNAUTHORIZED_RESULT = { message: 'Unauthorized' };
 const NOT_FOUND_RESULT = { message: 'Not Found' };
@@ -47,6 +48,12 @@ app.get('/', async (request, reply) => {
   reply.send({ database: 'alive' });
 });
 
+// Prometheus metrics endpoint
+app.get('/metrics', async (request, reply) => {
+  reply.header('Content-Type', metricsRegister.contentType);
+  reply.send(await metricsRegister.metrics());
+});
+
 app.get('/:collection/:id', async (request, reply) => {
   const token = await authenticate(request);
 
diff --git a/src/metrics.ts b/src/metrics.ts
@@ -0,0 +1,39 @@
+import { Registry, Gauge, Counter } from 'prom-client';
+
+// Create a custom registry
+export const register = new Registry();
+
+// Redis connection status gauge (1 = connected, 0 = disconnected)
+export const redisConnectionGauge = new Gauge({
+  name: 'database_redis_connection_status',
+  help: 'Redis connection status (1 = connected, 0 = disconnected)',
+  registers: [register],
+});
+
+// Redis operation failures counter
+export const redisFailureCounter = new Counter({
+  name: 'database_redis_operation_failures_total',
+  help: 'Total number of failed Redis operations',
+  labelNames: ['operation'], // 'get', 'set', 'remove'
+  registers: [register],
+});
+
+// Redis operation success counter
+export const redisSuccessCounter = new Counter({
+  name: 'database_redis_operation_success_total',
+  help: 'Total number of successful Redis operations',
+  labelNames: ['operation'],
+  registers: [register],
+});
+
+// HTTP request counter
+export const httpRequestCounter = new Counter({
+  name: 'database_http_requests_total',
+  help: 'Total number of HTTP requests',
+  labelNames: ['method', 'route', 'status_code'],
+  registers: [register],
+});
+
+// Initialize Redis status as disconnected
+redisConnectionGauge.set(0);
+