Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions alerting/rules.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
groups:
- name: WebServerRecording
rules:
# Recording rules for response time and errors
- record: job:flask_request_duration:avg_rate5m
expr: rate(flask_http_request_duration_seconds_sum[5m]) / rate(flask_http_request_duration_seconds_count[5m])

- record: job:flask_requests:error_rate5m
expr: rate(flask_http_request_total{status=~"5.."}[5m]) / rate(flask_http_request_total[5m])

- name: WebServerAlerts
rules:
- alert: HighResponseTime
expr: rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m]) > 0.5
expr: job:flask_request_duration:avg_rate5m > 0.5
for: 5m
labels:
severity: warning
Expand All @@ -11,7 +20,7 @@ groups:
description: Average response time is above 500ms for 5 minutes

- alert: HighErrorRate
expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
expr: job:flask_requests:error_rate5m > 0.05
for: 5m
labels:
severity: critical
Expand All @@ -25,5 +34,5 @@ groups:
labels:
severity: critical
annotations:
summary: Service {{ $labels.instance }} down
summary: Service {{ $labels.instance }} is down
description: Service has been down for more than 1 minute