Skip to content

Commit 2dcce29

Browse files
committed
Added support for alerts per node for cpu utilization, mem_used_percent, and root ebs volume disk_used_percent, Optimized alarms
1 parent a3139f9 commit 2dcce29

File tree

5 files changed

+176
-75
lines changed

5 files changed

+176
-75
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
# GraphDB AWS Terraform Module Changelog
22

33
## 2.6.0
4-
4+
* Added support for monitoring mem_used_percent metric via Cloudwatch alarm and dashboard per node.
5+
* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per node.
6+
* Added support for alarms when transitioning from in alarm state to ok state.
7+
* Changed the comparison operator to use GreaterThanOrEqualToThreshold for all alarms.
8+
* Added treat a missing data option for all alarms and set it to notBreaching.
59
* Added support for changing the volume size of the instance root EBS volume.
610

711
## 2.5.0

modules/monitoring/alarms.tf

Lines changed: 92 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Alarms
22

33
# Attempting to recover metric filter
4+
45
resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
56
count = var.graphdb_node_count > 1 ? 1 : 0
67

@@ -25,14 +26,16 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" {
2526

2627
alarm_name = "al-${var.resource_name_prefix}-attempting-recover"
2728
alarm_description = "Attempting to recover through snapshot replication"
28-
comparison_operator = "GreaterThanThreshold"
29+
comparison_operator = "GreaterThanOrEqualToThreshold"
2930
metric_name = aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0].metric_transformation[0].name
3031
namespace = aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0].metric_transformation[0].namespace
3132
period = var.cloudwatch_period
3233
statistic = "SampleCount"
3334
evaluation_periods = var.cloudwatch_evaluation_periods
3435
threshold = "0"
3536
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
37+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
38+
treat_missing_data = "notBreaching"
3639

3740
depends_on = [aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0]]
3841
}
@@ -59,20 +62,23 @@ resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filte
5962
resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
6063
alarm_name = "al-${var.resource_name_prefix}-low-disk-space"
6164
alarm_description = "Low Disk Space"
62-
comparison_operator = "GreaterThanThreshold"
65+
comparison_operator = "GreaterThanOrEqualToThreshold"
6366
metric_name = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].name
6467
namespace = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].namespace
6568
period = var.cloudwatch_period
6669
statistic = "SampleCount"
6770
evaluation_periods = var.cloudwatch_evaluation_periods
6871
threshold = "0"
6972
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
73+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
74+
treat_missing_data = "notBreaching"
7075

7176
depends_on = [aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter]
7277
}
7378

7479
locals {
7580
# Builds a list of instance hostnames
81+
7682
instance_hostnames = [
7783
for i in range(1, var.graphdb_node_count + 1) :
7884
var.route53_zone_dns_name != null ?
@@ -86,13 +92,15 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
8692

8793
alarm_name = "al-${var.resource_name_prefix}-heap-memory-usage-${each.key}"
8894
alarm_description = "Triggers if ${each.key}'s heap usage exceeds threshold of its total memory"
89-
comparison_operator = "GreaterThanThreshold"
95+
comparison_operator = "GreaterThanOrEqualToThreshold"
9096
threshold = var.graphdb_memory_utilization_threshold
9197
evaluation_periods = 1
92-
treat_missing_data = "missing"
98+
treat_missing_data = "notBreaching"
9399
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
100+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
94101

95102
# Define the metric query for heap used memory
103+
96104
metric_query {
97105
id = "m1"
98106
metric {
@@ -108,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
108116
}
109117

110118
# Defines the metric query for total memory
119+
111120
metric_query {
112121
id = "m2"
113122
metric {
@@ -123,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
123132
}
124133

125134
# Defines the expression to calculate heap usage percentage
135+
126136
metric_query {
127137
id = "e1"
128138
expression = "(m1 / m2) * 100"
@@ -131,24 +141,58 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
131141
}
132142
}
133143

134-
# Alarm for CPU Utilization for Autoscaling Group
144+
# Alarm for Memory Used Percent per node
135145

136-
resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
137-
alarm_name = "al-${var.resource_name_prefix}-cpu-utilization"
138-
alarm_description = "Alarm will trigger if CPU utilization is above 80%"
139-
comparison_operator = "GreaterThanThreshold"
146+
resource "aws_cloudwatch_metric_alarm" "mem_used_percent_per_instance" {
147+
for_each = local.id_to_name
148+
149+
alarm_name = "al-${var.resource_name_prefix}-${each.value}-memory-used-percent"
150+
alarm_description = "mem_used_percent on ${each.value} >= ${var.graphdb_memory_utilization_threshold}%"
151+
namespace = "CWAgent"
152+
metric_name = "mem_used_percent"
153+
unit = "Percent"
154+
statistic = "Maximum"
155+
period = var.cloudwatch_period
140156
evaluation_periods = var.cloudwatch_evaluation_periods
157+
comparison_operator = "GreaterThanOrEqualToThreshold"
158+
threshold = var.graphdb_memory_utilization_threshold
159+
treat_missing_data = "notBreaching"
160+
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
161+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
162+
163+
dimensions = {
164+
AutoScalingGroupName = var.resource_name_prefix
165+
ImageId = data.aws_instance.by_id[each.key].ami
166+
InstanceId = each.key
167+
InstanceType = data.aws_instance.by_id[each.key].instance_type
168+
}
169+
170+
tags = {
171+
InstanceId = each.key
172+
}
173+
}
174+
175+
# Alarm for CPU Utilization for Node
176+
177+
resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization_per_instance" {
178+
for_each = local.id_to_name
179+
180+
alarm_name = "al-${var.resource_name_prefix}-${each.value}-cpu-utilization"
181+
alarm_description = "CPU utilization on ${each.value} >= ${var.cloudwatch_cpu_utilization_threshold}%"
182+
namespace = "AWS/EC2"
183+
metric_name = "CPUUtilization"
184+
unit = "Percent"
185+
statistic = "Average"
141186
period = var.cloudwatch_period
142-
statistic = "Maximum"
187+
evaluation_periods = var.cloudwatch_evaluation_periods
188+
comparison_operator = "GreaterThanOrEqualToThreshold"
143189
threshold = var.cloudwatch_cpu_utilization_threshold
190+
treat_missing_data = "notBreaching"
144191
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
145-
146-
metric_name = "CPUUtilization"
147-
namespace = "AWS/EC2"
148-
unit = "Percent"
192+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
149193

150194
dimensions = {
151-
AutoScalingGroupName = var.resource_name_prefix
195+
InstanceId = each.key
152196
}
153197
}
154198

@@ -162,9 +206,10 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
162206
evaluation_periods = var.cloudwatch_evaluation_periods
163207
datapoints_to_alarm = 1
164208
threshold = 0
165-
comparison_operator = "GreaterThanThreshold"
166-
treat_missing_data = "missing"
209+
comparison_operator = "GreaterThanOrEqualToThreshold"
210+
treat_missing_data = "notBreaching"
167211
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
212+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
168213

169214
metric_query {
170215
id = "q1"
@@ -174,3 +219,33 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
174219
period = var.cloudwatch_period
175220
}
176221
}
222+
223+
# Alarm for Root Disk Used Percent per node
224+
225+
resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent_exact" {
226+
for_each = local.id_to_name
227+
228+
alarm_name = "al-${var.resource_name_prefix}-${each.value}-root-ebs-volume-used-percent"
229+
alarm_description = "Root filesystem used percent on ${each.value} >= 80%"
230+
namespace = "CWAgent"
231+
metric_name = "disk_used_percent"
232+
unit = "Percent"
233+
statistic = "Maximum"
234+
period = var.cloudwatch_period
235+
evaluation_periods = var.cloudwatch_evaluation_periods
236+
comparison_operator = "GreaterThanOrEqualToThreshold"
237+
threshold = 80
238+
treat_missing_data = "notBreaching"
239+
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
240+
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
241+
242+
dimensions = {
243+
AutoScalingGroupName = var.resource_name_prefix
244+
InstanceId = each.key
245+
ImageId = data.aws_instance.by_id[each.key].ami
246+
InstanceType = data.aws_instance.by_id[each.key].instance_type
247+
path = "/"
248+
device = "nvme0n1p1"
249+
fstype = "ext4"
250+
}
251+
}

modules/monitoring/cloudwatch_agent_config.json.tpl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@
117117
"metrics": {
118118
"aggregation_dimensions": [
119119
[
120-
"AutoScalingGroupName"
120+
"AutoScalingGroupName",
121+
"InstanceId"
121122
]
122123
],
123124
"append_dimensions": {

modules/monitoring/data.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
data "aws_instances" "asg_members" {
2+
filter {
3+
name = "tag:aws:autoscaling:groupName"
4+
values = [var.resource_name_prefix]
5+
}
6+
}
7+
8+
data "aws_instance" "by_id" {
9+
for_each = toset(data.aws_instances.asg_members.ids)
10+
instance_id = each.value
11+
}
12+
13+
locals {
14+
id_to_name = {
15+
for id, inst in data.aws_instance.by_id :
16+
id => coalesce(try(inst.tags["Name"], null), id)
17+
}
18+
}

modules/monitoring/graphdb_dashboard.json

Lines changed: 59 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -80,62 +80,64 @@
8080
"title": "GraphDB Memory Used % for the Auto Scaling Group"
8181
}
8282
},
83-
{
84-
"height": 6,
85-
"width": 6,
86-
"y": 6,
87-
"x": 6,
88-
"type": "metric",
89-
"properties": {
90-
"metrics": [
91-
[ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
92-
],
93-
"region": "${aws_region}",
94-
"stacked": false,
95-
"view": "timeSeries",
96-
"period": 300,
97-
"stat": "Average",
98-
"yAxis": {
99-
"left": {
100-
"label": "Gigabytes",
101-
"showUnits": false
102-
},
103-
"right": {
104-
"label": "",
105-
"showUnits": false
106-
}
107-
},
108-
"title": "GraphDB Data Dir Free per instance"
109-
}
110-
},
111-
{
112-
"height": 6,
113-
"width": 6,
114-
"y": 6,
115-
"x": 0,
116-
"type": "metric",
117-
"properties": {
118-
"metrics": [
119-
[ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
120-
],
121-
"region": "${aws_region}",
122-
"stacked": false,
123-
"view": "timeSeries",
124-
"period": 300,
125-
"stat": "Average",
126-
"yAxis": {
127-
"left": {
128-
"label": "Gigabytes",
129-
"showUnits": false
130-
},
131-
"right": {
132-
"label": "",
133-
"showUnits": false
134-
}
135-
},
136-
"title": "GraphDB Data Dir Used per instance"
137-
}
138-
},
83+
{
84+
"height": 6,
85+
"width": 6,
86+
"y": 6,
87+
"x": 6,
88+
"type": "metric",
89+
"properties": {
90+
"metrics": [
91+
[ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
92+
],
93+
"region": "${aws_region}",
94+
"stacked": false,
95+
"view": "timeSeries",
96+
"period": 300,
97+
"stat": "Average",
98+
"yAxis": {
99+
"left": {
100+
"min": 0,
101+
"label": "Gigabytes",
102+
"showUnits": false
103+
},
104+
"right": {
105+
"label": "",
106+
"showUnits": false
107+
}
108+
},
109+
"title": "GraphDB Data Dir Free per instance"
110+
}
111+
},
112+
{
113+
"height": 6,
114+
"width": 6,
115+
"y": 6,
116+
"x": 0,
117+
"type": "metric",
118+
"properties": {
119+
"metrics": [
120+
[ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
121+
],
122+
"region": "${aws_region}",
123+
"stacked": false,
124+
"view": "timeSeries",
125+
"period": 300,
126+
"stat": "Average",
127+
"yAxis": {
128+
"left": {
129+
"min": 0,
130+
"label": "Gigabytes",
131+
"showUnits": false
132+
},
133+
"right": {
134+
"label": "",
135+
"showUnits": false
136+
}
137+
},
138+
"title": "GraphDB Data Dir Used per instance"
139+
}
140+
},
139141
{
140142
"height": 6,
141143
"width": 6,
@@ -153,6 +155,7 @@
153155
"stat": "Average",
154156
"yAxis": {
155157
"left": {
158+
"min": 0,
156159
"label": "Count",
157160
"showUnits": false
158161
},

0 commit comments

Comments
 (0)