Skip to content

Commit 140fa3a

Browse files
committed
Added support for alerts per node for cpu utilization, mem_used_percent, and root ebs volume disk_used_percent, Optimized alarms
1 parent 96677f5 commit 140fa3a

File tree

4 files changed

+82
-37
lines changed

4 files changed

+82
-37
lines changed

CHANGELOG.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
# GraphDB AWS Terraform Module Changelog
22

3-
## 2.5.1
4-
* Added support for monitoring mem_used_percent metric via Cloudwatch alarm and dashboard.
5-
* Added support for monitoring the disk_used_percent and disk_free metric via Cloudwatch alarm and dashboard.
3+
## 2.6.0
4+
* Added support for monitoring mem_used_percent metric via Cloudwatch alarm and dashboard per node.
5+
* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per node.
6+
* Added support for alarms when transitioning from in alarm state to ok state.
7+
* Changed the comparison operator to use GreaterThanOrEqualToThreshold for all alarms.
8+
* Added treat missing data option for all alarms and set it to notBreaching.
69

710
## 2.5.0
811
* Added support for managing Route 53 hosted zones and DNS records via external_dns_records module.

modules/monitoring/alarms.tf

Lines changed: 56 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Alarms
22

33
# Attempting to recover metric filter
4+
45
resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
56
count = var.graphdb_node_count > 1 ? 1 : 0
67

@@ -77,6 +78,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
7778

7879
locals {
7980
# Builds a list of instance hostnames
81+
8082
instance_hostnames = [
8183
for i in range(1, var.graphdb_node_count + 1) :
8284
var.route53_zone_dns_name != null ?
@@ -98,6 +100,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
98100
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
99101

100102
# Define the metric query for heap used memory
103+
101104
metric_query {
102105
id = "m1"
103106
metric {
@@ -113,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
113116
}
114117

115118
# Defines the metric query for total memory
119+
116120
metric_query {
117121
id = "m2"
118122
metric {
@@ -128,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
128132
}
129133

130134
# Defines the expression to calculate heap usage percentage
135+
131136
metric_query {
132137
id = "e1"
133138
expression = "(m1 / m2) * 100"
@@ -136,49 +141,58 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
136141
}
137142
}
138143

139-
# Alarm for Memory Used Percent
144+
# Alarm for Memory Used Percent per node
140145

141-
resource "aws_cloudwatch_metric_alarm" "graphdb_memory_used_percent" {
142-
alarm_name = "al-${var.resource_name_prefix}-memory-used-percent"
143-
alarm_description = "Alarm will trigger if Memory used percent is above 80%"
144-
comparison_operator = "GreaterThanOrEqualToThreshold"
145-
evaluation_periods = var.cloudwatch_evaluation_periods
146-
period = var.cloudwatch_period
146+
resource "aws_cloudwatch_metric_alarm" "mem_used_percent_per_instance" {
147+
for_each = local.id_to_name
148+
149+
alarm_name = "al-${var.resource_name_prefix}-${each.value}-memory-used-percent"
150+
alarm_description = "mem_used_percent on ${each.value} >= ${var.graphdb_memory_utilization_threshold}%"
151+
namespace = "CWAgent"
152+
metric_name = "mem_used_percent"
153+
unit = "Percent"
147154
statistic = "Maximum"
155+
period = var.cloudwatch_period
156+
evaluation_periods = var.cloudwatch_evaluation_periods
157+
comparison_operator = "GreaterThanOrEqualToThreshold"
148158
threshold = var.graphdb_memory_utilization_threshold
159+
treat_missing_data = "notBreaching"
149160
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
150161
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
151-
treat_missing_data = "notBreaching"
152-
153-
metric_name = "mem_used_percent"
154-
namespace = "CWAgent"
155-
unit = "Percent"
156162

157163
dimensions = {
158164
AutoScalingGroupName = var.resource_name_prefix
165+
ImageId = data.aws_instance.by_id[each.key].ami
166+
InstanceId = each.key
167+
InstanceType = data.aws_instance.by_id[each.key].instance_type
168+
}
169+
170+
tags = {
171+
InstanceId = each.key
159172
}
160173
}
161174

162-
# Alarm for CPU Utilization for Autoscaling Group
175+
# Alarm for CPU Utilization for Node
163176

164-
resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
165-
alarm_name = "al-${var.resource_name_prefix}-cpu-utilization"
166-
alarm_description = "Alarm will trigger if CPU utilization is above 80%"
167-
comparison_operator = "GreaterThanOrEqualToThreshold"
168-
evaluation_periods = var.cloudwatch_evaluation_periods
177+
resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization_per_instance" {
178+
for_each = local.id_to_name
179+
180+
alarm_name = "al-${var.resource_name_prefix}-${each.value}-cpu-utilization"
181+
alarm_description = "CPU utilization on ${each.value} >= ${var.cloudwatch_cpu_utilization_threshold}%"
182+
namespace = "AWS/EC2"
183+
metric_name = "CPUUtilization"
184+
unit = "Percent"
185+
statistic = "Average"
169186
period = var.cloudwatch_period
170-
statistic = "Maximum"
187+
evaluation_periods = var.cloudwatch_evaluation_periods
188+
comparison_operator = "GreaterThanOrEqualToThreshold"
171189
threshold = var.cloudwatch_cpu_utilization_threshold
190+
treat_missing_data = "notBreaching"
172191
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
173192
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
174-
treat_missing_data = "notBreaching"
175-
176-
metric_name = "CPUUtilization"
177-
namespace = "AWS/EC2"
178-
unit = "Percent"
179193

180194
dimensions = {
181-
AutoScalingGroupName = var.resource_name_prefix
195+
InstanceId = each.key
182196
}
183197
}
184198

@@ -206,23 +220,32 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
206220
}
207221
}
208222

209-
resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent" {
210-
alarm_name = "al-${var.resource_name_prefix}-root-disk-used-percent"
211-
alarm_description = "Triggers when root filesystem used percent is above 90%"
212-
comparison_operator = "GreaterThanOrEqualToThreshold"
213-
metric_name = "disk_used_percent"
223+
# Alarm for Root Disk Used Percent per node
224+
225+
resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent_exact" {
226+
for_each = local.id_to_name
227+
228+
alarm_name = "al-${var.resource_name_prefix}-${each.value}-root-ebs-volume-used-percent"
229+
alarm_description = "Root filesystem used percent on ${each.value} >= 80%"
214230
namespace = "CWAgent"
231+
metric_name = "disk_used_percent"
232+
unit = "Percent"
215233
statistic = "Maximum"
216234
period = var.cloudwatch_period
217235
evaluation_periods = var.cloudwatch_evaluation_periods
218-
threshold = 90
219-
unit = "Percent"
236+
comparison_operator = "GreaterThanOrEqualToThreshold"
237+
threshold = 80
238+
treat_missing_data = "notBreaching"
220239
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
221240
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
222-
treat_missing_data = "notBreaching"
223241

224242
dimensions = {
225243
AutoScalingGroupName = var.resource_name_prefix
244+
InstanceId = each.key
245+
ImageId = data.aws_instance.by_id[each.key].ami
246+
InstanceType = data.aws_instance.by_id[each.key].instance_type
226247
path = "/"
248+
device = "nvme0n1p1"
249+
fstype = "ext4"
227250
}
228251
}

modules/monitoring/cloudwatch_agent_config.json.tpl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@
117117
"metrics": {
118118
"aggregation_dimensions": [
119119
[
120-
"AutoScalingGroupName"
120+
"AutoScalingGroupName",
121+
"InstanceId"
121122
]
122123
],
123124
"append_dimensions": {

modules/monitoring/data.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
data "aws_instances" "asg_members" {
2+
filter {
3+
name = "tag:aws:autoscaling:groupName"
4+
values = [var.resource_name_prefix]
5+
}
6+
}
7+
8+
data "aws_instance" "by_id" {
9+
for_each = toset(data.aws_instances.asg_members.ids)
10+
instance_id = each.value
11+
}
12+
13+
locals {
14+
id_to_name = {
15+
for id, inst in data.aws_instance.by_id :
16+
id => coalesce(try(inst.tags["Name"], null), id)
17+
}
18+
}

0 commit comments

Comments
 (0)