11# Alarms
22
33# Attempting to recover metric filter
4+
45resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
56 count = var. graphdb_node_count > 1 ? 1 : 0
67
@@ -77,6 +78,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
7778
7879locals {
7980 # Builds a list of instance hostnames
81+
8082 instance_hostnames = [
8183 for i in range (1 , var. graphdb_node_count + 1 ) :
8284 var . route53_zone_dns_name != null ?
@@ -98,6 +100,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
98100 ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
99101
100102 # Define the metric query for heap used memory
103+
101104 metric_query {
102105 id = " m1"
103106 metric {
@@ -113,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
113116 }
114117
115118 # Defines the metric query for total memory
119+
116120 metric_query {
117121 id = " m2"
118122 metric {
@@ -128,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
128132 }
129133
130134 # Defines the expression to calculate heap usage percentage
135+
131136 metric_query {
132137 id = " e1"
133138 expression = " (m1 / m2) * 100"
@@ -136,49 +141,58 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
136141 }
137142}
138143
139- # Alarm for Memory Used Percent
144+ # Alarm for Memory Used Percent per node
140145
141- resource "aws_cloudwatch_metric_alarm" "graphdb_memory_used_percent" {
142- alarm_name = " al-${ var . resource_name_prefix } -memory-used-percent"
143- alarm_description = " Alarm will trigger if Memory used percent is above 80%"
144- comparison_operator = " GreaterThanOrEqualToThreshold"
145- evaluation_periods = var. cloudwatch_evaluation_periods
146- period = var. cloudwatch_period
146+ resource "aws_cloudwatch_metric_alarm" "mem_used_percent_per_instance" {
147+ for_each = local. id_to_name
148+
149+ alarm_name = " al-${ var . resource_name_prefix } -${ each . value } -memory-used-percent"
150+ alarm_description = " mem_used_percent on ${ each . value } >= ${ var . graphdb_memory_utilization_threshold } %"
151+ namespace = " CWAgent"
152+ metric_name = " mem_used_percent"
153+ unit = " Percent"
147154 statistic = " Maximum"
155+ period = var. cloudwatch_period
156+ evaluation_periods = var. cloudwatch_evaluation_periods
157+ comparison_operator = " GreaterThanOrEqualToThreshold"
148158 threshold = var. graphdb_memory_utilization_threshold
159+ treat_missing_data = " notBreaching"
149160 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
150161 ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
151- treat_missing_data = " notBreaching"
152-
153- metric_name = " mem_used_percent"
154- namespace = " CWAgent"
155- unit = " Percent"
156162
157163 dimensions = {
158164 AutoScalingGroupName = var.resource_name_prefix
165+ ImageId = data.aws_instance.by_id[each.key].ami
166+ InstanceId = each.key
167+ InstanceType = data.aws_instance.by_id[each.key].instance_type
168+ }
169+
170+ tags = {
171+ InstanceId = each.key
159172 }
160173}
161174
162- # Alarm for CPU Utilization for Autoscaling Group
175+ # Alarm for CPU Utilization for Node
163176
164- resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
165- alarm_name = " al-${ var . resource_name_prefix } -cpu-utilization"
166- alarm_description = " Alarm will trigger if CPU utilization is above 80%"
167- comparison_operator = " GreaterThanOrEqualToThreshold"
168- evaluation_periods = var. cloudwatch_evaluation_periods
177+ resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization_per_instance" {
178+ for_each = local. id_to_name
179+
180+ alarm_name = " al-${ var . resource_name_prefix } -${ each . value } -cpu-utilization"
181+ alarm_description = " CPU utilization on ${ each . value } >= ${ var . cloudwatch_cpu_utilization_threshold } %"
182+ namespace = " AWS/EC2"
183+ metric_name = " CPUUtilization"
184+ unit = " Percent"
185+ statistic = " Average"
169186 period = var. cloudwatch_period
170- statistic = " Maximum"
187+ evaluation_periods = var. cloudwatch_evaluation_periods
188+ comparison_operator = " GreaterThanOrEqualToThreshold"
171189 threshold = var. cloudwatch_cpu_utilization_threshold
190+ treat_missing_data = " notBreaching"
172191 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
173192 ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
174- treat_missing_data = " notBreaching"
175-
176- metric_name = " CPUUtilization"
177- namespace = " AWS/EC2"
178- unit = " Percent"
179193
180194 dimensions = {
181- AutoScalingGroupName = var.resource_name_prefix
195+ InstanceId = each.key
182196 }
183197}
184198
@@ -206,23 +220,32 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
206220 }
207221}
208222
209- resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent" {
210- alarm_name = " al-${ var . resource_name_prefix } -root-disk-used-percent"
211- alarm_description = " Triggers when root filesystem used percent is above 90%"
212- comparison_operator = " GreaterThanOrEqualToThreshold"
213- metric_name = " disk_used_percent"
223+ # Alarm for Root Disk Used Percent per node
224+
225+ resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent_exact" {
226+ for_each = local. id_to_name
227+
228+ alarm_name = " al-${ var . resource_name_prefix } -${ each . value } -root-ebs-volume-used-percent"
229+ alarm_description = " Root filesystem used percent on ${ each . value } >= 80%"
214230 namespace = " CWAgent"
231+ metric_name = " disk_used_percent"
232+ unit = " Percent"
215233 statistic = " Maximum"
216234 period = var. cloudwatch_period
217235 evaluation_periods = var. cloudwatch_evaluation_periods
218- threshold = 90
219- unit = " Percent"
236+ comparison_operator = " GreaterThanOrEqualToThreshold"
237+ threshold = 80
238+ treat_missing_data = " notBreaching"
220239 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
221240 ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
222- treat_missing_data = " notBreaching"
223241
224242 dimensions = {
225243 AutoScalingGroupName = var.resource_name_prefix
244+ InstanceId = each.key
245+ ImageId = data.aws_instance.by_id[each.key].ami
246+ InstanceType = data.aws_instance.by_id[each.key].instance_type
226247 path = " /"
248+ device = " nvme0n1p1"
249+ fstype = " ext4"
227250 }
228251}
0 commit comments