11# Alarms
22
33# Attempting to recover metric filter
4+
45resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
56 count = var. graphdb_node_count > 1 ? 1 : 0
67
@@ -25,14 +26,16 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" {
2526
2627 alarm_name = " al-${ var . resource_name_prefix } -attempting-recover"
2728 alarm_description = " Attempting to recover through snapshot replication"
28- comparison_operator = " GreaterThanThreshold "
29+ comparison_operator = " GreaterThanOrEqualToThreshold "
2930 metric_name = aws_cloudwatch_log_metric_filter. graphdb_attempting_to_recover_metric_filter [0 ]. metric_transformation [0 ]. name
3031 namespace = aws_cloudwatch_log_metric_filter. graphdb_attempting_to_recover_metric_filter [0 ]. metric_transformation [0 ]. namespace
3132 period = var. cloudwatch_period
3233 statistic = " SampleCount"
3334 evaluation_periods = var. cloudwatch_evaluation_periods
3435 threshold = " 0"
3536 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
37+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
38+ treat_missing_data = " notBreaching"
3639
3740 depends_on = [aws_cloudwatch_log_metric_filter . graphdb_attempting_to_recover_metric_filter [0 ]]
3841}
@@ -59,20 +62,23 @@ resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filte
5962resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
6063 alarm_name = " al-${ var . resource_name_prefix } -low-disk-space"
6164 alarm_description = " Low Disk Space"
62- comparison_operator = " GreaterThanThreshold "
65+ comparison_operator = " GreaterThanOrEqualToThreshold "
6366 metric_name = aws_cloudwatch_log_metric_filter. graphdb_low_disk_space_metric_filter . metric_transformation [0 ]. name
6467 namespace = aws_cloudwatch_log_metric_filter. graphdb_low_disk_space_metric_filter . metric_transformation [0 ]. namespace
6568 period = var. cloudwatch_period
6669 statistic = " SampleCount"
6770 evaluation_periods = var. cloudwatch_evaluation_periods
6871 threshold = " 0"
6972 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
73+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
74+ treat_missing_data = " notBreaching"
7075
7176 depends_on = [aws_cloudwatch_log_metric_filter . graphdb_low_disk_space_metric_filter ]
7277}
7378
7479locals {
7580 # Builds a list of instance hostnames
81+
7682 instance_hostnames = [
7783 for i in range (1 , var. graphdb_node_count + 1 ) :
7884 var . route53_zone_dns_name != null ?
@@ -86,13 +92,15 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
8692
8793 alarm_name = " al-${ var . resource_name_prefix } -heap-memory-usage-${ each . key } "
8894 alarm_description = " Triggers if ${ each . key } 's heap usage exceeds threshold of its total memory"
89- comparison_operator = " GreaterThanThreshold "
95+ comparison_operator = " GreaterThanOrEqualToThreshold "
9096 threshold = var. graphdb_memory_utilization_threshold
9197 evaluation_periods = 1
92- treat_missing_data = " missing "
98+ treat_missing_data = " notBreaching "
9399 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
100+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
94101
95102 # Define the metric query for heap used memory
103+
96104 metric_query {
97105 id = " m1"
98106 metric {
@@ -108,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
108116 }
109117
110118 # Defines the metric query for total memory
119+
111120 metric_query {
112121 id = " m2"
113122 metric {
@@ -123,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
123132 }
124133
125134 # Defines the expression to calculate heap usage percentage
135+
126136 metric_query {
127137 id = " e1"
128138 expression = " (m1 / m2) * 100"
@@ -131,24 +141,58 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
131141 }
132142}
133143
134- # Alarm for CPU Utilization for Autoscaling Group
144+ # Alarm for Memory Used Percent per node
135145
136- resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
137- alarm_name = " al-${ var . resource_name_prefix } -cpu-utilization"
138- alarm_description = " Alarm will trigger if CPU utilization is above 80%"
139- comparison_operator = " GreaterThanThreshold"
146+ resource "aws_cloudwatch_metric_alarm" "mem_used_percent_per_instance" {
147+ for_each = local. id_to_name
148+
149+ alarm_name = " al-${ var . resource_name_prefix } -${ each . value } -memory-used-percent"
150+ alarm_description = " mem_used_percent on ${ each . value } >= ${ var . graphdb_memory_utilization_threshold } %"
151+ namespace = " CWAgent"
152+ metric_name = " mem_used_percent"
153+ unit = " Percent"
154+ statistic = " Maximum"
155+ period = var. cloudwatch_period
140156 evaluation_periods = var. cloudwatch_evaluation_periods
157+ comparison_operator = " GreaterThanOrEqualToThreshold"
158+ threshold = var. graphdb_memory_utilization_threshold
159+ treat_missing_data = " notBreaching"
160+ alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
161+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
162+
163+ dimensions = {
164+ AutoScalingGroupName = var.resource_name_prefix
165+ ImageId = data.aws_instance.by_id[each.key].ami
166+ InstanceId = each.key
167+ InstanceType = data.aws_instance.by_id[each.key].instance_type
168+ }
169+
170+ tags = {
171+ InstanceId = each.key
172+ }
173+ }
174+
175+ # Alarm for CPU Utilization for Node
176+
177+ resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization_per_instance" {
178+ for_each = local. id_to_name
179+
180+ alarm_name = " al-${ var . resource_name_prefix } -${ each . value } -cpu-utilization"
181+ alarm_description = " CPU utilization on ${ each . value } >= ${ var . cloudwatch_cpu_utilization_threshold } %"
182+ namespace = " AWS/EC2"
183+ metric_name = " CPUUtilization"
184+ unit = " Percent"
185+ statistic = " Average"
141186 period = var. cloudwatch_period
142- statistic = " Maximum"
187+ evaluation_periods = var. cloudwatch_evaluation_periods
188+ comparison_operator = " GreaterThanOrEqualToThreshold"
143189 threshold = var. cloudwatch_cpu_utilization_threshold
190+ treat_missing_data = " notBreaching"
144191 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
145-
146- metric_name = " CPUUtilization"
147- namespace = " AWS/EC2"
148- unit = " Percent"
192+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
149193
150194 dimensions = {
151- AutoScalingGroupName = var.resource_name_prefix
195+ InstanceId = each.key
152196 }
153197}
154198
@@ -162,9 +206,10 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
162206 evaluation_periods = var. cloudwatch_evaluation_periods
163207 datapoints_to_alarm = 1
164208 threshold = 0
165- comparison_operator = " GreaterThanThreshold "
166- treat_missing_data = " missing "
209+ comparison_operator = " GreaterThanOrEqualToThreshold "
210+ treat_missing_data = " notBreaching "
167211 alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
212+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
168213
169214 metric_query {
170215 id = " q1"
@@ -174,3 +219,33 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
174219 period = var. cloudwatch_period
175220 }
176221}
222+
223+ # Alarm for Root Disk Used Percent per node
224+
225+ resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent_exact" {
226+ for_each = local. id_to_name
227+
228+ alarm_name = " al-${ var . resource_name_prefix } -${ each . value } -root-ebs-volume-used-percent"
229+ alarm_description = " Root filesystem used percent on ${ each . value } >= 80%"
230+ namespace = " CWAgent"
231+ metric_name = " disk_used_percent"
232+ unit = " Percent"
233+ statistic = " Maximum"
234+ period = var. cloudwatch_period
235+ evaluation_periods = var. cloudwatch_evaluation_periods
236+ comparison_operator = " GreaterThanOrEqualToThreshold"
237+ threshold = 80
238+ treat_missing_data = " notBreaching"
239+ alarm_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
240+ ok_actions = [aws_sns_topic . graphdb_sns_topic . arn ]
241+
242+ dimensions = {
243+ AutoScalingGroupName = var.resource_name_prefix
244+ InstanceId = each.key
245+ ImageId = data.aws_instance.by_id[each.key].ami
246+ InstanceType = data.aws_instance.by_id[each.key].instance_type
247+ path = " /"
248+ device = " nvme0n1p1"
249+ fstype = " ext4"
250+ }
251+ }
0 commit comments