Added support for alerts per node for cpu utilization, mem_used_percent, and root ebs volume disk_used_percent, Optimized alarms

simonzhekoff · simonzhekoff · commit 2dcce2942398 · 2025-11-07T15:00:28.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,11 @@
 # GraphDB AWS Terraform Module Changelog
 
 ## 2.6.0
-
+* Added support for monitoring mem_used_percent metric via Cloudwatch alarm and dashboard per node.
+* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per node.
+* Added support for alarms when transitioning from in alarm state to ok state.
+* Changed the comparison operator to use GreaterThanOrEqualToThreshold for all alarms.
+* Added treat a missing data option for all alarms and set it to notBreaching.
 * Added support for changing the volume size of the instance root EBS volume.
 
 ## 2.5.0
diff --git a/modules/monitoring/alarms.tf b/modules/monitoring/alarms.tf
@@ -1,6 +1,7 @@
 # Alarms
 
 # Attempting to recover metric filter
+
 resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
   count = var.graphdb_node_count > 1 ? 1 : 0
 
@@ -25,14 +26,16 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" {
 
   alarm_name          = "al-${var.resource_name_prefix}-attempting-recover"
   alarm_description   = "Attempting to recover through snapshot replication"
-  comparison_operator = "GreaterThanThreshold"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   metric_name         = aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0].metric_transformation[0].name
   namespace           = aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0].metric_transformation[0].namespace
   period              = var.cloudwatch_period
   statistic           = "SampleCount"
   evaluation_periods  = var.cloudwatch_evaluation_periods
   threshold           = "0"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+  treat_missing_data  = "notBreaching"
 
   depends_on = [aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0]]
 }
@@ -59,20 +62,23 @@ resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filte
 resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
   alarm_name          = "al-${var.resource_name_prefix}-low-disk-space"
   alarm_description   = "Low Disk Space"
-  comparison_operator = "GreaterThanThreshold"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   metric_name         = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].name
   namespace           = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].namespace
   period              = var.cloudwatch_period
   statistic           = "SampleCount"
   evaluation_periods  = var.cloudwatch_evaluation_periods
   threshold           = "0"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+  treat_missing_data  = "notBreaching"
 
   depends_on = [aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter]
 }
 
 locals {
   # Builds a list of instance hostnames
+
   instance_hostnames = [
     for i in range(1, var.graphdb_node_count + 1) :
     var.route53_zone_dns_name != null ?
@@ -86,13 +92,15 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
 
   alarm_name          = "al-${var.resource_name_prefix}-heap-memory-usage-${each.key}"
   alarm_description   = "Triggers if ${each.key}'s heap usage exceeds threshold of its total memory"
-  comparison_operator = "GreaterThanThreshold"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.graphdb_memory_utilization_threshold
   evaluation_periods  = 1
-  treat_missing_data  = "missing"
+  treat_missing_data  = "notBreaching"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   # Define the metric query for heap used memory
+
   metric_query {
     id = "m1"
     metric {
@@ -108,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 
   # Defines the metric query for total memory
+
   metric_query {
     id = "m2"
     metric {
@@ -123,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 
   # Defines the expression to calculate heap usage percentage
+
   metric_query {
     id          = "e1"
     expression  = "(m1 / m2) * 100"
@@ -131,24 +141,58 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 }
 
-# Alarm for CPU Utilization for Autoscaling Group
+# Alarm for Memory Used Percent per node
 
-resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
-  alarm_name          = "al-${var.resource_name_prefix}-cpu-utilization"
-  alarm_description   = "Alarm will trigger if CPU utilization is above 80%"
-  comparison_operator = "GreaterThanThreshold"
+resource "aws_cloudwatch_metric_alarm" "mem_used_percent_per_instance" {
+  for_each = local.id_to_name
+
+  alarm_name          = "al-${var.resource_name_prefix}-${each.value}-memory-used-percent"
+  alarm_description   = "mem_used_percent on ${each.value} >= ${var.graphdb_memory_utilization_threshold}%"
+  namespace           = "CWAgent"
+  metric_name         = "mem_used_percent"
+  unit                = "Percent"
+  statistic           = "Maximum"
+  period              = var.cloudwatch_period
   evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.graphdb_memory_utilization_threshold
+  treat_missing_data  = "notBreaching"
+  alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+
+  dimensions = {
+    AutoScalingGroupName = var.resource_name_prefix
+    ImageId              = data.aws_instance.by_id[each.key].ami
+    InstanceId           = each.key
+    InstanceType         = data.aws_instance.by_id[each.key].instance_type
+  }
+
+  tags = {
+    InstanceId = each.key
+  }
+}
+
+# Alarm for CPU Utilization for Node
+
+resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization_per_instance" {
+  for_each = local.id_to_name
+
+  alarm_name          = "al-${var.resource_name_prefix}-${each.value}-cpu-utilization"
+  alarm_description   = "CPU utilization on ${each.value} >= ${var.cloudwatch_cpu_utilization_threshold}%"
+  namespace           = "AWS/EC2"
+  metric_name         = "CPUUtilization"
+  unit                = "Percent"
+  statistic           = "Average"
   period              = var.cloudwatch_period
-  statistic           = "Maximum"
+  evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.cloudwatch_cpu_utilization_threshold
+  treat_missing_data  = "notBreaching"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
-
-  metric_name = "CPUUtilization"
-  namespace   = "AWS/EC2"
-  unit        = "Percent"
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   dimensions = {
-    AutoScalingGroupName = var.resource_name_prefix
+    InstanceId = each.key
   }
 }
 
@@ -162,9 +206,10 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
   evaluation_periods  = var.cloudwatch_evaluation_periods
   datapoints_to_alarm = 1
   threshold           = 0
-  comparison_operator = "GreaterThanThreshold"
-  treat_missing_data  = "missing"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  treat_missing_data  = "notBreaching"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   metric_query {
     id          = "q1"
@@ -174,3 +219,33 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
     period      = var.cloudwatch_period
   }
 }
+
+# Alarm for Root Disk Used Percent per node
+
+resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent_exact" {
+  for_each = local.id_to_name
+
+  alarm_name          = "al-${var.resource_name_prefix}-${each.value}-root-ebs-volume-used-percent"
+  alarm_description   = "Root filesystem used percent on ${each.value} >= 80%"
+  namespace           = "CWAgent"
+  metric_name         = "disk_used_percent"
+  unit                = "Percent"
+  statistic           = "Maximum"
+  period              = var.cloudwatch_period
+  evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = 80
+  treat_missing_data  = "notBreaching"
+  alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+
+  dimensions = {
+    AutoScalingGroupName = var.resource_name_prefix
+    InstanceId           = each.key
+    ImageId              = data.aws_instance.by_id[each.key].ami
+    InstanceType         = data.aws_instance.by_id[each.key].instance_type
+    path                 = "/"
+    device               = "nvme0n1p1"
+    fstype               = "ext4"
+  }
+}
diff --git a/modules/monitoring/cloudwatch_agent_config.json.tpl b/modules/monitoring/cloudwatch_agent_config.json.tpl
@@ -117,7 +117,8 @@
     "metrics": {
         "aggregation_dimensions": [
             [
-                "AutoScalingGroupName"
+                "AutoScalingGroupName",
+                "InstanceId"
             ]
         ],
         "append_dimensions": {
diff --git a/modules/monitoring/data.tf b/modules/monitoring/data.tf
@@ -0,0 +1,18 @@
+data "aws_instances" "asg_members" {
+  filter {
+    name   = "tag:aws:autoscaling:groupName"
+    values = [var.resource_name_prefix]
+  }
+}
+
+data "aws_instance" "by_id" {
+  for_each    = toset(data.aws_instances.asg_members.ids)
+  instance_id = each.value
+}
+
+locals {
+  id_to_name = {
+    for id, inst in data.aws_instance.by_id :
+    id => coalesce(try(inst.tags["Name"], null), id)
+  }
+}
diff --git a/modules/monitoring/graphdb_dashboard.json b/modules/monitoring/graphdb_dashboard.json
@@ -80,62 +80,64 @@
             "title": "GraphDB Memory Used % for the Auto Scaling Group"
         }
     },
-      {
-          "height": 6,
-          "width": 6,
-          "y": 6,
-          "x": 6,
-          "type": "metric",
-          "properties": {
-              "metrics": [
-                  [ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
-              ],
-              "region": "${aws_region}",
-              "stacked": false,
-              "view": "timeSeries",
-              "period": 300,
-              "stat": "Average",
-              "yAxis": {
-                  "left": {
-                      "label": "Gigabytes",
-                      "showUnits": false
-                  },
-                  "right": {
-                      "label": "",
-                      "showUnits": false
-                  }
-              },
-              "title": "GraphDB Data Dir Free per instance"
-          }
-      },
-      {
-          "height": 6,
-          "width": 6,
-          "y": 6,
-          "x": 0,
-          "type": "metric",
-          "properties": {
-              "metrics": [
-                  [ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
-              ],
-              "region": "${aws_region}",
-              "stacked": false,
-              "view": "timeSeries",
-              "period": 300,
-              "stat": "Average",
-              "yAxis": {
-                  "left": {
-                      "label": "Gigabytes",
-                      "showUnits": false
-                  },
-                  "right": {
-                      "label": "",
-                      "showUnits": false
-                  }
-              },
-              "title": "GraphDB Data Dir Used per instance"
-          }
-      },
+     {
+       "height": 6,
+       "width": 6,
+       "y": 6,
+       "x": 6,
+       "type": "metric",
+       "properties": {
+           "metrics": [
+               [ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
+           ],
+           "region": "${aws_region}",
+           "stacked": false,
+           "view": "timeSeries",
+           "period": 300,
+           "stat": "Average",
+           "yAxis": {
+               "left": {
+                   "min": 0,
+                   "label": "Gigabytes",
+                   "showUnits": false
+               },
+               "right": {
+                   "label": "",
+                   "showUnits": false
+               }
+           },
+           "title": "GraphDB Data Dir Free per instance"
+       }
+     },
+     {
+       "height": 6,
+       "width": 6,
+       "y": 6,
+       "x": 0,
+       "type": "metric",
+       "properties": {
+           "metrics": [
+               [ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
+           ],
+           "region": "${aws_region}",
+           "stacked": false,
+           "view": "timeSeries",
+           "period": 300,
+           "stat": "Average",
+           "yAxis": {
+               "left": {
+                   "min": 0,
+                   "label": "Gigabytes",
+                   "showUnits": false
+               },
+               "right": {
+                   "label": "",
+                   "showUnits": false
+               }
+           },
+           "title": "GraphDB Data Dir Used per instance"
+       }
+     },
       {
           "height": 6,
           "width": 6,
@@ -153,6 +155,7 @@
               "stat": "Average",
               "yAxis": {
                   "left": {
+                      "min": 0,
                       "label": "Count",
                       "showUnits": false
                   },

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,8 @@`
`117`	`117`	`"metrics": {`
`118`	`118`	`"aggregation_dimensions": [`
`119`	`119`	`[`
`120`		`- "AutoScalingGroupName"`
	`120`	`+ "AutoScalingGroupName",`
	`121`	`+ "InstanceId"`
`121`	`122`	`]`
`122`	`123`	`],`
`123`	`124`	`"append_dimensions": {`