Added support for alerts per node for cpu utilization, mem_used_percent, and root ebs volume disk_used_percent, Optimized alarms

simonzhekoff · simonzhekoff · commit 140fa3a190d5 · 2025-11-07T14:54:30.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,11 @@
 # GraphDB AWS Terraform Module Changelog
 
-## 2.5.1
-* Added support for monitoring mem_used_percent metric via Cloudwatch alarm and dashboard.
-* Added support for monitoring the disk_used_percent and disk_free metric via Cloudwatch alarm and dashboard.
+## 2.6.0
+* Added support for monitoring mem_used_percent metric via Cloudwatch alarm and dashboard per node.
+* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per node.
+* Added support for alarms when transitioning from in alarm state to ok state.
+* Changed the comparison operator to use GreaterThanOrEqualToThreshold for all alarms.
+* Added treat missing data option for all alarms and set it to notBreaching.
 
 ## 2.5.0
 * Added support for managing Route 53 hosted zones and DNS records via external_dns_records module.
diff --git a/modules/monitoring/alarms.tf b/modules/monitoring/alarms.tf
@@ -1,6 +1,7 @@
 # Alarms
 
 # Attempting to recover metric filter
+
 resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
   count = var.graphdb_node_count > 1 ? 1 : 0
 
@@ -77,6 +78,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
 
 locals {
   # Builds a list of instance hostnames
+
   instance_hostnames = [
     for i in range(1, var.graphdb_node_count + 1) :
     var.route53_zone_dns_name != null ?
@@ -98,6 +100,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   # Define the metric query for heap used memory
+
   metric_query {
     id = "m1"
     metric {
@@ -113,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 
   # Defines the metric query for total memory
+
   metric_query {
     id = "m2"
     metric {
@@ -128,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 
   # Defines the expression to calculate heap usage percentage
+
   metric_query {
     id          = "e1"
     expression  = "(m1 / m2) * 100"
@@ -136,49 +141,58 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 }
 
-# Alarm for Memory Used Percent
+# Alarm for Memory Used Percent per node
 
-resource "aws_cloudwatch_metric_alarm" "graphdb_memory_used_percent" {
-  alarm_name          = "al-${var.resource_name_prefix}-memory-used-percent"
-  alarm_description   = "Alarm will trigger if Memory used percent is above 80%"
-  comparison_operator = "GreaterThanOrEqualToThreshold"
-  evaluation_periods  = var.cloudwatch_evaluation_periods
-  period              = var.cloudwatch_period
+resource "aws_cloudwatch_metric_alarm" "mem_used_percent_per_instance" {
+  for_each = local.id_to_name
+
+  alarm_name          = "al-${var.resource_name_prefix}-${each.value}-memory-used-percent"
+  alarm_description   = "mem_used_percent on ${each.value} >= ${var.graphdb_memory_utilization_threshold}%"
+  namespace           = "CWAgent"
+  metric_name         = "mem_used_percent"
+  unit                = "Percent"
   statistic           = "Maximum"
+  period              = var.cloudwatch_period
+  evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.graphdb_memory_utilization_threshold
+  treat_missing_data  = "notBreaching"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
   ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
-  treat_missing_data  = "notBreaching"
-
-  metric_name = "mem_used_percent"
-  namespace   = "CWAgent"
-  unit        = "Percent"
 
   dimensions = {
     AutoScalingGroupName = var.resource_name_prefix
+    ImageId              = data.aws_instance.by_id[each.key].ami
+    InstanceId           = each.key
+    InstanceType         = data.aws_instance.by_id[each.key].instance_type
+  }
+
+  tags = {
+    InstanceId = each.key
   }
 }
 
-# Alarm for CPU Utilization for Autoscaling Group
+# Alarm for CPU Utilization for Node
 
-resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
-  alarm_name          = "al-${var.resource_name_prefix}-cpu-utilization"
-  alarm_description   = "Alarm will trigger if CPU utilization is above 80%"
-  comparison_operator = "GreaterThanOrEqualToThreshold"
-  evaluation_periods  = var.cloudwatch_evaluation_periods
+resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization_per_instance" {
+  for_each = local.id_to_name
+
+  alarm_name          = "al-${var.resource_name_prefix}-${each.value}-cpu-utilization"
+  alarm_description   = "CPU utilization on ${each.value} >= ${var.cloudwatch_cpu_utilization_threshold}%"
+  namespace           = "AWS/EC2"
+  metric_name         = "CPUUtilization"
+  unit                = "Percent"
+  statistic           = "Average"
   period              = var.cloudwatch_period
-  statistic           = "Maximum"
+  evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.cloudwatch_cpu_utilization_threshold
+  treat_missing_data  = "notBreaching"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
   ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
-  treat_missing_data  = "notBreaching"
-
-  metric_name = "CPUUtilization"
-  namespace   = "AWS/EC2"
-  unit        = "Percent"
 
   dimensions = {
-    AutoScalingGroupName = var.resource_name_prefix
+    InstanceId = each.key
   }
 }
 
@@ -206,23 +220,32 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
   }
 }
 
-resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent" {
-  alarm_name          = "al-${var.resource_name_prefix}-root-disk-used-percent"
-  alarm_description   = "Triggers when root filesystem used percent is above 90%"
-  comparison_operator = "GreaterThanOrEqualToThreshold"
-  metric_name         = "disk_used_percent"
+# Alarm for Root Disk Used Percent per node
+
+resource "aws_cloudwatch_metric_alarm" "graphdb_root_disk_used_percent_exact" {
+  for_each = local.id_to_name
+
+  alarm_name          = "al-${var.resource_name_prefix}-${each.value}-root-ebs-volume-used-percent"
+  alarm_description   = "Root filesystem used percent on ${each.value} >= 80%"
   namespace           = "CWAgent"
+  metric_name         = "disk_used_percent"
+  unit                = "Percent"
   statistic           = "Maximum"
   period              = var.cloudwatch_period
   evaluation_periods  = var.cloudwatch_evaluation_periods
-  threshold           = 90
-  unit                = "Percent"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = 80
+  treat_missing_data  = "notBreaching"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
   ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
-  treat_missing_data  = "notBreaching"
 
   dimensions = {
     AutoScalingGroupName = var.resource_name_prefix
+    InstanceId           = each.key
+    ImageId              = data.aws_instance.by_id[each.key].ami
+    InstanceType         = data.aws_instance.by_id[each.key].instance_type
     path                 = "/"
+    device               = "nvme0n1p1"
+    fstype               = "ext4"
   }
 }
diff --git a/modules/monitoring/cloudwatch_agent_config.json.tpl b/modules/monitoring/cloudwatch_agent_config.json.tpl
@@ -117,7 +117,8 @@
     "metrics": {
         "aggregation_dimensions": [
             [
-                "AutoScalingGroupName"
+                "AutoScalingGroupName",
+                "InstanceId"
             ]
         ],
         "append_dimensions": {
diff --git a/modules/monitoring/data.tf b/modules/monitoring/data.tf
@@ -0,0 +1,18 @@
+data "aws_instances" "asg_members" {
+  filter {
+    name   = "tag:aws:autoscaling:groupName"
+    values = [var.resource_name_prefix]
+  }
+}
+
+data "aws_instance" "by_id" {
+  for_each    = toset(data.aws_instances.asg_members.ids)
+  instance_id = each.value
+}
+
+locals {
+  id_to_name = {
+    for id, inst in data.aws_instance.by_id :
+    id => coalesce(try(inst.tags["Name"], null), id)
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,8 @@`
`117`	`117`	`"metrics": {`
`118`	`118`	`"aggregation_dimensions": [`
`119`	`119`	`[`
`120`		`- "AutoScalingGroupName"`
	`120`	`+ "AutoScalingGroupName",`
	`121`	`+ "InstanceId"`
`121`	`122`	`]`
`122`	`123`	`],`
`123`	`124`	`"append_dimensions": {`