Ontotext-AD · simonzhekoff · Nov 12, 2025 · Oct 23, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,10 @@
 # GraphDB AWS Terraform Module Changelog
 
 ## 2.6.0
-
+* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per ASG and mem_used_percent alarm per ASG
+* Added support for alarms when transitioning from in alarm state to ok state.
+* Changed the comparison operator to use GreaterThanOrEqualToThreshold for most of the alarms.
+* Added treat a missing data option for all alarms and set it based on the type of the alarm.
 * Added support for changing the volume size of the instance root EBS volume.
 
 ## 2.5.0

diff --git a/modules/monitoring/alarms.tf b/modules/monitoring/alarms.tf
@@ -1,6 +1,7 @@
 # Alarms
 
 # Attempting to recover metric filter
+
 resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
   count = var.graphdb_node_count > 1 ? 1 : 0
 
@@ -33,14 +34,16 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" {
   evaluation_periods  = var.cloudwatch_evaluation_periods
   threshold           = "0"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+  treat_missing_data  = "missing"
 
   depends_on = [aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0]]
 }
 
 # Log filter for low disk space messages in the logs
 
 resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filter" {
-  name           = "al-${var.resource_name_prefix}-low-disk-space"
+  name           = "al-${var.resource_name_prefix}-low-disk-space-GraphDB-disk"
   pattern        = "No space left on the device"
   log_group_name = aws_cloudwatch_log_group.graphdb_log_group.name
 
@@ -57,7 +60,7 @@ resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filte
 # Alarm based on metric filter for Low Disk Space messages in the logs
 
 resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
-  alarm_name          = "al-${var.resource_name_prefix}-low-disk-space"
+  alarm_name          = "al-${var.resource_name_prefix}-low-disk-space-GraphDB-disk"
   alarm_description   = "Low Disk Space"
   comparison_operator = "GreaterThanThreshold"
   metric_name         = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].name
@@ -67,12 +70,15 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
   evaluation_periods  = var.cloudwatch_evaluation_periods
   threshold           = "0"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+  treat_missing_data  = "missing"
 
   depends_on = [aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter]
 }
 
 locals {
   # Builds a list of instance hostnames
+
   instance_hostnames = [
     for i in range(1, var.graphdb_node_count + 1) :
     var.route53_zone_dns_name != null ?
@@ -86,13 +92,15 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
 
   alarm_name          = "al-${var.resource_name_prefix}-heap-memory-usage-${each.key}"
   alarm_description   = "Triggers if ${each.key}'s heap usage exceeds threshold of its total memory"
-  comparison_operator = "GreaterThanThreshold"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.graphdb_memory_utilization_threshold
   evaluation_periods  = 1
   treat_missing_data  = "missing"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   # Define the metric query for heap used memory
+
   metric_query {
     id = "m1"
     metric {
@@ -108,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 
   # Defines the metric query for total memory
+
   metric_query {
     id = "m2"
     metric {
@@ -123,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 
   # Defines the expression to calculate heap usage percentage
+
   metric_query {
     id          = "e1"
     expression  = "(m1 / m2) * 100"
@@ -131,28 +141,52 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
   }
 }
 
-# Alarm for CPU Utilization for Autoscaling Group
+# Alarm for ASG Memory Used Percent
 
-resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
-  alarm_name          = "al-${var.resource_name_prefix}-cpu-utilization"
-  alarm_description   = "Alarm will trigger if CPU utilization is above 80%"
-  comparison_operator = "GreaterThanThreshold"
+resource "aws_cloudwatch_metric_alarm" "asg_mem_used_percent" {
+  alarm_name          = "al-${var.resource_name_prefix}-asg-mem-used-percent"
+  alarm_description   = "ASG mem_used_percent >= ${var.graphdb_memory_utilization_threshold}%"
+  namespace           = "CWAgent"
+  metric_name         = "mem_used_percent"
+  statistic           = "Average"
+  unit                = "Percent"
+  period              = var.cloudwatch_period
   evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.graphdb_memory_utilization_threshold
+  treat_missing_data  = "missing"
+  alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+
+  dimensions = {
+    AutoScalingGroupName = var.resource_name_prefix
+  }
+}
+
+# Alarm for ASG CPU Utilization
+
+resource "aws_cloudwatch_metric_alarm" "asg_cpu_utilization" {
+  alarm_name          = "al-${var.resource_name_prefix}-asg-cpu-utilization"
+  alarm_description   = "ASG average CPU >= ${var.cloudwatch_cpu_utilization_threshold}%"
+  namespace           = "AWS/EC2"
+  metric_name         = "CPUUtilization"
+  statistic           = "Average"
+  unit                = "Percent"
   period              = var.cloudwatch_period
-  statistic           = "Maximum"
+  evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
   threshold           = var.cloudwatch_cpu_utilization_threshold
+  treat_missing_data  = "missing"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
-
-  metric_name = "CPUUtilization"
-  namespace   = "AWS/EC2"
-  unit        = "Percent"
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   dimensions = {
     AutoScalingGroupName = var.resource_name_prefix
   }
 }
 
 # Alarm for nodes disconnected
+
 resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
   count = var.graphdb_node_count > 1 ? 1 : 0
 
@@ -165,6 +199,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
   comparison_operator = "GreaterThanThreshold"
   treat_missing_data  = "missing"
   alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
 
   metric_query {
     id          = "q1"
@@ -174,3 +209,25 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
     period      = var.cloudwatch_period
   }
 }
+
+# Alarm for ASG Root Disk Used Percent
+
+resource "aws_cloudwatch_metric_alarm" "asg_root_disk_used_percent" {
+  alarm_name          = "al-${var.resource_name_prefix}-asg-root-disk-used-percent"
+  alarm_description   = "ASG disk_used_percent on / >= 80%"
+  namespace           = "CWAgent"
+  metric_name         = "disk_used_percent"
+  statistic           = "Average"
+  unit                = "Percent"
+  period              = var.cloudwatch_period
+  evaluation_periods  = var.cloudwatch_evaluation_periods
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = 80
+  treat_missing_data  = "missing"
+  alarm_actions       = [aws_sns_topic.graphdb_sns_topic.arn]
+  ok_actions          = [aws_sns_topic.graphdb_sns_topic.arn]
+
+  dimensions = {
+    AutoScalingGroupName = var.resource_name_prefix
+  }
+}
diff --git a/modules/monitoring/cloudwatch_agent_config.json.tpl b/modules/monitoring/cloudwatch_agent_config.json.tpl
@@ -121,9 +121,7 @@
             ]
         ],
         "append_dimensions": {
-         "ImageId": "$${aws:ImageId}",
          "InstanceId": "$${aws:InstanceId}",
-         "InstanceType": "$${aws:InstanceType}",
          "AutoScalingGroupName": "$${aws:AutoScalingGroupName}"
         },
         "metrics_collected": {

diff --git a/modules/monitoring/data.tf b/modules/monitoring/data.tf
@@ -0,0 +1,18 @@
+data "aws_instances" "asg_members" {
+  filter {
+    name   = "tag:aws:autoscaling:groupName"
+    values = [var.resource_name_prefix]
+  }
+}
+
+data "aws_instance" "by_id" {
+  for_each    = toset(data.aws_instances.asg_members.ids)
+  instance_id = each.value
+}
+
+locals {
+  id_to_name = {
+    for id, inst in data.aws_instance.by_id :
+    id => coalesce(try(inst.tags["Name"], null), id)
+  }
+}
diff --git a/modules/monitoring/graphdb_dashboard.json b/modules/monitoring/graphdb_dashboard.json
@@ -80,62 +80,64 @@
             "title": "GraphDB Memory Used % for the Auto Scaling Group"
         }
     },
-      {
-          "height": 6,
-          "width": 6,
-          "y": 6,
-          "x": 6,
-          "type": "metric",
-          "properties": {
-              "metrics": [
-                  [ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
-              ],
-              "region": "${aws_region}",
-              "stacked": false,
-              "view": "timeSeries",
-              "period": 300,
-              "stat": "Average",
-              "yAxis": {
-                  "left": {
-                      "label": "Gigabytes",
-                      "showUnits": false
-                  },
-                  "right": {
-                      "label": "",
-                      "showUnits": false
-                  }
-              },
-              "title": "GraphDB Data Dir Free per instance"
-          }
-      },
-      {
-          "height": 6,
-          "width": 6,
-          "y": 6,
-          "x": 0,
-          "type": "metric",
-          "properties": {
-              "metrics": [
-                  [ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
-              ],
-              "region": "${aws_region}",
-              "stacked": false,
-              "view": "timeSeries",
-              "period": 300,
-              "stat": "Average",
-              "yAxis": {
-                  "left": {
-                      "label": "Gigabytes",
-                      "showUnits": false
-                  },
-                  "right": {
-                      "label": "",
-                      "showUnits": false
-                  }
-              },
-              "title": "GraphDB Data Dir Used per instance"
-          }
-      },
+     {
+       "height": 6,
+       "width": 6,
+       "y": 6,
+       "x": 6,
+       "type": "metric",
+       "properties": {
+           "metrics": [
+               [ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
+           ],
+           "region": "${aws_region}",
+           "stacked": false,
+           "view": "timeSeries",
+           "period": 300,
+           "stat": "Average",
+           "yAxis": {
+               "left": {
+                   "min": 0,
+                   "label": "Gigabytes",
+                   "showUnits": false
+               },
+               "right": {
+                   "label": "",
+                   "showUnits": false
+               }
+           },
+           "title": "GraphDB Data Dir Free per instance"
+       }
+     },
+     {
+       "height": 6,
+       "width": 6,
+       "y": 6,
+       "x": 0,
+       "type": "metric",
+       "properties": {
+           "metrics": [
+               [ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
+           ],
+           "region": "${aws_region}",
+           "stacked": false,
+           "view": "timeSeries",
+           "period": 300,
+           "stat": "Average",
+           "yAxis": {
+               "left": {
+                   "min": 0,
+                   "label": "Gigabytes",
+                   "showUnits": false
+               },
+               "right": {
+                   "label": "",
+                   "showUnits": false
+               }
+           },
+           "title": "GraphDB Data Dir Used per instance"
+       }
+     },
       {
           "height": 6,
           "width": 6,
@@ -153,6 +155,7 @@
               "stat": "Average",
               "yAxis": {
                   "left": {
+                      "min": 0,
                       "label": "Count",
                       "showUnits": false
                   },