diff --git a/CHANGELOG.md b/CHANGELOG.md index e31d18d..cd1a4b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ # GraphDB AWS Terraform Module Changelog ## 2.6.0 - +* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per ASG and mem_used_percent alarm per ASG +* Added support for alarms when transitioning from in alarm state to ok state. +* Changed the comparison operator to use GreaterThanOrEqualToThreshold for most of the alarms. +* Added treat a missing data option for all alarms and set it based on the type of the alarm. * Added support for changing the volume size of the instance root EBS volume. ## 2.5.0 diff --git a/modules/monitoring/alarms.tf b/modules/monitoring/alarms.tf index ee0c1b6..3d36ae9 100644 --- a/modules/monitoring/alarms.tf +++ b/modules/monitoring/alarms.tf @@ -1,6 +1,7 @@ # Alarms # Attempting to recover metric filter + resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" { count = var.graphdb_node_count > 1 ? 1 : 0 @@ -33,6 +34,8 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" { evaluation_periods = var.cloudwatch_evaluation_periods threshold = "0" alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] + treat_missing_data = "missing" depends_on = [aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0]] } @@ -40,7 +43,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" { # Log filter for low disk space messages in the logs resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filter" { - name = "al-${var.resource_name_prefix}-low-disk-space" + name = "al-${var.resource_name_prefix}-low-disk-space-GraphDB-disk" pattern = "No space left on the device" log_group_name = aws_cloudwatch_log_group.graphdb_log_group.name @@ -57,7 +60,7 @@ resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filte # Alarm based on metric filter for Low Disk Space messages in the logs resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" { - alarm_name = "al-${var.resource_name_prefix}-low-disk-space" + alarm_name = "al-${var.resource_name_prefix}-low-disk-space-GraphDB-disk" alarm_description = "Low Disk Space" comparison_operator = "GreaterThanThreshold" metric_name = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].name @@ -67,12 +70,15 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" { evaluation_periods = var.cloudwatch_evaluation_periods threshold = "0" alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] + treat_missing_data = "missing" depends_on = [aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter] } locals { # Builds a list of instance hostnames + instance_hostnames = [ for i in range(1, var.graphdb_node_count + 1) : var.route53_zone_dns_name != null ? @@ -86,13 +92,15 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" { alarm_name = "al-${var.resource_name_prefix}-heap-memory-usage-${each.key}" alarm_description = "Triggers if ${each.key}'s heap usage exceeds threshold of its total memory" - comparison_operator = "GreaterThanThreshold" + comparison_operator = "GreaterThanOrEqualToThreshold" threshold = var.graphdb_memory_utilization_threshold evaluation_periods = 1 treat_missing_data = "missing" alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] # Define the metric query for heap used memory + metric_query { id = "m1" metric { @@ -108,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" { } # Defines the metric query for total memory + metric_query { id = "m2" metric { @@ -123,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" { } # Defines the expression to calculate heap usage percentage + metric_query { id = "e1" expression = "(m1 / m2) * 100" @@ -131,21 +141,44 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" { } } -# Alarm for CPU Utilization for Autoscaling Group +# Alarm for ASG Memory Used Percent -resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" { - alarm_name = "al-${var.resource_name_prefix}-cpu-utilization" - alarm_description = "Alarm will trigger if CPU utilization is above 80%" - comparison_operator = "GreaterThanThreshold" +resource "aws_cloudwatch_metric_alarm" "asg_mem_used_percent" { + alarm_name = "al-${var.resource_name_prefix}-asg-mem-used-percent" + alarm_description = "ASG mem_used_percent >= ${var.graphdb_memory_utilization_threshold}%" + namespace = "CWAgent" + metric_name = "mem_used_percent" + statistic = "Average" + unit = "Percent" + period = var.cloudwatch_period evaluation_periods = var.cloudwatch_evaluation_periods + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = var.graphdb_memory_utilization_threshold + treat_missing_data = "missing" + alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] + + dimensions = { + AutoScalingGroupName = var.resource_name_prefix + } +} + +# Alarm for ASG CPU Utilization + +resource "aws_cloudwatch_metric_alarm" "asg_cpu_utilization" { + alarm_name = "al-${var.resource_name_prefix}-asg-cpu-utilization" + alarm_description = "ASG average CPU >= ${var.cloudwatch_cpu_utilization_threshold}%" + namespace = "AWS/EC2" + metric_name = "CPUUtilization" + statistic = "Average" + unit = "Percent" period = var.cloudwatch_period - statistic = "Maximum" + evaluation_periods = var.cloudwatch_evaluation_periods + comparison_operator = "GreaterThanOrEqualToThreshold" threshold = var.cloudwatch_cpu_utilization_threshold + treat_missing_data = "missing" alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] - - metric_name = "CPUUtilization" - namespace = "AWS/EC2" - unit = "Percent" + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] dimensions = { AutoScalingGroupName = var.resource_name_prefix @@ -153,6 +186,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" { } # Alarm for nodes disconnected + resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" { count = var.graphdb_node_count > 1 ? 1 : 0 @@ -165,6 +199,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" { comparison_operator = "GreaterThanThreshold" treat_missing_data = "missing" alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] metric_query { id = "q1" @@ -174,3 +209,25 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" { period = var.cloudwatch_period } } + +# Alarm for ASG Root Disk Used Percent + +resource "aws_cloudwatch_metric_alarm" "asg_root_disk_used_percent" { + alarm_name = "al-${var.resource_name_prefix}-asg-root-disk-used-percent" + alarm_description = "ASG disk_used_percent on / >= 80%" + namespace = "CWAgent" + metric_name = "disk_used_percent" + statistic = "Average" + unit = "Percent" + period = var.cloudwatch_period + evaluation_periods = var.cloudwatch_evaluation_periods + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 80 + treat_missing_data = "missing" + alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn] + ok_actions = [aws_sns_topic.graphdb_sns_topic.arn] + + dimensions = { + AutoScalingGroupName = var.resource_name_prefix + } +} diff --git a/modules/monitoring/cloudwatch_agent_config.json.tpl b/modules/monitoring/cloudwatch_agent_config.json.tpl index c8068be..2472ce1 100644 --- a/modules/monitoring/cloudwatch_agent_config.json.tpl +++ b/modules/monitoring/cloudwatch_agent_config.json.tpl @@ -121,9 +121,7 @@ ] ], "append_dimensions": { - "ImageId": "$${aws:ImageId}", "InstanceId": "$${aws:InstanceId}", - "InstanceType": "$${aws:InstanceType}", "AutoScalingGroupName": "$${aws:AutoScalingGroupName}" }, "metrics_collected": { diff --git a/modules/monitoring/data.tf b/modules/monitoring/data.tf new file mode 100644 index 0000000..fb77e62 --- /dev/null +++ b/modules/monitoring/data.tf @@ -0,0 +1,18 @@ +data "aws_instances" "asg_members" { + filter { + name = "tag:aws:autoscaling:groupName" + values = [var.resource_name_prefix] + } +} + +data "aws_instance" "by_id" { + for_each = toset(data.aws_instances.asg_members.ids) + instance_id = each.value +} + +locals { + id_to_name = { + for id, inst in data.aws_instance.by_id : + id => coalesce(try(inst.tags["Name"], null), id) + } +} diff --git a/modules/monitoring/graphdb_dashboard.json b/modules/monitoring/graphdb_dashboard.json index 2e7a5b7..f481c79 100644 --- a/modules/monitoring/graphdb_dashboard.json +++ b/modules/monitoring/graphdb_dashboard.json @@ -80,62 +80,64 @@ "title": "GraphDB Memory Used % for the Auto Scaling Group" } }, - { - "height": 6, - "width": 6, - "y": 6, - "x": 6, - "type": "metric", - "properties": { - "metrics": [ - [ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ] - ], - "region": "${aws_region}", - "stacked": false, - "view": "timeSeries", - "period": 300, - "stat": "Average", - "yAxis": { - "left": { - "label": "Gigabytes", - "showUnits": false - }, - "right": { - "label": "", - "showUnits": false - } - }, - "title": "GraphDB Data Dir Free per instance" - } - }, - { - "height": 6, - "width": 6, - "y": 6, - "x": 0, - "type": "metric", - "properties": { - "metrics": [ - [ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ] - ], - "region": "${aws_region}", - "stacked": false, - "view": "timeSeries", - "period": 300, - "stat": "Average", - "yAxis": { - "left": { - "label": "Gigabytes", - "showUnits": false - }, - "right": { - "label": "", - "showUnits": false - } - }, - "title": "GraphDB Data Dir Used per instance" - } - }, + { + "height": 6, + "width": 6, + "y": 6, + "x": 6, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ] + ], + "region": "${aws_region}", + "stacked": false, + "view": "timeSeries", + "period": 300, + "stat": "Average", + "yAxis": { + "left": { + "min": 0, + "label": "Gigabytes", + "showUnits": false + }, + "right": { + "label": "", + "showUnits": false + } + }, + "title": "GraphDB Data Dir Free per instance" + } + }, + { + "height": 6, + "width": 6, + "y": 6, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ] + ], + "region": "${aws_region}", + "stacked": false, + "view": "timeSeries", + "period": 300, + "stat": "Average", + "yAxis": { + "left": { + "min": 0, + "label": "Gigabytes", + "showUnits": false + }, + "right": { + "label": "", + "showUnits": false + } + }, + "title": "GraphDB Data Dir Used per instance" + } + }, { "height": 6, "width": 6, @@ -153,6 +155,7 @@ "stat": "Average", "yAxis": { "left": { + "min": 0, "label": "Count", "showUnits": false },