Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# GraphDB AWS Terraform Module Changelog

## 2.6.0

* Added support for monitoring the disk_free_percent for the root ebs volume via Cloudwatch alarm per ASG and mem_used_percent alarm per ASG
* Added support for alarms when transitioning from in alarm state to ok state.
* Changed the comparison operator to use GreaterThanOrEqualToThreshold for most of the alarms.
* Added treat a missing data option for all alarms and set it based on the type of the alarm.
* Added support for changing the volume size of the instance root EBS volume.

## 2.5.0
Expand Down
83 changes: 70 additions & 13 deletions modules/monitoring/alarms.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Alarms

# Attempting to recover metric filter

resource "aws_cloudwatch_log_metric_filter" "graphdb_attempting_to_recover_metric_filter" {
count = var.graphdb_node_count > 1 ? 1 : 0

Expand Down Expand Up @@ -33,14 +34,16 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_attempting_to_recover_alarm" {
evaluation_periods = var.cloudwatch_evaluation_periods
threshold = "0"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
treat_missing_data = "missing"

depends_on = [aws_cloudwatch_log_metric_filter.graphdb_attempting_to_recover_metric_filter[0]]
}

# Log filter for low disk space messages in the logs

resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filter" {
name = "al-${var.resource_name_prefix}-low-disk-space"
name = "al-${var.resource_name_prefix}-low-disk-space-GraphDB-disk"
pattern = "No space left on the device"
log_group_name = aws_cloudwatch_log_group.graphdb_log_group.name

Expand All @@ -57,7 +60,7 @@ resource "aws_cloudwatch_log_metric_filter" "graphdb_low_disk_space_metric_filte
# Alarm based on metric filter for Low Disk Space messages in the logs

resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
alarm_name = "al-${var.resource_name_prefix}-low-disk-space"
alarm_name = "al-${var.resource_name_prefix}-low-disk-space-GraphDB-disk"
alarm_description = "Low Disk Space"
comparison_operator = "GreaterThanThreshold"
metric_name = aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter.metric_transformation[0].name
Expand All @@ -67,12 +70,15 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_low_disk_space_alarm" {
evaluation_periods = var.cloudwatch_evaluation_periods
threshold = "0"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]
treat_missing_data = "missing"

depends_on = [aws_cloudwatch_log_metric_filter.graphdb_low_disk_space_metric_filter]
}

locals {
# Builds a list of instance hostnames

instance_hostnames = [
for i in range(1, var.graphdb_node_count + 1) :
var.route53_zone_dns_name != null ?
Expand All @@ -86,13 +92,15 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {

alarm_name = "al-${var.resource_name_prefix}-heap-memory-usage-${each.key}"
alarm_description = "Triggers if ${each.key}'s heap usage exceeds threshold of its total memory"
comparison_operator = "GreaterThanThreshold"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.graphdb_memory_utilization_threshold
evaluation_periods = 1
treat_missing_data = "missing"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]

# Define the metric query for heap used memory

metric_query {
id = "m1"
metric {
Expand All @@ -108,6 +116,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
}

# Defines the metric query for total memory

metric_query {
id = "m2"
metric {
Expand All @@ -123,6 +132,7 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
}

# Defines the expression to calculate heap usage percentage

metric_query {
id = "e1"
expression = "(m1 / m2) * 100"
Expand All @@ -131,28 +141,52 @@ resource "aws_cloudwatch_metric_alarm" "heap_usage_alarm" {
}
}

# Alarm for CPU Utilization for Autoscaling Group
# Alarm for ASG Memory Used Percent

resource "aws_cloudwatch_metric_alarm" "graphdb_cpu_utilization" {
alarm_name = "al-${var.resource_name_prefix}-cpu-utilization"
alarm_description = "Alarm will trigger if CPU utilization is above 80%"
comparison_operator = "GreaterThanThreshold"
resource "aws_cloudwatch_metric_alarm" "asg_mem_used_percent" {
alarm_name = "al-${var.resource_name_prefix}-asg-mem-used-percent"
alarm_description = "ASG mem_used_percent >= ${var.graphdb_memory_utilization_threshold}%"
namespace = "CWAgent"
metric_name = "mem_used_percent"
statistic = "Average"
unit = "Percent"
period = var.cloudwatch_period
evaluation_periods = var.cloudwatch_evaluation_periods
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.graphdb_memory_utilization_threshold
treat_missing_data = "missing"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]

dimensions = {
AutoScalingGroupName = var.resource_name_prefix
}
}

# Alarm for ASG CPU Utilization

resource "aws_cloudwatch_metric_alarm" "asg_cpu_utilization" {
alarm_name = "al-${var.resource_name_prefix}-asg-cpu-utilization"
alarm_description = "ASG average CPU >= ${var.cloudwatch_cpu_utilization_threshold}%"
namespace = "AWS/EC2"
metric_name = "CPUUtilization"
statistic = "Average"
unit = "Percent"
period = var.cloudwatch_period
statistic = "Maximum"
evaluation_periods = var.cloudwatch_evaluation_periods
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.cloudwatch_cpu_utilization_threshold
treat_missing_data = "missing"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]

metric_name = "CPUUtilization"
namespace = "AWS/EC2"
unit = "Percent"
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]

dimensions = {
AutoScalingGroupName = var.resource_name_prefix
}
}

# Alarm for nodes disconnected

resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
count = var.graphdb_node_count > 1 ? 1 : 0

Expand All @@ -165,6 +199,7 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
comparison_operator = "GreaterThanThreshold"
treat_missing_data = "missing"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]

metric_query {
id = "q1"
Expand All @@ -174,3 +209,25 @@ resource "aws_cloudwatch_metric_alarm" "graphdb_nodes_disconnected" {
period = var.cloudwatch_period
}
}

# Alarm for ASG Root Disk Used Percent

resource "aws_cloudwatch_metric_alarm" "asg_root_disk_used_percent" {
alarm_name = "al-${var.resource_name_prefix}-asg-root-disk-used-percent"
alarm_description = "ASG disk_used_percent on / >= 80%"
namespace = "CWAgent"
metric_name = "disk_used_percent"
statistic = "Average"
unit = "Percent"
period = var.cloudwatch_period
evaluation_periods = var.cloudwatch_evaluation_periods
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = 80
treat_missing_data = "missing"
alarm_actions = [aws_sns_topic.graphdb_sns_topic.arn]
ok_actions = [aws_sns_topic.graphdb_sns_topic.arn]

dimensions = {
AutoScalingGroupName = var.resource_name_prefix
}
}
2 changes: 0 additions & 2 deletions modules/monitoring/cloudwatch_agent_config.json.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,7 @@
]
],
"append_dimensions": {
"ImageId": "$${aws:ImageId}",
"InstanceId": "$${aws:InstanceId}",
"InstanceType": "$${aws:InstanceType}",
"AutoScalingGroupName": "$${aws:AutoScalingGroupName}"
},
"metrics_collected": {
Expand Down
18 changes: 18 additions & 0 deletions modules/monitoring/data.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
data "aws_instances" "asg_members" {
filter {
name = "tag:aws:autoscaling:groupName"
values = [var.resource_name_prefix]
}
}

data "aws_instance" "by_id" {
for_each = toset(data.aws_instances.asg_members.ids)
instance_id = each.value
}

locals {
id_to_name = {
for id, inst in data.aws_instance.by_id :
id => coalesce(try(inst.tags["Name"], null), id)
}
}
115 changes: 59 additions & 56 deletions modules/monitoring/graphdb_dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,62 +80,64 @@
"title": "GraphDB Memory Used % for the Auto Scaling Group"
}
},
{
"height": 6,
"width": 6,
"y": 6,
"x": 6,
"type": "metric",
"properties": {
"metrics": [
[ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
],
"region": "${aws_region}",
"stacked": false,
"view": "timeSeries",
"period": 300,
"stat": "Average",
"yAxis": {
"left": {
"label": "Gigabytes",
"showUnits": false
},
"right": {
"label": "",
"showUnits": false
}
},
"title": "GraphDB Data Dir Free per instance"
}
},
{
"height": 6,
"width": 6,
"y": 6,
"x": 0,
"type": "metric",
"properties": {
"metrics": [
[ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
],
"region": "${aws_region}",
"stacked": false,
"view": "timeSeries",
"period": 300,
"stat": "Average",
"yAxis": {
"left": {
"label": "Gigabytes",
"showUnits": false
},
"right": {
"label": "",
"showUnits": false
}
},
"title": "GraphDB Data Dir Used per instance"
}
},
{
"height": 6,
"width": 6,
"y": 6,
"x": 6,
"type": "metric",
"properties": {
"metrics": [
[ { "expression": "SELECT AVG(graphdb_data_dir_free) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
],
"region": "${aws_region}",
"stacked": false,
"view": "timeSeries",
"period": 300,
"stat": "Average",
"yAxis": {
"left": {
"min": 0,
"label": "Gigabytes",
"showUnits": false
},
"right": {
"label": "",
"showUnits": false
}
},
"title": "GraphDB Data Dir Free per instance"
}
},
{
"height": 6,
"width": 6,
"y": 6,
"x": 0,
"type": "metric",
"properties": {
"metrics": [
[ { "expression": "SELECT AVG(graphdb_data_dir_used) FROM \"${resource_name_prefix}\" GROUP BY host", "label": "Query1", "id": "q1", "region": "${aws_region}" } ]
],
"region": "${aws_region}",
"stacked": false,
"view": "timeSeries",
"period": 300,
"stat": "Average",
"yAxis": {
"left": {
"min": 0,
"label": "Gigabytes",
"showUnits": false
},
"right": {
"label": "",
"showUnits": false
}
},
"title": "GraphDB Data Dir Used per instance"
}
},
{
"height": 6,
"width": 6,
Expand All @@ -153,6 +155,7 @@
"stat": "Average",
"yAxis": {
"left": {
"min": 0,
"label": "Count",
"showUnits": false
},
Expand Down
Loading