Skip to content

Commit

Permalink
Merge pull request #3628 from ministryofjustice/DPR2-121
Browse files Browse the repository at this point in the history
CloudWatch Alarm Reusable Module - Add Alarms
  • Loading branch information
harichintala1 authored Oct 13, 2023
2 parents 5ac8015 + badfa92 commit 73e423c
Show file tree
Hide file tree
Showing 7 changed files with 511 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,14 @@
"include_dbuilder_gw_vpclink": false,
"enable_dbuilder_serverless_gw": true,
"enable_dbuilder_apikey": true,
"enable_slack_alerts": false,
"enable_slack_alerts": true,
"enable_pagerduty_alerts": false,
"enable_transfer_component_lambda": true,
"create_transfer_component_lambda_layer": true,
"setup_fake_data_dms_instance": true,
"enable_fake_data_dms_replication_task": true,
"setup_sonatype_secrets": true
"setup_sonatype_secrets": true,
"setup_cw_alarms": true
},
"test": {
"project_short_id": "dpr",
Expand Down Expand Up @@ -152,7 +153,8 @@
"create_transfer_component_lambda_layer": true,
"setup_fake_data_dms_instance": false,
"enable_fake_data_dms_replication_task": false,
"setup_sonatype_secrets": false
"setup_sonatype_secrets": false,
"setup_cw_alarms": true
},
"preproduction": {
"project_short_id": "dpr",
Expand Down Expand Up @@ -229,7 +231,8 @@
"create_transfer_component_lambda_layer": true,
"setup_fake_data_dms_instance": false,
"enable_fake_data_dms_replication_task": false,
"setup_sonatype_secrets": false
"setup_sonatype_secrets": false,
"setup_cw_alarms": false
},
"production": {
"project_short_id": "dpr",
Expand Down Expand Up @@ -306,7 +309,8 @@
"create_transfer_component_lambda_layer": true,
"setup_fake_data_dms_instance": false,
"enable_fake_data_dms_replication_task": false,
"setup_sonatype_secrets": false
"setup_sonatype_secrets": false,
"setup_cw_alarms": false
}
}
}
3 changes: 3 additions & 0 deletions terraform/environments/digital-prison-reporting/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ locals {
create_transfercomp_lambda_layer = local.application_data.accounts[local.environment].create_transfer_component_lambda_layer
lambda_transfercomp_layer_name = "${local.project}-redhift-jdbc-dependency-layer"

# Enable CW alarms
enable_cw_alarm = local.application_data.accounts[local.environment].setup_cw_alarms

# Sonatype Secrets
setup_sonatype_secrets = local.application_data.accounts[local.environment].setup_sonatype_secrets

Expand Down
290 changes: 290 additions & 0 deletions terraform/environments/digital-prison-reporting/metric_alarms.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
## Monitoring Amazon Redshift, https://docs.aws.amazon.com/redshift/latest/mgmt/metrics-listing.html
# Alarm - "Redshift Health Status"
# Indicates the health of the cluster. Every minute the cluster connects to its database and performs a simple query.
# If it is able to perform this operation successfully, the cluster is considered healthy.
# Otherwise, the cluster is unhealthy. An unhealthy status can occur when the cluster database is under extremely heavy load or if there is a configuration problem with a database on the cluster.
module "dpr_redshift_health_status_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-redshift-health-status"
alarm_description = "ATTENTION: DPR Redshift HealthStatus Monitor, Please investigate Redshift Errors !"
comparison_operator = "LessThanThreshold"
evaluation_periods = 1 # Boolean
threshold = 1
period = 60

namespace = "AWS/Redshift"
metric_name = "HealthStatus"
statistic = "Maximum"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}


## Monitoring AWS DMS tasks, https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Monitoring.html
# Alarm - "DMS Stop Monitor"
module "dpr_dms_stoptask_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-stop-task"
alarm_description = "ATTENTION: DPR DMS Replication Stop Monitor, Please investigate DMS Replication Task Errors !"
comparison_operator = "GreaterThanThreshold"
threshold = 0 # Boolean
period = 30
evaluation_periods = 1

dimensions = {
"Class" = "None"
"Resource" = "StopReplicationTask"
"Service" = "Database Migration Service"
"Type" = "API"
}

namespace = "AWS/Usage"
metric_name = "CallCount"
statistic = "Maximum"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS Start Monitor"
module "dpr_dms_starttask_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-start-task"
alarm_description = "ATTENTION: DPR DMS Replication Start Monitor, Please investigate DMS Replication Task Errors !"
comparison_operator = "GreaterThanThreshold"
threshold = 0 # Boolean
period = 30
evaluation_periods = 1

dimensions = {
"Class" = "None"
"Resource" = "StartReplicationTask"
"Service" = "Database Migration Service"
"Type" = "API"
}

namespace = "AWS/Usage"
metric_name = "CallCount"
statistic = "Maximum"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS CPU Utilization Monitor"
module "dpr_dms_cpu_utils_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-cpu-utilization"
alarm_description = "ATTENTION: DPR DMS Instance CPU Monitor, Please investigate High CPU Utilization for DMS Instance !"
comparison_operator = "GreaterThanThreshold"
period = 300
evaluation_periods = 1
threshold = 80 # 80% CPU

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
}

namespace = "AWS/DMS"
metric_name = "CPUUtilization"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS FreeMemory Monitor"
module "dpr_dms_free_memory_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-free-memory"
alarm_description = "ATTENTION: DPR DMS Instance FreeMemory Monitor, Please investigate FreeMemory is Below 1Gb DMS Instance !"
comparison_operator = "LessThanThreshold"
period = 300
evaluation_periods = 1
threshold = 1000000000 # 1Gb

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
}

namespace = "AWS/DMS"
metric_name = "FreeMemory"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS FreeableMemory Monitor"
module "dpr_dms_freeable_memory_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-freeable-memory"
alarm_description = "ATTENTION: DPR DMS Instance Freeable Memory Monitor, Please investigate low FreeableMemory for Nomis DMS Instance !"
comparison_operator = "LessThanOrEqualToThreshold"
period = 300
evaluation_periods = 1
threshold = 1000000000 # 1Gb

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
}

namespace = "AWS/DMS"
metric_name = "FreeableMemory"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS SWAP Usage Monitor"
# https://repost.aws/knowledge-center/dms-swap-files-consuming-space
module "dpr_dms_swap_usage_check" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-swap-usage"
alarm_description = "ATTENTION: DPR DMS Instance SWAP Usage Monitor, Please investigate SWAP Usage is Above 0.75 Gb for DMS Instance!"
comparison_operator = "GreaterThanThreshold"
period = 300
evaluation_periods = 1
threshold = 750000000 # 0.75Gb

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
}

namespace = "AWS/DMS"
metric_name = "SwapUsage"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS Network Transmit Throughput Monitor"
# https://repost.aws/knowledge-center/dms-swap-files-consuming-space
module "dpr_dms_network_transmit_throughput" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-network-transmit-throughput"
alarm_description = "ATTENTION: DPR DMS Instance Network Throughput Monitor, Please investigate Network Transmit Throughput is below Threshold 1000 Bytes!"
comparison_operator = "LessThanThreshold"
period = 300
evaluation_periods = 1
threshold = 10 # 10 Bytes

namespace = "AWS/DMS"
metric_name = "NetworkTransmitThroughput"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# Alarm - "DMS Network Receive Throughput Monitor"
# https://repost.aws/knowledge-center/dms-swap-files-consuming-space
module "dpr_dms_network_receive_throughput" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-network-receive-throughput"
alarm_description = "ATTENTION: DPR DMS Instance Network Throughput Monitor, Please investigate Network Receive Throughput is below Threshold 10 Bytes!"
comparison_operator = "LessThanThreshold"
period = 300
evaluation_periods = 1
threshold = 10 # 10 Bytes

namespace = "AWS/DMS"
metric_name = "NetworkReceiveThroughput"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# DMS, CDCLatencySource
# The gap, in seconds, between the last event captured from the source endpoint and current system time stamp of the AWS DMS instance.
# CDCLatencySource represents the latency between source and replication instance.
# High CDCLatencySource means the process of capturing changes from source is delayed.
module "dpr_dms_cdc_source_latency" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-cdc-source-latency"
alarm_description = "ATTENTION: P1 Incident: DPR DMS CDC Source Latency, Please investigate CDC Source Latency for Oracle Nomis is greater than 60 mins !"
comparison_operator = "GreaterThanThreshold"
period = 900
evaluation_periods = 1
threshold = 3600 # 60 mins

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
"ReplicationTaskIdentifier" = module.dms_nomis_ingestor.dms_replication_task_name
}

namespace = "AWS/DMS"
metric_name = "CDCLatencySource"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# DMS, CDCLatencyTarget
# The gap, in seconds, between the first event timestamp waiting to commit on the target and the current timestamp of the AWS DMS instance.
# Target latency is the difference between the replication instance server time and the oldest unconfirmed event id forwarded to a target component.
module "dpr_dms_cdc_target_latency" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-cdc-target-latency"
alarm_description = "ATTENTION: P1 Incident: DPR DMS CDC Target Latency, Please investigate CDC Target Latency for Oracle Nomis is greater than 60 mins !"
comparison_operator = "GreaterThanThreshold"
period = 900
evaluation_periods = 1
threshold = 3600 # 60 mins

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
"ReplicationTaskIdentifier" = module.dms_nomis_ingestor.dms_replication_task_name
}

namespace = "AWS/DMS"
metric_name = "CDCLatencyTarget"
statistic = "Average"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}

# DMS CDCIncomingChanges,
# The total number of change events at a point-in-time that are waiting to be applied to the target.
# Note that this is not the same as a measure of the transaction change rate of the source endpoint.
# A large number for this metric usually indicates AWS DMS is unable to apply captured changes in a timely manner,
# thus causing high target latency.
module "dpr_dms_cdc_incoming_events" {
source = "./modules/cw_alarm"
create_metric_alarm = local.enable_cw_alarm

alarm_name = "dpr-dms-cdc-incoming-events"
alarm_description = "ATTENTION: P1 Incident: DPR DMS CDC Incoming Events Alert, Please investigate CDC Incoming Events are waiting to be applied for Oracle Nomis !"
comparison_operator = "GreaterThanThreshold"
period = 60
evaluation_periods = 1
threshold = 100 # 100 events

dimensions = {
"ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name
"ReplicationTaskIdentifier" = module.dms_nomis_ingestor.dms_replication_task_name
}

namespace = "AWS/DMS"
metric_name = "CDCIncomingChanges"
statistic = "Maximum"

alarm_actions = [module.notifications_sns.sns_topic_arn]
}
Loading

0 comments on commit 73e423c

Please sign in to comment.