diff --git a/terraform/environments/digital-prison-reporting/application_variables.json b/terraform/environments/digital-prison-reporting/application_variables.json index f3d886f486d..466dbd923a2 100644 --- a/terraform/environments/digital-prison-reporting/application_variables.json +++ b/terraform/environments/digital-prison-reporting/application_variables.json @@ -69,13 +69,14 @@ "include_dbuilder_gw_vpclink": false, "enable_dbuilder_serverless_gw": true, "enable_dbuilder_apikey": true, - "enable_slack_alerts": false, + "enable_slack_alerts": true, "enable_pagerduty_alerts": false, "enable_transfer_component_lambda": true, "create_transfer_component_lambda_layer": true, "setup_fake_data_dms_instance": true, "enable_fake_data_dms_replication_task": true, - "setup_sonatype_secrets": true + "setup_sonatype_secrets": true, + "setup_cw_alarms": true }, "test": { "project_short_id": "dpr", @@ -152,7 +153,8 @@ "create_transfer_component_lambda_layer": true, "setup_fake_data_dms_instance": false, "enable_fake_data_dms_replication_task": false, - "setup_sonatype_secrets": false + "setup_sonatype_secrets": false, + "setup_cw_alarms": true }, "preproduction": { "project_short_id": "dpr", @@ -229,7 +231,8 @@ "create_transfer_component_lambda_layer": true, "setup_fake_data_dms_instance": false, "enable_fake_data_dms_replication_task": false, - "setup_sonatype_secrets": false + "setup_sonatype_secrets": false, + "setup_cw_alarms": false }, "production": { "project_short_id": "dpr", @@ -306,7 +309,8 @@ "create_transfer_component_lambda_layer": true, "setup_fake_data_dms_instance": false, "enable_fake_data_dms_replication_task": false, - "setup_sonatype_secrets": false + "setup_sonatype_secrets": false, + "setup_cw_alarms": false } } } diff --git a/terraform/environments/digital-prison-reporting/locals.tf b/terraform/environments/digital-prison-reporting/locals.tf index 89154f9957a..01fcc0bff5a 100644 --- a/terraform/environments/digital-prison-reporting/locals.tf +++ b/terraform/environments/digital-prison-reporting/locals.tf @@ -176,6 +176,9 @@ locals { create_transfercomp_lambda_layer = local.application_data.accounts[local.environment].create_transfer_component_lambda_layer lambda_transfercomp_layer_name = "${local.project}-redhift-jdbc-dependency-layer" + # Enable CW alarms + enable_cw_alarm = local.application_data.accounts[local.environment].setup_cw_alarms + # Sonatype Secrets setup_sonatype_secrets = local.application_data.accounts[local.environment].setup_sonatype_secrets diff --git a/terraform/environments/digital-prison-reporting/metric_alarms.tf b/terraform/environments/digital-prison-reporting/metric_alarms.tf new file mode 100644 index 00000000000..aea17c284e0 --- /dev/null +++ b/terraform/environments/digital-prison-reporting/metric_alarms.tf @@ -0,0 +1,290 @@ +## Monitoring Amazon Redshift, https://docs.aws.amazon.com/redshift/latest/mgmt/metrics-listing.html +# Alarm - "Redshift Health Status" +# Indicates the health of the cluster. Every minute the cluster connects to its database and performs a simple query. +# If it is able to perform this operation successfully, the cluster is considered healthy. +# Otherwise, the cluster is unhealthy. An unhealthy status can occur when the cluster database is under extremely heavy load or if there is a configuration problem with a database on the cluster. +module "dpr_redshift_health_status_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-redshift-health-status" + alarm_description = "ATTENTION: DPR Redshift HealthStatus Monitor, Please investigate Redshift Errors !" + comparison_operator = "LessThanThreshold" + evaluation_periods = 1 # Boolean + threshold = 1 + period = 60 + + namespace = "AWS/Redshift" + metric_name = "HealthStatus" + statistic = "Maximum" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + + +## Monitoring AWS DMS tasks, https://docs.aws.amazon.com/dms/latest/userguide/CHAP_Monitoring.html +# Alarm - "DMS Stop Monitor" +module "dpr_dms_stoptask_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-stop-task" + alarm_description = "ATTENTION: DPR DMS Replication Stop Monitor, Please investigate DMS Replication Task Errors !" + comparison_operator = "GreaterThanThreshold" + threshold = 0 # Boolean + period = 30 + evaluation_periods = 1 + + dimensions = { + "Class" = "None" + "Resource" = "StopReplicationTask" + "Service" = "Database Migration Service" + "Type" = "API" + } + + namespace = "AWS/Usage" + metric_name = "CallCount" + statistic = "Maximum" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS Start Monitor" +module "dpr_dms_starttask_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-start-task" + alarm_description = "ATTENTION: DPR DMS Replication Start Monitor, Please investigate DMS Replication Task Errors !" + comparison_operator = "GreaterThanThreshold" + threshold = 0 # Boolean + period = 30 + evaluation_periods = 1 + + dimensions = { + "Class" = "None" + "Resource" = "StartReplicationTask" + "Service" = "Database Migration Service" + "Type" = "API" + } + + namespace = "AWS/Usage" + metric_name = "CallCount" + statistic = "Maximum" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS CPU Utilization Monitor" +module "dpr_dms_cpu_utils_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-cpu-utilization" + alarm_description = "ATTENTION: DPR DMS Instance CPU Monitor, Please investigate High CPU Utilization for DMS Instance !" + comparison_operator = "GreaterThanThreshold" + period = 300 + evaluation_periods = 1 + threshold = 80 # 80% CPU + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + } + + namespace = "AWS/DMS" + metric_name = "CPUUtilization" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS FreeMemory Monitor" +module "dpr_dms_free_memory_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-free-memory" + alarm_description = "ATTENTION: DPR DMS Instance FreeMemory Monitor, Please investigate FreeMemory is Below 1Gb DMS Instance !" + comparison_operator = "LessThanThreshold" + period = 300 + evaluation_periods = 1 + threshold = 1000000000 # 1Gb + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + } + + namespace = "AWS/DMS" + metric_name = "FreeMemory" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS FreeableMemory Monitor" +module "dpr_dms_freeable_memory_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-freeable-memory" + alarm_description = "ATTENTION: DPR DMS Instance Freeable Memory Monitor, Please investigate low FreeableMemory for Nomis DMS Instance !" + comparison_operator = "LessThanOrEqualToThreshold" + period = 300 + evaluation_periods = 1 + threshold = 1000000000 # 1Gb + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + } + + namespace = "AWS/DMS" + metric_name = "FreeableMemory" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS SWAP Usage Monitor" +# https://repost.aws/knowledge-center/dms-swap-files-consuming-space +module "dpr_dms_swap_usage_check" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-swap-usage" + alarm_description = "ATTENTION: DPR DMS Instance SWAP Usage Monitor, Please investigate SWAP Usage is Above 0.75 Gb for DMS Instance!" + comparison_operator = "GreaterThanThreshold" + period = 300 + evaluation_periods = 1 + threshold = 750000000 # 0.75Gb + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + } + + namespace = "AWS/DMS" + metric_name = "SwapUsage" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS Network Transmit Throughput Monitor" +# https://repost.aws/knowledge-center/dms-swap-files-consuming-space +module "dpr_dms_network_transmit_throughput" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-network-transmit-throughput" + alarm_description = "ATTENTION: DPR DMS Instance Network Throughput Monitor, Please investigate Network Transmit Throughput is below Threshold 1000 Bytes!" + comparison_operator = "LessThanThreshold" + period = 300 + evaluation_periods = 1 + threshold = 10 # 10 Bytes + + namespace = "AWS/DMS" + metric_name = "NetworkTransmitThroughput" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# Alarm - "DMS Network Receive Throughput Monitor" +# https://repost.aws/knowledge-center/dms-swap-files-consuming-space +module "dpr_dms_network_receive_throughput" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-network-receive-throughput" + alarm_description = "ATTENTION: DPR DMS Instance Network Throughput Monitor, Please investigate Network Receive Throughput is below Threshold 10 Bytes!" + comparison_operator = "LessThanThreshold" + period = 300 + evaluation_periods = 1 + threshold = 10 # 10 Bytes + + namespace = "AWS/DMS" + metric_name = "NetworkReceiveThroughput" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# DMS, CDCLatencySource +# The gap, in seconds, between the last event captured from the source endpoint and current system time stamp of the AWS DMS instance. +# CDCLatencySource represents the latency between source and replication instance. +# High CDCLatencySource means the process of capturing changes from source is delayed. +module "dpr_dms_cdc_source_latency" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-cdc-source-latency" + alarm_description = "ATTENTION: P1 Incident: DPR DMS CDC Source Latency, Please investigate CDC Source Latency for Oracle Nomis is greater than 60 mins !" + comparison_operator = "GreaterThanThreshold" + period = 900 + evaluation_periods = 1 + threshold = 3600 # 60 mins + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + "ReplicationTaskIdentifier" = module.dms_nomis_ingestor.dms_replication_task_name + } + + namespace = "AWS/DMS" + metric_name = "CDCLatencySource" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# DMS, CDCLatencyTarget +# The gap, in seconds, between the first event timestamp waiting to commit on the target and the current timestamp of the AWS DMS instance. +# Target latency is the difference between the replication instance server time and the oldest unconfirmed event id forwarded to a target component. +module "dpr_dms_cdc_target_latency" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-cdc-target-latency" + alarm_description = "ATTENTION: P1 Incident: DPR DMS CDC Target Latency, Please investigate CDC Target Latency for Oracle Nomis is greater than 60 mins !" + comparison_operator = "GreaterThanThreshold" + period = 900 + evaluation_periods = 1 + threshold = 3600 # 60 mins + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + "ReplicationTaskIdentifier" = module.dms_nomis_ingestor.dms_replication_task_name + } + + namespace = "AWS/DMS" + metric_name = "CDCLatencyTarget" + statistic = "Average" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} + +# DMS CDCIncomingChanges, +# The total number of change events at a point-in-time that are waiting to be applied to the target. +# Note that this is not the same as a measure of the transaction change rate of the source endpoint. +# A large number for this metric usually indicates AWS DMS is unable to apply captured changes in a timely manner, +# thus causing high target latency. +module "dpr_dms_cdc_incoming_events" { + source = "./modules/cw_alarm" + create_metric_alarm = local.enable_cw_alarm + + alarm_name = "dpr-dms-cdc-incoming-events" + alarm_description = "ATTENTION: P1 Incident: DPR DMS CDC Incoming Events Alert, Please investigate CDC Incoming Events are waiting to be applied for Oracle Nomis !" + comparison_operator = "GreaterThanThreshold" + period = 60 + evaluation_periods = 1 + threshold = 100 # 100 events + + dimensions = { + "ReplicationInstanceIdentifier" = module.dms_nomis_ingestor.dms_instance_name + "ReplicationTaskIdentifier" = module.dms_nomis_ingestor.dms_replication_task_name + } + + namespace = "AWS/DMS" + metric_name = "CDCIncomingChanges" + statistic = "Maximum" + + alarm_actions = [module.notifications_sns.sns_topic_arn] +} diff --git a/terraform/environments/digital-prison-reporting/modules/cw_alarm/main.tf b/terraform/environments/digital-prison-reporting/modules/cw_alarm/main.tf new file mode 100644 index 00000000000..a4f5201df77 --- /dev/null +++ b/terraform/environments/digital-prison-reporting/modules/cw_alarm/main.tf @@ -0,0 +1,58 @@ +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/AlarmThatSendsEmail.html +resource "aws_cloudwatch_metric_alarm" "this" { + count = var.create_metric_alarm ? 1 : 0 + + alarm_name = var.alarm_name + alarm_description = var.alarm_description + actions_enabled = var.actions_enabled + + alarm_actions = var.alarm_actions + ok_actions = var.ok_actions + insufficient_data_actions = var.insufficient_data_actions + + comparison_operator = var.comparison_operator + evaluation_periods = var.evaluation_periods + threshold = var.threshold + unit = var.unit + + datapoints_to_alarm = var.datapoints_to_alarm + treat_missing_data = var.treat_missing_data + evaluate_low_sample_count_percentiles = var.evaluate_low_sample_count_percentiles + + # conflicts with metric_query + metric_name = var.metric_name + namespace = var.namespace + period = var.period + statistic = var.statistic + extended_statistic = var.extended_statistic + + dimensions = var.dimensions + + # conflicts with metric_name + dynamic "metric_query" { + for_each = var.metric_query + content { + id = lookup(metric_query.value, "id") + account_id = lookup(metric_query.value, "account_id", null) + label = lookup(metric_query.value, "label", null) + return_data = lookup(metric_query.value, "return_data", null) + expression = lookup(metric_query.value, "expression", null) + period = lookup(metric_query.value, "period", null) + + dynamic "metric" { + for_each = lookup(metric_query.value, "metric", []) + content { + metric_name = lookup(metric.value, "metric_name") + namespace = lookup(metric.value, "namespace") + period = lookup(metric.value, "period") + stat = lookup(metric.value, "stat") + unit = lookup(metric.value, "unit", null) + dimensions = lookup(metric.value, "dimensions", null) + } + } + } + } + threshold_metric_id = var.threshold_metric_id + + tags = var.tags +} \ No newline at end of file diff --git a/terraform/environments/digital-prison-reporting/modules/cw_alarm/outputs.tf b/terraform/environments/digital-prison-reporting/modules/cw_alarm/outputs.tf new file mode 100644 index 00000000000..ed0104901f1 --- /dev/null +++ b/terraform/environments/digital-prison-reporting/modules/cw_alarm/outputs.tf @@ -0,0 +1,9 @@ +output "cloudwatch_metric_alarm_arn" { + description = "The ARN of the Cloudwatch metric alarm." + value = try(aws_cloudwatch_metric_alarm.this[0].arn, "") +} + +output "cloudwatch_metric_alarm_id" { + description = "The ID of the Cloudwatch metric alarm." + value = try(aws_cloudwatch_metric_alarm.this[0].id, "") +} \ No newline at end of file diff --git a/terraform/environments/digital-prison-reporting/modules/cw_alarm/variables.tf b/terraform/environments/digital-prison-reporting/modules/cw_alarm/variables.tf new file mode 100644 index 00000000000..cd08ac8803e --- /dev/null +++ b/terraform/environments/digital-prison-reporting/modules/cw_alarm/variables.tf @@ -0,0 +1,134 @@ +variable "create_metric_alarm" { + description = "Whether to create the Cloudwatch metric alarm" + type = bool + default = true +} + +variable "alarm_name" { + description = "The descriptive name for the alarm. This name must be unique within the user's AWS account." + type = string +} + +variable "alarm_description" { + description = "The description for the alarm." + type = string + default = null +} + +variable "comparison_operator" { + description = "The arithmetic operation to use when comparing the specified Statistic and Threshold. The specified Statistic value is used as the first operand. Either of the following is supported: GreaterThanOrEqualToThreshold, GreaterThanThreshold, LessThanThreshold, LessThanOrEqualToThreshold." + type = string +} + +variable "evaluation_periods" { + description = "The number of periods over which data is compared to the specified threshold." + type = number +} + +variable "threshold" { + description = "The value against which the specified statistic is compared." + type = number + default = null +} + +variable "threshold_metric_id" { + description = "If this is an alarm based on an anomaly detection model, make this value match the ID of the ANOMALY_DETECTION_BAND function." + type = string + default = null +} + +variable "unit" { + description = "The unit for the alarm's associated metric." + type = string + default = null +} + +variable "metric_name" { + description = "The name for the alarm's associated metric. See docs for supported metrics." + type = string + default = null +} + +variable "namespace" { + description = "The namespace for the alarm's associated metric. See docs for the list of namespaces. See docs for supported metrics." + type = string + default = null +} + +variable "period" { + description = "The period in seconds over which the specified statistic is applied." + type = string + default = null +} + +variable "statistic" { + description = "The statistic to apply to the alarm's associated metric. Either of the following is supported: SampleCount, Average, Sum, Minimum, Maximum" + type = string + default = null +} + +variable "actions_enabled" { + description = "Indicates whether or not actions should be executed during any changes to the alarm's state. Defaults to true." + type = bool + default = true +} + +variable "datapoints_to_alarm" { + description = "The number of datapoints that must be breaching to trigger the alarm." + type = number + default = null +} + +variable "dimensions" { + description = "The dimensions for the alarm's associated metric." + type = any + default = null +} + +variable "alarm_actions" { + description = "The list of actions to execute when this alarm transitions into an ALARM state from any other state. Each action is specified as an Amazon Resource Name (ARN)." + type = list(string) + default = null +} + +variable "insufficient_data_actions" { + description = "The list of actions to execute when this alarm transitions into an INSUFFICIENT_DATA state from any other state. Each action is specified as an Amazon Resource Name (ARN)." + type = list(string) + default = null +} + +variable "ok_actions" { + description = "The list of actions to execute when this alarm transitions into an OK state from any other state. Each action is specified as an Amazon Resource Name (ARN)." + type = list(string) + default = null +} + +variable "extended_statistic" { + description = "The percentile statistic for the metric associated with the alarm. Specify a value between p0.0 and p100." + type = string + default = null +} + +variable "treat_missing_data" { + description = "Sets how this alarm is to handle missing data points. The following values are supported: missing, ignore, breaching and notBreaching." + type = string + default = "missing" +} + +variable "evaluate_low_sample_count_percentiles" { + description = "Used only for alarms based on percentiles. If you specify ignore, the alarm state will not change during periods with too few data points to be statistically significant. If you specify evaluate or omit this parameter, the alarm will always be evaluated and possibly change state no matter how many data points are available. The following values are supported: ignore, and evaluate." + type = string + default = null +} + +variable "metric_query" { + description = "Enables you to create an alarm based on a metric math expression. You may specify at most 20." + type = any + default = [] +} + +variable "tags" { + description = "A mapping of tags to assign to all resources" + type = map(string) + default = {} +} \ No newline at end of file diff --git a/terraform/environments/digital-prison-reporting/modules/dms/outputs.tf b/terraform/environments/digital-prison-reporting/modules/dms/outputs.tf index 14ff26bbbb7..f9e6e56a46c 100644 --- a/terraform/environments/digital-prison-reporting/modules/dms/outputs.tf +++ b/terraform/environments/digital-prison-reporting/modules/dms/outputs.tf @@ -1,3 +1,11 @@ output "dms_subnet_ids" { value = var.subnet_ids } + +output "dms_instance_name" { + value = var.name +} + +output "dms_replication_task_name" { + value = "${var.project_id}-dms-task-${var.short_name}-${var.dms_source_name}-${var.dms_target_name}" +} \ No newline at end of file