From 04d527af69d31ddad2bd012f5cde4381716c9217 Mon Sep 17 00:00:00 2001 From: Bill Buchan Date: Thu, 19 Sep 2024 15:36:44 +0100 Subject: [PATCH 1/5] Initial Version of Cloudwatch Alarms --- .../delius-core/locals_development.tf | 1 + .../delius-core/locals_preproduction.tf | 1 + .../environments/delius-core/locals_stage.tf | 1 + .../environments/delius-core/locals_test.tf | 1 + .../delius-core/main_development.tf | 9 +- .../delius-core/main_preproduction.tf | 18 +-- .../environments/delius-core/main_test.tf | 9 +- .../components/dms/cloudwatch-alarms.tf | 107 ++++++++++++++++++ .../modules/components/dms/versions.tf | 2 +- .../modules/delius_environment/dms.tf | 9 +- .../modules/delius_environment/versions.tf | 2 +- 11 files changed, 138 insertions(+), 22 deletions(-) create mode 100644 terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf diff --git a/terraform/environments/delius-core/locals_development.tf b/terraform/environments/delius-core/locals_development.tf index 3c82a5fa023..2d25b4a6983 100644 --- a/terraform/environments/delius-core/locals_development.tf +++ b/terraform/environments/delius-core/locals_development.tf @@ -135,5 +135,6 @@ locals { user_target_endpoint = { write_database = "DMDNDA" } + is-production = local.is-production } } diff --git a/terraform/environments/delius-core/locals_preproduction.tf b/terraform/environments/delius-core/locals_preproduction.tf index 1e79fd58dcf..f56eeae0209 100644 --- a/terraform/environments/delius-core/locals_preproduction.tf +++ b/terraform/environments/delius-core/locals_preproduction.tf @@ -135,6 +135,7 @@ locals { user_target_endpoint = { write_database = "PRENDA" } + is-production = local.is-production } } diff --git a/terraform/environments/delius-core/locals_stage.tf b/terraform/environments/delius-core/locals_stage.tf index 9e262c1d3aa..51d80ee6787 100644 --- a/terraform/environments/delius-core/locals_stage.tf +++ b/terraform/environments/delius-core/locals_stage.tf @@ -135,5 +135,6 @@ locals { user_target_endpoint = { write_database = "STGNDA" } + is-production = local.is-production } } diff --git a/terraform/environments/delius-core/locals_test.tf b/terraform/environments/delius-core/locals_test.tf index ff99d26378f..ce3fbde94be 100644 --- a/terraform/environments/delius-core/locals_test.tf +++ b/terraform/environments/delius-core/locals_test.tf @@ -132,5 +132,6 @@ locals { read_database = "TSTNDA" } user_target_endpoint = {} + is-production = local.is-production } } diff --git a/terraform/environments/delius-core/main_development.tf b/terraform/environments/delius-core/main_development.tf index 5a72e27fb1a..a70a9a6965a 100644 --- a/terraform/environments/delius-core/main_development.tf +++ b/terraform/environments/delius-core/main_development.tf @@ -8,10 +8,11 @@ module "environment_dev" { count = local.is-development ? 1 : 0 providers = { - aws = aws - aws.bucket-replication = aws - aws.core-vpc = aws.core-vpc - aws.core-network-services = aws.core-network-services + aws = aws + aws.bucket-replication = aws + aws.core-vpc = aws.core-vpc + aws.core-network-services = aws.core-network-services + aws.modernisation-platform = aws.modernisation-platform } env_name = "dev" diff --git a/terraform/environments/delius-core/main_preproduction.tf b/terraform/environments/delius-core/main_preproduction.tf index 3d627a7d16b..4f76d1069f8 100644 --- a/terraform/environments/delius-core/main_preproduction.tf +++ b/terraform/environments/delius-core/main_preproduction.tf @@ -9,10 +9,11 @@ module "environment_stage" { count = local.is-preproduction ? 1 : 0 providers = { - aws = aws - aws.bucket-replication = aws - aws.core-vpc = aws.core-vpc - aws.core-network-services = aws.core-network-services + aws = aws + aws.bucket-replication = aws + aws.core-vpc = aws.core-vpc + aws.core-network-services = aws.core-network-services + aws.modernisation-platform = aws.modernisation-platform } env_name = "stage" @@ -47,10 +48,11 @@ module "environment_preprod" { count = local.is-preproduction ? 1 : 0 providers = { - aws = aws - aws.bucket-replication = aws - aws.core-vpc = aws.core-vpc - aws.core-network-services = aws.core-network-services + aws = aws + aws.bucket-replication = aws + aws.core-vpc = aws.core-vpc + aws.core-network-services = aws.core-network-services + aws.modernisation-platform = aws.modernisation-platform } env_name = "preprod" diff --git a/terraform/environments/delius-core/main_test.tf b/terraform/environments/delius-core/main_test.tf index 1a6c5809edd..00f095ff188 100644 --- a/terraform/environments/delius-core/main_test.tf +++ b/terraform/environments/delius-core/main_test.tf @@ -9,10 +9,11 @@ module "environment_test" { count = local.is-test ? 1 : 0 providers = { - aws = aws - aws.bucket-replication = aws - aws.core-vpc = aws.core-vpc - aws.core-network-services = aws.core-network-services + aws = aws + aws.bucket-replication = aws + aws.core-vpc = aws.core-vpc + aws.core-network-services = aws.core-network-services + aws.modernisation-platform = aws.modernisation-platform } env_name = "test" diff --git a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf new file mode 100644 index 00000000000..ee3dd5ea8ee --- /dev/null +++ b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf @@ -0,0 +1,107 @@ +# SNS topic for monitoring to send alarms to +resource "aws_sns_topic" "dms_alerting" { + name = "delius-dms-alerting" + kms_master_key_id = var.account_config.kms_keys.general_shared +} + +locals { + aws_dms_replication_tasks = merge( + try(var.dms_config.user_target_endpoint.write_database, null) == null ? {} : { + (aws_dms_replication_task.user_inbound_replication[0].replication_task_arn) = aws_dms_replication_task.user_inbound_replication[0].replication_task_id + }, + {for k in keys(local.client_account_map) : + (aws_dms_replication_task.business_interaction_inbound_replication[k].replication_task_arn) => aws_dms_replication_task.business_interaction_inbound_replication[k].replication_task_id + }, + {for k in keys(local.client_account_map) : + (aws_dms_replication_task.audited_interaction_inbound_replication[k].replication_task_arn) => aws_dms_replication_task.audited_interaction_inbound_replication[k].replication_task_id + }, + {for k in keys(local.client_account_map) : + (aws_dms_replication_task.audited_interaction_checksum_inbound_replication[k].replication_task_arn) => aws_dms_replication_task.audited_interaction_checksum_inbound_replication[k].replication_task_id + }, + try(var.dms_config.audit_source_endpoint.read_database, null) == null ? {} : { + (aws_dms_replication_task.audited_interaction_outbound_replication[0].replication_task_arn) = aws_dms_replication_task.audited_interaction_outbound_replication[0].replication_task_id + }, + {for k in keys(local.client_account_map) : + (aws_dms_replication_task.user_outbound_replication[k].replication_task_arn) => aws_dms_replication_task.user_outbound_replication[k].replication_task_id + }, + try(var.dms_config.audit_source_endpoint.read_database, null) == null ? {} : { + (aws_dms_replication_task.business_interaction_outbound_replication[0].replication_task_arn) = aws_dms_replication_task.business_interaction_outbound_replication[0].replication_task_id + }, + try(var.dms_config.audit_source_endpoint.read_database, null) == null ? {} : { + (aws_dms_replication_task.audited_interaction_checksum_outbound_replication[0].replication_task_arn) = aws_dms_replication_task.audited_interaction_checksum_outbound_replication[0].replication_task_id + }, + ) +} + +resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_source" { + for_each = local.aws_dms_replication_tasks + alarm_name = "dms-cdc-latency-source-${each.value}" + alarm_description = "High CDC source latency for dms replication task for ${each.value}" + namespace = "AWS/DMS" + statistic = "Average" + metric_name = "CDCLatencySource" + comparison_operator = "GreaterThanThreshold" + threshold = 10 + evaluation_periods = 2 + period = 30 + actions_enabled = true + alarm_actions = [aws_sns_topic.dms_alerting.arn] + ok_actions = [aws_sns_topic.dms_alerting.arn] + dimensions = { + ReplicationInstanceIdentifier = aws_dms_replication_instance.dms_replication_instance.replication_instance_id + ReplicationTaskIdentifier = each.key + } + tags = var.tags +} + +resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_target" { + for_each = local.aws_dms_replication_tasks + alarm_name = "dms-cdc-latency-target-${each.value}" + alarm_description = "High CDC target latency for dms replication task for ${each.value}" + namespace = "AWS/DMS" + statistic = "Average" + metric_name = "CDCLatencyTarget" + comparison_operator = "GreaterThanThreshold" + threshold = 10 + evaluation_periods = 2 + period = 30 + actions_enabled = true + alarm_actions = [aws_sns_topic.dms_alerting.arn] + ok_actions = [aws_sns_topic.dms_alerting.arn] + dimensions = { + ReplicationInstanceIdentifier = aws_dms_replication_instance.dms_replication_instance.replication_instance_id + ReplicationTaskIdentifier = each.key + } + tags = var.tags +} + +# Pager duty integration + +# Get the map of pagerduty integration keys from the modernisation platform account +data "aws_secretsmanager_secret" "pagerduty_integration_keys" { + provider = aws.modernisation-platform + name = "pagerduty_integration_keys" +} + +data "aws_secretsmanager_secret_version" "pagerduty_integration_keys" { + provider = aws.modernisation-platform + secret_id = data.aws_secretsmanager_secret.pagerduty_integration_keys.id +} + +# Add a local to get the keys +locals { + pagerduty_integration_keys = jsondecode(data.aws_secretsmanager_secret_version.pagerduty_integration_keys.secret_string) + integration_key_lookup = var.dms_config.is-production ? "delius_oracle_prod_alarms" : "delius_oracle_nonprod_alarms" +} + +# link the sns topic to the service +# Non-Prod alerts channel: #delius-aws-oracle-dev-alerts +# Prod alerts channel: #delius-aws-oracle-prod-alerts +module "pagerduty_core_alerts" { + depends_on = [ + aws_sns_topic.dms_alerting + ] + source = "github.com/ministryofjustice/modernisation-platform-terraform-pagerduty-integration?ref=v2.0.0" + sns_topics = [aws_sns_topic.dms_alerting.name] + pagerduty_integration_key = local.pagerduty_integration_keys[local.integration_key_lookup] +} diff --git a/terraform/environments/delius-core/modules/components/dms/versions.tf b/terraform/environments/delius-core/modules/components/dms/versions.tf index 7e2c7df783a..d032acc847c 100644 --- a/terraform/environments/delius-core/modules/components/dms/versions.tf +++ b/terraform/environments/delius-core/modules/components/dms/versions.tf @@ -3,7 +3,7 @@ terraform { aws = { source = "hashicorp/aws" version = "~> 5.0" - configuration_aliases = [aws.core-vpc, aws.core-network-services, aws.bucket-replication] + configuration_aliases = [aws.core-vpc, aws.core-network-services, aws.bucket-replication, aws.modernisation-platform] } archive = { source = "hashicorp/archive" diff --git a/terraform/environments/delius-core/modules/delius_environment/dms.tf b/terraform/environments/delius-core/modules/delius_environment/dms.tf index 55d5a5f355d..03845c1240c 100644 --- a/terraform/environments/delius-core/modules/delius_environment/dms.tf +++ b/terraform/environments/delius-core/modules/delius_environment/dms.tf @@ -13,10 +13,11 @@ module "dms" { env_name_to_dms_config_map = var.env_name_to_dms_config_map providers = { - aws = aws - aws.bucket-replication = aws - aws.core-vpc = aws - aws.core-network-services = aws + aws = aws + aws.bucket-replication = aws + aws.core-vpc = aws + aws.core-network-services = aws + aws.modernisation-platform = aws.modernisation-platform } } diff --git a/terraform/environments/delius-core/modules/delius_environment/versions.tf b/terraform/environments/delius-core/modules/delius_environment/versions.tf index e5dc6ba202c..904687f9704 100644 --- a/terraform/environments/delius-core/modules/delius_environment/versions.tf +++ b/terraform/environments/delius-core/modules/delius_environment/versions.tf @@ -3,7 +3,7 @@ terraform { aws = { source = "hashicorp/aws" version = "~> 5.0" - configuration_aliases = [aws.bucket-replication, aws.core-vpc, aws.core-network-services] + configuration_aliases = [aws.bucket-replication, aws.core-vpc, aws.core-network-services, aws.modernisation-platform] } archive = { source = "hashicorp/archive" From 7640ea669a46c8bdeec6e7d918955b3a901b61fd Mon Sep 17 00:00:00 2001 From: Bill Buchan Date: Thu, 19 Sep 2024 16:30:15 +0100 Subject: [PATCH 2/5] Get the final part of the replication Id --- .../delius-core/modules/components/dms/cloudwatch-alarms.tf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf index ee3dd5ea8ee..9fb6acca014 100644 --- a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf +++ b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf @@ -49,7 +49,8 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_source" { ok_actions = [aws_sns_topic.dms_alerting.arn] dimensions = { ReplicationInstanceIdentifier = aws_dms_replication_instance.dms_replication_instance.replication_instance_id - ReplicationTaskIdentifier = each.key + # We only need to final element of the replication task ID (after the last :) + ReplicationTaskIdentifier = split(":", each.value)[length(split(":", each.value)) - 1] } tags = var.tags } @@ -70,7 +71,8 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_target" { ok_actions = [aws_sns_topic.dms_alerting.arn] dimensions = { ReplicationInstanceIdentifier = aws_dms_replication_instance.dms_replication_instance.replication_instance_id - ReplicationTaskIdentifier = each.key + # We only need to final element of the replication task ID (after the last :) + ReplicationTaskIdentifier = split(":", each.value)[length(split(":", each.value)) - 1] } tags = var.tags } From 54797e8e0aa9ffa92df208527ee404fe4e809400 Mon Sep 17 00:00:00 2001 From: Bill Buchan Date: Thu, 19 Sep 2024 17:26:22 +0100 Subject: [PATCH 3/5] Replication Task Identifier is in the key --- .../delius-core/modules/components/dms/cloudwatch-alarms.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf index 9fb6acca014..32a5e6a561d 100644 --- a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf +++ b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf @@ -50,7 +50,7 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_source" { dimensions = { ReplicationInstanceIdentifier = aws_dms_replication_instance.dms_replication_instance.replication_instance_id # We only need to final element of the replication task ID (after the last :) - ReplicationTaskIdentifier = split(":", each.value)[length(split(":", each.value)) - 1] + ReplicationTaskIdentifier = split(":", each.key)[length(split(":", each.key)) - 1] } tags = var.tags } @@ -72,7 +72,7 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_target" { dimensions = { ReplicationInstanceIdentifier = aws_dms_replication_instance.dms_replication_instance.replication_instance_id # We only need to final element of the replication task ID (after the last :) - ReplicationTaskIdentifier = split(":", each.value)[length(split(":", each.value)) - 1] + ReplicationTaskIdentifier = split(":", each.key)[length(split(":", each.key)) - 1] } tags = var.tags } From efd3c4ea5fb60cf3091c950eba523313904f804f Mon Sep 17 00:00:00 2001 From: Bill Buchan Date: Fri, 20 Sep 2024 10:05:33 +0100 Subject: [PATCH 4/5] Temporarily reduce the threshold for testing --- .../delius-core/modules/components/dms/cloudwatch-alarms.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf index 32a5e6a561d..9ae3c8e5e40 100644 --- a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf +++ b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf @@ -41,7 +41,7 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_source" { statistic = "Average" metric_name = "CDCLatencySource" comparison_operator = "GreaterThanThreshold" - threshold = 10 + threshold = 1 evaluation_periods = 2 period = 30 actions_enabled = true @@ -63,7 +63,7 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_target" { statistic = "Average" metric_name = "CDCLatencyTarget" comparison_operator = "GreaterThanThreshold" - threshold = 10 + threshold = 1 evaluation_periods = 2 period = 30 actions_enabled = true From ab8cf6d27cab77184b30d6b09cf6ec8c47b652d2 Mon Sep 17 00:00:00 2001 From: Bill Buchan Date: Fri, 20 Sep 2024 11:07:42 +0100 Subject: [PATCH 5/5] Reset sensible threshold --- .../delius-core/modules/components/dms/cloudwatch-alarms.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf index 9ae3c8e5e40..32a5e6a561d 100644 --- a/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf +++ b/terraform/environments/delius-core/modules/components/dms/cloudwatch-alarms.tf @@ -41,7 +41,7 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_source" { statistic = "Average" metric_name = "CDCLatencySource" comparison_operator = "GreaterThanThreshold" - threshold = 1 + threshold = 10 evaluation_periods = 2 period = 30 actions_enabled = true @@ -63,7 +63,7 @@ resource "aws_cloudwatch_metric_alarm" "dms_cdc_latency_target" { statistic = "Average" metric_name = "CDCLatencyTarget" comparison_operator = "GreaterThanThreshold" - threshold = 1 + threshold = 10 evaluation_periods = 2 period = 30 actions_enabled = true