Switched from ecs_managed metric detecting when task fails, to event …

…bridge. Seems a lot more stable, and can turn off ecs managed service now
Cameronsplaze · Nov 29, 2024 · dfa628e · dfa628e
1 parent 2a8269d
commit dfa628e
Show file tree

Hide file tree

Showing 8 changed files with 208 additions and 135 deletions.
diff --git a/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py b/ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py
@@ -8,6 +8,7 @@
     Duration,
     RemovalPolicy,
     aws_lambda,
+    aws_sns as sns,
     aws_iam as iam,
     aws_logs as logs,
     aws_ecs as ecs,
@@ -30,10 +31,13 @@ def __init__(
         self,
         scope: Construct,
         container_id: str,
+        container_url: str,
         domain_stack: DomainStack,
         ecs_cluster: ecs.Cluster,
         ec2_service: ecs.Ec2Service,
         auto_scaling_group: autoscaling.AutoScalingGroup,
+        base_stack_sns_topic: sns.Topic,
+        leaf_stack_sns_topic: sns.Topic,
         **kwargs,
     ) -> None:
         super().__init__(scope, "AsgStateChangeHook", **kwargs)
@@ -126,29 +130,56 @@ def __init__(
             )
         )
 
-
         ## EventBridge Rule: This is actually what hooks the Lambda to the ASG/Instance.
         #    Needed to keep the management in sync with if a container is running.
         # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.Rule.html
-        self.rule_asg_state_change_trigger = events.Rule(
+        message_up = events.RuleTargetInput.from_text(
+            f"Container for '{container_id}' is starting up! Connect to it at: '{container_url}'.",
+        )
+        self.rule_asg_state_change_trigger_up = events.Rule(
             self,
-            "AsgStateChangeTrigger",
-            rule_name=f"{container_id}-rule-ASG-StateChange-hook",
+            "AsgStateChangeTrigger-Up",
+            rule_name=f"{container_id_alpha}-rule-ASG-StateChange-spin-up",
             description="Trigger Lambda whenever the ASG state changes, to keep DNS in sync",
             # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
             event_pattern=events.EventPattern(
                 source=["aws.autoscaling"],
                 # "EC2 Instance Launch Successful" -> FINISHES spinning up (has an ip now)
+                detail_type=["EC2 Instance Launch Successful"],
+                detail={
+                    "AutoScalingGroupName": [auto_scaling_group.auto_scaling_group_name],
+                },
+            ),
+            targets=[
+                # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.LambdaFunction.html
+                events_targets.LambdaFunction(self.lambda_asg_state_change_hook),
+                # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
+                events_targets.SnsTopic(base_stack_sns_topic, message=message_up),
+                events_targets.SnsTopic(leaf_stack_sns_topic, message=message_up),
+            ],
+        )
+        message_down = events.RuleTargetInput.from_text(f"Container for '{container_id}' has stopped.")
+        self.rule_asg_state_change_trigger_down = events.Rule(
+            self,
+            "AsgStateChangeTrigger-Down",
+            rule_name=f"{container_id_alpha}-rule-ASG-StateChange-spin-down",
+            description="Trigger Lambda whenever the ASG state changes, to keep DNS in sync",
+            # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
+            event_pattern=events.EventPattern(
+                source=["aws.autoscaling"],
                 # "EC2 Instance-terminate Lifecycle Action" -> STARTS to spin down (shorter
                 #                          wait time than "EC2 Instance Terminate Successful").
-                detail_type=["EC2 Instance Launch Successful", "EC2 Instance-terminate Lifecycle Action"],
+                detail_type=["EC2 Instance-terminate Lifecycle Action"],
                 detail={
                     "AutoScalingGroupName": [auto_scaling_group.auto_scaling_group_name],
                 },
             ),
             targets=[
                 # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.LambdaFunction.html
                 events_targets.LambdaFunction(self.lambda_asg_state_change_hook),
+                # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
+                events_targets.SnsTopic(base_stack_sns_topic, message=message_down),
+                events_targets.SnsTopic(leaf_stack_sns_topic, message=message_down),
             ],
         )
 

diff --git a/ContainerManager/leaf_stack/NestedStacks/Dashboard.py b/ContainerManager/leaf_stack/NestedStacks/Dashboard.py
@@ -52,6 +52,7 @@ def __init__(
         metric_asg_lambda_invocation_count = asg_state_change_hook_nested_stack.lambda_asg_state_change_hook.metric_invocations(
             unit=cloudwatch.Unit.COUNT,
             statistic="Maximum",
+            period=Duration.minutes(1),
         )
 
 
@@ -90,8 +91,6 @@ def __init__(
                 width=12,
                 right=[metric_asg_lambda_invocation_count],
                 legend_position=cloudwatch.LegendPosition.RIGHT,
-                period=Duration.minutes(1),
-                statistic="Maximum",
             ),
 
             ### Show the number of instances, to see when it starts/stops:
@@ -114,7 +113,7 @@ def __init__(
                 alarms=[
                     watchdog_nested_stack.alarm_asg_instance_left_up,
                     watchdog_nested_stack.alarm_container_activity,
-                    # watchdog_nested_stack.alarm_capacity_provider,
+                    watchdog_nested_stack.alarm_break_crash_loop_count,
                 ],
             ),
 
@@ -131,7 +130,6 @@ def __init__(
             # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html
             cloudwatch.GraphWidget(
                 title="(ASG) All Network Traffic",
-                # Only show up to an hour ago:
                 height=6,
                 width=12,
                 right=[
@@ -158,14 +156,14 @@ def __init__(
                 alarm=watchdog_nested_stack.alarm_container_activity,
             ),
 
-            # ## Capacity Provider Alarm:
-            # # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
-            # cloudwatch.AlarmWidget(
-            #     title=f"Alarm: {watchdog_nested_stack.alarm_capacity_provider.alarm_name}",
-            #     width=6,
-            #     height=5,
-            #     alarm=watchdog_nested_stack.alarm_capacity_provider,
-            # ),
+            ## Crash Loop Alarm:
+            # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
+            cloudwatch.AlarmWidget(
+                title=f"Alarm: {watchdog_nested_stack.alarm_break_crash_loop_count.alarm_name}",
+                width=6,
+                height=5,
+                alarm=watchdog_nested_stack.alarm_break_crash_loop_count,
+            ),
 
             ## Show the Container Logs:
             # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html

diff --git a/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py b/ContainerManager/leaf_stack/NestedStacks/EcsAsg.py
@@ -193,10 +193,10 @@ def __init__(
             ## Since the instances don't live long, this doesn't do anything, and
             # the lambda to spin down the system will trigger TWICE when going down.
             enable_managed_draining=False,
-            ## We need the `CapacityProviderReservation` metric to know when to kill the ec2 instance
-            # if the container exists on startup. (Otherwise the task infinite loops, and since it
-            # successfully started FIRST, the circuit breaker won't stop it).
-            enable_managed_scaling=True,
+            ## We directly manage the ASG, that's how this architecture is designed.
+            # And since we'll ever have 1 or 0 instances, we don't need this. Save on
+            # cloudwatch api calls, and clean up the console instead.
+            enable_managed_scaling=False,
         )
 
         self.ecs_cluster.add_asg_capacity_provider(self.capacity_provider)
@@ -213,81 +213,19 @@ def __init__(
             cluster=self.ecs_cluster,
             task_definition=task_definition,
             desired_count=0,
-            circuit_breaker={
-                "rollback": False # Don't keep trying to restart the container if it fails
-            },
+            ## We use the 'spin-down-asg-on-error' lambda to take care of circuit breaker-like
+            ## logic. If we *just* spun down the task, the instance would still be running.
+            ## That'd both charge money, and not let the system "spin back up/reset".
+            # circuit_breaker={
+            #     "rollback": False # Don't keep trying to restart the container if it fails
+            # },
             capacity_provider_strategies=[capacity_provider_strategy],
             ### Puts each task in a particular group, on a different instance:
             ### (Not sure if we want this. Only will ever have one instance, and adds complexity)
             # placement_constraints=[ecs.PlacementConstraint.distinct_instances()],
             # placement_strategies=[ecs.PlacementStrategy.spread_across_instances()],
         )
 
-
-        ##########################
-        ### Notification Stuff ###
-        ##########################
-
-        ## EventBridge Rule: Send notification to user when ECS Task spins up or down:
-        # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.Rule.html
-        message = events.RuleTargetInput.from_text("\n".join([
-            f"Container for '{container_id}' has started!",
-            f"Connect to it at: '{container_url}'.",
-        ]))
-        self.rule_notify_up = events.Rule(
-            self,
-            "RuleNotifyUp",
-            rule_name=f"{container_id}-rule-notify-up",
-            description="Let user know when system finishes spinning UP",
-            # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
-            event_pattern=events.EventPattern(
-                source=["aws.ecs"],
-                detail_type=["ECS Task State Change"],
-                detail={
-                    "clusterArn": [self.ecs_cluster.cluster_arn],
-                    # You only care if the TASK starts, or the INSTANCE stops:
-                    "lastStatus": ["RUNNING"],
-                    "desiredStatus": ["RUNNING"],
-                },
-            ),
-            targets=[
-                # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
-                events_targets.SnsTopic(
-                    base_stack_sns_topic,
-                    message=message,
-                ),
-                events_targets.SnsTopic(
-                    leaf_stack_sns_topic,
-                    message=message,
-                ),
-            ],
-        )
-
-        ## Same thing, but notify user when task spins down finally:
-        ##   (Can't combine with above target, since we care about different 'detail_type'.
-        ##    Don't want to spam the user sadly.)
-        # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.Rule.html
-        message = events.RuleTargetInput.from_text(f"Container for '{container_id}' has stopped.")
-        self.rule_notify_down = events.Rule(
-            self,
-            "RuleNotifyDown",
-            rule_name=f"{container_id}-rule-notify-down",
-            description="Let user know when system finishes spinning down",
-            # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
-            event_pattern=events.EventPattern(
-                source=["aws.autoscaling"],
-                detail_type=["EC2 Instance-terminate Lifecycle Action"],
-                detail={
-                    "AutoScalingGroupName": [self.auto_scaling_group.auto_scaling_group_name],
-                },
-            ),
-            targets=[
-                # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
-                events_targets.SnsTopic(base_stack_sns_topic, message=message),
-                events_targets.SnsTopic(leaf_stack_sns_topic, message=message),
-            ],
-        )
-
         #####################
         ### cdk_nag stuff ###
         #####################