Skip to content

Commit

Permalink
Switched from ecs_managed metric detecting when task fails, to event …
Browse files Browse the repository at this point in the history
…bridge. Seems a lot more stable, and can turn off ecs managed service now
  • Loading branch information
Cameronsplaze committed Nov 29, 2024
1 parent 2a8269d commit dfa628e
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 135 deletions.
41 changes: 36 additions & 5 deletions ContainerManager/leaf_stack/NestedStacks/AsgStateChangeHook.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Duration,
RemovalPolicy,
aws_lambda,
aws_sns as sns,
aws_iam as iam,
aws_logs as logs,
aws_ecs as ecs,
Expand All @@ -30,10 +31,13 @@ def __init__(
self,
scope: Construct,
container_id: str,
container_url: str,
domain_stack: DomainStack,
ecs_cluster: ecs.Cluster,
ec2_service: ecs.Ec2Service,
auto_scaling_group: autoscaling.AutoScalingGroup,
base_stack_sns_topic: sns.Topic,
leaf_stack_sns_topic: sns.Topic,
**kwargs,
) -> None:
super().__init__(scope, "AsgStateChangeHook", **kwargs)
Expand Down Expand Up @@ -126,29 +130,56 @@ def __init__(
)
)


## EventBridge Rule: This is actually what hooks the Lambda to the ASG/Instance.
# Needed to keep the management in sync with if a container is running.
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.Rule.html
self.rule_asg_state_change_trigger = events.Rule(
message_up = events.RuleTargetInput.from_text(
f"Container for '{container_id}' is starting up! Connect to it at: '{container_url}'.",
)
self.rule_asg_state_change_trigger_up = events.Rule(
self,
"AsgStateChangeTrigger",
rule_name=f"{container_id}-rule-ASG-StateChange-hook",
"AsgStateChangeTrigger-Up",
rule_name=f"{container_id_alpha}-rule-ASG-StateChange-spin-up",
description="Trigger Lambda whenever the ASG state changes, to keep DNS in sync",
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
event_pattern=events.EventPattern(
source=["aws.autoscaling"],
# "EC2 Instance Launch Successful" -> FINISHES spinning up (has an ip now)
detail_type=["EC2 Instance Launch Successful"],
detail={
"AutoScalingGroupName": [auto_scaling_group.auto_scaling_group_name],
},
),
targets=[
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.LambdaFunction.html
events_targets.LambdaFunction(self.lambda_asg_state_change_hook),
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
events_targets.SnsTopic(base_stack_sns_topic, message=message_up),
events_targets.SnsTopic(leaf_stack_sns_topic, message=message_up),
],
)
message_down = events.RuleTargetInput.from_text(f"Container for '{container_id}' has stopped.")
self.rule_asg_state_change_trigger_down = events.Rule(
self,
"AsgStateChangeTrigger-Down",
rule_name=f"{container_id_alpha}-rule-ASG-StateChange-spin-down",
description="Trigger Lambda whenever the ASG state changes, to keep DNS in sync",
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
event_pattern=events.EventPattern(
source=["aws.autoscaling"],
# "EC2 Instance-terminate Lifecycle Action" -> STARTS to spin down (shorter
# wait time than "EC2 Instance Terminate Successful").
detail_type=["EC2 Instance Launch Successful", "EC2 Instance-terminate Lifecycle Action"],
detail_type=["EC2 Instance-terminate Lifecycle Action"],
detail={
"AutoScalingGroupName": [auto_scaling_group.auto_scaling_group_name],
},
),
targets=[
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.LambdaFunction.html
events_targets.LambdaFunction(self.lambda_asg_state_change_hook),
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
events_targets.SnsTopic(base_stack_sns_topic, message=message_down),
events_targets.SnsTopic(leaf_stack_sns_topic, message=message_down),
],
)

Expand Down
22 changes: 10 additions & 12 deletions ContainerManager/leaf_stack/NestedStacks/Dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(
metric_asg_lambda_invocation_count = asg_state_change_hook_nested_stack.lambda_asg_state_change_hook.metric_invocations(
unit=cloudwatch.Unit.COUNT,
statistic="Maximum",
period=Duration.minutes(1),
)


Expand Down Expand Up @@ -90,8 +91,6 @@ def __init__(
width=12,
right=[metric_asg_lambda_invocation_count],
legend_position=cloudwatch.LegendPosition.RIGHT,
period=Duration.minutes(1),
statistic="Maximum",
),

### Show the number of instances, to see when it starts/stops:
Expand All @@ -114,7 +113,7 @@ def __init__(
alarms=[
watchdog_nested_stack.alarm_asg_instance_left_up,
watchdog_nested_stack.alarm_container_activity,
# watchdog_nested_stack.alarm_capacity_provider,
watchdog_nested_stack.alarm_break_crash_loop_count,
],
),

Expand All @@ -131,7 +130,6 @@ def __init__(
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.GraphWidget.html
cloudwatch.GraphWidget(
title="(ASG) All Network Traffic",
# Only show up to an hour ago:
height=6,
width=12,
right=[
Expand All @@ -158,14 +156,14 @@ def __init__(
alarm=watchdog_nested_stack.alarm_container_activity,
),

# ## Capacity Provider Alarm:
# # https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
# cloudwatch.AlarmWidget(
# title=f"Alarm: {watchdog_nested_stack.alarm_capacity_provider.alarm_name}",
# width=6,
# height=5,
# alarm=watchdog_nested_stack.alarm_capacity_provider,
# ),
## Crash Loop Alarm:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.AlarmWidget.html
cloudwatch.AlarmWidget(
title=f"Alarm: {watchdog_nested_stack.alarm_break_crash_loop_count.alarm_name}",
width=6,
height=5,
alarm=watchdog_nested_stack.alarm_break_crash_loop_count,
),

## Show the Container Logs:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_cloudwatch.LogQueryWidget.html
Expand Down
82 changes: 10 additions & 72 deletions ContainerManager/leaf_stack/NestedStacks/EcsAsg.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,10 @@ def __init__(
## Since the instances don't live long, this doesn't do anything, and
# the lambda to spin down the system will trigger TWICE when going down.
enable_managed_draining=False,
## We need the `CapacityProviderReservation` metric to know when to kill the ec2 instance
# if the container exists on startup. (Otherwise the task infinite loops, and since it
# successfully started FIRST, the circuit breaker won't stop it).
enable_managed_scaling=True,
## We directly manage the ASG, that's how this architecture is designed.
# And since we'll ever have 1 or 0 instances, we don't need this. Save on
# cloudwatch api calls, and clean up the console instead.
enable_managed_scaling=False,
)

self.ecs_cluster.add_asg_capacity_provider(self.capacity_provider)
Expand All @@ -213,81 +213,19 @@ def __init__(
cluster=self.ecs_cluster,
task_definition=task_definition,
desired_count=0,
circuit_breaker={
"rollback": False # Don't keep trying to restart the container if it fails
},
## We use the 'spin-down-asg-on-error' lambda to take care of circuit breaker-like
## logic. If we *just* spun down the task, the instance would still be running.
## That'd both charge money, and not let the system "spin back up/reset".
# circuit_breaker={
# "rollback": False # Don't keep trying to restart the container if it fails
# },
capacity_provider_strategies=[capacity_provider_strategy],
### Puts each task in a particular group, on a different instance:
### (Not sure if we want this. Only will ever have one instance, and adds complexity)
# placement_constraints=[ecs.PlacementConstraint.distinct_instances()],
# placement_strategies=[ecs.PlacementStrategy.spread_across_instances()],
)


##########################
### Notification Stuff ###
##########################

## EventBridge Rule: Send notification to user when ECS Task spins up or down:
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.Rule.html
message = events.RuleTargetInput.from_text("\n".join([
f"Container for '{container_id}' has started!",
f"Connect to it at: '{container_url}'.",
]))
self.rule_notify_up = events.Rule(
self,
"RuleNotifyUp",
rule_name=f"{container_id}-rule-notify-up",
description="Let user know when system finishes spinning UP",
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
event_pattern=events.EventPattern(
source=["aws.ecs"],
detail_type=["ECS Task State Change"],
detail={
"clusterArn": [self.ecs_cluster.cluster_arn],
# You only care if the TASK starts, or the INSTANCE stops:
"lastStatus": ["RUNNING"],
"desiredStatus": ["RUNNING"],
},
),
targets=[
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
events_targets.SnsTopic(
base_stack_sns_topic,
message=message,
),
events_targets.SnsTopic(
leaf_stack_sns_topic,
message=message,
),
],
)

## Same thing, but notify user when task spins down finally:
## (Can't combine with above target, since we care about different 'detail_type'.
## Don't want to spam the user sadly.)
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.Rule.html
message = events.RuleTargetInput.from_text(f"Container for '{container_id}' has stopped.")
self.rule_notify_down = events.Rule(
self,
"RuleNotifyDown",
rule_name=f"{container_id}-rule-notify-down",
description="Let user know when system finishes spinning down",
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events.EventPattern.html
event_pattern=events.EventPattern(
source=["aws.autoscaling"],
detail_type=["EC2 Instance-terminate Lifecycle Action"],
detail={
"AutoScalingGroupName": [self.auto_scaling_group.auto_scaling_group_name],
},
),
targets=[
# https://docs.aws.amazon.com/cdk/api/v2/docs/aws-cdk-lib.aws_events_targets.SnsTopic.html
events_targets.SnsTopic(base_stack_sns_topic, message=message),
events_targets.SnsTopic(leaf_stack_sns_topic, message=message),
],
)

#####################
### cdk_nag stuff ###
#####################
Expand Down
Loading

0 comments on commit dfa628e

Please sign in to comment.