From a382166b0c800dca97b2bc01a734f6e1a84e4189 Mon Sep 17 00:00:00 2001 From: Timothy Clarke Date: Mon, 8 Mar 2021 15:32:13 +0000 Subject: [PATCH 1/2] Roundrobin DNS for a shared hostname --- README.md | 34 +++++-- lambda/multihost/multihost.py | 166 ++++++++++++++++++++++++++++++++++ main.tf | 161 ++++++++++++++++++++------------- outputs.tf | 7 +- variables.tf | 2 +- 5 files changed, 299 insertions(+), 71 deletions(-) create mode 100644 lambda/multihost/multihost.py diff --git a/README.md b/README.md index 55d8f0a..1b1380e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Purpose -This Terraform module sets up everything necessary for dynamically setting hostnames following a certain pattern on instances spawned by AWS Auto Scaling Groups (ASGs). +This Terraform module sets up everything necessary for dynamically setting hostnames following a certain pattern on instances spawned by AWS Auto Scaling Groups (ASGs). Learn more about our motivation to build this module in [this blog post](https://underthehood.meltwater.com/blog/2020/02/07/dynamic-route53-records-for-aws-auto-scaling-groups-with-terraform/). @@ -14,11 +14,13 @@ Learn more about our motivation to build this module in [this blog post](https:/ - [Terraform AWS provider](https://github.com/terraform-providers/terraform-provider-aws) 2.0+ ## Usage +`vpc_name` below is the DNZ zone name +### Per instance names Create an ASG and set the `asg:hostname_pattern` tag for example like this: ``` -asg-test-#instanceid.asg-handler-vpc.testing@Z3QP9GZSRL8IVA +asg-test-#instanceid.example.com@Z3QP9GZSRL8IVA ``` Could be interpolated in Terraform like this: @@ -26,12 +28,30 @@ Could be interpolated in Terraform like this: ```hcl tag { key = "asg:hostname_pattern" - value = "${var.hostname_prefix}-#instanceid.${var.vpc_name}.testing@${var.internal_zone_id}" + value = format("%s-#instanceid.%s@%s", var.hostname_prefix, var.vpc_name, var.internal_zone_id) propagate_at_launch = true } ``` - + +### Single DNS name for the entire ASG +Primary use of this is to manage round robin DNS. Keep in mind that you should still use health checks for any custom loadbalancers. +Following from the examples above we omit the `-#instanceid` portion. eg `mail-servers.example.com@ABCDEFGHIJ123` and use the key `asg:multihost_pattern` +Also note the ASG lifestyle_hook should use `notification_target_arn = module.autoscale_dns.multihost_handling_sns_topic_arn` + +While you could have a host in both the `-#instanceid` and single DNS name, it is advised against doing this as both handlers will attempt to rename the instance. The DNS should be fine, the instance name will be in an unknown state. + +```hcl +tag { + key = "asg:multihost_pattern" + value = format("%s.%s@%s", var.hostname_prefix, var.vpc_name, var.internal_zone_id) + propagate_at_launch = true +} +``` + + +### Common Once you have your ASG set up, you can just invoke this module and point to it: +`use_public_ip` defaults to false ```hcl module "clever_name_autoscale_dns" { source = "meltwater/asg-dns-handler/aws" @@ -39,7 +59,7 @@ module "clever_name_autoscale_dns" { # use_public_ip = true autoscale_handler_unique_identifier = "clever_name" autoscale_route53zone_arn = "ABCDEFGHIJ123" - vpc_name = "my_vpc" + vpc_name = "example.com" } ``` @@ -113,7 +133,7 @@ resource "aws_autoscaling_group" "my_asg" { module "autoscale_dns" { source = "meltwater/asg-dns-handler/aws" version = "x.y.z" - + autoscale_handler_unique_identifier = "my_asg_handler" autoscale_route53zone_arn = var.internal_zone_id vpc_name = var.vpc_name @@ -122,7 +142,7 @@ module "autoscale_dns" { ## Difference between Lifecycle action -Lifecycle_hook can have `CONTINUE` or `ABANDON` as default_result. By setting default_result to `ABANDON` will terminate the instance if the lambda function fails to update the DNS record as required. `Complete_lifecycle_action` in lambda function returns `LifecycleActionResult` as `CONTINUE` on success to Lifecycle_hook. But if lambda function fails, Lifecycle_hook doesn't get any response from `Complete_lifecycle_action` which results in timeout and terminates the instance. +Lifecycle_hook can have `CONTINUE` or `ABANDON` as default_result. By setting default_result to `ABANDON` will terminate the instance if the lambda function fails to update the DNS record as required. `Complete_lifecycle_action` in lambda function returns `LifecycleActionResult` as `CONTINUE` on success to Lifecycle_hook. But if lambda function fails, Lifecycle_hook doesn't get any response from `Complete_lifecycle_action` which results in timeout and terminates the instance. At the conclusion of a lifecycle hook, the result is either ABANDON or CONTINUE. If the instance is launching, CONTINUE indicates that your actions were successful, and that the instance can be put into service. Otherwise, ABANDON indicates that your custom actions were unsuccessful, and that the instance can be terminated. diff --git a/lambda/multihost/multihost.py b/lambda/multihost/multihost.py new file mode 100644 index 0000000..0852805 --- /dev/null +++ b/lambda/multihost/multihost.py @@ -0,0 +1,166 @@ +import json +import logging +import boto3 +import sys +import os + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +autoscaling = boto3.client('autoscaling') +ec2 = boto3.client('ec2') +route53 = boto3.client('route53') + +HOSTNAME_TAG_NAME = "asg:multihost_pattern" + +LIFECYCLE_KEY = "LifecycleHookName" +ASG_KEY = "AutoScalingGroupName" + +# Fetches IP of an instance via EC2 API +def fetch_ip_from_ec2(instance_id): + logger.info("Fetching IP for instance-id: %s", instance_id) + ec2_response = ec2.describe_instances(InstanceIds=[instance_id]) + if 'use_public_ip' in os.environ and os.environ['use_public_ip'] == "true": + ip_address = ec2_response['Reservations'][0]['Instances'][0]['PublicIpAddress'] + logger.info("Found public IP for instance-id %s: %s", instance_id, ip_address) + else: + ip_address = ec2_response['Reservations'][0]['Instances'][0]['PrivateIpAddress'] + logger.info("Found private IP for instance-id %s: %s", instance_id, ip_address) + + return ip_address + +# Fetches IP of an instance via route53 API +def fetch_ip_from_route53(hostname, zone_id): + logger.info("Fetching IP for hostname: %s", hostname) + + ip_address = route53.list_resource_record_sets( + HostedZoneId=zone_id, + StartRecordName=hostname, + StartRecordType='A', + MaxItems='1' + )['ResourceRecordSets'][0]['ResourceRecords'][0]['Value'] + + logger.info("Found IP for hostname %s: %s", hostname, ip_address) + + return ip_address + +# Fetches relevant tags from ASG +# Returns tuple of hostname_pattern, zone_id +def fetch_tag_metadata(asg_name): + logger.info("Fetching tags for ASG: %s", asg_name) + + tag_value = autoscaling.describe_tags( + Filters=[ + {'Name': 'auto-scaling-group','Values': [asg_name]}, + {'Name': 'key','Values': [HOSTNAME_TAG_NAME]} + ], + MaxRecords=1 + )['Tags'][0]['Value'] + + logger.info("Found tags for ASG %s: %s", asg_name, tag_value) + + return tag_value.split("@") + +# Updates the name tag of an instance +def update_name_tag(instance_id, hostname): + tag_name = hostname.split('.')[0] + logger.info("Updating name tag for instance-id %s with: %s", instance_id, tag_name) + ec2.create_tags( + Resources = [ + instance_id + ], + Tags = [ + { + 'Key': 'Name', + 'Value': tag_name + } + ] + ) + +# Updates a Route53 record +def update_record(zone_id, ips, hostname): + if len(ips) == 0: + ips.append({'Value': fetch_ip_from_route53(hostname, zone_id)}) + operation = 'DELETE' + else: + operation = 'UPSERT' + logger.info("Changing record with %s for %s -> %s in %s", operation, hostname, ips, zone_id) + route53.change_resource_record_sets( + HostedZoneId=zone_id, + ChangeBatch={ + 'Changes': [ + { + 'Action': operation, + 'ResourceRecordSet': { + 'Name': hostname, + 'Type': 'A', + 'TTL': 300, + 'ResourceRecords': ips + } + } + ] + } + ) + +def process_asg(autoScalingGroupName, hostname): + # Iterate through the instance group: Put IP addresses into a list and update the instance names to match the group. + ips = [] + # IP's is a list of dictionaries [{'Value': ipAddr1},{'Value': ipAddr2}] eg [{'Value':'127.0.0.1'}] + for instance in autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[autoScalingGroupName])['AutoScalingGroups'][0]['Instances']: + ips.append({'Value': fetch_ip_from_ec2(instance['InstanceId']) }) + update_name_tag(instance['InstanceId'], hostname) + return ips + + + # Processes a scaling event + # Builds a hostname from tag metadata, fetches a IP, and updates records accordingly +def process_message(message): + if 'LifecycleTransition' not in message: + logger.info("Processing %s event", message['Event']) + return + logger.info("Processing %s event", message['LifecycleTransition']) + + if message['LifecycleTransition'] not in ("autoscaling:EC2_INSTANCE_LAUNCHING","autoscaling:EC2_INSTANCE_TERMINATING", "autoscaling:EC2_INSTANCE_LAUNCH_ERROR"): + logger.error("Encountered unknown event type: %s", message['LifecycleTransition']) + + asg_name = message['AutoScalingGroupName'] + instance_id = message['EC2InstanceId'] + + hostname, zone_id = fetch_tag_metadata(asg_name) + + ipAddrs = process_asg(asg_name, hostname) + update_record(zone_id, ipAddrs, hostname) + +# Picks out the message from a SNS message and deserializes it +def process_record(record): + process_message(json.loads(record['Sns']['Message'])) + +# Main handler where the SNS events end up to +# Events are bulked up, so process each Record individually +def lambda_handler(event, context): + logger.info("Processing SNS event: " + json.dumps(event)) + + for record in event['Records']: + process_record(record) + +# Finish the asg lifecycle operation by sending a continue result + logger.info("Finishing ASG action") + message = json.loads(record['Sns']['Message']) + if LIFECYCLE_KEY in message and ASG_KEY in message : + response = autoscaling.complete_lifecycle_action ( + LifecycleHookName = message['LifecycleHookName'], + AutoScalingGroupName = message['AutoScalingGroupName'], + InstanceId = message['EC2InstanceId'], + LifecycleActionToken = message['LifecycleActionToken'], + LifecycleActionResult = 'CONTINUE' + ) + logger.info("ASG action complete: %s", response) + else : + logger.error("No valid JSON message") + +# if invoked manually, assume someone pipes in a event json +if __name__ == "__main__": + logging.basicConfig() + + lambda_handler(json.load(sys.stdin), None) + diff --git a/main.tf b/main.tf index 04413ca..6b6d369 100644 --- a/main.tf +++ b/main.tf @@ -1,77 +1,74 @@ resource "aws_sns_topic" "autoscale_handling" { - name = "${var.vpc_name}-${var.autoscale_handler_unique_identifier}" + name = format("%s-%s", var.vpc_name, var.autoscale_handler_unique_identifier) +} + +resource "aws_sns_topic" "autoscale_multihost_handling" { + name = format("%s-%s-multi", var.vpc_name, var.autoscale_handler_unique_identifier) } resource "aws_iam_role_policy" "autoscale_handling" { - name = "${var.vpc_name}-${var.autoscale_handler_unique_identifier}" + name = format("%s-%s", var.vpc_name, var.autoscale_handler_unique_identifier) role = aws_iam_role.autoscale_handling.name - policy = < Date: Tue, 9 Mar 2021 13:47:38 +0000 Subject: [PATCH 2/2] Addressed all errors I could generate --- README.md | 8 +++++ lambda/multihost/multihost.py | 55 ++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 1b1380e..3d19683 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,14 @@ Also note the ASG lifestyle_hook should use `notification_target_arn = module.au While you could have a host in both the `-#instanceid` and single DNS name, it is advised against doing this as both handlers will attempt to rename the instance. The DNS should be fine, the instance name will be in an unknown state. +**Constrints of running a pool** +If multiple events are happening in quick succession we may get into situiations where latter runs pickup instances that have not finished terminating. +In situiations were multiple terminations are expected it may be better to change the logic from + * Scanning the ASG and building the IP list from there +to + * Grabbing the IP list from EC2 and stripping out the instance. This may require more information to be stored in TXT entries to map instance ID's to IP addresses +In general it is expected that on busy ASG's there will be residual IP's between scaling events + ```hcl tag { key = "asg:multihost_pattern" diff --git a/lambda/multihost/multihost.py b/lambda/multihost/multihost.py index 0852805..9dc4948 100644 --- a/lambda/multihost/multihost.py +++ b/lambda/multihost/multihost.py @@ -16,16 +16,32 @@ LIFECYCLE_KEY = "LifecycleHookName" ASG_KEY = "AutoScalingGroupName" +# Constrints of running a pool +# If multiple events are happening in quick succession we may get into situiations where latter runs pickup instances that have not finished terminating. +# In situiations were multiple terminations are expected it may be better to change the logic from +# * Scanning the ASG and building the IP list from there +# to +# * Grabbing the IP list from EC2 and stripping out the instance. This may require more information to be stored in TXT entries to map instance ID's to IP addresses +# In general it is expected that on busy ASG's there will be residual IP's + # Fetches IP of an instance via EC2 API def fetch_ip_from_ec2(instance_id): logger.info("Fetching IP for instance-id: %s", instance_id) - ec2_response = ec2.describe_instances(InstanceIds=[instance_id]) - if 'use_public_ip' in os.environ and os.environ['use_public_ip'] == "true": - ip_address = ec2_response['Reservations'][0]['Instances'][0]['PublicIpAddress'] - logger.info("Found public IP for instance-id %s: %s", instance_id, ip_address) - else: - ip_address = ec2_response['Reservations'][0]['Instances'][0]['PrivateIpAddress'] - logger.info("Found private IP for instance-id %s: %s", instance_id, ip_address) + ip_address = None + ec2_response = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0] + if ec2_response['State']['Name'] == 'running': + if 'use_public_ip' in os.environ and os.environ['use_public_ip'] == "true": + try: + ip_address = ec2_response['PublicIpAddress'] + logger.info("Found public IP for instance-id %s: %s", instance_id, ip_address) + except: + logger.info("No public IP for instance-id %s: %s", instance_id, ip_address) + else: + try: + ip_address = ec2_response['PrivateIpAddress'] + logger.info("Found private IP for instance-id %s: %s", instance_id, ip_address) + except: + logger.info("No private IP for instance-id %s: %s", instance_id, ip_address) return ip_address @@ -102,13 +118,21 @@ def update_record(zone_id, ips, hostname): } ) -def process_asg(autoScalingGroupName, hostname): +def process_asg(auto_scaling_group_name, hostname, ignore_instance): # Iterate through the instance group: Put IP addresses into a list and update the instance names to match the group. + # ignore_instance should only be provided if we are terminating an instance. ips = [] # IP's is a list of dictionaries [{'Value': ipAddr1},{'Value': ipAddr2}] eg [{'Value':'127.0.0.1'}] - for instance in autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[autoScalingGroupName])['AutoScalingGroups'][0]['Instances']: - ips.append({'Value': fetch_ip_from_ec2(instance['InstanceId']) }) - update_name_tag(instance['InstanceId'], hostname) + if ignore_instance is None: + logger.info("Processing ASG %s", auto_scaling_group_name) + else: + logger.info("Ignoring instance-id %s while Processing ASG %s", ignore_instance, auto_scaling_group_name) + for instance in autoscaling.describe_auto_scaling_groups(AutoScalingGroupNames=[auto_scaling_group_name])['AutoScalingGroups'][0]['Instances']: + if ignore_instance != instance['InstanceId']: + ipAddr = fetch_ip_from_ec2(instance['InstanceId']) + if ipAddr is not None: + ips.append({'Value': ipAddr}) + update_name_tag(instance['InstanceId'], hostname) return ips @@ -126,10 +150,15 @@ def process_message(message): asg_name = message['AutoScalingGroupName'] instance_id = message['EC2InstanceId'] + ignore_instance = None + if message['LifecycleTransition'] == 'autoscaling:EC2_INSTANCE_TERMINATING': + ignore_instance = instance_id + logger.info("The following instance-id should be ignored %s", instance_id) + hostname, zone_id = fetch_tag_metadata(asg_name) - ipAddrs = process_asg(asg_name, hostname) - update_record(zone_id, ipAddrs, hostname) + ip_addrs = process_asg(asg_name, hostname, ignore_instance) + update_record(zone_id, ip_addrs, hostname) # Picks out the message from a SNS message and deserializes it def process_record(record):