diff --git a/images/apache_splash.png b/images/apache_splash.png deleted file mode 100644 index 1ed604b..0000000 Binary files a/images/apache_splash.png and /dev/null differ diff --git a/images/halloween_panda.png b/images/halloween_panda.png index 14103c2..3020a5d 100644 Binary files a/images/halloween_panda.png and b/images/halloween_panda.png differ diff --git a/images/init.png b/images/init.png index 312ef6f..d3c85e7 100644 Binary files a/images/init.png and b/images/init.png differ diff --git a/images/step01.png b/images/step01.png index 28759fe..adef670 100644 Binary files a/images/step01.png and b/images/step01.png differ diff --git a/images/step01_templates.png b/images/step01_templates.png new file mode 100644 index 0000000..67b15b8 Binary files /dev/null and b/images/step01_templates.png differ diff --git a/images/step01_web.png b/images/step01_web.png new file mode 100644 index 0000000..e0c7803 Binary files /dev/null and b/images/step01_web.png differ diff --git a/images/step02_tags.png b/images/step02_tags.png new file mode 100644 index 0000000..7b8231f Binary files /dev/null and b/images/step02_tags.png differ diff --git a/images/step02a.png b/images/step02a.png new file mode 100644 index 0000000..787b6b2 Binary files /dev/null and b/images/step02a.png differ diff --git a/images/step03.png b/images/step03.png new file mode 100644 index 0000000..a601723 Binary files /dev/null and b/images/step03.png differ diff --git a/steps/init/README.md b/steps/init/README.md index 1eea8be..ecb7a44 100644 --- a/steps/init/README.md +++ b/steps/init/README.md @@ -31,7 +31,6 @@ Most of the terraform files in this workshop use the same variables, so we'll cr ```hcl panda_name = "" -asset_bucket = "" ``` If you're running this as part of the live session, replace `` with the name we'll provide you with and `` with `630895193694-eu-west-2-oct-assets`, otherwise feel free to use your own details, or similar. @@ -49,38 +48,14 @@ If the command runs ok, we can then apply the changes with `terraform apply --au Once the command completes, you should see something like (although the actual values may be different): ```text -Apply complete! Resources: 14 added, 0 changed, 0 destroyed. +Apply complete! Resources: 20 added, 0 changed, 0 destroyed. Outputs: -public_subnet_details = { - "subnet-02b9d202802d4daa5" = { - "availability_zone" = "eu-west-2a" - "cidr_block" = "10.100.0.0/24" - } - "subnet-04cfa858a2cc3c3e6" = { - "availability_zone" = "eu-west-2c" - "cidr_block" = "10.100.2.0/24" - } - "subnet-092dbf5c60743a722" = { - "availability_zone" = "eu-west-2b" - "cidr_block" = "10.100.1.0/24" - } -} -public_subnet_ids = [ - "subnet-02b9d202802d4daa5", - "subnet-092dbf5c60743a722", - "subnet-04cfa858a2cc3c3e6", -] -subnet_cidr_ranges = [ - "10.100.0.0/24", - "10.100.1.0/24", - "10.100.2.0/24", -] -vpc_cidr_ranges = "10.100.0.0/16" -vpc_id = "vpc-026d7af20963b827b" -vpc_security_group_id = "sg-0a4d936e73c776c74c" +fis_iam_role = "funky_fis_iam_role" +fis_log_group = "/aws/fis/funky-fis-logs" ``` +:information_source: Make a note of the outputß values as we'll need these later. ## Sharing the configuration with other steps To allow the following steps in the workshop to use configuration we've just created, we'll store the details in a Terraform statefile `statefiles/base_config.tfstate`. diff --git a/steps/init/cloudwatch.tf b/steps/init/cloudwatch.tf new file mode 100644 index 0000000..e8c0cd8 --- /dev/null +++ b/steps/init/cloudwatch.tf @@ -0,0 +1,4 @@ +resource "aws_cloudwatch_log_group" "log_group" { + name = "/aws/fis/${var.panda_name}-fis-logs" + retention_in_days = 1 +} \ No newline at end of file diff --git a/steps/init/data.tf b/steps/init/data.tf index fe97a60..ec4c014 100644 --- a/steps/init/data.tf +++ b/steps/init/data.tf @@ -3,10 +3,4 @@ data "aws_availability_zones" "available_azs" { state = "available" } - -data "aws_caller_identity" "current" { -} - -data "aws_s3_bucket" "playground_assets" { - bucket = "${local.account_id}-${var.region}-oct-assets" -} \ No newline at end of file +data "aws_caller_identity" "current" {} diff --git a/steps/init/iam.tf b/steps/init/iam.tf new file mode 100644 index 0000000..40e0433 --- /dev/null +++ b/steps/init/iam.tf @@ -0,0 +1,53 @@ +resource "aws_iam_role" "fis_role" { + name = "${var.panda_name}_fis_iam_role" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = [ + "fis.amazonaws.com", + "delivery.logs.amazonaws.com" + ] + }, + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy_attachment" "ec2_fis_policy_attachment" { + name = "ec2_instance_policy_attachment_${local.resource_suffix}" + roles = [aws_iam_role.fis_role.name] + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access" +} + +resource "aws_iam_policy" "cloudwatch_logs_policy" { + name = "${var.panda_name}_cloudwatch_logs_policy" + description = "Policy to allow CloudWatch log group and log stream access" + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Action = [ + "logs:DescribeLogGroups", + "logs:CreateLogDelivery", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + Resource = [ + "*" + ] + } + ] + }) +} + +resource "aws_iam_policy_attachment" "cloudwatch_logs_policy_attachment" { + name = "${var.panda_name}_cloudwatch_logs_policy_attachment" + roles = [aws_iam_role.fis_role.name] + policy_arn = aws_iam_policy.cloudwatch_logs_policy.arn +} diff --git a/steps/init/outputs.tf b/steps/init/outputs.tf index 3c31aba..15f6d4b 100644 --- a/steps/init/outputs.tf +++ b/steps/init/outputs.tf @@ -1,40 +1,9 @@ -output "asset_bucket_arn" { - description = "ARN of the asset bucket." - value = data.aws_s3_bucket.playground_assets.arn +output "fis_iam_role" { + description = "IAM Role for FIS." + value = aws_iam_role.fis_role.name } -output "asset_bucket_name" { - description = "name of the asset bucket." - value = data.aws_s3_bucket.playground_assets.id +output "fis_log_group" { + description = "Log Group for FIS." + value = aws_cloudwatch_log_group.log_group.name } - -output "public_subnet_details" { - description = "Details of the public subnets." - value = module.vpc.public_subnet_details -} - -output "public_subnet_ids" { - description = "IDs of the public subnets." - value = module.vpc.public_subnet_ids -} - -output "vpc_security_group_id" { - description = "ID of the VPC Security Group." - value = module.vpc.security_group_id -} - -output "subnet_cidr_ranges" { - description = "CIDR Range for Subnets." - value = local.subnet_cidr_ranges -} - -output "vpc_cidr_ranges" { - description = "CIDR Range for VPC." - value = local.vpc_cidr_range -} - -output "vpc_id" { - description = "ID of the VPC." - value = module.vpc.vpc_id -} - diff --git a/steps/init/variables.tf b/steps/init/variables.tf index f122aac..e73d4e8 100644 --- a/steps/init/variables.tf +++ b/steps/init/variables.tf @@ -1,9 +1,3 @@ - -variable "asset_bucket" { - description = "Name of the asset bucket." - type = string -} - variable "panda_name" { description = "My Panda Name" type = string @@ -28,5 +22,5 @@ locals { vpc_cidr_range = "10.${local.panda_number + local.cidr_offset}.0.0/16" subnet_cidr_ranges = [for i in range(length(data.aws_availability_zones.available_azs.names)) : "10.${local.panda_number + local.cidr_offset}.${i}.0/24"] - account_id = data.aws_caller_identity.current.account_id + # account_id = data.aws_caller_identity.current.account_id } \ No newline at end of file diff --git a/steps/modules/vpc/sg.tf b/steps/modules/vpc/sg.tf index 8cb8604..8ab4f3c 100644 --- a/steps/modules/vpc/sg.tf +++ b/steps/modules/vpc/sg.tf @@ -1,10 +1,19 @@ resource "aws_security_group" "ec2_sg" { - name = "step01_${var.resource_suffix}_sg" + name = "${var.resource_suffix}_sg" description = "Allow inbound traffic on port 80" vpc_id = aws_vpc.this.id } +resource "aws_vpc_security_group_ingress_rule" "allow_http_sg" { + security_group_id = aws_security_group.ec2_sg.id + + from_port = 80 + to_port = 80 + ip_protocol = "tcp" + referenced_security_group_id = aws_security_group.ec2_sg.id +} + resource "aws_vpc_security_group_ingress_rule" "allow_http" { security_group_id = aws_security_group.ec2_sg.id @@ -23,11 +32,20 @@ resource "aws_vpc_security_group_ingress_rule" "allow_https" { cidr_ipv4 = "0.0.0.0/0" } -resource "aws_vpc_security_group_egress_rule" "allow_https_connect" { +resource "aws_vpc_security_group_egress_rule" "allow_https" { security_group_id = aws_security_group.ec2_sg.id from_port = 443 to_port = 443 ip_protocol = "tcp" cidr_ipv4 = "0.0.0.0/0" +} + +resource "aws_vpc_security_group_egress_rule" "allow_http_sg" { + security_group_id = aws_security_group.ec2_sg.id + + from_port = 80 + to_port = 80 + ip_protocol = "tcp" + referenced_security_group_id = aws_security_group.ec2_sg.id } \ No newline at end of file diff --git a/steps/step01/README.md b/steps/step01/README.md index 3de57cc..422590f 100644 --- a/steps/step01/README.md +++ b/steps/step01/README.md @@ -1,44 +1,112 @@ -# Step 01 - Creating base infrastructure +# Step 01 - First experiment with FIS, a simple single server setup -For several steps in this workshop, we'll be using AWS. To get started, we need to create some base infrastructure. +To get started, we'll need to deploy a simple server in AWS. We'll then use FIS to simulate a failure on this server. -Firstly, we're going to create a Virtual Private Cloud (VPC) to isolate our resources, and to make it simpler when we come to test later. +Please be aware that we're deliberately keeping this configuration simple to reduce costs and complexity. In a live environment, we'd use private subnets and NAT gateways. +:warning: Before you start this step, make sure you've deployed the base infrastructure in the [init](../init/README.md) step. -```mermaid -architecture-beta - group api(logos:aws-lambda)[API] +## Pre-requisites +Before we go further with the step, we need to ensure that +1. We have some active AWS credentials, +1. We have deployed the base inftastructure in the [init](../init/README.md) step. +1. This deployment assumes there is a route53 hosted zone in place, pointing to the domain referenced in `variables.tf`. If you don't have a hosted zone/domain, ignore this and use the url with an IP, but be aware that the IP may change if the server is stopped and started again. - service db(logos:aws-aurora)[Database] in api - service disk1(logos:aws-glacier)[Storage] in api - service disk2(logos:aws-s3)[Storage] in api - service server(logos:aws-ec2)[Server] in api +## Deploying an environment to test +To deploy the environment, we need to run the Terraform code in the `step01` directory. - db:L -- R:server - disk1:T -- B:server - disk2:T -- B:db -``` +Firstly, we need to initialise our Terraform environment by running the command `terraform init`. This will download the required providers and setup an environment ready to use. +Once initialised, it's always good practice to check our Terraform code, and we can do this with the commands `teraform plan --var-file ../common/common.tfvars`. This will show us what Terraform is going to do. -```mermaid -architecture-beta - group api(cloud)[API] +If the command runs ok, we can then apply the changes with `terraform apply --auto-approve --var-file ../common/common.tfvars`. This will create the infrastructure in the specified AWS account. - service db(internet)[Database] in api - service disk1(disk)[Storage] in api - service disk2(disk)[Storage] in api - service server(server)[Server] in api +Once the command completes, you should see something like (although the actual values may be different): - db:L -- R:server - disk1:T -- B:server - disk2:T -- B:db -``` +```text +Apply complete! Resources: 6 added, 0 changed, 0 destroyed. + +Outputs: + +ec2_details = { + "instance_az" = "eu-west-2a" + "instance_id" = "i-08ede8fe7eb507f58" + "instance_ip_url" = "http://18.170.32.169" + "instance_name" = "simple-funky" + "instance_named_url" = "http://simple-funky.devopsplayground.org" + "instance_subnet" = "subnet-0a1170859dd348b2b" +} -```mermaid -flowchart TD - A[Client] -->|Request| B(API) - B --> C{Condition} - C -->|Success| D[Response] - C -->|Error| E[Error message] ``` + +:information_source: Make a note of the `instance_id` in the output from your deployment as we'll need this shortly. + +## What have we deployed? +For this first step, we've deployed a simple EC2 instance running a web server. The instance is in a public subnet, and has a security group allowing inbound traffic on port 80. +The EC2 instance is configured to update route53 with the public IP address of the instance on every reboot, so we can access it via a domain name (if you have one available). + +The infrastructure deployed is shown in the diagram below: + +![Simple server setup](../../images/step01.png) + + +With the server deployed, we can now test it by visiting the URL in the `instance_ip_url` output. This should show a simple web page that looks something like: + +![Web page](../../images/step01_web.png) + +(:warning: **Note**: it may take a few minutes for the server to become fully available) + + +## Creating our first experiment with FIS +Now that we have our server deployed, we can create our first experiment with FIS. For this we're going to simulate a failure on the server by stopping the instance. + +Firstly, we need to create an experiment template within FIS. These templates describe the actions that we're going to carry out in the experiment. To do this, +we need to access FIS in the AWS console. You can do this by navigating to the [FIS console](https://eu-west-2.console.aws.amazon.com/fis/home?region=eu-west-2#/home) and selecting the `Experiment templates` option from the left-hand menu, or by entering FIS in the AWS search bar, selecting FIS and then clicking on `Experiment templates`. + +Once in the `Experiment templates` page, click on the `Create experiment template` button. This will take you to the `Create experiment template` page. We're only to going to +test against our account, so ensure that the `Account` option is selected, and click on `Confirm`. + +Next we'll define our experiment template with the following steps: + +1. Firstly, provide a description for the template, such as `Stop step01 EC2 instance`, and add a name, something including your panda name at the end to make it easier to identify. +1. In the `Actions` section, click on the `Add action` button. This will open a new section where we can define the action. + 1. In the name field, enter `stop-ec2`. + 1. Select `EC2` in the action type field, and then select `aws:ec2:stop-instances` in the action field. Depending on the action chosen, we'll see different options to complete. In this case, we're going to restart the server after 2 minutes. + 1. In the `Start instances after duration` field, enter 2 minutes. This will ensure that the instance is restarted after 3 minutes. + 1. Click on the `Save` button. +1. You'll see that in the `Targets` option it's automatically added a `Instances-Target` target. The target will depend upon the action chosen. +1. Click on `aws:ec2:instance` to specify which instance we want to stop + 1. In the `Resource IDs` field, either scroll or search in the dropdown using the instance id we noted earlier. + 1. Ensure that the field `Selection mode` contains the value `All`. + 1. Click on the `Save` button. +1. Scroll down and ensure that the `Use an existing IAM role` option is ticked, and choose the IAM role created in the init step (:exclamation: The role name should start with the `panda_name` value). +1. Click on the `Create an experiment template` button, and in the warning field, enter `create` and then click the `Create an experiment template` button. + +With this done, we should be able to see the experiment we just created in the `Experiment templates` page in the AWS console at https://eu-west-2.console.aws.amazon.com/fis/home?region=eu-west-2#ExperimentTemplates, looking something like: + +![Experiment template](../../images/step01_templates.png) + +## Running our first experiment +We now have a template we can use to initiate an experiment. From the templates page in the console, +tick the box next to the template you just created and then click the `Start Experiment` button. For now, after this, just click on the `Start Experiment` button. We'll then be prompted to confirm we want to start the experiment, so enter `start` in the field and click the `Start Experiment` button. + +This will open the experiment page, where we can see the progress of the experiment. If you open the [EC2 instances page](https://eu-west-2.console.aws.amazon.com/ec2/home?region=eu-west-2#Instances) in a new tab, you should be able to find the instance, and after a short while see that it's stopped. + +If you try to open the URL for the instance, you should see that it's no longer available. So our experiment has demonstrated that we don't have a resilient infrastructure. + +If you remember, we set the experiment to restart the instance after 2 minutes. After this time, refresh the EC2 instances page, and check the public IP assigned to the instance and access the URL via that IP you should see that the instance is running again. + +If we open the `experiments` section in the AWS console, we can see our experiment listed with a state of `Completed`. If we click on the experiment id, we can re-open the experiment and review the details. + +Because we've created an experiment template, we can now run this experiment whenever we want to test the resilience of our infrastructure. + +## Cleaning up +Once you've completed the experiment, you can clean up the resources by running the command `terraform destroy --auto-approve --var-file ../common/common.tfvars`. This will remove the resources we've created in this step. + +
+
+ +--- +Now, please proceed to [step 2 README](../step02/README.md), or +Back to the main [README](../../README.md) file \ No newline at end of file diff --git a/steps/step01/data.tf b/steps/step01/data.tf new file mode 100644 index 0000000..0072d75 --- /dev/null +++ b/steps/step01/data.tf @@ -0,0 +1,28 @@ +data "terraform_remote_state" "base_config" { + backend = "local" + + config = { + path = "../../statefiles/base_config.tfstate" + } +} + +# identify availability zones in region +data "aws_availability_zones" "available_azs" { + state = "available" +} + +data "aws_ami" "amazon_linux_2" { + most_recent = true + + filter { + name = "name" + values = ["amzn2-ami-hvm-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + owners = ["amazon"] +} diff --git a/steps/step01/ec2.tf b/steps/step01/ec2.tf new file mode 100644 index 0000000..1412dab --- /dev/null +++ b/steps/step01/ec2.tf @@ -0,0 +1,58 @@ +resource "aws_instance" "simple_ec2" { + ami = data.aws_ami.amazon_linux_2.id + instance_type = "t3.nano" + iam_instance_profile = aws_iam_instance_profile.ec2_instance_profile.name + + subnet_id = data.terraform_remote_state.base_config.outputs.public_subnet_ids[0] + vpc_security_group_ids = [data.terraform_remote_state.base_config.outputs.vpc_security_group_id] + associate_public_ip_address = true + + user_data = local.cloud_config + + metadata_options { + instance_metadata_tags = "enabled" + } + + tags = { + Name = "simple-${var.panda_name}" + Panda = var.panda_name + Experiment = "step01" + az = data.aws_availability_zones.available_azs.names[0] + Project = "${local.resource_suffix}" + } +} + +locals { + cloud_config = <<-END + #cloud-config + ${jsonencode({ + write_files = [ + { + path = "/run/myserver/template.html" + permissions = "0644" + owner = "root:root" + encoding = "b64" + content = filebase64("${path.module}/ec2_files/template.html") + }, + { + path = "/run/myserver/panda.png" + permissions = "0644" + owner = "root:root" + encoding = "b64" + content = filebase64("${path.module}/../../images/halloween_panda.png") + } + ], + runcmd = [ + "yum update -y", + "yum install -y httpd", + "systemctl start httpd", + "systemctl enable httpd", + "export ec2_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)", + "export ec2_name=$(curl -s http://169.254.169.254/latest/meta-data/tags/instance/Name)", + "export ec2_az=$(curl -s http://169.254.169.254/latest/meta-data/tags/instance/az)", + "envsubst < /run/myserver/template.html > /var/www/html/index.html", + "mv /run/myserver/panda.png /var/www/html/panda.png" + ], +})} + END +} \ No newline at end of file diff --git a/steps/step01/ec2_files/template.html b/steps/step01/ec2_files/template.html new file mode 100644 index 0000000..6a4002a --- /dev/null +++ b/steps/step01/ec2_files/template.html @@ -0,0 +1,13 @@ + + + + + + Hello World! + + + Panda Logo +

Server ${ec2_name} (${ec2_id})

+

I'm running in Availability Zone ${ec2_az}

+ + \ No newline at end of file diff --git a/steps/step01/iam.tf b/steps/step01/iam.tf new file mode 100644 index 0000000..f6ea206 --- /dev/null +++ b/steps/step01/iam.tf @@ -0,0 +1,27 @@ +# create an iam role to act as ec2 instance profile +resource "aws_iam_role" "ec2_instance_role" { + name = "${local.experiment}_${local.resource_suffix}_iam_role" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy_attachment" "ec2_instance_policy_attachment" { + name = "${local.experiment}ec2_instance_policy_attachment_${local.resource_suffix}" + roles = [aws_iam_role.ec2_instance_role.name] + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_instance_profile" "ec2_instance_profile" { + name = "${local.experiment}_instance_profile_${local.resource_suffix}" + role = aws_iam_role.ec2_instance_role.name +} diff --git a/steps/step01/main.tf b/steps/step01/main.tf new file mode 100644 index 0000000..5b6b336 --- /dev/null +++ b/steps/step01/main.tf @@ -0,0 +1,24 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">=5.70.0" + } + } + + backend "local" { + path = "../../statefiles/step01_config.tfstate" + } +} + +provider "aws" { + region = var.region + + default_tags { + tags = { + Project = local.playground_name + Panda = var.panda_name + Experiment = local.experiment + } + } +} diff --git a/steps/step01/outputs.tf b/steps/step01/outputs.tf new file mode 100644 index 0000000..072bab6 --- /dev/null +++ b/steps/step01/outputs.tf @@ -0,0 +1,10 @@ +output "ec2_details" { + description = "Details of the EC2 instance." + value = { + instance_id = aws_instance.simple_ec2.id + instance_name = lookup(aws_instance.simple_ec2.tags, "Name", "default") + instance_ip_url = "http://${aws_instance.simple_ec2.public_ip}" + instance_subnet = aws_instance.simple_ec2.subnet_id + instance_az = aws_instance.simple_ec2.availability_zone + } +} diff --git a/steps/step01/variables.tf b/steps/step01/variables.tf new file mode 100644 index 0000000..6906552 --- /dev/null +++ b/steps/step01/variables.tf @@ -0,0 +1,23 @@ +variable "panda_name" { + description = "My Panda Name" + type = string +} + + +variable "region" { + description = "AWS Region for deployment." + type = string + default = "eu-west-2" +} + + + +locals { + playground_name = "devops_playground_oct2024" + panda_name_list = ["funky", "sad"] + panda_number = index(local.panda_name_list, var.panda_name) + + resource_suffix = "${var.panda_name}_${local.playground_name}" + + experiment = "step01" +} \ No newline at end of file diff --git a/steps/step02/README.md b/steps/step02/README.md new file mode 100644 index 0000000..16d8802 --- /dev/null +++ b/steps/step02/README.md @@ -0,0 +1,81 @@ +# Step 02 - improving resilience. + +In [step01](../step01/README.md), we saw that a single server has very little resilience and if it fails, +we will lose access to whatever services are running on it. + +Often, people will jump to a multi-server setup to improve resilience, but sometime we might have cost challenges. +If we can live with short outages, a simple solution can be to place that single server in an auto-scaling group with +a minimum and maximum of 1 instances. This means that if the server dies, health-checks will fail and a +new server with the same configuration will be started. + +Whilst not completely necessary, at this point, it's worth adding a load balancer, so that we don't need to query the server to get the IP address everytime. + +With that in mind, our new architecture will look like this: +![Step 02 infrastructure](../../images/step02a.png) + +## Creating the infrastructure. +This step will be similar to the previous one, but we need to make sure we're in the `step02` directory before running the command `terraform init`, and then +`terraform apply --auto-approve --var-file ../common/common.tfvars`. This will show the output as the infrastructure is created, completing with output looking +something like: + +``` +Apply complete! Resources: 6 added, 0 changed, 0 destroyed. + +Outputs: + +elb_url = "http://step02-funky-elb-746265935.eu-west-2.elb.amazonaws.com" +``` + +If you click on the url, you'll see a page similar to the first step. Make a note of the instance id, shown in the output, as we'll need this in the next step. + +## Creating a new template +Let's revisit the FIS Experiment templates in the AWS Console (either search for FIS, or use the link https://eu-west-2.console.aws.amazon.com/fis/home?region=eu-west-2#ExperimentTemplates). + +However, this time rather than using the instance id which isn't a great practice as it can change with autoscaling groups, let's look at a different option. + +When we create resources, we should really always be using tags to make it easier to identify resources - this might be for cost reporting purposes, or we might have scripts we use +to manage resources. In this case, we've setup the launch template for the auto-scaling group to add tags similar to below: + +![Instance tags](../../images/step02_tags.png) + +So this time, rather than specifying the instance id, we can specify the tag. Let's create a new template to stop the instance. + +Click on `Create an experiment template` and repeat the steps from the previous exercise as below: + +1. Firstly, provide a description for the template, such as `Stop step02 EC2 instance`, and add a name, something including your panda name at the end to make it easier to identify. +1. In the `Actions` section, click on the `Add action` button. This will open a new section where we can define the action. + 1. In the name field, enter `stop-ec2`. + 1. Select `EC2` in the action type field, and then select `aws:ec2:stop-instances` in the action field. However, this time don't select the option to restart instances as the autoscaling + group should take care of this for us. + 1. Click on the `Save` button. +1. You'll see that in the `Targets` option it's automatically added a `Instances-Target` target. The target will depend upon the action chosen. +1. Click on `aws:ec2:instance` to specify which instance we want to stop + 1. Click on the `Resource Tags, filters and parameters` option. + 1. We want to make sure we're only targeting our own instances from this experiment, so click on the `Add new tag` button, and enter `Panda` in the Key field and your panda name in the Value field. Then click on `Add new tag`, enter `Experiment` in the Key field and `step02` in the Value field. + 1. Ensure that the field `Selection mode` contains the value `All`. + 1. Click on the `Save` button. +1. Again, we'll use the role we created at the start, so click on `Use an existing IAM role` and select the role that starts with your panda name. +1. Click on the `Create an experiment template` button, and in the warning field, enter `create` and then click the `Create an experiment template` button. + +Before we run the experiment, let's look at the options we have available to us. If we look at the bottom of our experiment, we have a number of tags that will tell us about the experiment. +1. **Actions** - this tells us the actions that will be taken in the experiment. +1. **Targets** - this tells us how we'll choose the resources to be affected by the experiment. +1. **Export** - generates a CLI command that we could use to generate the command via script. +1. **Tags** - lists any tags that will be applied for the experiment run. +1. **Timeline** - this gives an estimate of how long the experiment will take to run. +1. **Schedules** - this lets us set or list when the experiment will run, if we don't want it to happen on request. + +Now we have our new template, let's run the experiment. Click on `Start experiment`, and the same on the next page before entering `start` in the warning field and clicking `Start experiment`. + +If we review the instance page in the EC2 console, we'll see that the instance is stopping. If we refresh the load balancer page, we'll see that the page is no longer available. After a few +minutes, the health checks on the auto-scaling group will fail, and a new instance will be started. If we refresh the load balancer page again, we'll see that the page is available again. + +## Cleaning up +Once you've completed the experiment, you can clean up the resources by running the command `terraform destroy --auto-approve --var-file ../common/common.tfvars`. This will remove the resources we've created in this step. + +
+
+ +--- +Now, please proceed to [step 3 README](../step03/README.md), or +Back to the main [README](../../README.md) file \ No newline at end of file diff --git a/steps/step02/asg.tf b/steps/step02/asg.tf new file mode 100644 index 0000000..cc32281 --- /dev/null +++ b/steps/step02/asg.tf @@ -0,0 +1,87 @@ +resource "aws_launch_template" "this" { + name = "${local.experiment}_${local.resource_suffix}_lt" + image_id = data.aws_ami.amazon_linux_2.id + instance_type = "t3.nano" + + iam_instance_profile { + name = aws_iam_instance_profile.ec2_instance_profile.name + } + + user_data = base64encode(local.cloud_config) + + metadata_options { + instance_metadata_tags = "enabled" + } + + # vpc_security_group_ids = [data.terraform_remote_state.base_config.outputs.vpc_security_group_id] + + network_interfaces { + associate_public_ip_address = true + security_groups = [data.terraform_remote_state.base_config.outputs.vpc_security_group_id] + } + + tag_specifications { + resource_type = "instance" + tags = { + Name = "${var.panda_name}-${local.experiment}" + Project = "${local.resource_suffix}" + Panda = var.panda_name + Experiment = local.experiment + } + } +} + +resource "aws_autoscaling_group" "this" { + name = "${local.experiment}_${local.resource_suffix}_asg" + + launch_template { + id = aws_launch_template.this.id + version = "$Latest" + } + + vpc_zone_identifier = [data.terraform_remote_state.base_config.outputs.public_subnet_ids[0]] + + min_size = 1 + max_size = 1 + desired_capacity = 1 + + load_balancers = [aws_elb.this.id] + + health_check_type = "ELB" +} + +locals { + cloud_config = <<-END + #cloud-config + ${jsonencode({ + write_files = [ + { + path = "/run/myserver/template.html" + permissions = "0644" + owner = "root:root" + encoding = "b64" + content = filebase64("${path.module}/ec2_files/template.html") + }, + { + path = "/run/myserver/panda.png" + permissions = "0644" + owner = "root:root" + encoding = "b64" + content = filebase64("${path.module}/../../images/halloween_panda.png") + }, + ], + runcmd = [ + "yum update -y", + "yum install -y httpd", + "systemctl start httpd", + "systemctl enable httpd", + "export ec2_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)", + "export ec2_name=$(curl -s http://169.254.169.254/latest/meta-data/tags/instance/Name)", + "export ec2_az=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone/)", + "envsubst < /run/myserver/template.html > /var/www/html/index.html", + "mv /run/myserver/panda.png /var/www/html/panda.png" + ], +})} + END +} + diff --git a/steps/step02/data.tf b/steps/step02/data.tf new file mode 100644 index 0000000..5ce91ac --- /dev/null +++ b/steps/step02/data.tf @@ -0,0 +1,29 @@ +data "terraform_remote_state" "base_config" { + backend = "local" + + config = { + path = "../../statefiles/base_config.tfstate" + } +} + +# identify availability zones in region +data "aws_availability_zones" "available_azs" { + state = "available" +} + +data "aws_ami" "amazon_linux_2" { + most_recent = true + + filter { + name = "name" + values = ["amzn2-ami-hvm-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + owners = ["amazon"] +} + diff --git a/steps/step02/ec2_files/template.html b/steps/step02/ec2_files/template.html new file mode 100644 index 0000000..6a4002a --- /dev/null +++ b/steps/step02/ec2_files/template.html @@ -0,0 +1,13 @@ + + + + + + Hello World! + + + Panda Logo +

Server ${ec2_name} (${ec2_id})

+

I'm running in Availability Zone ${ec2_az}

+ + \ No newline at end of file diff --git a/steps/step02/elb.tf b/steps/step02/elb.tf new file mode 100644 index 0000000..4d9f6d4 --- /dev/null +++ b/steps/step02/elb.tf @@ -0,0 +1,25 @@ +resource "aws_elb" "this" { + name = replace("${local.experiment}-${var.panda_name}-elb", "_", "-") + subnets = data.terraform_remote_state.base_config.outputs.public_subnet_ids + + security_groups = [data.terraform_remote_state.base_config.outputs.vpc_security_group_id] + + listener { + instance_port = 80 + instance_protocol = "HTTP" + lb_port = 80 + lb_protocol = "HTTP" + } + + health_check { + target = "HTTP:80/" + interval = 30 + timeout = 5 + healthy_threshold = 2 + unhealthy_threshold = 2 + } + + tags = { + Name = "${local.experiment}_${local.resource_suffix}_elb" + } +} diff --git a/steps/step02/iam.tf b/steps/step02/iam.tf new file mode 100644 index 0000000..f6ea206 --- /dev/null +++ b/steps/step02/iam.tf @@ -0,0 +1,27 @@ +# create an iam role to act as ec2 instance profile +resource "aws_iam_role" "ec2_instance_role" { + name = "${local.experiment}_${local.resource_suffix}_iam_role" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy_attachment" "ec2_instance_policy_attachment" { + name = "${local.experiment}ec2_instance_policy_attachment_${local.resource_suffix}" + roles = [aws_iam_role.ec2_instance_role.name] + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_instance_profile" "ec2_instance_profile" { + name = "${local.experiment}_instance_profile_${local.resource_suffix}" + role = aws_iam_role.ec2_instance_role.name +} diff --git a/steps/step02/main.tf b/steps/step02/main.tf new file mode 100644 index 0000000..f6b503f --- /dev/null +++ b/steps/step02/main.tf @@ -0,0 +1,24 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">=5.70.0" + } + } + + backend "local" { + path = "../../statefiles/step02_config.tfstate" + } +} + +provider "aws" { + region = var.region + + default_tags { + tags = { + Project = local.playground_name + Panda = var.panda_name + experiment = local.experiment + } + } +} diff --git a/steps/step02/outputs.tf b/steps/step02/outputs.tf new file mode 100644 index 0000000..af74d7e --- /dev/null +++ b/steps/step02/outputs.tf @@ -0,0 +1,4 @@ +output "elb_url" { + description = "URL of the ELB." + value = "http://${aws_elb.this.dns_name}" +} diff --git a/steps/step02/variables.tf b/steps/step02/variables.tf new file mode 100644 index 0000000..8ea22b3 --- /dev/null +++ b/steps/step02/variables.tf @@ -0,0 +1,26 @@ + +variable "panda_name" { + description = "My Panda Name" + type = string +} + + +variable "region" { + description = "AWS Region for deployment." + type = string + default = "eu-west-2" +} + + + +locals { + playground_name = "devops_playground_oct2024" + panda_name_list = ["funky", "sad"] + panda_number = index(local.panda_name_list, var.panda_name) + + resource_suffix = "${var.panda_name}_${local.playground_name}" + + domain_name = "devopsplayground.org" + + experiment = "step02" +} \ No newline at end of file diff --git a/steps/step03/README.md b/steps/step03/README.md new file mode 100644 index 0000000..f725e3f --- /dev/null +++ b/steps/step03/README.md @@ -0,0 +1,72 @@ +# Step 03 - Ensuring our service survives. + +In the previous experiment, we improved our resilience, cutting down the time it takes to recover from a failure. However, we can still improve this further by ensuring that our service can survive even +if some of the servers die or are stopped. + +## Building our infrastructure. +This time rather than having a single server in the autoscaling group, we're going to have multiple servers spread across multiple availability zones. This means that if an availability zone fails, we still have servers running in the other availability zones. + +Our new architecture will look like this: + +![Step 03 infrastructure](../../images/step03.png) + +To deploy our infrastructure, make sure we're in the `step03` directory before running the command `terraform init`, and then build our service with the command `terraform apply --auto-approve --var-file ../common/common.tfvars`. This will show the output as the infrastructure is created, completing with output looking something like: + +``` +Apply complete! Resources: 6 added, 0 changed, 0 destroyed. + +Outputs: + +elb_url = "http://step03-funky-elb-2009186052.eu-west-2.elb.amazonaws.com" +``` + +As alway, lets try opening that url and make a note of the instance id. This time, if you refresh the page, you should see the instance changing (it might take a couple of refreshes). + +## Creating an experiment +This time, we're going to add a couple of new steps to the experiment. + +We're going to start similar to the previous experiment, by creating a new template. Let's revisit the FIS Experiment templates in the AWS Console (either search for FIS, or use the link https://eu-west-2.console.aws.amazon.com/fis/home?region=eu-west-2#ExperimentTemplates) and then choose `Create an experiment template`. + +Let's go with these steps: +1. Specify that we're going to create an alarm targeting just our account. +1. Provide a description for the template, such as `Stop step02 EC2 instance`, and add a name, something including your panda name at the end to make it easier to identify. +1. Create an action; provide a name, and then choose either to either stop or terminate the ec2 instances. Click `save` +1. Click `Add action` again. Add a new name, then for the action, choose `aws:fis:wait`. Next specify that we'll start the action after our first one by choosing the name of the first step from the `Start after` dropdown. Set the duration to `3 minutes` and click `save`. +1. Finally, click `Add action` again, provide a new name, and again choose either to stop or terminate the ec2 instances, set this start after the pause action. Make sure the targets are set to the same as the first action. Click `save`. +1. Select the target; we're going to filter on our instance tags again, so choose `Resource tags, filters and parameters`, and add the tags `Panda` and `Experiment` again, this time using `step03` as the `Experiment` value. For the first run of the experiment, we're going to ask it to kill 2 of the 3 available servers, so change `Selection mode` to `count` and set the value to `2`. Click `save`. +1. Use the role we created at the start, so click on `Use an existing IAM role` and select the role that starts with your panda name. +1. This time, we're going to set a condition that will stop the experiment. We've already created an alarm that starts with the panda name that will alert if more than a certain number of errors are generated by the loadbalancer. So click on the dropdown next to `Stop conditions` and select the alarm that starts with your panda name. +1. Click on the `Create an experiment template` button, and in the warning field, enter `create` and then click the `Create an experiment template` button. + +So this time, we generated an experiment we can use to check that if we lose a couple of servers, our service will still be available. + +This time, for our alarm to generate an alert, we need to create traffic going through the load balancer. We'll need to run the following script in a server +```bash +while : +do + http_code=$(curl -s -o /dev/null -w "%{http_code}" http:_load_balancer_url_) + if [ $http_code -eq 200 ]; then + printf "." + else + printf "X" + fi + sleep 0.5 +done +``` +Replace *_load_balancer_url_* with the url of the load balancer, and run the script in a terminal. + +Let's run the experiment by clicking on `Start experiment`, and the same on the next page before entering `start` in the warning field and clicking `Start experiment`. + +This is going to test that we can survive 2/3's of the servers in our test service failing, and the experiment should complete in about 3 minutes. + +## Re-running our experiment +But how do we know whether the experiment is actually valid - let's repeat it, but this time we'll stop all servers. + +Firstly, check the `instances` in the AWS console and make sure that all of our 3 instances are up and running as we'd expect. We can filter on our panda name to check +just our servers. + +Once we're happy we have all of the servers, let's head back to the `Experiment Templates` in FIS. Rather than creating a new experiment let's modify the one we just ran. Find the template in the list, and click on the template ID. In the `Actions` dropdown at the top of the page, select `Update`. + +Click on the `Targets` section, and change the `Selection mode` to `All` or update the `Number of resources` to 3. Click `Save`. Scroll to the bottom of the template and click `Update an experiment template`. Once the template details are shown, click on `Start experiment` in the top right corner, and confirm you want to `Start`. + +If you've stopped the script we were running in the terminal, restart it, then in the AWS Console, restart the experiment. After 2 or 3 minutes, the alarm should trigger and stop the experiment. diff --git a/steps/step03/alarm.tf b/steps/step03/alarm.tf new file mode 100644 index 0000000..f0778c7 --- /dev/null +++ b/steps/step03/alarm.tf @@ -0,0 +1,17 @@ +resource "aws_cloudwatch_metric_alarm" "lb_alarm" { + alarm_name = "${var.panda_name}-elb-alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "4" + metric_name = "HTTPCode_ELB_5XX" + namespace = "AWS/ELB" + period = "10" + statistic = "Sum" + threshold = "10" + alarm_description = "This metric monitors the number of healthy hosts." + datapoints_to_alarm = "2" + treat_missing_data = "notBreaching" + + dimensions = { + LoadBalancerName = "${aws_elb.this.name}" + } +} \ No newline at end of file diff --git a/steps/step03/asg.tf b/steps/step03/asg.tf new file mode 100644 index 0000000..2e1c8e2 --- /dev/null +++ b/steps/step03/asg.tf @@ -0,0 +1,85 @@ +resource "aws_launch_template" "this" { + name = "${local.experiment}_${local.resource_suffix}_lt" + image_id = data.aws_ami.amazon_linux_2.id + instance_type = "t3.nano" + + iam_instance_profile { + name = aws_iam_instance_profile.ec2_instance_profile.name + } + + user_data = base64encode(local.cloud_config) + + metadata_options { + instance_metadata_tags = "enabled" + } + + network_interfaces { + associate_public_ip_address = true + security_groups = [data.terraform_remote_state.base_config.outputs.vpc_security_group_id] + } + + tag_specifications { + resource_type = "instance" + tags = { + Name = "${var.panda_name}-${local.experiment}" + Project = "${local.resource_suffix}" + Panda = var.panda_name + Experiment = local.experiment + } + } +} + +resource "aws_autoscaling_group" "this" { + name = "${local.experiment}_${local.resource_suffix}_asg" + + launch_template { + id = aws_launch_template.this.id + version = "$Latest" + } + + vpc_zone_identifier = data.terraform_remote_state.base_config.outputs.public_subnet_ids + + min_size = 3 + max_size = 9 + desired_capacity = 3 + + load_balancers = [aws_elb.this.id] + + health_check_type = "ELB" +} + +locals { + cloud_config = <<-END + #cloud-config + ${jsonencode({ + write_files = [ + { + path = "/run/myserver/template.html" + permissions = "0644" + owner = "root:root" + encoding = "b64" + content = filebase64("${path.module}/ec2_files/template.html") + }, + { + path = "/run/myserver/panda.png" + permissions = "0644" + owner = "root:root" + encoding = "b64" + content = filebase64("${path.module}/../../images/halloween_panda.png") + }, + ], + runcmd = [ + "yum update -y", + "yum install -y httpd", + "systemctl start httpd", + "systemctl enable httpd", + "export ec2_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)", + "export ec2_name=$(curl -s http://169.254.169.254/latest/meta-data/tags/instance/Name)", + "export ec2_az=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone/)", + "envsubst < /run/myserver/template.html > /var/www/html/index.html", + "mv /run/myserver/panda.png /var/www/html/panda.png" + ], +})} + END +} + diff --git a/steps/step03/data.tf b/steps/step03/data.tf new file mode 100644 index 0000000..5ce91ac --- /dev/null +++ b/steps/step03/data.tf @@ -0,0 +1,29 @@ +data "terraform_remote_state" "base_config" { + backend = "local" + + config = { + path = "../../statefiles/base_config.tfstate" + } +} + +# identify availability zones in region +data "aws_availability_zones" "available_azs" { + state = "available" +} + +data "aws_ami" "amazon_linux_2" { + most_recent = true + + filter { + name = "name" + values = ["amzn2-ami-hvm-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + owners = ["amazon"] +} + diff --git a/steps/step03/ec2_files/template.html b/steps/step03/ec2_files/template.html new file mode 100644 index 0000000..6a4002a --- /dev/null +++ b/steps/step03/ec2_files/template.html @@ -0,0 +1,13 @@ + + + + + + Hello World! + + + Panda Logo +

Server ${ec2_name} (${ec2_id})

+

I'm running in Availability Zone ${ec2_az}

+ + \ No newline at end of file diff --git a/steps/step03/elb.tf b/steps/step03/elb.tf new file mode 100644 index 0000000..7bbb2ab --- /dev/null +++ b/steps/step03/elb.tf @@ -0,0 +1,25 @@ +resource "aws_elb" "this" { + name = replace("${local.experiment}-${var.panda_name}-elb", "_", "-") + subnets = data.terraform_remote_state.base_config.outputs.public_subnet_ids + + security_groups = [data.terraform_remote_state.base_config.outputs.vpc_security_group_id] + + listener { + instance_port = 80 + instance_protocol = "HTTP" + lb_port = 80 + lb_protocol = "HTTP" + } + + health_check { + target = "HTTP:80/" + interval = 5 + timeout = 2 + healthy_threshold = 2 + unhealthy_threshold = 2 + } + + tags = { + Name = "${local.experiment}_${local.resource_suffix}_elb" + } +} diff --git a/steps/step03/iam.tf b/steps/step03/iam.tf new file mode 100644 index 0000000..f6ea206 --- /dev/null +++ b/steps/step03/iam.tf @@ -0,0 +1,27 @@ +# create an iam role to act as ec2 instance profile +resource "aws_iam_role" "ec2_instance_role" { + name = "${local.experiment}_${local.resource_suffix}_iam_role" + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "ec2.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy_attachment" "ec2_instance_policy_attachment" { + name = "${local.experiment}ec2_instance_policy_attachment_${local.resource_suffix}" + roles = [aws_iam_role.ec2_instance_role.name] + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_instance_profile" "ec2_instance_profile" { + name = "${local.experiment}_instance_profile_${local.resource_suffix}" + role = aws_iam_role.ec2_instance_role.name +} diff --git a/steps/step03/main.tf b/steps/step03/main.tf new file mode 100644 index 0000000..f6b503f --- /dev/null +++ b/steps/step03/main.tf @@ -0,0 +1,24 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">=5.70.0" + } + } + + backend "local" { + path = "../../statefiles/step02_config.tfstate" + } +} + +provider "aws" { + region = var.region + + default_tags { + tags = { + Project = local.playground_name + Panda = var.panda_name + experiment = local.experiment + } + } +} diff --git a/steps/step03/outputs.tf b/steps/step03/outputs.tf new file mode 100644 index 0000000..af74d7e --- /dev/null +++ b/steps/step03/outputs.tf @@ -0,0 +1,4 @@ +output "elb_url" { + description = "URL of the ELB." + value = "http://${aws_elb.this.dns_name}" +} diff --git a/steps/step03/variables.tf b/steps/step03/variables.tf new file mode 100644 index 0000000..4e8be12 --- /dev/null +++ b/steps/step03/variables.tf @@ -0,0 +1,26 @@ + +variable "panda_name" { + description = "My Panda Name" + type = string +} + + +variable "region" { + description = "AWS Region for deployment." + type = string + default = "eu-west-2" +} + + + +locals { + playground_name = "devops_playground_oct2024" + panda_name_list = ["funky", "sad"] + panda_number = index(local.panda_name_list, var.panda_name) + + resource_suffix = "${var.panda_name}_${local.playground_name}" + + domain_name = "devopsplayground.org" + + experiment = "step03" +} \ No newline at end of file