Merge branch 'simonhanmer:dev' into dev

DevOpsPlayground · Oct 29, 2024 · e5b4ed1 · e5b4ed1
2 parents 2e33313 + 9422527
commit e5b4ed1
Show file tree

Hide file tree

Showing 43 changed files with 1,009 additions and 113 deletions.
diff --git a/images/apache_splash.png b/images/apache_splash.png
diff --git a/images/halloween_panda.png b/images/halloween_panda.png
diff --git a/images/init.png b/images/init.png
diff --git a/images/step01.png b/images/step01.png
diff --git a/images/step01_templates.png b/images/step01_templates.png
diff --git a/images/step01_web.png b/images/step01_web.png
diff --git a/images/step02_tags.png b/images/step02_tags.png
diff --git a/images/step02a.png b/images/step02a.png
diff --git a/images/step03.png b/images/step03.png
diff --git a/steps/init/README.md b/steps/init/README.md
@@ -31,7 +31,6 @@ Most of the terraform files in this workshop use the same variables, so we'll cr
 
 ```hcl
 panda_name = "<some_panda_name>"
-asset_bucket = "<asset_bucket_name>"
 ```
 
 If you're running this as part of the live session, replace `<some_panda_name>` with the name we'll provide you with and `<asset_bucket>` with `630895193694-eu-west-2-oct-assets`, otherwise feel free to use your own details, or similar.
@@ -49,38 +48,14 @@ If the command runs ok, we can then apply the changes with `terraform apply --au
 Once the command completes, you should see something like (although the actual values may  be different):
 
 ```text
-Apply complete! Resources: 14 added, 0 changed, 0 destroyed.
+Apply complete! Resources: 20 added, 0 changed, 0 destroyed.
 
 Outputs:
 
-public_subnet_details = {
-  "subnet-02b9d202802d4daa5" = {
-    "availability_zone" = "eu-west-2a"
-    "cidr_block" = "10.100.0.0/24"
-  }
-  "subnet-04cfa858a2cc3c3e6" = {
-    "availability_zone" = "eu-west-2c"
-    "cidr_block" = "10.100.2.0/24"
-  }
-  "subnet-092dbf5c60743a722" = {
-    "availability_zone" = "eu-west-2b"
-    "cidr_block" = "10.100.1.0/24"
-  }
-}
-public_subnet_ids = [
-  "subnet-02b9d202802d4daa5",
-  "subnet-092dbf5c60743a722",
-  "subnet-04cfa858a2cc3c3e6",
-]
-subnet_cidr_ranges = [
-  "10.100.0.0/24",
-  "10.100.1.0/24",
-  "10.100.2.0/24",
-]
-vpc_cidr_ranges = "10.100.0.0/16"
-vpc_id = "vpc-026d7af20963b827b"
-vpc_security_group_id = "sg-0a4d936e73c776c74c"
+fis_iam_role = "funky_fis_iam_role"
+fis_log_group = "/aws/fis/funky-fis-logs"
 ```
+:information_source: Make a note of the outputß values as we'll need these later.
 
 ## Sharing the configuration with other steps
 To allow the following steps in the workshop to use configuration we've just created, we'll store the details in a Terraform statefile `statefiles/base_config.tfstate`.

diff --git a/steps/init/cloudwatch.tf b/steps/init/cloudwatch.tf
@@ -0,0 +1,4 @@
+resource "aws_cloudwatch_log_group" "log_group" {
+  name              = "/aws/fis/${var.panda_name}-fis-logs"
+  retention_in_days = 1
+}
diff --git a/steps/init/data.tf b/steps/init/data.tf
@@ -3,10 +3,4 @@ data "aws_availability_zones" "available_azs" {
   state = "available"
 }
 
-
-data "aws_caller_identity" "current" {
-}
-
-data "aws_s3_bucket" "playground_assets" {
-  bucket = "${local.account_id}-${var.region}-oct-assets"
-}
+data "aws_caller_identity" "current" {}
diff --git a/steps/init/iam.tf b/steps/init/iam.tf
@@ -0,0 +1,53 @@
+resource "aws_iam_role" "fis_role" {
+  name = "${var.panda_name}_fis_iam_role"
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17",
+    Statement = [
+      {
+        Effect = "Allow",
+        Principal = {
+          Service = [
+            "fis.amazonaws.com",
+            "delivery.logs.amazonaws.com"
+          ]
+        },
+        Action = "sts:AssumeRole"
+      }
+    ]
+  })
+}
+
+resource "aws_iam_policy_attachment" "ec2_fis_policy_attachment" {
+  name       = "ec2_instance_policy_attachment_${local.resource_suffix}"
+  roles      = [aws_iam_role.fis_role.name]
+  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access"
+}
+
+resource "aws_iam_policy" "cloudwatch_logs_policy" {
+  name        = "${var.panda_name}_cloudwatch_logs_policy"
+  description = "Policy to allow CloudWatch log group and log stream access"
+  policy = jsonencode({
+    Version = "2012-10-17",
+    Statement = [
+      {
+        Effect = "Allow",
+        Action = [
+          "logs:DescribeLogGroups",
+          "logs:CreateLogDelivery",
+          "logs:CreateLogGroup",
+          "logs:CreateLogStream",
+          "logs:PutLogEvents"
+        ],
+        Resource = [
+          "*"
+        ]
+      }
+    ]
+  })
+}
+
+resource "aws_iam_policy_attachment" "cloudwatch_logs_policy_attachment" {
+  name       = "${var.panda_name}_cloudwatch_logs_policy_attachment"
+  roles      = [aws_iam_role.fis_role.name]
+  policy_arn = aws_iam_policy.cloudwatch_logs_policy.arn
+}
diff --git a/steps/init/outputs.tf b/steps/init/outputs.tf
@@ -1,40 +1,9 @@
-output "asset_bucket_arn" {
-  description = "ARN of the asset bucket."
-  value       = data.aws_s3_bucket.playground_assets.arn
+output "fis_iam_role" {
+  description = "IAM Role for FIS."
+  value       = aws_iam_role.fis_role.name
 }
 
-output "asset_bucket_name" {
-  description = "name of the asset bucket."
-  value       = data.aws_s3_bucket.playground_assets.id
+output "fis_log_group" {
+  description = "Log Group for FIS."
+  value       = aws_cloudwatch_log_group.log_group.name
 }
-
-output "public_subnet_details" {
-  description = "Details of the public subnets."
-  value       = module.vpc.public_subnet_details
-}
-
-output "public_subnet_ids" {
-  description = "IDs of the public subnets."
-  value       = module.vpc.public_subnet_ids
-}
-
-output "vpc_security_group_id" {
-  description = "ID of the VPC Security Group."
-  value       = module.vpc.security_group_id
-}
-
-output "subnet_cidr_ranges" {
-  description = "CIDR Range for Subnets."
-  value       = local.subnet_cidr_ranges
-}
-
-output "vpc_cidr_ranges" {
-  description = "CIDR Range for VPC."
-  value       = local.vpc_cidr_range
-}
-
-output "vpc_id" {
-  description = "ID of the VPC."
-  value       = module.vpc.vpc_id
-}
-
diff --git a/steps/init/variables.tf b/steps/init/variables.tf
@@ -1,9 +1,3 @@
-
-variable "asset_bucket" {
-  description = "Name of the asset bucket."
-  type        = string
-}
-
 variable "panda_name" {
   description = "My Panda Name"
   type        = string
@@ -28,5 +22,5 @@ locals {
   vpc_cidr_range     = "10.${local.panda_number + local.cidr_offset}.0.0/16"
   subnet_cidr_ranges = [for i in range(length(data.aws_availability_zones.available_azs.names)) : "10.${local.panda_number + local.cidr_offset}.${i}.0/24"]
 
-  account_id = data.aws_caller_identity.current.account_id
+  # account_id = data.aws_caller_identity.current.account_id
 }
diff --git a/steps/modules/vpc/sg.tf b/steps/modules/vpc/sg.tf
@@ -1,10 +1,19 @@
 resource "aws_security_group" "ec2_sg" {
-  name        = "step01_${var.resource_suffix}_sg"
+  name        = "${var.resource_suffix}_sg"
   description = "Allow inbound traffic on port 80"
   vpc_id      = aws_vpc.this.id
 }
 
 
+resource "aws_vpc_security_group_ingress_rule" "allow_http_sg" {
+  security_group_id = aws_security_group.ec2_sg.id
+
+  from_port   = 80
+  to_port     = 80
+  ip_protocol = "tcp"
+  referenced_security_group_id = aws_security_group.ec2_sg.id
+}
+
 resource "aws_vpc_security_group_ingress_rule" "allow_http" {
   security_group_id = aws_security_group.ec2_sg.id
 
@@ -23,11 +32,20 @@ resource "aws_vpc_security_group_ingress_rule" "allow_https" {
   cidr_ipv4   = "0.0.0.0/0"
 }
 
-resource "aws_vpc_security_group_egress_rule" "allow_https_connect" {
+resource "aws_vpc_security_group_egress_rule" "allow_https" {
   security_group_id = aws_security_group.ec2_sg.id
 
   from_port   = 443
   to_port     = 443
   ip_protocol = "tcp"
   cidr_ipv4   = "0.0.0.0/0"
+}
+
+resource "aws_vpc_security_group_egress_rule" "allow_http_sg" {
+  security_group_id = aws_security_group.ec2_sg.id
+
+  from_port   = 80
+  to_port     = 80
+  ip_protocol = "tcp"
+  referenced_security_group_id = aws_security_group.ec2_sg.id
 }
diff --git a/steps/step01/README.md b/steps/step01/README.md
@@ -1,44 +1,112 @@
-# Step 01 - Creating base infrastructure
+# Step 01 - First experiment with FIS, a simple single server setup
 
-For several steps in this workshop, we'll be using AWS. To get started, we need to create some base infrastructure.
+To get started, we'll need to deploy a simple server in AWS. We'll then use FIS to simulate a failure on this server.
 
-Firstly, we're going to create a Virtual Private Cloud (VPC) to isolate our resources, and to make it simpler when we come to test later.
+Please be aware that we're deliberately keeping this configuration simple to reduce costs and complexity. In a live environment, we'd use private subnets and NAT gateways.
 
+:warning: Before you start this step, make sure you've deployed the base infrastructure in the [init](../init/README.md) step.
 
 
-```mermaid
-architecture-beta
-    group api(logos:aws-lambda)[API]
+## Pre-requisites
+Before we go further with the step, we need to ensure that 
+1. We have some active AWS credentials,
+1. We have deployed the base inftastructure in the [init](../init/README.md) step.
+1. This deployment assumes there is a route53 hosted zone in place, pointing to the domain referenced in `variables.tf`. If you don't have a hosted zone/domain, ignore this and use the url with an IP, but be aware that the IP may change if the server is stopped and started again.
 
-    service db(logos:aws-aurora)[Database] in api
-    service disk1(logos:aws-glacier)[Storage] in api
-    service disk2(logos:aws-s3)[Storage] in api
-    service server(logos:aws-ec2)[Server] in api
+## Deploying an environment to test
+To deploy the environment, we need to run the Terraform code in the `step01` directory. 
 
-    db:L -- R:server
-    disk1:T -- B:server
-    disk2:T -- B:db
-```
+Firstly, we need to initialise our Terraform environment by running the command `terraform init`. This will download the required providers and setup an environment ready to use.
 
+Once initialised, it's always good practice to check our Terraform code, and we can do this with the commands `teraform plan --var-file ../common/common.tfvars`. This will show us what Terraform is going to do.
 
-```mermaid
-architecture-beta
-    group api(cloud)[API]
+If the command runs ok, we can then apply the changes with `terraform apply --auto-approve --var-file ../common/common.tfvars`. This will create the infrastructure in the specified AWS account.
 
-    service db(internet)[Database] in api
-    service disk1(disk)[Storage] in api
-    service disk2(disk)[Storage] in api
-    service server(server)[Server] in api
+Once the command completes, you should see something like (although the actual values may  be different):
 
-    db:L -- R:server
-    disk1:T -- B:server
-    disk2:T -- B:db
-```
+```text
+Apply complete! Resources: 6 added, 0 changed, 0 destroyed.
+
+Outputs:
+
+ec2_details = {
+  "instance_az" = "eu-west-2a"
+  "instance_id" = "i-08ede8fe7eb507f58"
+  "instance_ip_url" = "http://18.170.32.169"
+  "instance_name" = "simple-funky"
+  "instance_named_url" = "http://simple-funky.devopsplayground.org"
+  "instance_subnet" = "subnet-0a1170859dd348b2b"
+}
 
-```mermaid
-flowchart TD
-    A[Client] -->|Request| B(API)
-    B --> C{Condition}
-    C -->|Success| D[Response]
-    C -->|Error| E[Error message]
 ```
+
+:information_source: Make a note of the `instance_id` in the output from your deployment as we'll need this shortly.
+
+## What have we deployed?
+For this first step, we've deployed a simple EC2 instance running a web server. The instance is in a public subnet, and has a security group allowing inbound traffic on port 80. 
+The EC2 instance is configured to update route53 with the public IP address of the instance on every reboot, so we can access it via a domain name (if you have one available).
+
+The infrastructure deployed is shown in the diagram below:
+
+![Simple server setup](../../images/step01.png)
+
+
+With the server deployed, we can now test it by visiting the URL in the `instance_ip_url` output. This should show a simple web page that looks something like:
+
+![Web page](../../images/step01_web.png)
+
+(:warning: **Note**: it may take a few minutes for the server to become fully available)
+
+
+## Creating our first experiment with FIS
+Now that we have our server deployed, we can create our first experiment with FIS. For this we're going to simulate a failure on the server by stopping the instance.
+
+Firstly, we need to create an experiment template within FIS. These templates describe the actions that we're going to carry out in the experiment. To do this,
+we need to access FIS in the AWS console. You can do this by navigating to the [FIS console](https://eu-west-2.console.aws.amazon.com/fis/home?region=eu-west-2#/home) and selecting the `Experiment templates` option from the left-hand menu, or by entering FIS in the AWS search bar, selecting FIS and then clicking on `Experiment templates`.
+
+Once in the `Experiment templates` page, click on the `Create experiment template` button. This will take you to the `Create experiment template` page. We're only to going to 
+test against our account, so ensure that the `Account` option is selected, and click on `Confirm`.
+
+Next we'll define our experiment template with the following steps:
+
+1. Firstly, provide a description for the template, such as `Stop step01 EC2 instance`, and add a name, something including your panda name at the end to make it easier to identify.
+1. In the `Actions` section, click on the `Add action` button. This will open a new section where we can define the action. 
+    1. In the name field, enter `stop-ec2`.
+    1. Select `EC2` in the action type field, and then select `aws:ec2:stop-instances` in the action field. Depending on the action chosen, we'll see different options to complete. In this case, we're going to restart the server after 2 minutes.
+    1. In the `Start instances after duration` field, enter 2 minutes. This will ensure that the instance is restarted after 3 minutes.
+    1. Click on the `Save` button.
+1. You'll see that in the `Targets` option it's automatically added a `Instances-Target` target. The target will depend upon the action chosen.
+1. Click on `aws:ec2:instance` to specify which instance we want to stop
+    1. In the `Resource IDs` field, either scroll or search in the dropdown using the instance id we noted earlier.
+    1. Ensure that the field `Selection mode` contains the value `All`.
+    1. Click on the `Save` button.
+1. Scroll down and ensure that the `Use an existing IAM role` option is ticked, and choose the IAM role created in the init step (:exclamation: The role name should start with the `panda_name` value).
+1. Click on the `Create an experiment template` button, and in the warning field, enter `create` and then click the `Create an experiment template` button.
+
+With this done, we should be able to see the experiment we just created in the `Experiment templates` page in the AWS console at https://eu-west-2.console.aws.amazon.com/fis/home?region=eu-west-2#ExperimentTemplates, looking something like:
+
+![Experiment template](../../images/step01_templates.png)
+
+## Running our first experiment
+We now have a template we can use to initiate an experiment. From the templates page in the console, 
+tick the box next to the template you just created and then click the `Start Experiment` button. For now, after this, just click on the `Start Experiment` button. We'll then be prompted to confirm we want to start the experiment, so enter `start` in the field and click the `Start Experiment` button.
+
+This will open the experiment page, where we can see the progress of the experiment. If you open the [EC2 instances page](https://eu-west-2.console.aws.amazon.com/ec2/home?region=eu-west-2#Instances) in a new tab, you should be able to find the instance, and after a short while see that it's stopped.
+
+If you try to open the URL for the instance, you should see that it's no longer available. So our experiment has demonstrated that we don't have a resilient infrastructure.
+
+If you remember, we set the experiment to restart the instance after 2 minutes. After this time, refresh the EC2 instances page, and check the public IP assigned to the instance and access the URL via that IP you should see that the instance is running again.
+
+If we open the `experiments` section in the AWS console, we can see our experiment listed with a state of `Completed`. If we click on the experiment id, we can re-open the experiment and review the details.
+
+Because we've created an experiment template, we can now run this experiment whenever we want to test the resilience of our infrastructure.
+
+## Cleaning up
+Once you've completed the experiment, you can clean up the resources by running the command `terraform destroy --auto-approve --var-file ../common/common.tfvars`. This will remove the resources we've created in this step.
+
+<br />
+<br />
+
+---
+Now, please proceed to [step 2 README](../step02/README.md), or
+Back to the main [README](../../README.md) file