Skip to content

Commit

Permalink
feat: Add cluster-autoscaler example, update Inferentia example
Browse files Browse the repository at this point in the history
  • Loading branch information
bryantbiggs committed Mar 8, 2024
1 parent 9041584 commit 77f3a10
Show file tree
Hide file tree
Showing 15 changed files with 678 additions and 227 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:

env:
TERRAFORM_DOCS_VERSION: v0.16.0
TFLINT_VERSION: v0.44.1
TFLINT_VERSION: v0.50.3

jobs:
collectInputs:
Expand All @@ -22,7 +22,7 @@ jobs:

- name: Get root directories
id: dirs
uses: clowdhaus/terraform-composite-actions/directories@v1.8.3
uses: clowdhaus/terraform-composite-actions/directories@v1.9.0

preCommitMinVersions:
name: Min TF pre-commit
Expand All @@ -37,14 +37,14 @@ jobs:

- name: Terraform min/max versions
id: minMax
uses: clowdhaus/terraform-min-max@v1.2.7
uses: clowdhaus/terraform-min-max@v1.3.0
with:
directory: ${{ matrix.directory }}

- name: Pre-commit Terraform ${{ steps.minMax.outputs.minVersion }}
# Run only validate pre-commit check on min version supported
if: ${{ matrix.directory != '.' }}
uses: clowdhaus/terraform-composite-actions/pre-commit@v1.8.3
uses: clowdhaus/terraform-composite-actions/pre-commit@v1.9.0
with:
terraform-version: ${{ steps.minMax.outputs.minVersion }}
tflint-version: ${{ env.TFLINT_VERSION }}
Expand All @@ -53,7 +53,7 @@ jobs:
- name: Pre-commit Terraform ${{ steps.minMax.outputs.minVersion }}
# Run only validate pre-commit check on min version supported
if: ${{ matrix.directory == '.' }}
uses: clowdhaus/terraform-composite-actions/pre-commit@v1.8.3
uses: clowdhaus/terraform-composite-actions/pre-commit@v1.9.0
with:
terraform-version: ${{ steps.minMax.outputs.minVersion }}
tflint-version: ${{ env.TFLINT_VERSION }}
Expand All @@ -72,10 +72,10 @@ jobs:

- name: Terraform min/max versions
id: minMax
uses: clowdhaus/terraform-min-max@v1.2.7
uses: clowdhaus/terraform-min-max@v1.3.0

- name: Pre-commit Terraform ${{ steps.minMax.outputs.maxVersion }}
uses: clowdhaus/terraform-composite-actions/pre-commit@v1.8.3
uses: clowdhaus/terraform-composite-actions/pre-commit@v1.9.0
with:
terraform-version: ${{ steps.minMax.outputs.maxVersion }}
tflint-version: ${{ env.TFLINT_VERSION }}
Expand Down
49 changes: 49 additions & 0 deletions cluster-autoscaler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# EKS Cluster w/ Cluster Autoscaler

### Prerequisites:

Ensure that you have the following tools installed locally:

1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html)
2. [kubectl](https://Kubernetes.io/docs/tasks/tools/)
3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli)

### Deployment

1. Provision resources as they are defined in the `us-east-1` directory using:

```sh
terraform init -upgrade=true
terraform apply
```

2. Once the cluster is up and running and the node group is provisioned, update your Terraform state to align with changes made by the AWS API. This doesn't modify any resources, it just simply aligns your statefile with the current state. You can read more about this at the following links if interested:

- https://github.com/hashicorp/terraform/pull/28634
- https://github.com/hashicorp/terraform/issues/28803

```sh
terraform apply -refresh-only
terraform plan # should show `No changes. Your infrastructure matches the configuration.`
```

3. Update your kubeconfig to access the cluster:

```sh
aws eks --region us-east-1 update-kubeconfig --name cluster-autoscaler
```

4. Deploy the sample inflate deployment - this will cause cluster-autoscaler to scale up the nodegroup to satisfy the pending pod requests:

```sh
kubectl apply -f inflate.yaml
```

### Tear Down & Clean-Up

1. Remove the resources created by Terraform

```sh
terraform destroy -target=module.eks_blueprints_addons
terraform destroy
```
22 changes: 22 additions & 0 deletions cluster-autoscaler/add-ons.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
################################################################################
# Addons
################################################################################

module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
version = "~> 1.16"

cluster_name = module.eks.cluster_name
cluster_endpoint = module.eks.cluster_endpoint
cluster_version = module.eks.cluster_version
oidc_provider_arn = module.eks.oidc_provider_arn

# Wait for compute to be available
create_delay_dependencies = [for group in module.eks.eks_managed_node_groups :
group.node_group_arn if group.node_group_arn != null
]

enable_cluster_autoscaler = true

tags = module.tags.tags
}
51 changes: 51 additions & 0 deletions cluster-autoscaler/eks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
################################################################################
# EKS Cluster
################################################################################

module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.4"

cluster_name = local.name
cluster_version = "1.29"

# To facilitate easier interaction for demonstration purposes
cluster_endpoint_public_access = true

# Gives Terraform identity admin access to cluster which will
# allow deploying resources (Karpenter) into the cluster
enable_cluster_creator_admin_permissions = true

cluster_addons = {
coredns = {}
kube-proxy = {}
vpc-cni = {}
}

vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets

eks_managed_node_group_defaults = {
iam_role_additional_policies = {
# Not required, but used in the example to access the nodes to inspect drivers and devices
AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
}

eks_managed_node_groups = {
default = {
instance_types = ["m5.large"]

min_size = 1
max_size = 20
desired_size = 1

tags = {
"k8s.io/cluster-autoscaler/enabled" : true,
"k8s.io/cluster-autoscaler/${local.name}" : "owned",
}
}
}

tags = module.tags.tags
}
27 changes: 27 additions & 0 deletions cluster-autoscaler/inflate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inflate
spec:
replicas: 8
selector:
matchLabels:
app: inflate
template:
metadata:
labels:
app: inflate
spec:
terminationGracePeriodSeconds: 0
containers:
- name: inflate
image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
resources:
# We are consuming (nearly) all resources for an m5.large
# which maps one pod per node
requests:
cpu: 1500m
memory: 6Gi
limits:
cpu: 1500m
memory: 6Gi
69 changes: 69 additions & 0 deletions cluster-autoscaler/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
terraform {
required_version = "~> 1.3"

required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.38"
}
helm = {
source = "hashicorp/helm"
version = ">= 2.7"
}
}
}

provider "aws" {
region = local.region

# assume_role {
# role_arn = "<TODO>"
# session_name = local.name
# }
}

provider "helm" {
kubernetes {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)

exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
# This requires the awscli to be installed locally where Terraform is executed
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
}
}
}

################################################################################
# Common Locals
################################################################################

locals {
name = "cluster-autoscaler"
region = "us-east-1"
environment = "nonprod"

vpc_cidr = "10.0.0.0/16"
azs = slice(data.aws_availability_zones.available.names, 0, 3)
}

################################################################################
# Common Data
################################################################################

data "aws_availability_zones" "available" {}

################################################################################
# Common Modules
################################################################################

module "tags" {
source = "clowdhaus/tags/aws"
version = "~> 1.0"

application = local.name
environment = local.environment
repository = "https://github.com/clowdhaus/eks-reference-architecture"
}
4 changes: 4 additions & 0 deletions cluster-autoscaler/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output "update_kubeconfig" {
description = "Update kubeconfig"
value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
}
33 changes: 33 additions & 0 deletions cluster-autoscaler/vpc.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
################################################################################
# VPC
################################################################################

module "vpc" {
# https://registry.terraform.io/modules/terraform-aws-modules/vpc/aws/latest
source = "terraform-aws-modules/vpc/aws"
version = "~> 5.0"

name = local.name
cidr = local.vpc_cidr

azs = local.azs
private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)]
public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)]

enable_nat_gateway = true
single_nat_gateway = true

public_subnet_tags = {
"kubernetes.io/role/elb" = 1
"kubernetes.io/cluster/${local.name}" = "shared"
}

private_subnet_tags = {
"kubernetes.io/role/internal-elb" = 1
"kubernetes.io/cluster/${local.name}" = "shared"
# Tags subnets for Karpenter auto-discovery
"karpenter.sh/discovery" = local.name
}

tags = module.tags.tags
}
12 changes: 12 additions & 0 deletions eks-mng-gpu/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ module "eks" {
max_size = 1
desired_size = 1

# Default AMI has only 8GB of storage
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 256
volume_type = "gp3"
delete_on_termination = true
}
}
}

taints = {
# Ensure only GPU workloads are scheduled on this node group
gpu = {
Expand Down
Loading

0 comments on commit 77f3a10

Please sign in to comment.