diff --git a/hack/ebs-scale-test/README.md b/hack/ebs-scale-test/README.md new file mode 100644 index 0000000000..225ee87561 --- /dev/null +++ b/hack/ebs-scale-test/README.md @@ -0,0 +1,76 @@ +# EBS CSI Driver Scalability Tests + +EBS uses EBS CSI Driver scalability tests to validate that each release of our driver can manage EBS volume lifecycle for large-scale clusters. + +Setup and run an EBS CSI Driver scalability test with our `scale-test` tool: + +```shell +# Set scalability parameters +export CLUSTER_TYPE="pre-allocated" +export TEST_TYPE="scale-sts" +export REPLICAS="1000" + +# Setup an EKS scalability cluster and install EBS CSI Driver. +./scale-test setup + +# Run a scalability test and export results to S3. +./scale-test run + +# Cleanup all AWS resources related to scalability cluster. +./scale-test cleanup +``` + +## Pre-requisites + +REVIEWER NOTE: I'm open to relying on `make tools` /bin dependencies. But that might be confusing to those just wanting to run scale tests. + +Install the following commandline tools: +- [gomplate](https://github.com/hairyhenderson/gomplate) +- [aws cli v2](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) +- [eksctl](https://eksctl.io/installation/) +- [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) + +## Overridable parameters + +You can modify the kind of scalability cluster test run, or the names of script artifacts, through environment variables. + +Note: The environment variables set when you run `scale-test setup` must remain the same for future `scale-test run`/`scale-test clean` commands on that scalability cluster. + +```sh +# Affect test +CLUSTER_TYPE # Type of scalability cluster to create. +TEST_TYPE # Type of scale test to run. +REPLICAS # Number of StatefulSet replicas to create. +DRIVER_VALUES_FILEPATH # Custom values file passed to EBS CSI Driver Helm chart. + +# Names +CLUSTER_NAME # Base name used by `eksctl` to create AWS resources. +EXPORT_DIR # Where to export scale test metrics/logs locally. +S3_BUCKET # Name of S3 bucket used for holding scalability run results. +SCALABILITY_TEST_RUN_NAME # Name of test run. Used as name of directory for adding run results in $S3_BUCKET. + +# Find default values at top of `scale-test` script. +``` + +## Types of scalability tests + +Set the `CLUSTER_TYPE` and `TEST_TYPE` environment variables to set up and run different scalability tests. + +- `CLUSTER_TYPE` dictates what type of scalability cluster `scale-test` creates and which nodes are used during a scalability test run. Options include: + - 'pre-allocated': Additional worker nodes are created during cluster setup. By default, we pre-allocate 1 `m7a.48xlarge` EC2 instance for every 100 StatefulSet replicas. + + +- `TEST_TYPE` dictates what type of scalability test we want to run. Options include: + - 'scale-sts': Scales a StatefulSet to `$REPLICAS`. Waits for all pods to be ready. Delete Sts. Waits for all PVs to be deleted. Exercises + +You can mix and match `CLUSTER_TYPE` and `TEST_TYPE`. + +## Contributing scalability tests + +`scale-test` parses arguments and wraps scripts and configuration files in the `helpers` directory. These helper scripts manage the scalability cluster and test runs. + +We rely on [gomplate](https://github.com/hairyhenderson/gomplate) to render configuration files based on environment variables. + +The `helpers` directory includes: +- `/helpers/cluster-setup`: Holds scripts and configuration for cluster and add-on setup/cleanup. +- `/helpers/scale-test`: Holds directory for each scale test. Also holds utility scripts used by every test (like exporting logs/metrics to S3). diff --git a/hack/ebs-scale-test/helpers/cluster-setup/manage-cluster.sh b/hack/ebs-scale-test/helpers/cluster-setup/manage-cluster.sh new file mode 100755 index 0000000000..0739f9aea3 --- /dev/null +++ b/hack/ebs-scale-test/helpers/cluster-setup/manage-cluster.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +### Helper script to create/delete eks ebs-scale-test clusters and install add-ons. + +set -euo pipefail + +# We expect this helper script is sourced from hack/ebs-scale-test +path_to_cluster_setup_dir="${BASE_DIR}/helpers/cluster-setup/" + +## Cluster + +create_cluster() { + if eksctl get cluster --name "$CLUSTER_NAME" --region "$AWS_REGION" >/dev/null 2>&1; then + echo "EKS cluster '$CLUSTER_NAME' already up in $AWS_REGION." + aws eks update-kubeconfig --name "$CLUSTER_NAME" --region "$AWS_REGION" + else + echo "Deploying EKS cluster. See configuration in $EXPORT_DIR/cluster-config.yaml" + gomplate -f "$path_to_cluster_setup_dir/scale-cluster-config.yaml" -o "$EXPORT_DIR/cluster-config.yaml" + eksctl create cluster -f "$EXPORT_DIR/cluster-config.yaml" + fi +} + +cleanup_cluster() { + eksctl delete cluster "$CLUSTER_NAME" +} + +## EBS CSI Driver + +deploy_ebs_csi_driver() { + path_to_chart="${BASE_DIR}/../../charts/aws-ebs-csi-driver" + echo "Deploying EBS CSI driver from chart $path_to_chart" + + helm upgrade --install aws-ebs-csi-driver \ + --namespace kube-system \ + --values "$DRIVER_VALUES_FILEPATH" \ + --wait \ + --timeout 15m \ + "$path_to_chart" +} + +(return 0 2>/dev/null) || ( + echo "This script is not meant to be run directly, only sourced as a helper!" + exit 1 +) diff --git a/hack/ebs-scale-test/helpers/cluster-setup/scale-cluster-config.yaml b/hack/ebs-scale-test/helpers/cluster-setup/scale-cluster-config.yaml new file mode 100644 index 0000000000..60f2800807 --- /dev/null +++ b/hack/ebs-scale-test/helpers/cluster-setup/scale-cluster-config.yaml @@ -0,0 +1,42 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: {{ .Env.CLUSTER_NAME }} + version: {{ .Env.K8S_VERSION }} + region: {{ .Env.AWS_REGION }} + tags: + karpenter.sh/discovery: {{ .Env.CLUSTER_NAME }} + +iam: + withOIDC: true + podIdentityAssociations: + - namespace: kube-system + serviceAccountName: ebs-csi-controller-sa + wellKnownPolicies: + ebsCSIController: true + +managedNodeGroups: +{{- if eq ( getenv "CLUSTER_TYPE" ) "pre-allocated" }} + - instanceType: m7a.48xlarge + amiFamily: AmazonLinux2 + name: pre-allocated-ng + desiredCapacity: {{ .Env.PRE_ALLOCATED_NODES }} +{{- end }} + +addons: + - name: eks-pod-identity-agent + - name: snapshot-controller diff --git a/hack/ebs-scale-test/helpers/cluster-setup/scale-driver-values.yaml b/hack/ebs-scale-test/helpers/cluster-setup/scale-driver-values.yaml new file mode 100644 index 0000000000..e15d4e2c59 --- /dev/null +++ b/hack/ebs-scale-test/helpers/cluster-setup/scale-driver-values.yaml @@ -0,0 +1,28 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default values.yaml for ebs-scale-test installation of aws-ebs-csi-driver +image: + pullPolicy: Always +controller: + logLevel: 7 + replicaCount: 1 + enableMetrics: true +sidecars: + provisioner: + additionalArgs: ["--http-endpoint=:8081"] + resizer: + additionalArgs: ["--http-endpoint=:8082"] + attacher: + additionalArgs: ["--http-endpoint=:8084"] diff --git a/hack/ebs-scale-test/helpers/scale-test/export-to-s3.sh b/hack/ebs-scale-test/helpers/scale-test/export-to-s3.sh new file mode 100755 index 0000000000..0e4414ac0f --- /dev/null +++ b/hack/ebs-scale-test/helpers/scale-test/export-to-s3.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script deploys the EBS CSI Driver and runs e2e tests +# CLUSTER_NAME and CLUSTER_TYPE are expected to be specified by the caller +# All other environment variables have default values (see config.sh) but +# many can be overridden on demand if needed + +### Helper script for exporting EBS CSI Driver metrics to S3 bucket + +set -euo pipefail + +export_to_s3() { + echo "Port-forwarding" + controller_pod_name=$(kubectl get pod -n kube-system -l app=ebs-csi-controller -o jsonpath='{.items[0].metadata.name}') + kubectl port-forward "$controller_pod_name" 3301:3301 -n kube-system & + kubectl port-forward "$controller_pod_name" 8081:8081 -n kube-system & + kubectl port-forward "$controller_pod_name" 8082:8082 -n kube-system & + kubectl port-forward "$controller_pod_name" 8084:8084 -n kube-system & + + echo "Collecting metrics" + for port in 3301 8081 8082 8084; do + while true; do + curl "http://localhost:${port}/metrics" >>"$EXPORT_DIR/metrics.txt" && break + echo "Failed to collect metrics from port ${port}, retrying..." + sleep 5 + done + done + + echo "Collecting ebs-plugin logs" + kubectl logs "$controller_pod_name" -n kube-system >"$EXPORT_DIR/ebs-plugin-logs.txt" + + echo "Collecting ebs-csi-controller Deployment and ebs-csi-node Daemonset yaml" + kubectl get deployment ebs-csi-controller -n kube-system -o yaml >"$EXPORT_DIR/ebs-csi-controller.yaml" + kubectl get daemonset ebs-csi-node -n kube-system -o yaml >"$EXPORT_DIR/ebs-csi-node.yaml" + + echo "Exporting everything in $EXPORT_DIR to S3" + if ! aws s3 ls "s3://$S3_BUCKET"; then + aws s3 mb "s3://$S3_BUCKET" --region "${AWS_REGION}" + fi + + aws s3 sync "$EXPORT_DIR" "s3://$S3_BUCKET/$SCALABILITY_TEST_RUN_NAME" + echo "Metrics exported to s3://$S3_BUCKET/$SCALABILITY_TEST_RUN_NAME/" +} + +(return 0 2>/dev/null) || ( + echo "This script is not meant to be run directly, only sourced as a helper!" + exit 1 +) diff --git a/hack/ebs-scale-test/helpers/scale-test/scale-sts-test/scale-sts.sh b/hack/ebs-scale-test/helpers/scale-test/scale-sts-test/scale-sts.sh new file mode 100755 index 0000000000..9983ec1feb --- /dev/null +++ b/hack/ebs-scale-test/helpers/scale-test/scale-sts-test/scale-sts.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +### Helper script for running EBS-backed StatefulSet scaling test + +# We expect this helper script is sourced from hack/ebs-scale-test +path_to_scale_test_dir="${BASE_DIR}/helpers/scale-test/scale-sts-test" + +sts_scale_test() { + manifest_path="$path_to_scale_test_dir/scale-sts.yaml" + export_manifest_path="$EXPORT_DIR/scale-manifest.yaml" + + echo "Applying $manifest_path. Exported to $export_manifest_path" + gomplate -f "$manifest_path" -o "$export_manifest_path" + kubectl apply -f "$export_manifest_path" + + echo "Scaling StatefulSet $REPLICAS replicas" + kubectl scale sts --replicas "$REPLICAS" ebs-scale-test + kubectl rollout status statefulset ebs-scale-test + + echo "Deleting StatefulSet" + kubectl delete -f "$export_manifest_path" + + echo "Waiting for all PVs to be deleted" + wait_for_pvs_to_delete +} + +wait_for_pvs_to_delete() { + while true; do + pv_count=$(kubectl get pv --no-headers | wc -l) + if [ "$pv_count" -eq 0 ]; then + echo "No PVs exist in the cluster, proceeding..." + break + else + echo "$pv_count PVs still exist, waiting..." + sleep 5 + fi + done +} + +(return 0 2>/dev/null) || ( + echo "This script is not meant to be run directly, only sourced as a helper!" + exit 1 +) diff --git a/hack/ebs-scale-test/helpers/scale-test/scale-sts-test/scale-sts.yaml b/hack/ebs-scale-test/helpers/scale-test/scale-sts-test/scale-sts.yaml new file mode 100644 index 0000000000..053445469a --- /dev/null +++ b/hack/ebs-scale-test/helpers/scale-test/scale-sts-test/scale-sts.yaml @@ -0,0 +1,68 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: ebs-scale-test +spec: + serviceName: "nginx" + podManagementPolicy: "Parallel" + replicas: 0 + selector: + matchLabels: + app: ebs-scale-test + template: + metadata: + labels: + app: ebs-scale-test + spec: + containers: + - name: nginx + image: nginx:latest + ports: + - containerPort: 80 + name: web + volumeMounts: + - name: vol + mountPath: /usr/share/nginx/html + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "256Mi" +{{- if eq ( getenv "CLUSTER_TYPE" ) "karpenter" }} + nodeSelector: + karpenter.sh/nodepool: ebs-scale-test +{{- end }} + volumeClaimTemplates: + - metadata: + name: vol + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: "ebs-scale-test" + resources: + requests: + storage: 1Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ebs-scale-test +provisioner: ebs.csi.aws.com +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer diff --git a/hack/ebs-scale-test/scale-test b/hack/ebs-scale-test/scale-test new file mode 100755 index 0000000000..1a2c8601af --- /dev/null +++ b/hack/ebs-scale-test/scale-test @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +### This script helps setup, run, and cleanup an ebs-scale-test. See README.md +set -euo pipefail +BASE_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +## Overridable environment variables. See README.md +export CLUSTER_TYPE +export TEST_TYPE +export REPLICAS +export DRIVER_VALUES_FILEPATH + +export CLUSTER_NAME +export EXPORT_DIR +export S3_BUCKET +export SCALABILITY_TEST_RUN_NAME + +CLUSTER_TYPE=${CLUSTER_TYPE:="pre-allocated"} +TEST_TYPE=${TEST_TYPE:="scale-sts"} +REPLICAS=${REPLICAS:=1000} +DRIVER_VALUES_FILEPATH=${DRIVER_VALUES_FILEPATH:="$BASE_DIR/helpers/cluster-setup/scale-driver-values.yaml"} +# TODO Q: Does anyone need an $OVERRIDE_KUBECONFIG? Let's discuss. + +CLUSTER_NAME=${CLUSTER_NAME:="ebs-scale-$CLUSTER_TYPE"} +EXPORT_DIR=${EXPORT_DIR:=$(mktemp -d)} +S3_BUCKET=${S3_BUCKET:="ebs-scale-tests"} +SCALABILITY_TEST_RUN_NAME=${SCALABILITY_TEST_RUN_NAME:="$CLUSTER_NAME-$TEST_TYPE-$REPLICAS-$(date -u +%Y-%m-%dT%H:%M%Z)"} + +## Script helpers and internal environment variables +export PRE_ALLOCATED_NODES K8S_VERSION AWS_ACCOUNT_ID AWS_REGION BASE_DIR TEMPOUT + +PRE_ALLOCATED_NODES=${PRE_ALLOCATED_NODES:=$((($REPLICAS / 100) + 1))} +K8S_VERSION=$(aws eks describe-cluster-versions --query "clusterVersions[0].clusterVersion") + +AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +AWS_REGION=$(aws configure get region) +TEMPOUT=$(mktemp) + +source "${BASE_DIR}/helpers/cluster-setup/manage-cluster.sh" +source "${BASE_DIR}/helpers/scale-test/export-to-s3.sh" +source "${BASE_DIR}/helpers/scale-test//scale-sts-test/scale-sts.sh" + +usage() { + echo "Usage: $0 [base-cmd]" + echo "Possible base-cmds: 'setup', 'run', 'cleanup'" + exit 1 +} + +# TODO Q: I'm open to relying on `make tools` /bin dependencies. But that might be confusing to those just wanting to run scale tests. +check_dependencies_helper() { + local readonly dependencies=("kubectl" "aws" "eksctl" "gomplate") + + for cmd in "${dependencies[@]}"; do + if ! command -v "${cmd}" &>/dev/null; then + echo "${cmd} could not be found, please install it." + exit 1 + fi + done +} + +## Script start + +# Functions sourced from helpers/cluster-setup/manage-cluster.sh +setup-scale() { + create_cluster + deploy_ebs_csi_driver +} + +# Functions sourced from helpers/scale-test/... +run-scale() { + # TODO Q: Is it worth restarting EBS CSI Controller pod to ensure clean metrics/logs? Or move EBS CSI Driver install/cleanup to 'run' instead of 'setup'? Let's discuss. + aws eks update-kubeconfig --name "$CLUSTER_NAME" + sts_scale_test + export_to_s3 +} + +# Functions sourced from helpers/cluster-setup/manage-cluster.sh +clean-scale() { + cleanup_cluster +} + +main() { + # Check params + [[ $# -ne 1 ]] && usage + check_dependencies_helper + + case "$1" in + *setup* | *create*) setup-scale ;; + *run*) run-scale ;; + *clean*) clean-scale ;; + *) usage ;; + esac +} + +main "$@"