Skip to content

Commit

Permalink
chore: cleaning up experiemental perf test scripts (#34)
Browse files Browse the repository at this point in the history
* chore(provisioner-refs): removing references to legacy gpu provisioners

* fix: fixing perf test scale up and removing old azure dashboard references in place of make az-mon dashboards

* chore: bumping delete timeout for larger runs
  • Loading branch information
Bryce-Soghigian authored Nov 15, 2023
1 parent e983049 commit 887970a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 20 deletions.
6 changes: 0 additions & 6 deletions Makefile-az.mk
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,6 @@ az-klogs: ## Karpenter logs
az-kevents: ## Karpenter events
kubectl get events -A --field-selector source=karpenter

az-provision-gpus:
kubectl apply -f examples/workloads/device-plugin.yaml
kubectl apply -f examples/provisioner/gpu-provisioner.yaml
kubectl apply -f examples/workloads/samples-mnist.yaml


az-node-viewer: ## Watch nodes using eks-node-viewer
eks-node-viewer --disable-pricing --node-selector "karpenter.sh/nodepool" # --resources cpu,memory

Expand Down
41 changes: 41 additions & 0 deletions hack/azure/general-purpose-small-nodes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

---
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: sm-general-purpose
annotations:
kubernetes.io/description: "General purpose NodePool for generic workloads with small nodes"
spec:
disruption:
expireAfter: Never
template:
spec:
# exclude very large SKUs (32+ vCPU)
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.azure.com/sku-cpu
operator: Lt
values: ["3"]
- key: karpenter.azure.com/sku-family
operator: In
values: [D]
nodeClassRef:
name: default
---
apiVersion: karpenter.azure.com/v1alpha2
kind: AKSNodeClass
metadata:
name: default
annotations:
kubernetes.io/description: "General purpose AKSNodeClass for running Ubuntu2204 nodes"
spec:
imageFamily: Ubuntu2204
36 changes: 22 additions & 14 deletions hack/azure/perftest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

# This deploys Provisioner requiring small instances (2 vCPU) and 'inflate' deployment with 1 cpu request, requiring VM per replica.
# It then scales the deployment up to the requested number of replicas (allocating the same number of VMs) and then scales it down.
# make az-mon-deploy and az-mon-access will configure some monitoring dashboards that can be used to observe the scale up.

# TODO: obtain cluster ID programmatically
CLUSTER_ID=63559813aff5f40001dfadb5
DASHBOARD=3052d470-b928-4e5e-bdbc-cc01e18ff318

set -euxo pipefail

Expand All @@ -14,15 +12,23 @@ replicas="$1"

FMT='+%Y-%m-%dT%H-%M-%SZ'
START=$(date ${FMT})
STARTKUBECTL=$(date --iso-8601=seconds)

# Check if the operating system is macOS or Linux
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS, use BSD date syntax
STARTKUBECTL=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
else
# Linux, use GNU date syntax
STARTKUBECTL=$(date --iso-8601=seconds)
fi

mkdir -p logs
exec > >(tee -i "logs/az-perftest-${START}-${replicas}.log")
exec 2>&1
logk="logs/az-perftest-${START}-${replicas}-karpenter.log"

# prep
kubectl apply -f examples/provisioner/general-purpose-azure-smallnodes.yaml
kubectl apply -f hack/azure/general-purpose-small-nodes.yaml
kubectl apply -f examples/workloads/inflate.yaml

# scale up
Expand All @@ -31,24 +37,26 @@ kubectl scale --replicas="${replicas}" deployment/inflate
time kubectl rollout status deployment/inflate --watch --timeout=2h
date
ENDUP=$(date ${FMT})
echo Scale up: "https://dataexplorer.azure.com/dashboards/${DASHBOARD}?p-_startTime=${START}&p-_endTime=${ENDUP}&p-_cluster_id=${CLUSTER_ID}&p-_bin_size=v-20s"
ENDUPKUBECTL=$(date --iso-8601=seconds)
echo Scale up: ${START} ${ENDUP} ${replicas}
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS, use BSD date syntax
ENDUPKUBECTL=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
else
# Linux, use GNU date syntax
ENDUPKUBECTL=$(date --iso-8601=seconds)
fi
kubectl logs deployment/karpenter -n karpenter --since-time="${STARTKUBECTL}" > "${logk}"

# scale down
sleep 30
kubectl scale --replicas=0 deployment/inflate
date
kubectl delete --wait=false nodes -l karpenter.sh/provisioner-name
time kubectl wait --for=delete nodes -l karpenter.sh/provisioner-name --timeout=30m
kubectl delete --wait=false nodes -l karpenter.sh/nodepool
time kubectl wait --for=delete nodes -l karpenter.sh/nodepool --timeout=2h
ENDDOWN=$(date ${FMT})
date

# review
kubectl logs deployment/karpenter -n karpenter --since-time="${ENDUPKUBECTL}" >> "${logk}"
az resource list -o table --tag=karpenter.sh_provisioner-name=default
# az resource wait --deleted --timeout 300 --tag=karpenter.sh_provisioner-name=default - can't wait on tags :(
az resource list -o table --tag=karpenter.sh_nodepool=sm-general-purpose

# Cluster Autoscaler dashboard links - handy for some metrics
echo Scale up: "https://dataexplorer.azure.com/dashboards/${DASHBOARD}?p-_startTime=${START}&p-_endTime=${ENDUP}&p-_cluster_id=${CLUSTER_ID}&p-_bin_size=v-20s"
echo Scale down: "https://dataexplorer.azure.com/dashboards/${DASHBOARD}?p-_startTime=${ENDUP}&p-_endTime=${ENDDOWN}&p-_cluster_id=${CLUSTER_ID}&p-_bin_size=v-20s"

0 comments on commit 887970a

Please sign in to comment.