diff --git a/Makefile-az.mk b/Makefile-az.mk index 8ba0d0972..ab6f60464 100755 --- a/Makefile-az.mk +++ b/Makefile-az.mk @@ -263,12 +263,6 @@ az-klogs: ## Karpenter logs az-kevents: ## Karpenter events kubectl get events -A --field-selector source=karpenter -az-provision-gpus: - kubectl apply -f examples/workloads/device-plugin.yaml - kubectl apply -f examples/provisioner/gpu-provisioner.yaml - kubectl apply -f examples/workloads/samples-mnist.yaml - - az-node-viewer: ## Watch nodes using eks-node-viewer eks-node-viewer --disable-pricing --node-selector "karpenter.sh/nodepool" # --resources cpu,memory diff --git a/hack/azure/general-purpose-small-nodes.yaml b/hack/azure/general-purpose-small-nodes.yaml new file mode 100644 index 000000000..ceb0e49ff --- /dev/null +++ b/hack/azure/general-purpose-small-nodes.yaml @@ -0,0 +1,41 @@ + +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: sm-general-purpose + annotations: + kubernetes.io/description: "General purpose NodePool for generic workloads with small nodes" +spec: + disruption: + expireAfter: Never + template: + spec: + # exclude very large SKUs (32+ vCPU) + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: karpenter.azure.com/sku-cpu + operator: Lt + values: ["3"] + - key: karpenter.azure.com/sku-family + operator: In + values: [D] + nodeClassRef: + name: default +--- +apiVersion: karpenter.azure.com/v1alpha2 +kind: AKSNodeClass +metadata: + name: default + annotations: + kubernetes.io/description: "General purpose AKSNodeClass for running Ubuntu2204 nodes" +spec: + imageFamily: Ubuntu2204 diff --git a/hack/azure/perftest.sh b/hack/azure/perftest.sh index 6909ae558..35d506e34 100755 --- a/hack/azure/perftest.sh +++ b/hack/azure/perftest.sh @@ -2,10 +2,8 @@ # This deploys Provisioner requiring small instances (2 vCPU) and 'inflate' deployment with 1 cpu request, requiring VM per replica. # It then scales the deployment up to the requested number of replicas (allocating the same number of VMs) and then scales it down. +# make az-mon-deploy and az-mon-access will configure some monitoring dashboards that can be used to observe the scale up. -# TODO: obtain cluster ID programmatically -CLUSTER_ID=63559813aff5f40001dfadb5 -DASHBOARD=3052d470-b928-4e5e-bdbc-cc01e18ff318 set -euxo pipefail @@ -14,7 +12,15 @@ replicas="$1" FMT='+%Y-%m-%dT%H-%M-%SZ' START=$(date ${FMT}) -STARTKUBECTL=$(date --iso-8601=seconds) + +# Check if the operating system is macOS or Linux +if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS, use BSD date syntax + STARTKUBECTL=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +else + # Linux, use GNU date syntax + STARTKUBECTL=$(date --iso-8601=seconds) +fi mkdir -p logs exec > >(tee -i "logs/az-perftest-${START}-${replicas}.log") @@ -22,7 +28,7 @@ exec 2>&1 logk="logs/az-perftest-${START}-${replicas}-karpenter.log" # prep -kubectl apply -f examples/provisioner/general-purpose-azure-smallnodes.yaml +kubectl apply -f hack/azure/general-purpose-small-nodes.yaml kubectl apply -f examples/workloads/inflate.yaml # scale up @@ -31,24 +37,26 @@ kubectl scale --replicas="${replicas}" deployment/inflate time kubectl rollout status deployment/inflate --watch --timeout=2h date ENDUP=$(date ${FMT}) -echo Scale up: "https://dataexplorer.azure.com/dashboards/${DASHBOARD}?p-_startTime=${START}&p-_endTime=${ENDUP}&p-_cluster_id=${CLUSTER_ID}&p-_bin_size=v-20s" -ENDUPKUBECTL=$(date --iso-8601=seconds) +echo Scale up: ${START} ${ENDUP} ${replicas} +if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS, use BSD date syntax + ENDUPKUBECTL=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +else + # Linux, use GNU date syntax + ENDUPKUBECTL=$(date --iso-8601=seconds) +fi kubectl logs deployment/karpenter -n karpenter --since-time="${STARTKUBECTL}" > "${logk}" # scale down sleep 30 kubectl scale --replicas=0 deployment/inflate date -kubectl delete --wait=false nodes -l karpenter.sh/provisioner-name -time kubectl wait --for=delete nodes -l karpenter.sh/provisioner-name --timeout=30m +kubectl delete --wait=false nodes -l karpenter.sh/nodepool +time kubectl wait --for=delete nodes -l karpenter.sh/nodepool --timeout=2h ENDDOWN=$(date ${FMT}) date # review kubectl logs deployment/karpenter -n karpenter --since-time="${ENDUPKUBECTL}" >> "${logk}" -az resource list -o table --tag=karpenter.sh_provisioner-name=default -# az resource wait --deleted --timeout 300 --tag=karpenter.sh_provisioner-name=default - can't wait on tags :( +az resource list -o table --tag=karpenter.sh_nodepool=sm-general-purpose -# Cluster Autoscaler dashboard links - handy for some metrics -echo Scale up: "https://dataexplorer.azure.com/dashboards/${DASHBOARD}?p-_startTime=${START}&p-_endTime=${ENDUP}&p-_cluster_id=${CLUSTER_ID}&p-_bin_size=v-20s" -echo Scale down: "https://dataexplorer.azure.com/dashboards/${DASHBOARD}?p-_startTime=${ENDUP}&p-_endTime=${ENDDOWN}&p-_cluster_id=${CLUSTER_ID}&p-_bin_size=v-20s"