Skip to content

Commit

Permalink
Merge branch 'main' into tallaxes/karpenter-bump
Browse files Browse the repository at this point in the history
  • Loading branch information
tallaxes authored Dec 1, 2024
2 parents bceaecc + b4ae5e1 commit f426473
Show file tree
Hide file tree
Showing 15 changed files with 228 additions and 142 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/crate-ci/typos
rev: v1.26.0
rev: v1.28.1
hooks:
- id: typos
args: [--write-changes, --force-exclude, --exclude, go.mod]
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ require (
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
golang.org/x/sync v0.8.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.30.3
k8s.io/apiextensions-apiserver v0.30.3
k8s.io/apimachinery v0.30.3
Expand Down Expand Up @@ -158,7 +159,6 @@ require (
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cloud-provider v0.30.3 // indirect
k8s.io/component-base v0.30.3 // indirect
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/nodeclass/hash/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1alpha
v1alpha2.AnnotationAKSNodeClassHashVersion: v1alpha2.AKSNodeClassHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match
// Any NodeClaim that is already drifted will remain drifted if the karpenter.azure.com/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed
if nc.StatusConditions().Get(karpv1.ConditionTypeDrifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
Expand Down
6 changes: 3 additions & 3 deletions pkg/providers/imagefamily/azlinux.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ func (u AzureLinux) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
// GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu
// See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201
SubnetID: u.Options.SubnetID,
GPUDriverType: u.Options.GPUDriverType,
GPUImageSHA: u.Options.GPUImageSHA,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/imagefamily/bootstrap/aksbootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ type NodeBootstrapVariables struct {
SwapFileSizeMB int // t user input
GPUImageSHA string // s static sha rarely updated
GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator.
GPUDriverType string // k
GPUInstanceProfile string // t user-specified
CustomSearchDomainName string // c user-specified [presumably cluster-level]
CustomSearchRealmUser string // c user-specified [presumably cluster-level]
Expand Down Expand Up @@ -467,6 +468,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) {
nbv.GPUNode = true
nbv.ConfigGPUDriverIfNeeded = true
nbv.GPUDriverVersion = a.GPUDriverVersion
nbv.GPUDriverType = a.GPUDriverType
nbv.GPUImageSHA = a.GPUImageSHA
}

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type Options struct {
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
SubnetID string
}
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}"
SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}"
GPU_IMAGE_SHA="{{.GPUImageSHA}}"
GPU_DRIVER_VERSION="{{.GPUDriverVersion}}"
GPU_DRIVER_TYPE="{{.GPUDriverType}}"
GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}"
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/ubuntu_2204.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func (u Ubuntu2204) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
GPUDriverType: u.Options.GPUDriverType,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
Expand Down
51 changes: 26 additions & 25 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,10 @@ var _ = Describe("InstanceType Provider", func() {
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))

}
}

})

DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) {
Expand Down Expand Up @@ -655,7 +653,7 @@ var _ = Describe("InstanceType Provider", func() {
}}}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))
})
It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() {
Expand Down Expand Up @@ -1011,20 +1009,15 @@ var _ = Describe("InstanceType Provider", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
node := ExpectScheduled(ctx, env.Client, pod)

Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
Expect(vm.Properties).ToNot(BeNil())
Expect(vm.Properties.HardwareProfile).ToNot(BeNil())
Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse())

clusterNodes := cluster.Nodes()
node := clusterNodes[0]
if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0"))
}
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0"))
})

It("should schedule GPU pod on GPU capable node", func() {
Expand Down Expand Up @@ -1054,23 +1047,31 @@ var _ = Describe("InstanceType Provider", func() {
})

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)

// Verify that the node has the GPU label set that the pod was scheduled on
clusterNodes := cluster.Nodes()
Expect(clusterNodes).ToNot(BeEmpty())
Expect(len(clusterNodes)).To(Equal(1))
node := clusterNodes[0]
Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
node := ExpectScheduled(ctx, env.Client, pod)

if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
// the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption
Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3"))

Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100"))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1"))
// Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3)
customData := ExpectDecodedCustomData(azureEnv)
Expect(customData).To(SatisfyAll(
ContainSubstring("GPU_NODE=true"),
ContainSubstring("SGX_NODE=false"),
ContainSubstring("MIG_NODE=false"),
ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"),
ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"),
ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""),
ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)),
ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)),
ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""),
ContainSubstring("GPU_INSTANCE_PROFILE=\"\""),
))

}
// Verify that the node the pod was scheduled on has GPU resource and labels set
Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4"))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1"))
})
})

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp
Arch: arch,
GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name),
GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name),
GPUDriverType: utils.GetGPUDriverType(instanceType.Name),
GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name),
TenantID: p.tenantID,
SubscriptionID: p.subscriptionID,
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/parameters/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type StaticParameters struct {
Arch string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
TenantID string
SubscriptionID string
Expand Down
153 changes: 59 additions & 94 deletions pkg/utils/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,124 +17,81 @@ limitations under the License.
package utils

import (
_ "embed"
"strings"

"gopkg.in/yaml.v2"
)

// TODO: Get these from agentbaker
const (
Nvidia470CudaDriverVersion = "cuda-470.82.01"
Nvidia550CudaDriverVersion = "cuda-550.54.15"
Nvidia535GridDriverVersion = "grid-535.161.08"

// These SHAs will change once we update aks-gpu images in aks-gpu repository. We do that fairly rarely at this time.
// So for now these will be kept here like this and periodically bump them
AKSGPUGridSHA = "sha-d1f0ca"
AKSGPUCudaSHA = "sha-2d4c96"
Nvidia470CudaDriverVersion = "470.82.01"

// https://github.com/Azure/AgentBaker/blob/ddf36a24eafd02ce0589657ff2dc799125f4ad37/parts/linux/cloud-init/artifacts/components.json#L562
NvidiaCudaDriverVersion = "550.90.12"
AKSGPUCudaVersionSuffix = "20241021235610"

NvidiaGridDriverVersion = "535.161.08"
AKSGPUGridVersionSuffix = "20241021235607"
)

func GetAKSGPUImageSHA(size string) string {
if UseGridDrivers(size) {
return AKSGPUGridSHA
}
return AKSGPUCudaSHA
type NvidiaSKUConfig struct {
NvidiaEnabledSKUFamilies map[string][]string `yaml:"nvidiaEnabledSKUs"`
MarinerNvidiaEnabledSKUFamilies map[string][]string `yaml:"marinerNvidiaEnabledSKUs"`
}

var (
/* If a new GPU sku becomes available, add a key to this map, but only if you have a confirmation
that we have an agreement with NVIDIA for this specific gpu.
*/
NvidiaEnabledSKUs = map[string]bool{
// M60
"standard_nv6": true,
"standard_nv12": true,
"standard_nv12s_v3": true,
"standard_nv24": true,
"standard_nv24s_v3": true,
"standard_nv24r": true,
"standard_nv48s_v3": true,
// P40
"standard_nd6s": true,
"standard_nd12s": true,
"standard_nd24s": true,
"standard_nd24rs": true,
// P100
"standard_nc6s_v2": true,
"standard_nc12s_v2": true,
"standard_nc24s_v2": true,
"standard_nc24rs_v2": true,
// V100
"standard_nc6s_v3": true,
"standard_nc12s_v3": true,
"standard_nc24s_v3": true,
"standard_nc24rs_v3": true,
"standard_nd40s_v3": true,
"standard_nd40rs_v2": true,
// T4
"standard_nc4as_t4_v3": true,
"standard_nc8as_t4_v3": true,
"standard_nc16as_t4_v3": true,
"standard_nc64as_t4_v3": true,
// A100 40GB
"standard_nd96asr_v4": true,
"standard_nd112asr_a100_v4": true,
"standard_nd120asr_a100_v4": true,
// A100 80GB
"standard_nd96amsr_a100_v4": true,
"standard_nd112amsr_a100_v4": true,
"standard_nd120amsr_a100_v4": true,
// A100 PCIE 80GB
"standard_nc24ads_a100_v4": true,
"standard_nc48ads_a100_v4": true,
"standard_nc96ads_a100_v4": true,
"standard_ncads_a100_v4": true,
// A10
"standard_nc8ads_a10_v4": true,
"standard_nc16ads_a10_v4": true,
"standard_nc32ads_a10_v4": true,
// A10, GRID only
"standard_nv6ads_a10_v5": true,
"standard_nv12ads_a10_v5": true,
"standard_nv18ads_a10_v5": true,
"standard_nv36ads_a10_v5": true,
"standard_nv36adms_a10_v5": true,
"standard_nv72ads_a10_v5": true,
// A100
"standard_nd96ams_v4": true,
"standard_nd96ams_a100_v4": true,
nvidiaEnabledSKUs = make(map[string]bool)
marinerNvidiaEnabledSKUs = make(map[string]bool)
)

//go:embed supported-gpus.yaml
var configFile []byte

func init() {
readNvidiaSKUConfig()
}

func readNvidiaSKUConfig() {
var nvidiaSKUConfig NvidiaSKUConfig

err := yaml.Unmarshal(configFile, &nvidiaSKUConfig)
if err != nil {
panic(err)
}
for _, skus := range nvidiaSKUConfig.NvidiaEnabledSKUFamilies {
for _, sku := range skus {
nvidiaEnabledSKUs[sku] = true
}
}
for _, skus := range nvidiaSKUConfig.MarinerNvidiaEnabledSKUFamilies {
for _, sku := range skus {
marinerNvidiaEnabledSKUs[sku] = true
}
}
}

// List of GPU SKUs currently enabled and validated for Mariner. Will expand the support
// to cover other SKUs available in Azure
MarinerNvidiaEnabledSKUs = map[string]bool{
// V100
"standard_nc6s_v3": true,
"standard_nc12s_v3": true,
"standard_nc24s_v3": true,
"standard_nc24rs_v3": true,
"standard_nd40s_v3": true,
"standard_nd40rs_v2": true,
// T4
"standard_nc4as_t4_v3": true,
"standard_nc8as_t4_v3": true,
"standard_nc16as_t4_v3": true,
"standard_nc64as_t4_v3": true,
func GetAKSGPUImageSHA(size string) string {
if UseGridDrivers(size) {
return AKSGPUGridVersionSuffix
}
)
return AKSGPUCudaVersionSuffix
}

// IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support
func IsNvidiaEnabledSKU(vmSize string) bool {
// Trim the optional _Promo suffix.
vmSize = strings.ToLower(vmSize)
vmSize = strings.TrimSuffix(vmSize, "_promo")
return NvidiaEnabledSKUs[vmSize]
return nvidiaEnabledSKUs[vmSize]
}

// IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support
func IsMarinerEnabledGPUSKU(vmSize string) bool {
// Trim the optional _Promo suffix.
vmSize = strings.ToLower(vmSize)
vmSize = strings.TrimSuffix(vmSize, "_promo")
return MarinerNvidiaEnabledSKUs[vmSize]
return marinerNvidiaEnabledSKUs[vmSize]
}

// NV series GPUs target graphics workloads vs NC which targets compute.
Expand All @@ -143,12 +100,20 @@ func IsMarinerEnabledGPUSKU(vmSize string) bool {
// NVv3 is untested on AKS, NVv4 is AMD so n/a, and NVv2 no longer seems to exist (?).
func GetGPUDriverVersion(size string) string {
if UseGridDrivers(size) {
return Nvidia535GridDriverVersion
return NvidiaGridDriverVersion
}
if isStandardNCv1(size) {
return Nvidia470CudaDriverVersion
}
return Nvidia550CudaDriverVersion
return NvidiaCudaDriverVersion
}

// GetGPUDriverType returns the type of GPU driver for given VM SKU ("grid" or "cuda")
func GetGPUDriverType(size string) string {
if UseGridDrivers(size) {
return "grid"
}
return "cuda"
}

func isStandardNCv1(size string) bool {
Expand Down
Loading

0 comments on commit f426473

Please sign in to comment.