Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: GPU bootstrap, refresh driver versions and list of supported GPU VM SKUs #587

Merged
merged 8 commits into from
Dec 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/crate-ci/typos
rev: v1.26.0
rev: v1.28.1
hooks:
- id: typos
args: [--write-changes, --force-exclude, --exclude, go.mod]
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ require (
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
golang.org/x/sync v0.8.0
gopkg.in/yaml.v2 v2.4.0
k8s.io/api v0.30.3
k8s.io/apiextensions-apiserver v0.30.3
k8s.io/apimachinery v0.30.3
Expand Down Expand Up @@ -158,7 +159,6 @@ require (
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/dnaeon/go-vcr.v3 v3.2.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cloud-provider v0.30.3 // indirect
k8s.io/component-base v0.30.3 // indirect
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/nodeclass/hash/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1alpha
v1alpha2.AnnotationAKSNodeClassHashVersion: v1alpha2.AKSNodeClassHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match
// Any NodeClaim that is already drifted will remain drifted if the karpenter.azure.com/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed
if nc.StatusConditions().Get(karpv1.ConditionTypeDrifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
Expand Down
6 changes: 3 additions & 3 deletions pkg/providers/imagefamily/azlinux.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ func (u AzureLinux) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
// GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu
tallaxes marked this conversation as resolved.
Show resolved Hide resolved
// See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201
SubnetID: u.Options.SubnetID,
GPUDriverType: u.Options.GPUDriverType,
GPUImageSHA: u.Options.GPUImageSHA,
tallaxes marked this conversation as resolved.
Show resolved Hide resolved
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/imagefamily/bootstrap/aksbootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ type NodeBootstrapVariables struct {
SwapFileSizeMB int // t user input
GPUImageSHA string // s static sha rarely updated
GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator.
GPUDriverType string // k
GPUInstanceProfile string // t user-specified
CustomSearchDomainName string // c user-specified [presumably cluster-level]
CustomSearchRealmUser string // c user-specified [presumably cluster-level]
Expand Down Expand Up @@ -467,6 +468,7 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) {
nbv.GPUNode = true
nbv.ConfigGPUDriverIfNeeded = true
nbv.GPUDriverVersion = a.GPUDriverVersion
nbv.GPUDriverType = a.GPUDriverType
nbv.GPUImageSHA = a.GPUImageSHA
}

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type Options struct {
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
SubnetID string
}
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}"
SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}"
GPU_IMAGE_SHA="{{.GPUImageSHA}}"
GPU_DRIVER_VERSION="{{.GPUDriverVersion}}"
GPU_DRIVER_TYPE="{{.GPUDriverType}}"
GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}"
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/ubuntu_2204.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ func (u Ubuntu2204) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfigu
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
GPUDriverType: u.Options.GPUDriverType,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
Expand Down
51 changes: 26 additions & 25 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,10 @@ var _ = Describe("InstanceType Provider", func() {
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))

}
}

})

DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) {
Expand Down Expand Up @@ -655,7 +653,7 @@ var _ = Describe("InstanceType Provider", func() {
}}}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))
})
It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() {
Expand Down Expand Up @@ -1011,20 +1009,15 @@ var _ = Describe("InstanceType Provider", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
node := ExpectScheduled(ctx, env.Client, pod)

Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
Expect(vm.Properties).ToNot(BeNil())
Expect(vm.Properties.HardwareProfile).ToNot(BeNil())
Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse())

clusterNodes := cluster.Nodes()
node := clusterNodes[0]
if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0"))
}
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0"))
})

It("should schedule GPU pod on GPU capable node", func() {
Expand Down Expand Up @@ -1054,23 +1047,31 @@ var _ = Describe("InstanceType Provider", func() {
})

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)

// Verify that the node has the GPU label set that the pod was scheduled on
clusterNodes := cluster.Nodes()
Expect(clusterNodes).ToNot(BeEmpty())
Expect(len(clusterNodes)).To(Equal(1))
node := clusterNodes[0]
Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
node := ExpectScheduled(ctx, env.Client, pod)

if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
// the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption
Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3"))

Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100"))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1"))
// Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3)
customData := ExpectDecodedCustomData(azureEnv)
Expect(customData).To(SatisfyAll(
ContainSubstring("GPU_NODE=true"),
ContainSubstring("SGX_NODE=false"),
ContainSubstring("MIG_NODE=false"),
ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"),
ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"),
ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""),
ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)),
ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)),
ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""),
ContainSubstring("GPU_INSTANCE_PROFILE=\"\""),
))

}
// Verify that the node the pod was scheduled on has GPU resource and labels set
Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4"))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1"))
})
})

Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp
Arch: arch,
GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name),
GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name),
GPUDriverType: utils.GetGPUDriverType(instanceType.Name),
GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name),
TenantID: p.tenantID,
SubscriptionID: p.subscriptionID,
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/launchtemplate/parameters/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type StaticParameters struct {
Arch string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
TenantID string
SubscriptionID string
Expand Down
153 changes: 59 additions & 94 deletions pkg/utils/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,124 +17,81 @@ limitations under the License.
package utils

import (
_ "embed"
"strings"

"gopkg.in/yaml.v2"
)

// TODO: Get these from agentbaker
const (
Nvidia470CudaDriverVersion = "cuda-470.82.01"
Nvidia550CudaDriverVersion = "cuda-550.54.15"
Nvidia535GridDriverVersion = "grid-535.161.08"

// These SHAs will change once we update aks-gpu images in aks-gpu repository. We do that fairly rarely at this time.
// So for now these will be kept here like this and periodically bump them
AKSGPUGridSHA = "sha-d1f0ca"
AKSGPUCudaSHA = "sha-2d4c96"
Nvidia470CudaDriverVersion = "470.82.01"

// https://github.com/Azure/AgentBaker/blob/ddf36a24eafd02ce0589657ff2dc799125f4ad37/parts/linux/cloud-init/artifacts/components.json#L562
NvidiaCudaDriverVersion = "550.90.12"
AKSGPUCudaVersionSuffix = "20241021235610"

NvidiaGridDriverVersion = "535.161.08"
AKSGPUGridVersionSuffix = "20241021235607"
)

func GetAKSGPUImageSHA(size string) string {
if UseGridDrivers(size) {
return AKSGPUGridSHA
}
return AKSGPUCudaSHA
type NvidiaSKUConfig struct {
NvidiaEnabledSKUFamilies map[string][]string `yaml:"nvidiaEnabledSKUs"`
tallaxes marked this conversation as resolved.
Show resolved Hide resolved
MarinerNvidiaEnabledSKUFamilies map[string][]string `yaml:"marinerNvidiaEnabledSKUs"`
}

var (
/* If a new GPU sku becomes available, add a key to this map, but only if you have a confirmation
that we have an agreement with NVIDIA for this specific gpu.
*/
NvidiaEnabledSKUs = map[string]bool{
// M60
"standard_nv6": true,
"standard_nv12": true,
"standard_nv12s_v3": true,
"standard_nv24": true,
"standard_nv24s_v3": true,
"standard_nv24r": true,
"standard_nv48s_v3": true,
// P40
"standard_nd6s": true,
"standard_nd12s": true,
"standard_nd24s": true,
"standard_nd24rs": true,
// P100
"standard_nc6s_v2": true,
"standard_nc12s_v2": true,
"standard_nc24s_v2": true,
"standard_nc24rs_v2": true,
// V100
"standard_nc6s_v3": true,
"standard_nc12s_v3": true,
"standard_nc24s_v3": true,
"standard_nc24rs_v3": true,
"standard_nd40s_v3": true,
"standard_nd40rs_v2": true,
// T4
"standard_nc4as_t4_v3": true,
"standard_nc8as_t4_v3": true,
"standard_nc16as_t4_v3": true,
"standard_nc64as_t4_v3": true,
// A100 40GB
"standard_nd96asr_v4": true,
"standard_nd112asr_a100_v4": true,
"standard_nd120asr_a100_v4": true,
// A100 80GB
"standard_nd96amsr_a100_v4": true,
"standard_nd112amsr_a100_v4": true,
"standard_nd120amsr_a100_v4": true,
// A100 PCIE 80GB
"standard_nc24ads_a100_v4": true,
"standard_nc48ads_a100_v4": true,
"standard_nc96ads_a100_v4": true,
"standard_ncads_a100_v4": true,
// A10
"standard_nc8ads_a10_v4": true,
"standard_nc16ads_a10_v4": true,
"standard_nc32ads_a10_v4": true,
// A10, GRID only
"standard_nv6ads_a10_v5": true,
"standard_nv12ads_a10_v5": true,
"standard_nv18ads_a10_v5": true,
"standard_nv36ads_a10_v5": true,
"standard_nv36adms_a10_v5": true,
"standard_nv72ads_a10_v5": true,
// A100
"standard_nd96ams_v4": true,
"standard_nd96ams_a100_v4": true,
nvidiaEnabledSKUs = make(map[string]bool)
marinerNvidiaEnabledSKUs = make(map[string]bool)
)

//go:embed supported-gpus.yaml
var configFile []byte

func init() {
readNvidiaSKUConfig()
}

func readNvidiaSKUConfig() {
var nvidiaSKUConfig NvidiaSKUConfig

err := yaml.Unmarshal(configFile, &nvidiaSKUConfig)
if err != nil {
panic(err)
}
for _, skus := range nvidiaSKUConfig.NvidiaEnabledSKUFamilies {
for _, sku := range skus {
nvidiaEnabledSKUs[sku] = true
}
}
for _, skus := range nvidiaSKUConfig.MarinerNvidiaEnabledSKUFamilies {
for _, sku := range skus {
marinerNvidiaEnabledSKUs[sku] = true
}
}
}
tallaxes marked this conversation as resolved.
Show resolved Hide resolved

// List of GPU SKUs currently enabled and validated for Mariner. Will expand the support
// to cover other SKUs available in Azure
MarinerNvidiaEnabledSKUs = map[string]bool{
// V100
"standard_nc6s_v3": true,
"standard_nc12s_v3": true,
"standard_nc24s_v3": true,
"standard_nc24rs_v3": true,
"standard_nd40s_v3": true,
"standard_nd40rs_v2": true,
// T4
"standard_nc4as_t4_v3": true,
"standard_nc8as_t4_v3": true,
"standard_nc16as_t4_v3": true,
"standard_nc64as_t4_v3": true,
func GetAKSGPUImageSHA(size string) string {
if UseGridDrivers(size) {
return AKSGPUGridVersionSuffix
}
)
return AKSGPUCudaVersionSuffix
}

// IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support
func IsNvidiaEnabledSKU(vmSize string) bool {
// Trim the optional _Promo suffix.
vmSize = strings.ToLower(vmSize)
vmSize = strings.TrimSuffix(vmSize, "_promo")
return NvidiaEnabledSKUs[vmSize]
return nvidiaEnabledSKUs[vmSize]
}

// IsNvidiaEnabledSKU determines if an VM SKU has nvidia driver support
func IsMarinerEnabledGPUSKU(vmSize string) bool {
// Trim the optional _Promo suffix.
vmSize = strings.ToLower(vmSize)
vmSize = strings.TrimSuffix(vmSize, "_promo")
return MarinerNvidiaEnabledSKUs[vmSize]
return marinerNvidiaEnabledSKUs[vmSize]
}

// NV series GPUs target graphics workloads vs NC which targets compute.
Expand All @@ -143,12 +100,20 @@ func IsMarinerEnabledGPUSKU(vmSize string) bool {
// NVv3 is untested on AKS, NVv4 is AMD so n/a, and NVv2 no longer seems to exist (?).
func GetGPUDriverVersion(size string) string {
if UseGridDrivers(size) {
return Nvidia535GridDriverVersion
return NvidiaGridDriverVersion
}
if isStandardNCv1(size) {
return Nvidia470CudaDriverVersion
tallaxes marked this conversation as resolved.
Show resolved Hide resolved
}
return Nvidia550CudaDriverVersion
return NvidiaCudaDriverVersion
}

// GetGPUDriverType returns the type of GPU driver for given VM SKU ("grid" or "cuda")
func GetGPUDriverType(size string) string {
if UseGridDrivers(size) {
return "grid"
}
return "cuda"
}

func isStandardNCv1(size string) bool {
Expand Down
Loading
Loading