Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: GPU bootstrap, refresh driver versions and list of supported GPU VM SKUs #587

Merged
merged 8 commits into from
Dec 1, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
hooks:
- id: shellcheck
- repo: https://github.com/crate-ci/typos
rev: v1.26.0
rev: v1.28.1
hooks:
- id: typos
args: [--write-changes, --force-exclude, --exclude, go.mod]
Expand Down
2 changes: 1 addition & 1 deletion pkg/controllers/nodeclass/hash/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (c *Controller) updateNodeClaimHash(ctx context.Context, nodeClass *v1alpha
v1alpha2.AnnotationAKSNodeClassHashVersion: v1alpha2.AKSNodeClassHashVersion,
})

// Any NodeClaim that is already drifted will remain drifted if the karpenter.k8s.aws/nodepool-hash-version doesn't match
// Any NodeClaim that is already drifted will remain drifted if the karpenter.azure.com/nodepool-hash-version doesn't match
// Since the hashing mechanism has changed we will not be able to determine if the drifted status of the NodeClaim has changed
if nc.StatusConditions().Get(karpv1.ConditionTypeDrifted) == nil {
nc.Annotations = lo.Assign(nc.Annotations, map[string]string{
Expand Down
23 changes: 12 additions & 11 deletions pkg/providers/imagefamily/azlinux.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,18 @@ func (u AzureLinux) DefaultImages() []DefaultImageOutput {
func (u AzureLinux) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfiguration, taints []v1.Taint, labels map[string]string, caBundle *string, _ *cloudprovider.InstanceType) bootstrap.Bootstrapper {
return bootstrap.AKS{
Options: bootstrap.Options{
ClusterName: u.Options.ClusterName,
ClusterEndpoint: u.Options.ClusterEndpoint,
KubeletConfig: kubeletConfig,
Taints: taints,
Labels: labels,
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
// GPUImageSHA: u.Options.GPUImageSHA - GPU image SHA only applies to Ubuntu
tallaxes marked this conversation as resolved.
Show resolved Hide resolved
// See: https://github.com/Azure/AgentBaker/blob/f393d6e4d689d9204d6000c85623ad9b764e2a29/vhdbuilder/packer/install-dependencies.sh#L201
SubnetID: u.Options.SubnetID,
ClusterName: u.Options.ClusterName,
ClusterEndpoint: u.Options.ClusterEndpoint,
KubeletConfig: kubeletConfig,
Taints: taints,
Labels: labels,
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUDriverType: u.Options.GPUDriverType,
GPUNeedsFabricManager: u.Options.GPUNeedsFabricManager,
GPUImageSHA: u.Options.GPUImageSHA,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
3 changes: 3 additions & 0 deletions pkg/providers/imagefamily/bootstrap/aksbootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ type NodeBootstrapVariables struct {
SwapFileSizeMB int // t user input
GPUImageSHA string // s static sha rarely updated
GPUDriverVersion string // k determine by OS + GPU hardware requirements; can be determined automatically, but hard. suggest using GPU operator.
GPUDriverType string // k
GPUInstanceProfile string // t user-specified
CustomSearchDomainName string // c user-specified [presumably cluster-level]
CustomSearchRealmUser string // c user-specified [presumably cluster-level]
Expand Down Expand Up @@ -467,7 +468,9 @@ func (a AKS) applyOptions(nbv *NodeBootstrapVariables) {
nbv.GPUNode = true
nbv.ConfigGPUDriverIfNeeded = true
nbv.GPUDriverVersion = a.GPUDriverVersion
nbv.GPUDriverType = a.GPUDriverType
nbv.GPUImageSHA = a.GPUImageSHA
nbv.GPUNeedsFabricManager = a.GPUNeedsFabricManager
}

// merge and stringify labels
Expand Down
22 changes: 12 additions & 10 deletions pkg/providers/imagefamily/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,18 @@ type KubeletConfiguration struct {

// Options is the node bootstrapping parameters passed from Karpenter to the provisioning node
type Options struct {
ClusterName string
ClusterEndpoint string
KubeletConfig *KubeletConfiguration
Taints []core.Taint `hash:"set"`
Labels map[string]string `hash:"set"`
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUImageSHA string
SubnetID string
ClusterName string
ClusterEndpoint string
KubeletConfig *KubeletConfiguration
Taints []core.Taint `hash:"set"`
Labels map[string]string `hash:"set"`
CABundle *string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
GPUNeedsFabricManager bool
SubnetID string
}

// Bootstrapper can be implemented to generate a bootstrap script
Expand Down
1 change: 1 addition & 0 deletions pkg/providers/imagefamily/bootstrap/cse_cmd.sh.gtpl
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ KUBELET_CONFIG_FILE_CONTENT="{{.KubeletConfigFileContent}}"
SWAP_FILE_SIZE_MB="{{.SwapFileSizeMB}}"
GPU_IMAGE_SHA="{{.GPUImageSHA}}"
GPU_DRIVER_VERSION="{{.GPUDriverVersion}}"
GPU_DRIVER_TYPE="{{.GPUDriverType}}"
GPU_INSTANCE_PROFILE="{{.GPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{.CustomSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{.CustomSearchRealmUser}}"
Expand Down
22 changes: 12 additions & 10 deletions pkg/providers/imagefamily/ubuntu_2204.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,18 @@ func (u Ubuntu2204) DefaultImages() []DefaultImageOutput {
func (u Ubuntu2204) ScriptlessCustomData(kubeletConfig *bootstrap.KubeletConfiguration, taints []v1.Taint, labels map[string]string, caBundle *string, _ *cloudprovider.InstanceType) bootstrap.Bootstrapper {
return bootstrap.AKS{
Options: bootstrap.Options{
ClusterName: u.Options.ClusterName,
ClusterEndpoint: u.Options.ClusterEndpoint,
KubeletConfig: kubeletConfig,
Taints: taints,
Labels: labels,
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
SubnetID: u.Options.SubnetID,
ClusterName: u.Options.ClusterName,
ClusterEndpoint: u.Options.ClusterEndpoint,
KubeletConfig: kubeletConfig,
Taints: taints,
Labels: labels,
CABundle: caBundle,
GPUNode: u.Options.GPUNode,
GPUDriverVersion: u.Options.GPUDriverVersion,
GPUImageSHA: u.Options.GPUImageSHA,
GPUDriverType: u.Options.GPUDriverType,
GPUNeedsFabricManager: u.Options.GPUNeedsFabricManager,
SubnetID: u.Options.SubnetID,
},
Arch: u.Options.Arch,
TenantID: u.Options.TenantID,
Expand Down
51 changes: 26 additions & 25 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,10 @@ var _ = Describe("InstanceType Provider", func() {
nodes := &v1.NodeList{}
Expect(env.Client.List(ctx, nodes)).To(Succeed())
for _, node := range nodes.Items {
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))

}
}

})

DescribeTable("Should not return unavailable offerings", func(azEnv *test.Environment) {
Expand Down Expand Up @@ -655,7 +653,7 @@ var _ = Describe("InstanceType Provider", func() {
}}}
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels["karpenter.k8s.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["karpenter.kubernetes.azure/zone"]).ToNot(Equal(fmt.Sprintf("%s-1", fake.Region)))
Expect(node.Labels["node.kubernetes.io/instance-type"]).To(Equal("Standard_D2_v2"))
})
It("should launch smaller instances than optimal if larger instance launch results in Insufficient Capacity Error", func() {
Expand Down Expand Up @@ -1011,20 +1009,15 @@ var _ = Describe("InstanceType Provider", func() {
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)
node := ExpectScheduled(ctx, env.Client, pod)

Expect(azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Len()).To(Equal(1))
vm := azureEnv.VirtualMachinesAPI.VirtualMachineCreateOrUpdateBehavior.CalledWithInput.Pop().VM
Expect(vm.Properties).ToNot(BeNil())
Expect(vm.Properties.HardwareProfile).ToNot(BeNil())
Expect(utils.IsNvidiaEnabledSKU(string(*vm.Properties.HardwareProfile.VMSize))).To(BeFalse())

clusterNodes := cluster.Nodes()
node := clusterNodes[0]
if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "0"))
}
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "0"))
})

It("should schedule GPU pod on GPU capable node", func() {
Expand Down Expand Up @@ -1054,23 +1047,31 @@ var _ = Describe("InstanceType Provider", func() {
})

ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod)
ExpectScheduled(ctx, env.Client, pod)

// Verify that the node has the GPU label set that the pod was scheduled on
clusterNodes := cluster.Nodes()
Expect(clusterNodes).ToNot(BeEmpty())
Expect(len(clusterNodes)).To(Equal(1))
node := clusterNodes[0]
Expect(node.Node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
node := ExpectScheduled(ctx, env.Client, pod)

if node.Name() == pod.Spec.NodeName {
nodeLabels := node.Labels()
// the following checks assume Standard_NC16as_T4_v3 (surprisingly the cheapest GPU in the test set), so test the assumption
Expect(node.Labels).To(HaveKeyWithValue("node.kubernetes.io/instance-type", "Standard_NC16as_T4_v3"))

Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-name", "A100"))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(nodeLabels).To(HaveKeyWithValue("karpenter.k8s.azure/sku-gpu-count", "1"))
// Verify GPU related settings in bootstrap (assuming one Standard_NC16as_T4_v3)
customData := ExpectDecodedCustomData(azureEnv)
Expect(customData).To(SatisfyAll(
ContainSubstring("GPU_NODE=true"),
ContainSubstring("SGX_NODE=false"),
ContainSubstring("MIG_NODE=false"),
ContainSubstring("CONFIG_GPU_DRIVER_IF_NEEDED=true"),
ContainSubstring("ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED=false"),
ContainSubstring("GPU_DRIVER_TYPE=\"cuda\""),
ContainSubstring(fmt.Sprintf("GPU_DRIVER_VERSION=\"%s\"", utils.NvidiaCudaDriverVersion)),
ContainSubstring(fmt.Sprintf("GPU_IMAGE_SHA=\"%s\"", utils.AKSGPUCudaVersionSuffix)),
ContainSubstring("GPU_NEEDS_FABRIC_MANAGER=\"false\""),
ContainSubstring("GPU_INSTANCE_PROFILE=\"\""),
))

}
// Verify that the node the pod was scheduled on has GPU resource and labels set
Expect(node.Status.Allocatable).To(HaveKeyWithValue(v1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-name", "T4"))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-manufacturer", v1alpha2.ManufacturerNvidia))
Expect(node.Labels).To(HaveKeyWithValue("karpenter.azure.com/sku-gpu-count", "1"))
})
})

Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/launchtemplate/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,9 @@ func (p *Provider) getStaticParameters(ctx context.Context, instanceType *cloudp
Arch: arch,
GPUNode: utils.IsNvidiaEnabledSKU(instanceType.Name),
GPUDriverVersion: utils.GetGPUDriverVersion(instanceType.Name),
GPUDriverType: utils.GetGPUDriverType(instanceType.Name),
GPUImageSHA: utils.GetAKSGPUImageSHA(instanceType.Name),
GPUNeedsFabricManager: utils.GPUNeedsFabricManager(instanceType.Name),
TenantID: p.tenantID,
SubscriptionID: p.subscriptionID,
KubeletIdentityClientID: p.kubeletIdentityClientID,
Expand Down
2 changes: 2 additions & 0 deletions pkg/providers/launchtemplate/parameters/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ type StaticParameters struct {
Arch string
GPUNode bool
GPUDriverVersion string
GPUDriverType string
GPUImageSHA string
GPUNeedsFabricManager bool
TenantID string
SubscriptionID string
KubeletIdentityClientID string
Expand Down
74 changes: 62 additions & 12 deletions pkg/utils/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,21 @@ import (

// TODO: Get these from agentbaker
const (
Nvidia470CudaDriverVersion = "cuda-470.82.01"
Nvidia550CudaDriverVersion = "cuda-550.54.15"
Nvidia535GridDriverVersion = "grid-535.161.08"

// These SHAs will change once we update aks-gpu images in aks-gpu repository. We do that fairly rarely at this time.
// So for now these will be kept here like this and periodically bump them
AKSGPUGridSHA = "sha-d1f0ca"
AKSGPUCudaSHA = "sha-2d4c96"
Nvidia470CudaDriverVersion = "470.82.01"

// https://github.com/Azure/AgentBaker/blob/ddf36a24eafd02ce0589657ff2dc799125f4ad37/parts/linux/cloud-init/artifacts/components.json#L562
NvidiaCudaDriverVersion = "550.90.12"
AKSGPUCudaVersionSuffix = "20241021235610"

NvidiaGridDriverVersion = "535.161.08"
AKSGPUGridVersionSuffix = "20241021235607"
)

func GetAKSGPUImageSHA(size string) string {
if UseGridDrivers(size) {
return AKSGPUGridSHA
return AKSGPUGridVersionSuffix
}
return AKSGPUCudaSHA
return AKSGPUCudaVersionSuffix
}

var (
Expand Down Expand Up @@ -143,12 +143,20 @@ func IsMarinerEnabledGPUSKU(vmSize string) bool {
// NVv3 is untested on AKS, NVv4 is AMD so n/a, and NVv2 no longer seems to exist (?).
func GetGPUDriverVersion(size string) string {
if UseGridDrivers(size) {
return Nvidia535GridDriverVersion
return NvidiaGridDriverVersion
}
if isStandardNCv1(size) {
return Nvidia470CudaDriverVersion
tallaxes marked this conversation as resolved.
Show resolved Hide resolved
}
return Nvidia550CudaDriverVersion
return NvidiaCudaDriverVersion
}

// GetGPUDriverType returns the type of GPU driver for given VM SKU ("grid" or "cuda")
func GetGPUDriverType(size string) string {
if UseGridDrivers(size) {
return "grid"
}
return "cuda"
}

func isStandardNCv1(size string) bool {
Expand All @@ -160,6 +168,11 @@ func UseGridDrivers(size string) bool {
return ConvergedGPUDriverSizes[strings.ToLower(size)]
}

// GPUNeedsFabricManager indicates whether this VM SKU needs fabric manager
func GPUNeedsFabricManager(size string) bool {
return FabricManagerGPUSizes[strings.ToLower(size)]
}

/* ConvergedGPUDriverSizes : these sizes use a "converged" driver to support both cuda/grid workloads.
how do you figure this out? ask HPC or find out by trial and error.
installing vanilla cuda drivers will fail to install with opaque errors.
Expand All @@ -177,3 +190,40 @@ var ConvergedGPUDriverSizes = map[string]bool{
"standard_nc16ads_a10_v4": true,
"standard_nc32ads_a10_v4": true,
}

/* Fabric manager trains nvlink connections between multi instance gpus.
it appears this is only necessary for systems with *multiple cards*.
i.e., an A100 can be partitioned a maximum of 7 ways.
An NC24ads_A100_v4 has one A100.
An ND96asr_v4 has eight A100, for a maximum of 56 partitions.
ND96 seems to require fabric manager *even when not using mig partitions*
while it fails to install on NC24.
*/
//nolint:gochecknoglobals
var FabricManagerGPUSizes = map[string]bool{
// A100
"standard_nd96asr_v4": true,
"standard_nd112asr_a100_v4": true,
"standard_nd120asr_a100_v4": true,
"standard_nd96amsr_a100_v4": true,
"standard_nd112amsr_a100_v4": true,
"standard_nd120amsr_a100_v4": true,
// TODO(ace): one of these is probably dupe...
// confirm with HPC/SKU owners.
"standard_nd96ams_a100_v4": true,
"standard_nd96ams_v4": true,
// H100.
"standard_nd46s_h100_v5": true,
"standard_nd48s_h100_v5": true,
"standard_nd50s_h100_v5": true,
"standard_nd92is_h100_v5": true,
"standard_nd96is_h100_v5": true,
"standard_nd100is_h100_v5": true,
"standard_nd92isr_h100_v5": true,
"standard_nd96isr_h100_v5": true,
"standard_nd100isr_h100_v5": true,
// A100 oddballs.
"standard_nc24ads_a100_v4": false, // NCads_v4 will fail to start fabricmanager.
"standard_nc48ads_a100_v4": false,
"standard_nc96ads_a100_v4": false,
}
Loading
Loading