Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: remove confidential VM SKUs (DC,EC) from consideration #23

Merged
merged 7 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hack/codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ skugen() {
SUBJECT="SKUGEN"
NO_UPDATE=$' pkg/fake/zz_generated.sku.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)'

go run hack/code/instancetype_testdata_gen.go -- "${GENERATED_FILE}" "eastus" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_F16s_v2,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3"
go run hack/code/instancetype_testdata_gen.go -- "${GENERATED_FILE}" "eastus" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_F16s_v2,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3,Standard_DC8s_v3"

GIT_DIFF=$(git diff --stat "${GENERATED_FILE}")
checkForUpdates "${GIT_DIFF}" "${NO_UPDATE}" "${SUBJECT} beside timestamps since last update" "${GENERATED_FILE}"
Expand Down
45 changes: 44 additions & 1 deletion pkg/fake/zz_generated.sku.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
)

// generated at 2023-11-04T06:56:34Z
// generated at 2023-11-10T01:03:33Z

// ResourceSkus is a list of all available skus from azure
var ResourceSkus = []compute.ResourceSku{
Expand Down Expand Up @@ -386,6 +386,49 @@ var ResourceSkus = []compute.ResourceSku{
},
},
},
{
Name: lo.ToPtr("Standard_DC8s_v3"),
Tier: lo.ToPtr("Standard"),
Kind: lo.ToPtr(""),
Size: lo.ToPtr("DC8s_v3"),
Family: lo.ToPtr("standardDCSv3Family"),
ResourceType: lo.ToPtr("virtualMachines"),
APIVersions: &[]string{},
Costs: &[]compute.ResourceSkuCosts{},
Restrictions: &[]compute.ResourceSkuRestrictions{},
Capabilities: &[]compute.ResourceSkuCapabilities{
{Name: lo.ToPtr("MaxResourceVolumeMB"), Value: lo.ToPtr("0")},
{Name: lo.ToPtr("OSVhdSizeMB"), Value: lo.ToPtr("1047552")},
{Name: lo.ToPtr("vCPUs"), Value: lo.ToPtr("8")},
{Name: lo.ToPtr("MemoryPreservingMaintenanceSupported"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("HyperVGenerations"), Value: lo.ToPtr("V2")},
{Name: lo.ToPtr("MemoryGB"), Value: lo.ToPtr("64")},
{Name: lo.ToPtr("MaxDataDiskCount"), Value: lo.ToPtr("32")},
{Name: lo.ToPtr("CpuArchitectureType"), Value: lo.ToPtr("x64")},
{Name: lo.ToPtr("LowPriorityCapable"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("PremiumIO"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("VMDeploymentTypes"), Value: lo.ToPtr("IaaS")},
{Name: lo.ToPtr("vCPUsAvailable"), Value: lo.ToPtr("8")},
{Name: lo.ToPtr("vCPUsPerCore"), Value: lo.ToPtr("1")},
{Name: lo.ToPtr("CombinedTempDiskAndCachedIOPS"), Value: lo.ToPtr("77000")},
{Name: lo.ToPtr("CombinedTempDiskAndCachedReadBytesPerSecond"), Value: lo.ToPtr("77000000000")},
{Name: lo.ToPtr("CombinedTempDiskAndCachedWriteBytesPerSecond"), Value: lo.ToPtr("77000000000")},
{Name: lo.ToPtr("UncachedDiskIOPS"), Value: lo.ToPtr("25600")},
{Name: lo.ToPtr("UncachedDiskBytesPerSecond"), Value: lo.ToPtr("384000000")},
{Name: lo.ToPtr("EphemeralOSDiskSupported"), Value: lo.ToPtr("False")},
{Name: lo.ToPtr("EncryptionAtHostSupported"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("CapacityReservationSupported"), Value: lo.ToPtr("False")},
{Name: lo.ToPtr("AcceleratedNetworkingEnabled"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("RdmaEnabled"), Value: lo.ToPtr("False")},
{Name: lo.ToPtr("MaxNetworkInterfaces"), Value: lo.ToPtr("8")},
},
Locations: &[]string{"eastus"},
LocationInfo: &[]compute.ResourceSkuLocationInfo{{Location: lo.ToPtr("eastus"), Zones: &[]string{
"1",
},
},
},
},
{
Name: lo.ToPtr("Standard_DS2_v2"),
Tier: lo.ToPtr("Standard"),
Expand Down
70 changes: 43 additions & 27 deletions pkg/providers/instancetype/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"math"
"net/http"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -88,9 +89,6 @@ func (p *Provider) List(
logging.FromContext(ctx).Errorf("parsing VM size %s, %v", *sku.Size, err)
continue
}
if !p.isSupportedSize(vmsize) {
continue
}
architecture, err := sku.GetCPUArchitectureType()
if err != nil {
logging.FromContext(ctx).Errorf("parsing SKU architecture %s, %v", *sku.Size, err)
Expand Down Expand Up @@ -152,7 +150,13 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU
skus := cache.List(ctx, skewer.ResourceTypeFilter(skewer.VirtualMachines))
logging.FromContext(ctx).Debugf("Discovered %d SKUs", len(skus))
for i := range skus {
if p.isSupported(&skus[i]) {
vmsize, err := skus[i].GetVMSize()
if err != nil {
logging.FromContext(ctx).Errorf("parsing VM size %s, %v", *skus[i].Size, err)
continue
}

if p.isSupported(&skus[i], vmsize) {
instanceTypes[skus[i].GetName()] = &skus[i]
}
}
Expand All @@ -163,36 +167,48 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU
}

// isSupported indicates SKU is supported by AKS, based on SKU properties
func (p *Provider) isSupported(sku *skewer.SKU) bool {
name := lo.FromPtr(sku.Name)
func (p *Provider) isSupported(sku *skewer.SKU, vmsize *skewer.VMSizeType) bool {
return p.hasMinimumCPU(sku) &&
p.hasMinimumMemory(sku) &&
!p.isUnsupportedByAKS(sku) &&
!p.isUnsupportedGPU(sku) &&
!p.hasConstrainedCPUs(vmsize) &&
!p.isConfidential(sku)
}

// less than 2 cpus
if cpu, err := sku.VCPU(); err == nil && cpu < 2 {
return false
}
// at least 2 cpus
func (p *Provider) hasMinimumCPU(sku *skewer.SKU) bool {
cpu, err := sku.VCPU()
return err == nil && cpu >= 2
}

// less then 3.5 GiB of memory
if memGiB, err := sku.Memory(); err == nil && memGiB < 3.5 {
return false
}
// at least 3.5 GiB of memory
func (p *Provider) hasMinimumMemory(sku *skewer.SKU) bool {
memGiB, err := sku.Memory()
return err == nil && memGiB >= 3.5
}

// filter out instances AKS does not support
if RestrictedVMSizes.Has(sku.GetName()) {
return false
}
// instances AKS does not support
func (p *Provider) isUnsupportedByAKS(sku *skewer.SKU) bool {
return RestrictedVMSizes.Has(sku.GetName())
}

// filter out GPU SKUs AKS does not support
if gpu, err := sku.GPU(); err == nil && gpu > 0 && !(utils.IsNvidiaEnabledSKU(name) || utils.IsMarinerEnabledGPUSKU(name)) {
return false
}
// GPU SKUs AKS does not support
func (p *Provider) isUnsupportedGPU(sku *skewer.SKU) bool {
name := lo.FromPtr(sku.Name)
gpu, err := sku.GPU()
return err == nil && gpu > 0 && !(utils.IsNvidiaEnabledSKU(name) || utils.IsMarinerEnabledGPUSKU(name))
}

return true
// SKU with constrained CPUs
func (p *Provider) hasConstrainedCPUs(vmsize *skewer.VMSizeType) bool {
return vmsize.CpusConstrained != nil
}

// isSupportedSize indicates VM size is supported by AKS, based on additional SKU properties
func (p *Provider) isSupportedSize(size *skewer.VMSizeType) bool {
// any SKU with constrained CPUs
return size.CpusConstrained == nil
// confidential VMs (DC, EC) are not yet supported by this Karpenter provider
func (p *Provider) isConfidential(sku *skewer.SKU) bool {
size := sku.GetSize()
return strings.HasPrefix(size, "DC") || strings.HasPrefix(size, "EC")
}

// MaxEphemeralOSDiskSizeGB returns the maximum ephemeral OS disk size for a given SKU.
Expand Down
34 changes: 22 additions & 12 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,30 @@ var _ = Describe("InstanceType Provider", func() {
})

Context("Filtering in InstanceType Provider List", func() {
It("should filter out skus that are explicitly marked as restricted", func() {
instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, nodeClass)
Expect(err).NotTo(HaveOccurred())
for _, instanceType := range instanceTypes {
// We should not see any instance types in the restricted list
Expect(instancetype.RestrictedVMSizes.Has(instanceType.Name)).To(BeFalse())
var instanceTypes corecloudprovider.InstanceTypes
var err error
getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name }

BeforeEach(func() {
instanceTypes, err = azureEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, nodeClass)
Expect(err).ToNot(HaveOccurred())
})

It("should not include SKUs marked as restricted", func() {
isRestricted := func(instanceType *corecloudprovider.InstanceType) bool {
return instancetype.RestrictedVMSizes.Has(instanceType.Name)
}
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(isRestricted, Equal(true))))
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(isRestricted, Equal(true))))
})
It("should not include SKUs with constrained CPUs, but include unconstrained ones", func() {
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_M8-2ms"))))
Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2_v2"))))
})
It("should not include confidential SKUs", func() {
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_DC8s_v3"))))
})

})

Context("Ephemeral Disk", func() {
Expand Down Expand Up @@ -564,12 +580,6 @@ var _ = Describe("InstanceType Provider", func() {
Expect(ok).To(BeTrue(), "Expected nvidia.com/gpu to be present in capacity, and be zero")
Expect(gpuQuanityNonGPU.Value()).To(Equal(int64(0)))
})

It("should not include SKUs with constrained CPUs, but include unconstrained ones", func() {
getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name }
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_M8-2ms"))))
Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2_v2"))))
})
})

Context("Instance Types", func() {
Expand Down