Skip to content

Commit

Permalink
fix: remove confidential VM SKUs (DC,EC) from consideration (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
tallaxes authored Nov 14, 2023
1 parent 057ac97 commit 61d82d3
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 41 deletions.
2 changes: 1 addition & 1 deletion hack/codegen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ skugen() {
SUBJECT="SKUGEN"
NO_UPDATE=$' pkg/fake/zz_generated.sku.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)'

go run hack/code/instancetype_testdata_gen.go -- "${GENERATED_FILE}" "eastus" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_F16s_v2,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3"
go run hack/code/instancetype_testdata_gen.go -- "${GENERATED_FILE}" "eastus" "Standard_B1s,Standard_A0,Standard_D2_v2,Standard_D2_v3,Standard_DS2_v2,Standard_D2s_v3,Standard_D2_v5,Standard_F16s_v2,Standard_NC24ads_A100_v4,Standard_M8-2ms,Standard_D4s_v3,Standard_D64s_v3,Standard_DC8s_v3"

GIT_DIFF=$(git diff --stat "${GENERATED_FILE}")
checkForUpdates "${GIT_DIFF}" "${NO_UPDATE}" "${SUBJECT} beside timestamps since last update" "${GENERATED_FILE}"
Expand Down
45 changes: 44 additions & 1 deletion pkg/fake/zz_generated.sku.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
)

// generated at 2023-11-04T06:56:34Z
// generated at 2023-11-10T01:03:33Z

// ResourceSkus is a list of all available skus from azure
var ResourceSkus = []compute.ResourceSku{
Expand Down Expand Up @@ -386,6 +386,49 @@ var ResourceSkus = []compute.ResourceSku{
},
},
},
{
Name: lo.ToPtr("Standard_DC8s_v3"),
Tier: lo.ToPtr("Standard"),
Kind: lo.ToPtr(""),
Size: lo.ToPtr("DC8s_v3"),
Family: lo.ToPtr("standardDCSv3Family"),
ResourceType: lo.ToPtr("virtualMachines"),
APIVersions: &[]string{},
Costs: &[]compute.ResourceSkuCosts{},
Restrictions: &[]compute.ResourceSkuRestrictions{},
Capabilities: &[]compute.ResourceSkuCapabilities{
{Name: lo.ToPtr("MaxResourceVolumeMB"), Value: lo.ToPtr("0")},
{Name: lo.ToPtr("OSVhdSizeMB"), Value: lo.ToPtr("1047552")},
{Name: lo.ToPtr("vCPUs"), Value: lo.ToPtr("8")},
{Name: lo.ToPtr("MemoryPreservingMaintenanceSupported"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("HyperVGenerations"), Value: lo.ToPtr("V2")},
{Name: lo.ToPtr("MemoryGB"), Value: lo.ToPtr("64")},
{Name: lo.ToPtr("MaxDataDiskCount"), Value: lo.ToPtr("32")},
{Name: lo.ToPtr("CpuArchitectureType"), Value: lo.ToPtr("x64")},
{Name: lo.ToPtr("LowPriorityCapable"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("PremiumIO"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("VMDeploymentTypes"), Value: lo.ToPtr("IaaS")},
{Name: lo.ToPtr("vCPUsAvailable"), Value: lo.ToPtr("8")},
{Name: lo.ToPtr("vCPUsPerCore"), Value: lo.ToPtr("1")},
{Name: lo.ToPtr("CombinedTempDiskAndCachedIOPS"), Value: lo.ToPtr("77000")},
{Name: lo.ToPtr("CombinedTempDiskAndCachedReadBytesPerSecond"), Value: lo.ToPtr("77000000000")},
{Name: lo.ToPtr("CombinedTempDiskAndCachedWriteBytesPerSecond"), Value: lo.ToPtr("77000000000")},
{Name: lo.ToPtr("UncachedDiskIOPS"), Value: lo.ToPtr("25600")},
{Name: lo.ToPtr("UncachedDiskBytesPerSecond"), Value: lo.ToPtr("384000000")},
{Name: lo.ToPtr("EphemeralOSDiskSupported"), Value: lo.ToPtr("False")},
{Name: lo.ToPtr("EncryptionAtHostSupported"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("CapacityReservationSupported"), Value: lo.ToPtr("False")},
{Name: lo.ToPtr("AcceleratedNetworkingEnabled"), Value: lo.ToPtr("True")},
{Name: lo.ToPtr("RdmaEnabled"), Value: lo.ToPtr("False")},
{Name: lo.ToPtr("MaxNetworkInterfaces"), Value: lo.ToPtr("8")},
},
Locations: &[]string{"eastus"},
LocationInfo: &[]compute.ResourceSkuLocationInfo{{Location: lo.ToPtr("eastus"), Zones: &[]string{
"1",
},
},
},
},
{
Name: lo.ToPtr("Standard_DS2_v2"),
Tier: lo.ToPtr("Standard"),
Expand Down
70 changes: 43 additions & 27 deletions pkg/providers/instancetype/instancetypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"math"
"net/http"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -88,9 +89,6 @@ func (p *Provider) List(
logging.FromContext(ctx).Errorf("parsing VM size %s, %v", *sku.Size, err)
continue
}
if !p.isSupportedSize(vmsize) {
continue
}
architecture, err := sku.GetCPUArchitectureType()
if err != nil {
logging.FromContext(ctx).Errorf("parsing SKU architecture %s, %v", *sku.Size, err)
Expand Down Expand Up @@ -152,7 +150,13 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU
skus := cache.List(ctx, skewer.ResourceTypeFilter(skewer.VirtualMachines))
logging.FromContext(ctx).Debugf("Discovered %d SKUs", len(skus))
for i := range skus {
if p.isSupported(&skus[i]) {
vmsize, err := skus[i].GetVMSize()
if err != nil {
logging.FromContext(ctx).Errorf("parsing VM size %s, %v", *skus[i].Size, err)
continue
}

if p.isSupported(&skus[i], vmsize) {
instanceTypes[skus[i].GetName()] = &skus[i]
}
}
Expand All @@ -163,36 +167,48 @@ func (p *Provider) getInstanceTypes(ctx context.Context) (map[string]*skewer.SKU
}

// isSupported indicates SKU is supported by AKS, based on SKU properties
func (p *Provider) isSupported(sku *skewer.SKU) bool {
name := lo.FromPtr(sku.Name)
func (p *Provider) isSupported(sku *skewer.SKU, vmsize *skewer.VMSizeType) bool {
return p.hasMinimumCPU(sku) &&
p.hasMinimumMemory(sku) &&
!p.isUnsupportedByAKS(sku) &&
!p.isUnsupportedGPU(sku) &&
!p.hasConstrainedCPUs(vmsize) &&
!p.isConfidential(sku)
}

// less than 2 cpus
if cpu, err := sku.VCPU(); err == nil && cpu < 2 {
return false
}
// at least 2 cpus
func (p *Provider) hasMinimumCPU(sku *skewer.SKU) bool {
cpu, err := sku.VCPU()
return err == nil && cpu >= 2
}

// less then 3.5 GiB of memory
if memGiB, err := sku.Memory(); err == nil && memGiB < 3.5 {
return false
}
// at least 3.5 GiB of memory
func (p *Provider) hasMinimumMemory(sku *skewer.SKU) bool {
memGiB, err := sku.Memory()
return err == nil && memGiB >= 3.5
}

// filter out instances AKS does not support
if RestrictedVMSizes.Has(sku.GetName()) {
return false
}
// instances AKS does not support
func (p *Provider) isUnsupportedByAKS(sku *skewer.SKU) bool {
return RestrictedVMSizes.Has(sku.GetName())
}

// filter out GPU SKUs AKS does not support
if gpu, err := sku.GPU(); err == nil && gpu > 0 && !(utils.IsNvidiaEnabledSKU(name) || utils.IsMarinerEnabledGPUSKU(name)) {
return false
}
// GPU SKUs AKS does not support
func (p *Provider) isUnsupportedGPU(sku *skewer.SKU) bool {
name := lo.FromPtr(sku.Name)
gpu, err := sku.GPU()
return err == nil && gpu > 0 && !(utils.IsNvidiaEnabledSKU(name) || utils.IsMarinerEnabledGPUSKU(name))
}

return true
// SKU with constrained CPUs
func (p *Provider) hasConstrainedCPUs(vmsize *skewer.VMSizeType) bool {
return vmsize.CpusConstrained != nil
}

// isSupportedSize indicates VM size is supported by AKS, based on additional SKU properties
func (p *Provider) isSupportedSize(size *skewer.VMSizeType) bool {
// any SKU with constrained CPUs
return size.CpusConstrained == nil
// confidential VMs (DC, EC) are not yet supported by this Karpenter provider
func (p *Provider) isConfidential(sku *skewer.SKU) bool {
size := sku.GetSize()
return strings.HasPrefix(size, "DC") || strings.HasPrefix(size, "EC")
}

// MaxEphemeralOSDiskSizeGB returns the maximum ephemeral OS disk size for a given SKU.
Expand Down
34 changes: 22 additions & 12 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,30 @@ var _ = Describe("InstanceType Provider", func() {
})

Context("Filtering in InstanceType Provider List", func() {
It("should filter out skus that are explicitly marked as restricted", func() {
instanceTypes, err := azureEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, nodeClass)
Expect(err).NotTo(HaveOccurred())
for _, instanceType := range instanceTypes {
// We should not see any instance types in the restricted list
Expect(instancetype.RestrictedVMSizes.Has(instanceType.Name)).To(BeFalse())
var instanceTypes corecloudprovider.InstanceTypes
var err error
getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name }

BeforeEach(func() {
instanceTypes, err = azureEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, nodeClass)
Expect(err).ToNot(HaveOccurred())
})

It("should not include SKUs marked as restricted", func() {
isRestricted := func(instanceType *corecloudprovider.InstanceType) bool {
return instancetype.RestrictedVMSizes.Has(instanceType.Name)
}
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(isRestricted, Equal(true))))
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(isRestricted, Equal(true))))
})
It("should not include SKUs with constrained CPUs, but include unconstrained ones", func() {
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_M8-2ms"))))
Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2_v2"))))
})
It("should not include confidential SKUs", func() {
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_DC8s_v3"))))
})

})

Context("Ephemeral Disk", func() {
Expand Down Expand Up @@ -564,12 +580,6 @@ var _ = Describe("InstanceType Provider", func() {
Expect(ok).To(BeTrue(), "Expected nvidia.com/gpu to be present in capacity, and be zero")
Expect(gpuQuanityNonGPU.Value()).To(Equal(int64(0)))
})

It("should not include SKUs with constrained CPUs, but include unconstrained ones", func() {
getName := func(instanceType *corecloudprovider.InstanceType) string { return instanceType.Name }
Expect(instanceTypes).ShouldNot(ContainElement(WithTransform(getName, Equal("Standard_M8-2ms"))))
Expect(instanceTypes).Should(ContainElement(WithTransform(getName, Equal("Standard_D2_v2"))))
})
})

Context("Instance Types", func() {
Expand Down

0 comments on commit 61d82d3

Please sign in to comment.