diff --git a/pkg/cloudprovider/suite_test.go b/pkg/cloudprovider/suite_test.go index b4cde4dee..8009f0442 100644 --- a/pkg/cloudprovider/suite_test.go +++ b/pkg/cloudprovider/suite_test.go @@ -83,7 +83,7 @@ var _ = BeforeSuite(func() { fakeClock = clock.NewFakeClock(time.Now()) recorder = events.NewRecorder(&record.FakeRecorder{}) cloudProvider = New(azureEnv.InstanceTypesProvider, azureEnv.InstanceProvider, recorder, env.Client, azureEnv.ImageProvider) - cluster = state.NewCluster(fakeClock, env.Client) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) prov = provisioning.NewProvisioner(env.Client, recorder, cloudProvider, cluster, fakeClock) }) @@ -195,7 +195,9 @@ var _ = Describe("CloudProvider", func() { }, Spec: karpv1.NodeClaimSpec{ NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, }) diff --git a/pkg/controllers/nodeclaim/garbagecollection/suite_test.go b/pkg/controllers/nodeclaim/garbagecollection/suite_test.go index 5e9a7edee..7df69fb96 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/suite_test.go +++ b/pkg/controllers/nodeclaim/garbagecollection/suite_test.go @@ -82,7 +82,7 @@ var _ = BeforeSuite(func() { cloudProvider = cloudprovider.New(azureEnv.InstanceTypesProvider, azureEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnv.ImageProvider) garbageCollectionController = garbagecollection.NewController(env.Client, cloudProvider) fakeClock = &clock.FakeClock{} - cluster = state.NewCluster(fakeClock, env.Client) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) prov = provisioning.NewProvisioner(env.Client, events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster, fakeClock) }) diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 923f24e17..1142ce0d2 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -79,7 +79,7 @@ func TestAzure(t *testing.T) { cloudProvider = cloudprovider.New(azureEnv.InstanceTypesProvider, azureEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnv.ImageProvider) cloudProviderNonZonal = cloudprovider.New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider) fakeClock = &clock.FakeClock{} - cluster = state.NewCluster(fakeClock, env.Client) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) coreProvisioner = provisioning.NewProvisioner(env.Client, events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster, fakeClock) RunSpecs(t, "Provider/Azure") } @@ -119,7 +119,9 @@ var _ = Describe("InstanceProvider", func() { }, Spec: karpv1.NodeClaimSpec{ NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, }) diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 6c4a5b85c..a55f1a88f 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -25,6 +25,7 @@ import ( "strings" "testing" + "github.com/awslabs/operatorpkg/object" "github.com/awslabs/operatorpkg/status" "github.com/blang/semver/v4" . "github.com/onsi/ginkgo/v2" @@ -93,8 +94,8 @@ func TestAzure(t *testing.T) { cloudProvider = cloudprovider.New(azureEnv.InstanceTypesProvider, azureEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnv.ImageProvider) cloudProviderNonZonal = cloudprovider.New(azureEnvNonZonal.InstanceTypesProvider, azureEnvNonZonal.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}), env.Client, azureEnvNonZonal.ImageProvider) - cluster = state.NewCluster(fakeClock, env.Client) - clusterNonZonal = state.NewCluster(fakeClock, env.Client) + cluster = state.NewCluster(fakeClock, env.Client, cloudProvider) + clusterNonZonal = state.NewCluster(fakeClock, env.Client, cloudProviderNonZonal) coreProvisioner = provisioning.NewProvisioner(env.Client, events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster, fakeClock) coreProvisionerNonZonal = provisioning.NewProvisioner(env.Client, events.NewRecorder(&record.FakeRecorder{}), cloudProviderNonZonal, clusterNonZonal, fakeClock) @@ -121,7 +122,9 @@ var _ = Describe("InstanceType Provider", func() { Template: karpv1.NodeClaimTemplate{ Spec: karpv1.NodeClaimTemplateSpec{ NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, }, @@ -313,7 +316,9 @@ var _ = Describe("InstanceType Provider", func() { }, Spec: karpv1.NodeClaimSpec{ NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, }, }, }) @@ -382,7 +387,9 @@ var _ = Describe("InstanceType Provider", func() { Values: []string{"Standard_D64s_v3"}, }}) np.Spec.Template.Spec.NodeClassRef = &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, } ExpectApplied(ctx, env.Client, np, nodeClass) @@ -413,7 +420,9 @@ var _ = Describe("InstanceType Provider", func() { Values: []string{"Standard_D64s_v3"}, }}) np.Spec.Template.Spec.NodeClassRef = &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, } ExpectApplied(ctx, env.Client, np, nodeClass) @@ -440,7 +449,9 @@ var _ = Describe("InstanceType Provider", func() { Values: []string{"Standard_D2s_v3"}, }}) np.Spec.Template.Spec.NodeClassRef = &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, } ExpectApplied(ctx, env.Client, np, nodeClass) @@ -892,7 +903,11 @@ var _ = Describe("InstanceType Provider", func() { Operator: v1.NodeSelectorOpIn, Values: []string{instanceType}, }}) - nodePool.Spec.Template.Spec.NodeClassRef = &karpv1.NodeClassReference{Name: nodeClass.Name} + nodePool.Spec.Template.Spec.NodeClassRef = &karpv1.NodeClassReference{ + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, + } ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{}) ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, coreProvisioner, pod) diff --git a/pkg/test/environment.go b/pkg/test/environment.go index 30b6777bf..36d5fb102 100644 --- a/pkg/test/environment.go +++ b/pkg/test/environment.go @@ -23,7 +23,6 @@ import ( corev1 "k8s.io/api/core/v1" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" - karpv1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1" azurecache "github.com/Azure/karpenter-provider-azure/pkg/cache" "github.com/Azure/karpenter-provider-azure/pkg/fake" @@ -40,7 +39,6 @@ import ( ) func init() { - karpv1beta1.NormalizedLabels = lo.Assign(karpv1beta1.NormalizedLabels, map[string]string{"topology.disk.csi.azure.com/zone": corev1.LabelTopologyZone}) karpv1.NormalizedLabels = lo.Assign(karpv1.NormalizedLabels, map[string]string{"topology.disk.csi.azure.com/zone": corev1.LabelTopologyZone}) } diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index 754d1eb5f..af3876eee 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -91,6 +91,36 @@ func (env *Environment) ExpectUpdated(objects ...client.Object) { } } +// ExpectStatusUpdated will update objects in the cluster to match the inputs. +// WARNING: This ignores the resource version check, which can result in +// overwriting changes made by other controllers in the cluster. +// This is useful in ensuring that we can clean up resources by patching +// out finalizers. +// Grab the object before making the updates to reduce the chance of this race. +func (env *Environment) ExpectStatusUpdated(objects ...client.Object) { + GinkgoHelper() + for _, o := range objects { + Eventually(func(g Gomega) { + current := o.DeepCopyObject().(client.Object) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(current), current)).To(Succeed()) + if current.GetResourceVersion() != o.GetResourceVersion() { + log.FromContext(env).Info(fmt.Sprintf("detected an update to an object (%s) with an outdated resource version, did you get the latest version of the object before patching?", lo.Must(apiutil.GVKForObject(o, env.Client.Scheme())))) + } + o.SetResourceVersion(current.GetResourceVersion()) + g.Expect(env.Client.Status().Update(env.Context, o)).To(Succeed()) + }).WithTimeout(time.Second * 10).Should(Succeed()) + } +} + +func ReplaceNodeConditions(node *corev1.Node, conds ...corev1.NodeCondition) *corev1.Node { + keys := sets.New[string](lo.Map(conds, func(c corev1.NodeCondition, _ int) string { return string(c.Type) })...) + node.Status.Conditions = lo.Reject(node.Status.Conditions, func(c corev1.NodeCondition, _ int) bool { + return keys.Has(string(c.Type)) + }) + node.Status.Conditions = append(node.Status.Conditions, conds...) + return node +} + // ExpectCreatedOrUpdated can update objects in the cluster to match the inputs. // WARNING: ExpectUpdated ignores the resource version check, which can result in // overwriting changes made by other controllers in the cluster. @@ -272,6 +302,17 @@ func (env *Environment) EventuallyExpectTerminatingWithTimeout(timeout time.Dura }).WithTimeout(timeout).Should(Succeed()) } +func (env *Environment) EventuallyExpectNoLeakedKubeNodeLease() { + GinkgoHelper() + // expect no kube node lease to be leaked + leases := &coordinationv1.LeaseList{} + Expect(env.Client.List(env.Context, leases, client.InNamespace("kube-node-lease"))).To(Succeed()) + leakedLeases := lo.Filter(leases.Items, func(l coordinationv1.Lease, _ int) bool { + return l.OwnerReferences == nil + }) + Expect(leakedLeases).To(HaveLen(0)) +} + func (env *Environment) EventuallyExpectHealthyWithTimeout(timeout time.Duration, pods ...*corev1.Pod) { GinkgoHelper() Eventually(func(g Gomega) { @@ -296,6 +337,17 @@ func (env *Environment) ConsistentlyExpectTerminatingPods(duration time.Duration }, duration.String()).Should(Succeed()) } +func (env *Environment) ConsistentlyExpectActivePods(duration time.Duration, pods ...*corev1.Pod) { + GinkgoHelper() + By(fmt.Sprintf("expecting %d pods to be live for %s", len(pods), duration)) + Consistently(func(g Gomega) { + for _, pod := range pods { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) + g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) + } + }, duration.String()).Should(Succeed()) +} + func (env *Environment) ConsistentlyExpectHealthyPods(duration time.Duration, pods ...*corev1.Pod) { GinkgoHelper() By(fmt.Sprintf("expecting %d pods to be ready for %s", len(pods), duration)) @@ -462,16 +514,13 @@ func (env *Environment) eventuallyExpectScaleDown() { func (env *Environment) EventuallyExpectNotFound(objects ...client.Object) { GinkgoHelper() - env.EventuallyExpectNotFoundAssertion(objects...).Should(Succeed()) -} -func (env *Environment) EventuallyExpectNotFoundAssertion(objects ...client.Object) AsyncAssertion { - return Eventually(func(g Gomega) { + Eventually(func(g Gomega) { for _, object := range objects { err := env.Client.Get(env, client.ObjectKeyFromObject(object), object) g.Expect(errors.IsNotFound(err)).To(BeTrue()) } - }) + }).Should(Succeed()) } func (env *Environment) ExpectCreatedNodeCount(comparator string, count int) []*corev1.Node { @@ -524,34 +573,77 @@ func (env *Environment) ConsistentlyExpectNodeCount(comparator string, count int return lo.ToSlicePtr(nodeList.Items) } -func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) (taintedNodes []*corev1.Node) { +// ConsistentlyExpectNoDisruptions asserts that the number of tainted nodes remains the same. +// And that the number of nodeclaims remains the same. +func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) { GinkgoHelper() - return env.ConsistentlyExpectDisruptionsWithNodeCount(0, nodeCount, duration) + Consistently(func(g Gomega) { + nodeClaimList := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + g.Expect(nodeClaimList.Items).To(HaveLen(nodeCount)) + nodeList := &corev1.NodeList{} + g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + g.Expect(nodeList.Items).To(HaveLen(nodeCount)) + nodeList.Items = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool { + _, ok := lo.Find(n.Spec.Taints, func(t corev1.Taint) bool { + return t.MatchTaint(&karpv1.DisruptedNoScheduleTaint) + }) + return ok + }) + g.Expect(nodeList.Items).To(HaveLen(0)) + }, duration).Should(Succeed()) } -// ConsistentlyExpectDisruptionsWithNodeCount will continually ensure that there are exactly disruptingNodes with totalNodes (including replacements and existing nodes) -func (env *Environment) ConsistentlyExpectDisruptionsWithNodeCount(disruptingNodes, totalNodes int, duration time.Duration) (taintedNodes []*corev1.Node) { +// ConsistentlyExpectDisruptionsUntilNoneLeft consistently ensures a max on number of concurrently disrupting and non-terminating nodes. +// This actually uses an Eventually() under the hood so that when we reach 0 tainted nodes we exit early. +// We use the StopTrying() so that we can exit the Eventually() if we've breached an assertion on total concurrency of disruptions. +// For example: if we have 5 nodes, with a budget of 2 nodes, we ensure that `disruptingNodes <= maxNodesDisrupting=2` +// We use nodesAtStart+maxNodesDisrupting to assert that we're not creating too many instances in replacement. +func (env *Environment) ConsistentlyExpectDisruptionsUntilNoneLeft(nodesAtStart, maxNodesDisrupting int, timeout time.Duration) { GinkgoHelper() nodes := []corev1.Node{} - Consistently(func(g Gomega) { - // Ensure we don't change our NodeClaims + // We use an eventually to exit when we detect the number of tainted/disrupted nodes matches our target. + Eventually(func(g Gomega) { + // Grab Nodes and NodeClaims nodeClaimList := &karpv1.NodeClaimList{} - g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) - g.Expect(nodeClaimList.Items).To(HaveLen(totalNodes)) - nodeList := &corev1.NodeList{} + g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) - g.Expect(nodeList.Items).To(HaveLen(totalNodes)) + // Don't include NodeClaims with the `Terminating` status condition, as they're not included in budgets + removedProviderIDs := sets.Set[string]{} + nodeClaimList.Items = lo.Filter(nodeClaimList.Items, func(nc karpv1.NodeClaim, _ int) bool { + if !nc.StatusConditions().IsTrue(karpv1.ConditionTypeInstanceTerminating) { + return true + } + removedProviderIDs.Insert(nc.Status.ProviderID) + return false + }) + if len(nodeClaimList.Items) > nodesAtStart+maxNodesDisrupting { + StopTrying(fmt.Sprintf("Too many nodeclaims created. Expected no more than %d, got %d", nodesAtStart+maxNodesDisrupting, len(nodeClaimList.Items))).Now() + } + + // Don't include Nodes whose NodeClaims have been ignored + nodeList.Items = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool { + return !removedProviderIDs.Has(n.Spec.ProviderID) + }) + if len(nodeList.Items) > nodesAtStart+maxNodesDisrupting { + StopTrying(fmt.Sprintf("Too many nodes created. Expected no more than %d, got %d", nodesAtStart+maxNodesDisrupting, len(nodeList.Items))).Now() + } + + // Filter further by the number of tainted nodes to get the number of nodes that are disrupting nodes = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool { _, ok := lo.Find(n.Spec.Taints, func(t corev1.Taint) bool { - return karpv1.IsDisruptingTaint(t) + return t.MatchTaint(&karpv1.DisruptedNoScheduleTaint) }) return ok }) - g.Expect(nodes).To(HaveLen(disruptingNodes)) - }, duration).Should(Succeed()) - return lo.ToSlicePtr(nodes) + if len(nodes) > maxNodesDisrupting { + StopTrying(fmt.Sprintf("Too many disruptions detected. Expected no more than %d, got %d", maxNodesDisrupting, len(nodeList.Items))).Now() + } + + g.Expect(nodes).To(HaveLen(0)) + }).WithTimeout(timeout).WithPolling(5 * time.Second).Should(Succeed()) } func (env *Environment) EventuallyExpectTaintedNodeCount(comparator string, count int) []*corev1.Node { diff --git a/test/suites/nodeclaim/nodeclaim_test.go b/test/suites/nodeclaim/nodeclaim_test.go index 9b9b32d78..2d35a176a 100644 --- a/test/suites/nodeclaim/nodeclaim_test.go +++ b/test/suites/nodeclaim/nodeclaim_test.go @@ -18,6 +18,7 @@ package nodeclaim_test import ( "github.com/Azure/karpenter-provider-azure/pkg/apis/v1alpha2" + "github.com/awslabs/operatorpkg/object" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" @@ -46,7 +47,9 @@ var _ = Describe("StandaloneNodeClaim", func() { }}, }, NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, }) @@ -66,7 +69,9 @@ var _ = Describe("StandaloneNodeClaim", func() { }, }, NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, }) @@ -100,7 +105,9 @@ var _ = Describe("StandaloneNodeClaim", func() { }, }, NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, }) @@ -139,7 +146,9 @@ var _ = Describe("StandaloneNodeClaim", func() { }}, }, NodeClassRef: &karpv1.NodeClassReference{ - Name: nodeClass.Name, + Group: object.GVK(nodeClass).Group, + Kind: object.GVK(nodeClass).Kind, + Name: nodeClass.Name, }, }, })