From 7e7ddd6a83ed4fc6778a6215e12642c2909573ca Mon Sep 17 00:00:00 2001 From: Andrew Delph Date: Fri, 23 Feb 2024 14:47:03 -0500 Subject: [PATCH 1/2] kpa: simplify computeActiveCondition logic Making changes to the pa active status has becomes difficult. This change breaks down some of the logic so that it is more readable. No changes to test cases were made as it doesn't actually change the logical cases. --- pkg/reconciler/autoscaling/kpa/kpa.go | 63 ++++++++++++++-------- pkg/reconciler/autoscaling/kpa/kpa_test.go | 25 ++++++++- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/pkg/reconciler/autoscaling/kpa/kpa.go b/pkg/reconciler/autoscaling/kpa/kpa.go index 6c185949d534..b06d95cebe93 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa.go +++ b/pkg/reconciler/autoscaling/kpa/kpa.go @@ -259,40 +259,57 @@ func reportMetrics(pa *autoscalingv1alpha1.PodAutoscaler, pc podCounts) { // | -1 | >= min | >0 | activating | active | // | -1 | >= min | >0 | active | active | func computeActiveCondition(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscaler, pc podCounts) { + minReady := activeThreshold(ctx, pa) - if pc.ready >= minReady && pa.Status.ServiceName != "" { + isServiceCreated := pa.Status.ServiceName != "" + + // Mark the pa as Initialized if the service is created and reached the initial scale. + if pc.ready >= minReady && isServiceCreated { pa.Status.MarkScaleTargetInitialized() } - switch { - // Need to check for minReady = 0 because in the initialScale 0 case, pc.want will be -1. - case pc.want == 0 || minReady == 0: - if pa.Status.IsActivating() && minReady > 0 { - // We only ever scale to zero while activating if we fail to activate within the progress deadline. - pa.Status.MarkInactive("TimedOut", "The target could not be activated.") - } else { + // If the service is not created yet, it is either queued or inctive if initial scale zero. + if !isServiceCreated { + if minReady == 0 { pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.") - } - - case pc.ready < minReady: - if pc.want > 0 || !pa.Status.IsInactive() { + } else { pa.Status.MarkActivating( "Queued", "Requests to the target are being buffered as resources are provisioned.") + } + return + } + + // If the target is not initialized, it will be queued. + if !pa.Status.IsScaleTargetInitialized() { + pa.Status.MarkActivating( + "Queued", "Requests to the target are being buffered as resources are provisioned.") + return + } + + switch { + case pc.want == 0: + // The kpa is trying to scale to 0. + if pa.Status.IsActivating() { + // The pa is stuck in activating and the progressDeadline is forcing a scale to zero. + pa.Status.MarkInactive("TimedOut", "The target could not be activated.") } else { - // This is for the initialScale 0 case. In the first iteration, minReady is 0, - // but for the following iterations, minReady is 1. pc.want will continue being - // -1 until we start receiving metrics, so we will end up here. - // Even though PA is already been marked as inactive in the first iteration, we - // still need to set it again. Otherwise reconciliation will fail with NewObservedGenFailure - // because we cannot go through one iteration of reconciliation without setting - // some status. + // The pa is no longer receiving traffic. pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.") } - case pc.ready >= minReady: - if pc.want > 0 || !pa.Status.IsInactive() { - pa.Status.MarkActive() - } + case pc.ready >= minReady && pc.ready > 0: + // To be active there must be atleast 1 active pod. + // This takes into consideration when InitialScale is zero. + pa.Status.MarkActive() + + case pc.want < 0: + // The decider is not receiving metrics because the pa scaled to zero. + pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.") + + default: + // The desired scale has not been reached yet. + pa.Status.MarkActivating( + "Queued", "Requests to the target are being buffered as resources are provisioned.") } } diff --git a/pkg/reconciler/autoscaling/kpa/kpa_test.go b/pkg/reconciler/autoscaling/kpa/kpa_test.go index f16450cc44f2..4946b79e657c 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa_test.go +++ b/pkg/reconciler/autoscaling/kpa/kpa_test.go @@ -1171,7 +1171,30 @@ func TestReconcile(t *testing.T) { WantUpdates: []clientgotesting.UpdateActionImpl{{ Object: defaultProxySKS, }}, - }} + }, { + Name: "image pull error - stay queued", // The service is created and not initialized + Key: key, + Ctx: context.WithValue(context.Background(), deciderKey{}, + decider(testNamespace, testRevision, -1, /* desiredScale */ + 0 /* ebc */)), + Objects: []runtime.Object{ + kpa(testNamespace, testRevision, WithPASKSNotReady(""), withScales(0, -1), WithReachabilityReachable, + WithPAMetricsService(privateSvc)), + // SKS won't be ready bc no ready endpoints, but private service name will be populated. + sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithPrivateService, WithPubService), + metric(testNamespace, testRevision), + deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { + d.Spec.Replicas = ptr.Int32(0) + }), + }, + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: kpa(testNamespace, testRevision, WithPASKSNotReady(""), + WithBufferedTraffic, + withScales(0, -1), WithReachabilityReachable, + WithPAMetricsService(privateSvc), WithObservedGeneration(1), + WithPAStatusService(testRevision), + ), + }}}} table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, cmw configmap.Watcher) controller.Reconciler { retryAttempted = false From fa32deec0a04867d64060de3b4620edde5ed4ce4 Mon Sep 17 00:00:00 2001 From: Andrew Delph Date: Thu, 23 Nov 2023 20:37:31 -0500 Subject: [PATCH 2/2] fix: Revisions stuck in restart loop now can scale down once unreachable 1. Unreachable pa becomes inactive PA computeActiveCondition() will MarkInactive in the case of "Queued" when the pa is Unreachable. 2. Adding DeadStart e2e tests TestDeadStartToHealthy: creates a service which is never able to transition to a ready state by existing immediately. The failed revision will scale to zero once the service is updated with a healthy revision. TestDeadStartFromHealthy: updates a healthy service with an image that can never reach a ready state. The healthy revision remains Ready and the DeadStart revision doesn't not scale down until ProgressDeadline is reached. --- pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go | 8 + pkg/reconciler/autoscaling/kpa/kpa.go | 9 +- pkg/reconciler/autoscaling/kpa/kpa_test.go | 124 ++++++++- pkg/reconciler/autoscaling/kpa/scaler.go | 6 +- pkg/reconciler/autoscaling/kpa/scaler_test.go | 22 ++ pkg/testing/v1/configuration.go | 7 + test/conformance.go | 1 + test/e2e/dead_start_test.go | 261 ++++++++++++++++++ test/test_images/deadstart/README.md | 4 + test/test_images/deadstart/deadstart.go | 26 ++ test/test_images/deadstart/service.yaml | 10 + 11 files changed, 461 insertions(+), 17 deletions(-) create mode 100644 test/e2e/dead_start_test.go create mode 100644 test/test_images/deadstart/README.md create mode 100644 test/test_images/deadstart/deadstart.go create mode 100644 test/test_images/deadstart/service.yaml diff --git a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go index 9cff0ab9f547..7529cdae4e90 100644 --- a/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go +++ b/pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go @@ -187,6 +187,14 @@ func (pa *PodAutoscaler) IsReady() bool { pas.GetCondition(PodAutoscalerConditionReady).IsTrue() } +func (pa *PodAutoscaler) IsUnreachable() bool { + return pa.Spec.Reachability == ReachabilityUnreachable +} + +func (pa *PodAutoscaler) IsReachable() bool { + return pa.Spec.Reachability == ReachabilityReachable +} + // IsActive returns true if the pod autoscaler has finished scaling. func (pas *PodAutoscalerStatus) IsActive() bool { return pas.GetCondition(PodAutoscalerConditionActive).IsTrue() diff --git a/pkg/reconciler/autoscaling/kpa/kpa.go b/pkg/reconciler/autoscaling/kpa/kpa.go index b06d95cebe93..47f10a7a89a0 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa.go +++ b/pkg/reconciler/autoscaling/kpa/kpa.go @@ -270,6 +270,7 @@ func computeActiveCondition(ctx context.Context, pa *autoscalingv1alpha1.PodAuto // If the service is not created yet, it is either queued or inctive if initial scale zero. if !isServiceCreated { + if minReady == 0 { pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.") } else { @@ -281,8 +282,12 @@ func computeActiveCondition(ctx context.Context, pa *autoscalingv1alpha1.PodAuto // If the target is not initialized, it will be queued. if !pa.Status.IsScaleTargetInitialized() { - pa.Status.MarkActivating( - "Queued", "Requests to the target are being buffered as resources are provisioned.") + if pa.IsUnreachable() { + pa.Status.MarkInactive("Unreachable", "The target does not have an active routing state.") + } else { + pa.Status.MarkActivating( + "Queued", "Requests to the target are being buffered as resources are provisioned.") + } return } diff --git a/pkg/reconciler/autoscaling/kpa/kpa_test.go b/pkg/reconciler/autoscaling/kpa/kpa_test.go index 4946b79e657c..16fec955d87d 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa_test.go +++ b/pkg/reconciler/autoscaling/kpa/kpa_test.go @@ -29,6 +29,7 @@ import ( // These are the fake informers we want setup. fakenetworkingclient "knative.dev/networking/pkg/client/injection/client/fake" fakesksinformer "knative.dev/networking/pkg/client/injection/informers/networking/v1alpha1/serverlessservice/fake" + "knative.dev/pkg/apis" fakekubeclient "knative.dev/pkg/client/injection/kube/client/fake" fakefilteredpodsinformer "knative.dev/pkg/client/injection/kube/informers/core/v1/pod/filtered/fake" _ "knative.dev/pkg/client/injection/kube/informers/core/v1/service/fake" @@ -174,9 +175,9 @@ func newConfigWatcher() configmap.Watcher { }) } -func withScales(g, w int32) PodAutoscalerOption { +func withScales(actualScale, desiredScale int32) PodAutoscalerOption { return func(pa *autoscalingv1alpha1.PodAutoscaler) { - pa.Status.DesiredScale, pa.Status.ActualScale = ptr.Int32(w), ptr.Int32(g) + pa.Status.DesiredScale, pa.Status.ActualScale = ptr.Int32(desiredScale), ptr.Int32(actualScale) } } @@ -211,8 +212,20 @@ func sks(ns, n string, so ...SKSOption) *nv1a1.ServerlessService { return s } -func markOld(pa *autoscalingv1alpha1.PodAutoscaler) { - pa.Status.Conditions[0].LastTransitionTime.Inner.Time = time.Now().Add(-1 * time.Hour) +func sksMarkOld(sks *nv1a1.ServerlessService) { + for i := range sks.Status.Conditions { + sks.Status.Conditions[i].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(time.Now().Add(-time.Hour))} + } +} + +func markActivatorEndpointsPopulated(sks *nv1a1.ServerlessService) { + sks.Status.MarkActivatorEndpointsPopulated() +} + +func kpaMarkOld(pa *autoscalingv1alpha1.PodAutoscaler) { + for i := range pa.Status.Conditions { + pa.Status.Conditions[i].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(time.Now().Add(-time.Hour))} + } } func markScaleTargetInitialized(pa *autoscalingv1alpha1.PodAutoscaler) { @@ -467,7 +480,7 @@ func TestReconcile(t *testing.T) { Key: key, Objects: []runtime.Object{ kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), - withScales(0, defaultScale), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc)), + withScales(0, defaultScale), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), WithReachabilityReachable), // SKS is ready here, since its endpoints are populated with Activator endpoints. sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithSKSReady), metric(testNamespace, testRevision), @@ -480,7 +493,7 @@ func TestReconcile(t *testing.T) { WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ Object: kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithBufferedTraffic, withScales(0, defaultScale), WithPASKSReady, WithPAMetricsService(privateSvc), - WithPAStatusService(testRevision), WithObservedGeneration(1)), + WithPAStatusService(testRevision), WithObservedGeneration(1), WithReachabilityReachable), }}, }, { Name: "sks is still not ready", @@ -708,7 +721,7 @@ func TestReconcile(t *testing.T) { Objects: []runtime.Object{ kpa(testNamespace, testRevision, WithScaleTargetInitialized, withScales(0, 0), WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), - WithPASKSReady, markOld, WithPAStatusService(testRevision), + WithPASKSReady, kpaMarkOld, WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), WithObservedGeneration(1)), sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady), metric(testNamespace, testRevision), @@ -724,7 +737,7 @@ func TestReconcile(t *testing.T) { Objects: []runtime.Object{ kpa(testNamespace, testRevision, WithScaleTargetInitialized, withScales(0, 0), WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), - WithPASKSReady, markOld, WithPAStatusService(testRevision), + WithPASKSReady, kpaMarkOld, WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), WithObservedGeneration(1)), sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady), metric(testNamespace, testRevision), @@ -743,7 +756,7 @@ func TestReconcile(t *testing.T) { Ctx: context.WithValue(context.Background(), deciderKey{}, decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)), Objects: []runtime.Object{ - kpa(testNamespace, testRevision, WithPASKSReady, WithTraffic, markOld, + kpa(testNamespace, testRevision, WithPASKSReady, WithTraffic, kpaMarkOld, withScales(0, 0), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc)), defaultSKS, metric(testNamespace, testRevision), @@ -776,14 +789,14 @@ func TestReconcile(t *testing.T) { Ctx: context.WithValue(context.Background(), deciderKey{}, decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)), Objects: []runtime.Object{ - kpa(testNamespace, testRevision, WithPASKSReady, WithBufferedTraffic, markOld, + kpa(testNamespace, testRevision, WithPASKSReady, WithBufferedTraffic, kpaMarkOld, WithReachabilityUnreachable, WithPAStatusService(testRevision), withScales(0, 0), WithPAMetricsService(privateSvc)), defaultSKS, metric(testNamespace, testRevision), deploy(testNamespace, testRevision), defaultReady}, WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ - Object: kpa(testNamespace, testRevision, markScaleTargetInitialized, WithPASKSReady, WithPAMetricsService(privateSvc), + Object: kpa(testNamespace, testRevision, markScaleTargetInitialized, WithPASKSReady, WithPAMetricsService(privateSvc), WithReachabilityUnreachable, WithNoTraffic("TimedOut", "The target could not be activated."), withScales(1, 0), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), WithObservedGeneration(1)), @@ -932,13 +945,13 @@ func TestReconcile(t *testing.T) { Object: activeKPAMinScale(overscale, defaultScale), }}, }, { - Name: "scaled-to-0-no-scale-data", + Name: "scaled to 0 no scale data", Key: key, Ctx: context.WithValue(context.Background(), deciderKey{}, decider(testNamespace, testRevision, unknownScale, /* desiredScale */ 0 /* ebc */)), Objects: []runtime.Object{ - kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithPASKSReady, + kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithReachabilityReachable, WithPASKSReady, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), WithPAMetricsService(privateSvc), withScales(0, -1), WithPAStatusService(testRevision), WithObservedGeneration(1)), sks(testNamespace, testRevision, WithDeployRef(deployName), @@ -1071,6 +1084,28 @@ func TestReconcile(t *testing.T) { WithPASKSNotReady(""), WithPAStatusService(testRevision), ), }}, + }, { + Name: "initial scale zero: with ready pod", + Key: key, + Ctx: context.WithValue(context.WithValue(context.Background(), asConfigKey{}, initialScaleZeroASConfig()), deciderKey{}, + decider(testNamespace, testRevision, 0 /* desiredScale */, -42 /* ebc */)), + Objects: append([]runtime.Object{ + kpa(testNamespace, testRevision, markScaleTargetInitialized, withScales(1, 0), + WithReachabilityReachable, WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), + WithPASKSReady, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), + ), + sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady, WithNumActivators(defaultAct)), + metric(testNamespace, testRevision), + deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { + d.Spec.Replicas = ptr.Int32(1) + }), + }, makeReadyPods(1, testNamespace, testRevision)...), + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: kpa(testNamespace, testRevision, WithTraffic, WithPASKSReady, markScaleTargetInitialized, + withScales(1, 0), WithReachabilityReachable, WithPAStatusService(testRevision), + WithPAMetricsService(privateSvc), WithObservedGeneration(1), WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), + ), + }}, }, { Name: "initial scale zero: sks ServiceName empty", Key: key, @@ -1194,7 +1229,68 @@ func TestReconcile(t *testing.T) { WithPAMetricsService(privateSvc), WithObservedGeneration(1), WithPAStatusService(testRevision), ), - }}}} + }}}, + { + Name: "unreachable crashing - pa becomes Failed.", + Key: key, + Ctx: context.WithValue(context.Background(), deciderKey{}, + decider(testNamespace, testRevision, -1 /* desiredScale */, 0 /* ebc */)), + Objects: []runtime.Object{ + kpa(testNamespace, + testRevision, WithReachabilityUnreachable, + WithPAMetricsService(privateSvc), WithPASKSNotReady(noPrivateServiceName)), + sks( + testNamespace, testRevision, WithProxyMode, + WithDeployRef(deployName), WithPrivateService, WithPubService, + ), + metric(testNamespace, testRevision), + deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { + d.Spec.Replicas = ptr.Int32(1) + }), + }, + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: kpa(testNamespace, testRevision, + WithNoTraffic("Unreachable", "The target does not have an active routing state."), withScales(0, -1), WithReachabilityUnreachable, + WithPAMetricsService(privateSvc), WithObservedGeneration(1), + WithPAStatusService(testRevision), WithPASKSNotReady(""), + ), + }}, + }, + { + Name: "unreachable crashing - deployment becomes 0", + Key: key, + Ctx: context.WithValue(context.Background(), deciderKey{}, + decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)), + Objects: []runtime.Object{ + kpa(testNamespace, testRevision, + WithNoTraffic("Unreachable", "The target does not have an active routing state."), WithReachabilityUnreachable, + WithObservedGeneration(1), WithPAStatusService(testRevision), WithPASKSNotReady(""), withScales(0, -1), + ), + sks( + testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), + WithPrivateService, WithPubService, markActivatorEndpointsPopulated, sksMarkOld, + ), + metric(testNamespace, testRevision), + deploy(testNamespace, testRevision, func(d *appsv1.Deployment) { + d.Spec.Replicas = ptr.Int32(1) + }), + }, + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: kpa(testNamespace, testRevision, + WithNoTraffic("Unreachable", "The target does not have an active routing state."), WithReachabilityUnreachable, + WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), + WithObservedGeneration(1), withScales(0, 0), WithPASKSNotReady(""), + ), + }}, + WantPatches: []clientgotesting.PatchActionImpl{{ + ActionImpl: clientgotesting.ActionImpl{ + Namespace: testNamespace, + }, + Name: deployName, + Patch: []byte(`[{"op":"replace","path":"/spec/replicas","value":0}]`), + }}, + }, + } table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, cmw configmap.Watcher) controller.Reconciler { retryAttempted = false diff --git a/pkg/reconciler/autoscaling/kpa/scaler.go b/pkg/reconciler/autoscaling/kpa/scaler.go index 5cb95f2fc719..67ea7ac23f0c 100644 --- a/pkg/reconciler/autoscaling/kpa/scaler.go +++ b/pkg/reconciler/autoscaling/kpa/scaler.go @@ -328,7 +328,11 @@ func (ks *scaler) scale(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscal if desiredScale < 0 && !pa.Status.IsActivating() { logger.Debug("Metrics are not yet being collected.") - return desiredScale, nil + + // if the target is unreachable and has no metrics. we still want to be able to scale it to 0 + if !pa.IsUnreachable() { + return desiredScale, nil + } } min, max := pa.ScaleBounds(asConfig) diff --git a/pkg/reconciler/autoscaling/kpa/scaler_test.go b/pkg/reconciler/autoscaling/kpa/scaler_test.go index 6b9b8e1f917a..d2205f7084f2 100644 --- a/pkg/reconciler/autoscaling/kpa/scaler_test.go +++ b/pkg/reconciler/autoscaling/kpa/scaler_test.go @@ -409,6 +409,27 @@ func TestScaler(t *testing.T) { wantScaling: false, paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) { paMarkInactive(k, time.Now()) + WithReachabilityReachable(k) + }, + }, { + label: "negative scale, to zero if unreachable with no metrics", + startReplicas: 1, + scaleTo: -1, + wantReplicas: 0, + wantScaling: true, + paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) { + paMarkInactive(k, time.Now().Add(-time.Hour)) + WithReachabilityUnreachable(k) + }, + }, { + label: "negative scale does not to zero if reachable with no metrics", + startReplicas: 1, + scaleTo: -1, + wantReplicas: -1, + wantScaling: false, + paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) { + paMarkInactive(k, time.Now().Add(-time.Hour)) + WithReachabilityReachable(k) }, }, { label: "scales up from zero to desired one", @@ -430,6 +451,7 @@ func TestScaler(t *testing.T) { wantScaling: false, paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) { paMarkActive(k, time.Now()) + WithReachabilityReachable(k) }, }, { label: "initial scale attained, but now time to scale down", diff --git a/pkg/testing/v1/configuration.go b/pkg/testing/v1/configuration.go index e84e532d9d6d..d026f5f75d2f 100644 --- a/pkg/testing/v1/configuration.go +++ b/pkg/testing/v1/configuration.go @@ -138,3 +138,10 @@ func WithConfigRevisionIdleTimeoutSeconds(revisionIdleTimeoutSeconds int64) Conf cfg.Spec.Template.Spec.IdleTimeoutSeconds = ptr.Int64(revisionIdleTimeoutSeconds) } } + +// WithConfigRevisionIdleTimeoutSeconds sets revision idle timeout +func WithConfigTEST(key, value string) ConfigOption { + return func(cfg *v1.Configuration) { + cfg.Spec.Template.ObjectMeta.Annotations[key] = value + } +} diff --git a/test/conformance.go b/test/conformance.go index ff31fd4d49f1..3bb848cdbc0a 100644 --- a/test/conformance.go +++ b/test/conformance.go @@ -53,6 +53,7 @@ const ( Timeout = "timeout" Volumes = "volumes" SlowStart = "slowstart" + DeadStart = "deadstart" // Constants for test image output. PizzaPlanetText1 = "What a spaceport!" diff --git a/test/e2e/dead_start_test.go b/test/e2e/dead_start_test.go new file mode 100644 index 000000000000..eb2ab2f65711 --- /dev/null +++ b/test/e2e/dead_start_test.go @@ -0,0 +1,261 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2020 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "fmt" + "strconv" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + pkgtest "knative.dev/pkg/test" + "knative.dev/serving/pkg/apis/serving" + v1 "knative.dev/serving/pkg/apis/serving/v1" + rtesting "knative.dev/serving/pkg/testing/v1" + v1options "knative.dev/serving/pkg/testing/v1" + "knative.dev/serving/test" + v1test "knative.dev/serving/test/v1" +) + +// withServiceImage sets the container image to be the provided string. +func withServiceImage(img string) v1options.ServiceOption { + return func(svc *v1.Service) { + svc.Spec.Template.Spec.PodSpec.Containers[0].Image = img + } +} + +func withConfigImage(img string) rtesting.ConfigOption { + return func(cfg *v1.Configuration) { + cfg.Spec.Template.Spec.Containers[0].Image = img + } +} + +func generationLabelSelector(config string, generation int) string { + return fmt.Sprintf("%s=%s,%s=%s", serving.ConfigurationLabelKey, config, "serving.knative.dev/configurationGeneration", strconv.Itoa(generation)) +} + +func revisionLabelSelector(revName string) string { + return fmt.Sprintf("%s=%s", serving.RevisionLabelKey, revName) +} + +func IsRevisionRestarting(clients *test.Clients) func(r *v1.Revision) (bool, error) { + return func(r *v1.Revision) (bool, error) { + // serving.knative.dev / revision = dead - start - to - healthy - tbibclgi - 00001 + pods := clients.KubeClient.CoreV1().Pods(r.GetNamespace()) + podList, err := pods.List(context.Background(), metav1.ListOptions{ + LabelSelector: revisionLabelSelector(r.Name), + FieldSelector: "status.phase!=Pending", + }) + if err != nil { + return false, err + } + + // verify the pods are restarting. + for i := range podList.Items { + conds := podList.Items[i].Status.ContainerStatuses + for j := range conds { + if conds[j].RestartCount > 0 { + return true, nil + } + } + } + return false, nil + } +} + +func IsRevisionScaledZero(clients *test.Clients) func(r *v1.Revision) (bool, error) { + return func(r *v1.Revision) (bool, error) { + // serving.knative.dev / revision = dead - start - to - healthy - tbibclgi - 00001 + pods := clients.KubeClient.CoreV1().Pods(r.GetNamespace()) + podList, err := pods.List(context.Background(), metav1.ListOptions{ + LabelSelector: revisionLabelSelector(r.Name), + FieldSelector: "status.phase!=Pending", + }) + if err != nil { + return false, err + } + gotPods := len(podList.Items) + + return gotPods == 0, nil + } +} + +// func latestRevisionName(t *testing.T, clients *test.Clients, configName, oldRevName string) string { +// // Wait for the Config have a LatestCreatedRevisionName +// if err := v1test.WaitForConfigurationState( +// clients.ServingClient, configName, +// func(c *v1.Configuration) (bool, error) { +// return c.Status.LatestCreatedRevisionName != oldRevName, nil +// }, "ConfigurationHasUpdatedCreatedRevision", +// ); err != nil { +// t.Fatalf("The Configuration %q has not updated LatestCreatedRevisionName from %q: %v", configName, oldRevName, err) +// } + +// config, err := clients.ServingClient.Configs.Get(context.Background(), configName, metav1.GetOptions{}) +// if err != nil { +// t.Fatal("Failed to get Configuration after it was seen to be live:", err) +// } + +// return config.Status.LatestCreatedRevisionName +// } + +// This test case creates a service which can never reach a ready state. +// The service is then udpated with a healthy image and is verified that +// the healthy revision is ready and the unhealhy revision is scaled to zero. +func TestDeadStartToHealthy(t *testing.T) { + t.Parallel() + + clients := Setup(t) + + names := test.ResourceNames{ + Config: test.ObjectNameForTest(t), + Service: test.ObjectNameForTest(t), + Route: test.ObjectNameForTest(t), + Image: test.DeadStart, + } + test.EnsureTearDown(t, clients, &names) + + var err error + const minScale = 3 + + t.Log("Creating route") + if _, err := v1test.CreateRoute(t, clients, names); err != nil { + t.Fatal("Failed to create Route:", err) + } + cfg, err := v1test.CreateConfiguration(t, clients, names, + withMinScale(minScale), + // rtesting.WithConfigTEST(serving.ProgressDeadlineAnnotationKey, "1s"), + rtesting.WithConfigRevisionTimeoutSeconds(1), + ) + if err != nil { + t.Fatal("Failed to create Configuration:", err) + } + + // time.Sleep(time.Hour) + failedRevName := latestRevisionName(t, clients, names.Config, "") + + t.Log("Waiting for revision to restart") + if err := v1test.WaitForRevisionState( + clients.ServingClient, failedRevName, IsRevisionRestarting(clients), "RevisionIsRestarting", + ); err != nil { + t.Fatalf("The Revision %q is not restarting: %v", failedRevName, err) + } + + t.Log("Updating configuration with valid image") + if _, err := v1test.PatchConfig(t, clients, cfg, withConfigImage(pkgtest.ImagePath(test.HelloWorld))); err != nil { + t.Fatal("Failed to update Configuration:", err) + } + + healthyRevName := latestRevisionName(t, clients, names.Config, failedRevName) + + t.Log("Waiting for revision to become ready") + if err := v1test.WaitForRevisionState( + clients.ServingClient, healthyRevName, v1test.IsRevisionReady, "RevisionIsReady", + ); err != nil { + t.Fatalf("The Revision %q did not become ready: %v", healthyRevName, err) + } + + t.Log("Waiting for revision scale zero") + if err := v1test.WaitForRevisionState( + clients.ServingClient, failedRevName, IsRevisionScaledZero(clients), "RevisionIsScaledZero", + ); err != nil { + t.Fatalf("The Revision %q is not restarting: %v", failedRevName, err) + } + + t.Log("Verify revision Active is Unreachable") + if err = v1test.CheckRevisionState(clients.ServingClient, failedRevName, func(r *v1.Revision) (bool, error) { + cond := r.Status.GetCondition(v1.RevisionConditionActive) + t.Logf("Revision %s Active state = %#v", failedRevName, cond) + if cond != nil { + if cond.Reason == "Unreachable" { + return true, nil + } + return true, fmt.Errorf("The Revision %s Active condition has: (Reason=%q, Message=%q)", + failedRevName, cond.Reason, cond.Message) + } + return false, fmt.Errorf("The Revision %s has empty Active condition", failedRevName) + }); err != nil { + t.Fatal("Failed to validate revision state:", err) + } + +} + +// This test case updates a healthy service with an image that can never reach a ready state. +// The healthy revision remains Ready and the DeadStart revision doesnt not scale down until ProgressDeadline is reached. +func TestDeadStartFromHealthy(t *testing.T) { + t.Parallel() + + clients := Setup(t) + + names := test.ResourceNames{ + Config: test.ObjectNameForTest(t), + Service: test.ObjectNameForTest(t), + Route: test.ObjectNameForTest(t), + Image: test.HelloWorld, + } + test.EnsureTearDown(t, clients, &names) + + var err error + const minScale = 3 + + t.Log("Creating route") + if _, err := v1test.CreateRoute(t, clients, names); err != nil { + t.Fatal("Failed to create Route:", err) + } + + cfg, err := v1test.CreateConfiguration(t, clients, names, withMinScale(minScale), + // rtesting.WithConfigAnn(serving.ProgressDeadlineAnnotationKey, "45s"), + rtesting.WithConfigRevisionTimeoutSeconds(1), + ) + if err != nil { + t.Fatal("Failed to create Configuration:", err) + } + + healthyRevName := latestRevisionName(t, clients, names.Config, "") + + t.Log("Waiting for revision to become ready") + if err := v1test.WaitForRevisionState( + clients.ServingClient, healthyRevName, v1test.IsRevisionReady, "RevisionIsReady", + ); err != nil { + t.Fatalf("The Revision %q did not become ready: %v", healthyRevName, err) + } + + t.Log("Updating configuration with valid image") + if _, err := v1test.PatchConfig(t, clients, cfg, withConfigImage(pkgtest.ImagePath(test.DeadStart))); err != nil { + t.Fatal("Failed to update Configuration:", err) + } + + failedRevName := latestRevisionName(t, clients, names.Config, healthyRevName) + + t.Log("Waiting for revision to restart") + if err := v1test.WaitForRevisionState( + clients.ServingClient, failedRevName, IsRevisionRestarting(clients), "RevisionIsRestarting", + ); err != nil { + t.Fatalf("The Revision %q is not restarting: %v", failedRevName, err) + } + + for i := 0; i < 100; i++ { + fmt.Printf("healthyRevName %v failedRevName %v\n", healthyRevName, failedRevName) + } + + // TODO: verify the restart is queued. + // it only shutsdown once progressdeadline reached. +} diff --git a/test/test_images/deadstart/README.md b/test/test_images/deadstart/README.md new file mode 100644 index 000000000000..76e3d29016c8 --- /dev/null +++ b/test/test_images/deadstart/README.md @@ -0,0 +1,4 @@ +# Deadstart test image + +This directory contains the test image used in the deadstart e2e tests. + diff --git a/test/test_images/deadstart/deadstart.go b/test/test_images/deadstart/deadstart.go new file mode 100644 index 000000000000..92d432342bac --- /dev/null +++ b/test/test_images/deadstart/deadstart.go @@ -0,0 +1,26 @@ +/* +Copyright 2018 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "os" +) + +func main() { + os.Exit(0) + +} diff --git a/test/test_images/deadstart/service.yaml b/test/test_images/deadstart/service.yaml new file mode 100644 index 000000000000..b14d3dea1679 --- /dev/null +++ b/test/test_images/deadstart/service.yaml @@ -0,0 +1,10 @@ +apiVersion: serving.knative.dev/v1 +kind: Service +metadata: + name: deadstart-test-image + namespace: default +spec: + template: + spec: + containers: + - image: ko://knative.dev/serving/test/test_images/deadstart