Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix crash-looping pods take a long time to terminate/clean #14607

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,14 @@ func (pa *PodAutoscaler) IsReady() bool {
pas.GetCondition(PodAutoscalerConditionReady).IsTrue()
}

func (pa *PodAutoscaler) IsUnreachable() bool {
return pa.Spec.Reachability == ReachabilityUnreachable
}

func (pa *PodAutoscaler) IsReachable() bool {
return pa.Spec.Reachability == ReachabilityReachable
}

// IsActive returns true if the pod autoscaler has finished scaling.
func (pas *PodAutoscalerStatus) IsActive() bool {
return pas.GetCondition(PodAutoscalerConditionActive).IsTrue()
Expand Down
64 changes: 43 additions & 21 deletions pkg/reconciler/autoscaling/kpa/kpa.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,40 +259,62 @@ func reportMetrics(pa *autoscalingv1alpha1.PodAutoscaler, pc podCounts) {
// | -1 | >= min | >0 | activating | active |
// | -1 | >= min | >0 | active | active |
func computeActiveCondition(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscaler, pc podCounts) {

minReady := activeThreshold(ctx, pa)
if pc.ready >= minReady && pa.Status.ServiceName != "" {
isServiceCreated := pa.Status.ServiceName != ""

// Mark the pa as Initialized if the service is created and reached the initial scale.
if pc.ready >= minReady && isServiceCreated {
pa.Status.MarkScaleTargetInitialized()
}

switch {
// Need to check for minReady = 0 because in the initialScale 0 case, pc.want will be -1.
case pc.want == 0 || minReady == 0:
if pa.Status.IsActivating() && minReady > 0 {
// We only ever scale to zero while activating if we fail to activate within the progress deadline.
pa.Status.MarkInactive("TimedOut", "The target could not be activated.")
} else {
// If the service is not created yet, it is either queued or inctive if initial scale zero.
if !isServiceCreated {

if minReady == 0 {
pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.")
} else {
pa.Status.MarkActivating(
"Queued", "Requests to the target are being buffered as resources are provisioned.")
}
return
}

case pc.ready < minReady:
if pc.want > 0 || !pa.Status.IsInactive() {
// If the target is not initialized, it will be queued.
if !pa.Status.IsScaleTargetInitialized() {
if pa.IsUnreachable() {
pa.Status.MarkInactive("Unreachable", "The target does not have an active routing state.")
} else {
pa.Status.MarkActivating(
"Queued", "Requests to the target are being buffered as resources are provisioned.")
}
return
}

switch {
case pc.want == 0:
// The kpa is trying to scale to 0.
if pa.Status.IsActivating() {
// The pa is stuck in activating and the progressDeadline is forcing a scale to zero.
pa.Status.MarkInactive("TimedOut", "The target could not be activated.")
} else {
// This is for the initialScale 0 case. In the first iteration, minReady is 0,
// but for the following iterations, minReady is 1. pc.want will continue being
// -1 until we start receiving metrics, so we will end up here.
// Even though PA is already been marked as inactive in the first iteration, we
// still need to set it again. Otherwise reconciliation will fail with NewObservedGenFailure
// because we cannot go through one iteration of reconciliation without setting
// some status.
andrew-delph marked this conversation as resolved.
Show resolved Hide resolved
// The pa is no longer receiving traffic.
pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.")
andrew-delph marked this conversation as resolved.
Show resolved Hide resolved
}

case pc.ready >= minReady:
if pc.want > 0 || !pa.Status.IsInactive() {
pa.Status.MarkActive()
}
case pc.ready >= minReady && pc.ready > 0:
// To be active there must be atleast 1 active pod.
// This takes into consideration when InitialScale is zero.
pa.Status.MarkActive()

case pc.want < 0:
// The decider is not receiving metrics because the pa scaled to zero.
pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.")

default:
// The desired scale has not been reached yet.
pa.Status.MarkActivating(
"Queued", "Requests to the target are being buffered as resources are provisioned.")
}
}

Expand Down
147 changes: 133 additions & 14 deletions pkg/reconciler/autoscaling/kpa/kpa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
// These are the fake informers we want setup.
fakenetworkingclient "knative.dev/networking/pkg/client/injection/client/fake"
fakesksinformer "knative.dev/networking/pkg/client/injection/informers/networking/v1alpha1/serverlessservice/fake"
"knative.dev/pkg/apis"
fakekubeclient "knative.dev/pkg/client/injection/kube/client/fake"
fakefilteredpodsinformer "knative.dev/pkg/client/injection/kube/informers/core/v1/pod/filtered/fake"
_ "knative.dev/pkg/client/injection/kube/informers/core/v1/service/fake"
Expand Down Expand Up @@ -174,9 +175,9 @@ func newConfigWatcher() configmap.Watcher {
})
}

func withScales(g, w int32) PodAutoscalerOption {
func withScales(actualScale, desiredScale int32) PodAutoscalerOption {
return func(pa *autoscalingv1alpha1.PodAutoscaler) {
pa.Status.DesiredScale, pa.Status.ActualScale = ptr.Int32(w), ptr.Int32(g)
pa.Status.DesiredScale, pa.Status.ActualScale = ptr.Int32(desiredScale), ptr.Int32(actualScale)
}
}

Expand Down Expand Up @@ -211,8 +212,20 @@ func sks(ns, n string, so ...SKSOption) *nv1a1.ServerlessService {
return s
}

func markOld(pa *autoscalingv1alpha1.PodAutoscaler) {
pa.Status.Conditions[0].LastTransitionTime.Inner.Time = time.Now().Add(-1 * time.Hour)
func sksMarkOld(sks *nv1a1.ServerlessService) {
for i := range sks.Status.Conditions {
sks.Status.Conditions[i].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(time.Now().Add(-time.Hour))}
}
}

func markActivatorEndpointsPopulated(sks *nv1a1.ServerlessService) {
sks.Status.MarkActivatorEndpointsPopulated()
}

func kpaMarkOld(pa *autoscalingv1alpha1.PodAutoscaler) {
for i := range pa.Status.Conditions {
pa.Status.Conditions[i].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(time.Now().Add(-time.Hour))}
}
}

func markScaleTargetInitialized(pa *autoscalingv1alpha1.PodAutoscaler) {
Expand Down Expand Up @@ -467,7 +480,7 @@ func TestReconcile(t *testing.T) {
Key: key,
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
withScales(0, defaultScale), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc)),
withScales(0, defaultScale), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), WithReachabilityReachable),
// SKS is ready here, since its endpoints are populated with Activator endpoints.
sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithSKSReady),
metric(testNamespace, testRevision),
Expand All @@ -480,7 +493,7 @@ func TestReconcile(t *testing.T) {
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithBufferedTraffic, withScales(0, defaultScale),
WithPASKSReady, WithPAMetricsService(privateSvc),
WithPAStatusService(testRevision), WithObservedGeneration(1)),
WithPAStatusService(testRevision), WithObservedGeneration(1), WithReachabilityReachable),
}},
}, {
Name: "sks is still not ready",
Expand Down Expand Up @@ -708,7 +721,7 @@ func TestReconcile(t *testing.T) {
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, withScales(0, 0),
WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
WithPASKSReady, markOld, WithPAStatusService(testRevision),
WithPASKSReady, kpaMarkOld, WithPAStatusService(testRevision),
WithPAMetricsService(privateSvc), WithObservedGeneration(1)),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady),
metric(testNamespace, testRevision),
Expand All @@ -724,7 +737,7 @@ func TestReconcile(t *testing.T) {
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, withScales(0, 0),
WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
WithPASKSReady, markOld, WithPAStatusService(testRevision),
WithPASKSReady, kpaMarkOld, WithPAStatusService(testRevision),
WithPAMetricsService(privateSvc), WithObservedGeneration(1)),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady),
metric(testNamespace, testRevision),
Expand All @@ -743,7 +756,7 @@ func TestReconcile(t *testing.T) {
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithPASKSReady, WithTraffic, markOld,
kpa(testNamespace, testRevision, WithPASKSReady, WithTraffic, kpaMarkOld,
withScales(0, 0), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc)),
defaultSKS,
metric(testNamespace, testRevision),
Expand Down Expand Up @@ -776,14 +789,14 @@ func TestReconcile(t *testing.T) {
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithPASKSReady, WithBufferedTraffic, markOld,
kpa(testNamespace, testRevision, WithPASKSReady, WithBufferedTraffic, kpaMarkOld, WithReachabilityUnreachable,
WithPAStatusService(testRevision), withScales(0, 0),
WithPAMetricsService(privateSvc)),
defaultSKS,
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision), defaultReady},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, markScaleTargetInitialized, WithPASKSReady, WithPAMetricsService(privateSvc),
Object: kpa(testNamespace, testRevision, markScaleTargetInitialized, WithPASKSReady, WithPAMetricsService(privateSvc), WithReachabilityUnreachable,
WithNoTraffic("TimedOut", "The target could not be activated."), withScales(1, 0),
WithPAStatusService(testRevision), WithPAMetricsService(privateSvc),
WithObservedGeneration(1)),
Expand Down Expand Up @@ -932,13 +945,13 @@ func TestReconcile(t *testing.T) {
Object: activeKPAMinScale(overscale, defaultScale),
}},
}, {
Name: "scaled-to-0-no-scale-data",
Name: "scaled to 0 no scale data",
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, unknownScale, /* desiredScale */
0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithPASKSReady,
kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithReachabilityReachable, WithPASKSReady,
WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), WithPAMetricsService(privateSvc),
withScales(0, -1), WithPAStatusService(testRevision), WithObservedGeneration(1)),
sks(testNamespace, testRevision, WithDeployRef(deployName),
Expand Down Expand Up @@ -1071,6 +1084,28 @@ func TestReconcile(t *testing.T) {
WithPASKSNotReady(""), WithPAStatusService(testRevision),
),
}},
}, {
Name: "initial scale zero: with ready pod",
Key: key,
Ctx: context.WithValue(context.WithValue(context.Background(), asConfigKey{}, initialScaleZeroASConfig()), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, -42 /* ebc */)),
Objects: append([]runtime.Object{
kpa(testNamespace, testRevision, markScaleTargetInitialized, withScales(1, 0),
WithReachabilityReachable, WithPAStatusService(testRevision), WithPAMetricsService(privateSvc),
WithPASKSReady, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady, WithNumActivators(defaultAct)),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(1)
}),
}, makeReadyPods(1, testNamespace, testRevision)...),
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, WithTraffic, WithPASKSReady, markScaleTargetInitialized,
withScales(1, 0), WithReachabilityReachable, WithPAStatusService(testRevision),
WithPAMetricsService(privateSvc), WithObservedGeneration(1), WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
),
}},
}, {
Name: "initial scale zero: sks ServiceName empty",
Key: key,
Expand Down Expand Up @@ -1171,7 +1206,91 @@ func TestReconcile(t *testing.T) {
WantUpdates: []clientgotesting.UpdateActionImpl{{
Object: defaultProxySKS,
}},
}}
}, {
Name: "image pull error - stay queued", // The service is created and not initialized
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, -1, /* desiredScale */
0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithPASKSNotReady(""), withScales(0, -1), WithReachabilityReachable,
WithPAMetricsService(privateSvc)),
// SKS won't be ready bc no ready endpoints, but private service name will be populated.
sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithPrivateService, WithPubService),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(0)
}),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, WithPASKSNotReady(""),
WithBufferedTraffic,
withScales(0, -1), WithReachabilityReachable,
WithPAMetricsService(privateSvc), WithObservedGeneration(1),
WithPAStatusService(testRevision),
),
}}},
{
Name: "unreachable crashing - pa becomes Failed.",
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, -1 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace,
testRevision, WithReachabilityUnreachable,
WithPAMetricsService(privateSvc), WithPASKSNotReady(noPrivateServiceName)),
sks(
testNamespace, testRevision, WithProxyMode,
WithDeployRef(deployName), WithPrivateService, WithPubService,
),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(1)
}),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision,
WithNoTraffic("Unreachable", "The target does not have an active routing state."), withScales(0, -1), WithReachabilityUnreachable,
WithPAMetricsService(privateSvc), WithObservedGeneration(1),
WithPAStatusService(testRevision), WithPASKSNotReady(""),
),
}},
},
{
Name: "unreachable crashing - deployment becomes 0",
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision,
WithNoTraffic("Unreachable", "The target does not have an active routing state."), WithReachabilityUnreachable,
WithObservedGeneration(1), WithPAStatusService(testRevision), WithPASKSNotReady(""), withScales(0, -1),
),
sks(
testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName),
WithPrivateService, WithPubService, markActivatorEndpointsPopulated, sksMarkOld,
),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(1)
}),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision,
WithNoTraffic("Unreachable", "The target does not have an active routing state."), WithReachabilityUnreachable,
WithPAStatusService(testRevision), WithPAMetricsService(privateSvc),
WithObservedGeneration(1), withScales(0, 0), WithPASKSNotReady(""),
),
}},
WantPatches: []clientgotesting.PatchActionImpl{{
ActionImpl: clientgotesting.ActionImpl{
Namespace: testNamespace,
},
Name: deployName,
Patch: []byte(`[{"op":"replace","path":"/spec/replicas","value":0}]`),
}},
},
}

table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, cmw configmap.Watcher) controller.Reconciler {
retryAttempted = false
Expand Down
6 changes: 5 additions & 1 deletion pkg/reconciler/autoscaling/kpa/scaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,11 @@ func (ks *scaler) scale(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscal

if desiredScale < 0 && !pa.Status.IsActivating() {
logger.Debug("Metrics are not yet being collected.")
return desiredScale, nil

// if the target is unreachable and has no metrics. we still want to be able to scale it to 0
if !pa.IsUnreachable() {
return desiredScale, nil
}
}

min, max := pa.ScaleBounds(asConfig)
Expand Down
22 changes: 22 additions & 0 deletions pkg/reconciler/autoscaling/kpa/scaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,27 @@ func TestScaler(t *testing.T) {
wantScaling: false,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now())
WithReachabilityReachable(k)
},
}, {
label: "negative scale, to zero if unreachable with no metrics",
startReplicas: 1,
scaleTo: -1,
wantReplicas: 0,
wantScaling: true,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now().Add(-time.Hour))
WithReachabilityUnreachable(k)
},
}, {
label: "negative scale does not to zero if reachable with no metrics",
startReplicas: 1,
scaleTo: -1,
wantReplicas: -1,
wantScaling: false,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now().Add(-time.Hour))
WithReachabilityReachable(k)
},
}, {
label: "scales up from zero to desired one",
Expand All @@ -430,6 +451,7 @@ func TestScaler(t *testing.T) {
wantScaling: false,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkActive(k, time.Now())
WithReachabilityReachable(k)
},
}, {
label: "initial scale attained, but now time to scale down",
Expand Down
Loading