From 76d7aeffa71fca073997e4b8901df177e4aa5974 Mon Sep 17 00:00:00 2001 From: gabemontero Date: Thu, 18 Jan 2024 16:45:31 -0500 Subject: [PATCH] do not allow negative requeue times Use of the value of 0 for the taskrun/pipeline timeout, which per https://github.com/tektoncd/pipeline/blob/main/docs/pipelineruns.md#configuring-a-failure-timeout for example means timeout is disabled, results in the waitTime passed to the Requeue event to be negative. This had the observed behavior of Requeue'ing immediately, and intense cycles of many reconcilations per second were observed if the TaskRun's/PipelineRun's state did not in fact change. This artificially constrained the peformance of the pipeline controller. This change makes sure the wait time passed to the Requeue is not negative. rh-pre-commit.version: 2.1.0 rh-pre-commit.check-secrets: ENABLED --- docs/pipelineruns.md | 2 + docs/taskruns.md | 2 + pkg/reconciler/pipelinerun/pipelinerun.go | 16 +++- .../pipelinerun/pipelinerun_test.go | 90 +++++++++++++++++++ pkg/reconciler/taskrun/taskrun.go | 7 +- pkg/reconciler/taskrun/taskrun_test.go | 85 ++++++++++++++++++ 6 files changed, 199 insertions(+), 3 deletions(-) diff --git a/docs/pipelineruns.md b/docs/pipelineruns.md index 1fd3a008537..735886af02b 100644 --- a/docs/pipelineruns.md +++ b/docs/pipelineruns.md @@ -1424,6 +1424,8 @@ If you set the timeout to 0, the `PipelineRun` fails immediately upon encounteri > :warning: ** `timeout` is deprecated and will be removed in future versions. Consider using `timeouts` instead. +> :note: An internal detail of the `PipelineRun` and `TaskRun` reconcilers in the Tekton controller is that it will requeue a `PipelineRun` or `TaskRun` for re-evaluation, versus waiting for the next update, under certain conditions. The wait time for that re-queueing is the elapsed time subtracted from the timeout; however, if the timeout is set to '0', that calculation produces a negative number, and the new reconciliation event will fire immediately, which can impact overall performance, which is counter to the intent of wait time calculation. So instead, the reconcilers will use the configured global timeout as the wait time when the associated timeout has been set to '0'. + ## `PipelineRun` status ### The `status` field diff --git a/docs/taskruns.md b/docs/taskruns.md index 6db9c375f1b..ebb9264bee6 100644 --- a/docs/taskruns.md +++ b/docs/taskruns.md @@ -771,6 +771,8 @@ a different global default timeout value using the `default-timeout-minutes` fie all `TaskRuns` that do not have a timeout set will have no timeout and will run until it completes successfully or fails from an error. +> :note: An internal detail of the `PipelineRun` and `TaskRun` reconcilers in the Tekton controller is that it will requeue a `PipelineRun` or `TaskRun` for re-evaluation, versus waiting for the next update, under certain conditions. The wait time for that re-queueing is the elapsed time subtracted from the timeout; however, if the timeout is set to '0', that calculation produces a negative number, and the new reconciliation event will fire immediately, which can impact overall performance, which is counter to the intent of wait time calculation. So instead, the reconcilers will use the configured global timeout as the wait time when the associated timeout has been set to '0'. + ### Specifying `ServiceAccount` credentials You can execute the `Task` in your `TaskRun` with a specific set of credentials by diff --git a/pkg/reconciler/pipelinerun/pipelinerun.go b/pkg/reconciler/pipelinerun/pipelinerun.go index b9a01f558e5..c163438ca81 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun.go +++ b/pkg/reconciler/pipelinerun/pipelinerun.go @@ -25,6 +25,7 @@ import ( "reflect" "regexp" "strings" + "time" "github.com/hashicorp/go-multierror" "github.com/tektoncd/pipeline/pkg/apis/config" @@ -273,9 +274,20 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pr *v1.PipelineRun) pkgr // Compute the time since the task started. elapsed := c.Clock.Since(pr.Status.StartTime.Time) // Snooze this resource until the appropriate timeout has elapsed. - waitTime := pr.PipelineTimeout(ctx) - elapsed - if pr.Status.FinallyStartTime == nil && pr.TasksTimeout() != nil { + // but if the timeout has been disabled by setting timeout to 0, we + // do not want to subtract from 0, because a negative wait time will + // result in the requeue happening essentially immediately + timeout := pr.PipelineTimeout(ctx) + taskTimeout := pr.TasksTimeout() + waitTime := timeout - elapsed + if timeout == config.NoTimeoutDuration { + waitTime = time.Duration(config.FromContextOrDefaults(ctx).Defaults.DefaultTimeoutMinutes) * time.Minute + } + if pr.Status.FinallyStartTime == nil && taskTimeout != nil { waitTime = pr.TasksTimeout().Duration - elapsed + if taskTimeout.Duration == config.NoTimeoutDuration { + waitTime = time.Duration(config.FromContextOrDefaults(ctx).Defaults.DefaultTimeoutMinutes) * time.Minute + } } else if pr.Status.FinallyStartTime != nil && pr.FinallyTimeout() != nil { finallyWaitTime := pr.FinallyTimeout().Duration - c.Clock.Since(pr.Status.FinallyStartTime.Time) if finallyWaitTime < waitTime { diff --git a/pkg/reconciler/pipelinerun/pipelinerun_test.go b/pkg/reconciler/pipelinerun/pipelinerun_test.go index 1b100cd1b71..6af32515743 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun_test.go +++ b/pkg/reconciler/pipelinerun/pipelinerun_test.go @@ -2486,6 +2486,96 @@ spec: } } +func TestReconcileWithTimeoutDisabled(t *testing.T) { + testCases := []struct { + name string + timeout time.Duration + }{ + { + name: "pipeline timeout is 24h", + timeout: 24 * time.Hour, + }, + { + name: "pipeline timeout is way longer than 24h", + timeout: 360 * time.Hour, + }, + } + + for _, tc := range testCases { + startTime := time.Date(2022, time.January, 1, 0, 0, 0, 0, time.UTC).Add(-3 * tc.timeout) + t.Run(tc.name, func(t *testing.T) { + ps := []*v1.Pipeline{parse.MustParseV1Pipeline(t, ` +metadata: + name: test-pipeline + namespace: foo +spec: + tasks: + - name: hello-world-1 + taskRef: + name: hello-world + - name: hello-world-2 + taskRef: + name: hello-world +`)} + prs := []*v1.PipelineRun{parse.MustParseV1PipelineRun(t, ` +metadata: + name: test-pipeline-run-with-timeout-disabled + namespace: foo +spec: + pipelineRef: + name: test-pipeline + taskRunTemplate: + serviceAccountName: test-sa + timeouts: + pipeline: 0h0m0s +status: + startTime: "2021-12-30T00:00:00Z" +`)} + ts := []*v1.Task{simpleHelloWorldTask} + + trs := []*v1.TaskRun{mustParseTaskRunWithObjectMeta(t, taskRunObjectMeta("test-pipeline-run-with-timeout-hello-world-1", "foo", "test-pipeline-run-with-timeout-disabled", + "test-pipeline", "hello-world-1", false), ` +spec: + serviceAccountName: test-sa + taskRef: + name: hello-world + kind: Task +`)} + start := metav1.NewTime(startTime) + prs[0].Status.StartTime = &start + + d := test.Data{ + PipelineRuns: prs, + Pipelines: ps, + Tasks: ts, + TaskRuns: trs, + } + prt := newPipelineRunTest(t, d) + defer prt.Cancel() + + c := prt.TestAssets.Controller + clients := prt.TestAssets.Clients + reconcileError := c.Reconciler.Reconcile(prt.TestAssets.Ctx, "foo/test-pipeline-run-with-timeout-disabled") + if reconcileError == nil { + t.Errorf("expected error, but got nil") + } + if isRequeueError, requeueDuration := controller.IsRequeueKey(reconcileError); !isRequeueError { + t.Errorf("Expected requeue error, but got: %s", reconcileError.Error()) + } else if requeueDuration < 0 { + t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String()) + } + prt.Test.Logf("Getting reconciled run") + reconciledRun, err := clients.Pipeline.TektonV1().PipelineRuns("foo").Get(prt.TestAssets.Ctx, "test-pipeline-run-with-timeout-disabled", metav1.GetOptions{}) + if err != nil { + prt.Test.Errorf("Somehow had error getting reconciled run out of fake client: %s", err) + } + if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason == "PipelineRunTimeout" { + t.Errorf("Expected PipelineRun to not be timed out, but it is timed out") + } + }) + } +} + func TestReconcileWithTimeoutForALongTimeAndEtcdLimit_Pipeline(t *testing.T) { timeout := 12 * time.Hour testCases := []struct { diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 09ca1b7a151..99213bb5f87 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -211,7 +211,12 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon // Compute the time since the task started. elapsed := c.Clock.Since(tr.Status.StartTime.Time) // Snooze this resource until the timeout has elapsed. - return controller.NewRequeueAfter(tr.GetTimeout(ctx) - elapsed) + timeout := tr.GetTimeout(ctx) + waitTime := timeout - elapsed + if timeout == config.NoTimeoutDuration { + waitTime = time.Duration(config.FromContextOrDefaults(ctx).Defaults.DefaultTimeoutMinutes) * time.Minute + } + return controller.NewRequeueAfter(waitTime) } return nil } diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index c681f8ae04b..7eb2e9eb39a 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -2777,6 +2777,91 @@ status: } } +func TestReconcileWithTimeoutDisabled(t *testing.T) { + type testCase struct { + name string + taskRun *v1.TaskRun + } + + testcases := []testCase{ + { + name: "taskrun with timeout", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-timeout + namespace: foo +spec: + taskRef: + name: test-task + timeout: 10m +status: + conditions: + - status: Unknown + type: Succeeded +`), + }, { + name: "taskrun with default timeout", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-default-timeout-60-minutes + namespace: foo +spec: + taskRef: + name: test-task +status: + conditions: + - status: Unknown + type: Succeeded +`), + }, { + name: "task run with timeout set to 0 to disable", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-timeout-disabled + namespace: foo +spec: + taskRef: + name: test-task + timeout: 0s +status: + conditions: + - status: Unknown + type: Succeeded +`), + }} + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + start := metav1.NewTime(time.Now()) + tc.taskRun.Status.StartTime = &start + pod, err := makePod(tc.taskRun, simpleTask) + d := test.Data{ + TaskRuns: []*v1.TaskRun{tc.taskRun}, + Tasks: []*v1.Task{simpleTask}, + Pods: []*corev1.Pod{pod}, + } + testAssets, cancel := getTaskRunController(t, d) + defer cancel() + c := testAssets.Controller + clients := testAssets.Clients + + err = c.Reconciler.Reconcile(testAssets.Ctx, getRunName(tc.taskRun)) + if err == nil { + t.Errorf("expected error when reconciling completed TaskRun : %v", err) + } + if isRequeueError, requeueDuration := controller.IsRequeueKey(err); !isRequeueError { + t.Errorf("Expected requeue error, but got: %s", err.Error()) + } else if requeueDuration < 0 { + t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String()) + } + _, err = clients.Pipeline.TektonV1().TaskRuns(tc.taskRun.Namespace).Get(testAssets.Ctx, tc.taskRun.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", tc.taskRun.Name, err) + } + }) + } +} + func TestReconcileTimeouts(t *testing.T) { type testCase struct { name string