Skip to content

Commit

Permalink
Start autoupdate_agent_rollout controller in auth service (#49101)
Browse files Browse the repository at this point in the history
* run autoupdate_agent_rollout controller

* Recover from panics inside the controller

* Address tim's feedback

Co-authored-by: rosstimothy <[email protected]>

---------

Co-authored-by: rosstimothy <[email protected]>
  • Loading branch information
hugoShaka and rosstimothy authored Nov 27, 2024
1 parent ace61e4 commit b207bb7
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package rolloutcontroller
package rollout

import (
"context"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package rolloutcontroller
package rollout

import (
"context"
Expand All @@ -29,7 +29,7 @@ import (
)

// mockClient is a mock implementation if the Client interface for testing purposes.
// This is used to precisely check which calls are made by the Reconciler during tests.
// This is used to precisely check which calls are made by the reconciler during tests.
// Use newMockClient to create one from stubs. Once the test is over, you must call
// mockClient.checkIfEmpty to validate all expected calls were made.
type mockClient struct {
Expand Down
106 changes: 106 additions & 0 deletions lib/autoupdate/rollout/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Teleport
* Copyright (C) 2024 Gravitational, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package rollout

import (
"context"
"log/slog"
"time"

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"

"github.com/gravitational/teleport/api/utils/retryutils"
"github.com/gravitational/teleport/lib/utils/interval"
)

const (
reconcilerPeriod = time.Minute
)

// Controller wakes up every minute to reconcile the autoupdate_agent_rollout resource.
// See the reconciler godoc for more details about the reconciliation process.
// We currently wake up every minute, in the future we might decide to also watch for events
// (from autoupdate_config and autoupdate_version changefeed) to react faster.
type Controller struct {
// TODO(hugoShaka) add prometheus metrics describing the reconciliation status
reconciler reconciler
clock clockwork.Clock
log *slog.Logger
}

// NewController creates a new Controller for the autoupdate_agent_rollout kind.
func NewController(client Client, log *slog.Logger, clock clockwork.Clock) (*Controller, error) {
if client == nil {
return nil, trace.BadParameter("missing client")
}
if log == nil {
return nil, trace.BadParameter("missing log")
}
if clock == nil {
return nil, trace.BadParameter("missing clock")
}
return &Controller{
clock: clock,
log: log,
reconciler: reconciler{
clt: client,
log: log,
},
}, nil
}

// Run the autoupdate_agent_rollout controller. This function returns only when its context is canceled.
func (c *Controller) Run(ctx context.Context) error {
config := interval.Config{
Duration: reconcilerPeriod,
FirstDuration: reconcilerPeriod,
Jitter: retryutils.SeventhJitter,
Clock: c.clock,
}
ticker := interval.New(config)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
c.log.InfoContext(ctx, "Stopping autoupdate_agent_rollout controller", "reason", ctx.Err())
return ctx.Err()
case <-ticker.Next():
c.log.DebugContext(ctx, "Reconciling autoupdate_agent_rollout")
if err := c.tryAndCatch(ctx); err != nil {
c.log.ErrorContext(ctx, "Failed to reconcile autoudpate_agent_controller", "error", err)
}
}
}
}

// tryAndCatch tries to run the controller reconciliation logic and recovers from potential panic by converting them
// into errors. This ensures that a critical bug in the reconciler cannot bring down the whole Teleport cluster.
func (c *Controller) tryAndCatch(ctx context.Context) (err error) {
// If something terribly bad happens during the reconciliation, we recover and return an error
defer func() {
if r := recover(); r != nil {
c.log.ErrorContext(ctx, "Recovered from panic in the autoupdate_agent_rollout controller", "panic", r)
err = trace.NewAggregate(err, trace.Errorf("Panic recovered during reconciliation: %v", r))
}
}()
err = trace.Wrap(c.reconciler.reconcile(ctx))
return
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package rolloutcontroller
package rollout

import (
"context"
Expand All @@ -37,21 +37,21 @@ const (
maxConflictRetry = 3
)

// Reconciler reconciles the AutoUpdateAgentRollout singleton based on the content of the AutoUpdateVersion and
// reconciler reconciles the AutoUpdateAgentRollout singleton based on the content of the AutoUpdateVersion and
// AutoUpdateConfig singletons. This reconciler is not based on the services.GenericReconciler because:
// - we reconcile 2 resources with one
// - both input and output are singletons, we don't need the multi resource logic nor stream/paginated APIs
type Reconciler struct {
type reconciler struct {
clt Client
log *slog.Logger

// mutex ensures we only run one reconciliation at a time
mutex sync.Mutex
}

// Reconcile the AutoUpdateAgentRollout singleton. The reconciliation can fail because of a conflict (multiple auths
// reconcile the AutoUpdateAgentRollout singleton. The reconciliation can fail because of a conflict (multiple auths
// are racing), in this case we retry the reconciliation immediately.
func (r *Reconciler) Reconcile(ctx context.Context) error {
func (r *reconciler) reconcile(ctx context.Context) error {
r.mutex.Lock()
defer r.mutex.Unlock()

Expand Down Expand Up @@ -88,7 +88,7 @@ func (r *Reconciler) Reconcile(ctx context.Context) error {
// The creation/update/deletion can fail with a trace.CompareFailedError or trace.NotFoundError
// if the resource change while we were computing it.
// The caller must handle those error and retry the reconciliation.
func (r *Reconciler) tryReconcile(ctx context.Context) error {
func (r *reconciler) tryReconcile(ctx context.Context) error {
// get autoupdate_config
var config *autoupdate.AutoUpdateConfig
if c, err := r.clt.GetAutoUpdateConfig(ctx); err == nil {
Expand Down Expand Up @@ -171,7 +171,7 @@ func (r *Reconciler) tryReconcile(ctx context.Context) error {
return trace.Wrap(err, "updating rollout")
}

func (r *Reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAgents, version *autoupdate.AutoUpdateVersionSpecAgents) (*autoupdate.AutoUpdateAgentRolloutSpec, error) {
func (r *reconciler) buildRolloutSpec(config *autoupdate.AutoUpdateConfigSpecAgents, version *autoupdate.AutoUpdateVersionSpecAgents) (*autoupdate.AutoUpdateAgentRolloutSpec, error) {
// reconcile mode
mode, err := getMode(config.GetMode(), version.GetMode())
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package rolloutcontroller
package rollout

import (
"context"
Expand Down Expand Up @@ -307,7 +307,7 @@ func TestTryReconcile(t *testing.T) {

// Test execution: Running the reconciliation

reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}
Expand Down Expand Up @@ -375,13 +375,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.NoError(t, reconciler.Reconcile(ctx))
require.NoError(t, reconciler.reconcile(ctx))

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand All @@ -397,13 +397,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.NoError(t, reconciler.Reconcile(ctx))
require.NoError(t, reconciler.reconcile(ctx))

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand All @@ -421,13 +421,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.NoError(t, reconciler.Reconcile(ctx))
require.NoError(t, reconciler.reconcile(ctx))

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand Down Expand Up @@ -461,13 +461,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.NoError(t, reconciler.Reconcile(ctx))
require.NoError(t, reconciler.reconcile(ctx))

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand Down Expand Up @@ -499,13 +499,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.NoError(t, reconciler.Reconcile(ctx))
require.NoError(t, reconciler.reconcile(ctx))

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand All @@ -523,13 +523,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.ErrorContains(t, reconciler.Reconcile(ctx), "the DB fell on the floor")
require.ErrorContains(t, reconciler.reconcile(ctx), "the DB fell on the floor")

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand All @@ -553,13 +553,13 @@ func TestReconciler_Reconcile(t *testing.T) {
}

client := newMockClient(t, stubs)
reconciler := &Reconciler{
reconciler := &reconciler{
clt: client,
log: log,
}

// Test execution: run the reconciliation loop
require.ErrorContains(t, reconciler.Reconcile(cancelableCtx), "canceled")
require.ErrorIs(t, reconciler.reconcile(cancelableCtx), context.Canceled)

// Test validation: check that all the expected calls were received
client.checkIfEmpty(t)
Expand Down
9 changes: 9 additions & 0 deletions lib/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ import (
"github.com/gravitational/teleport/lib/auth/storage"
"github.com/gravitational/teleport/lib/authz"
"github.com/gravitational/teleport/lib/automaticupgrades"
"github.com/gravitational/teleport/lib/autoupdate/rollout"
"github.com/gravitational/teleport/lib/backend"
"github.com/gravitational/teleport/lib/backend/dynamo"
_ "github.com/gravitational/teleport/lib/backend/etcdbk"
Expand Down Expand Up @@ -2430,6 +2431,14 @@ func (process *TeleportProcess) initAuthService() error {
return trace.Wrap(spiffeFedSyncer.Run(process.GracefulExitContext()), "running SPIFFEFederation Syncer")
})

agentRolloutController, err := rollout.NewController(authServer, logger, process.Clock)
if err != nil {
return trace.Wrap(err, "creating the rollout controller")
}
process.RegisterFunc("auth.autoupdate_agent_rollout_controller", func() error {
return trace.Wrap(agentRolloutController.Run(process.GracefulExitContext()), "running autoupdate_agent_rollout controller")
})

process.RegisterFunc("auth.server_info", func() error {
return trace.Wrap(auth.ReconcileServerInfos(process.GracefulExitContext(), authServer))
})
Expand Down
Loading

0 comments on commit b207bb7

Please sign in to comment.