Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: vastly improve development responsiveness and startup time #1465

Merged
merged 1 commit into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 34 additions & 18 deletions backend/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ type Config struct {
ConsoleURL *url.URL `help:"The public URL of the console (for CORS)." env:"FTL_CONTROLLER_CONSOLE_URL"`
ContentTime time.Time `help:"Time to use for console resource timestamps." default:"${timestamp=1970-01-01T00:00:00Z}"`
RunnerTimeout time.Duration `help:"Runner heartbeat timeout." default:"10s"`
ControllerTimeout time.Duration `help:"Controller heartbeat timeout." default:"10s"`
DeploymentReservationTimeout time.Duration `help:"Deployment reservation timeout." default:"120s"`
ArtefactChunkSize int `help:"Size of each chunk streamed to the client." default:"1048576"`
CommonConfig
Expand Down Expand Up @@ -177,6 +178,14 @@ func New(ctx context.Context, db *dal.DAL, config Config, runnerScaling scaling.
key = model.NewControllerKey(config.Bind.Hostname(), config.Bind.Port())
}
config.SetDefaults()

// Override some defaults during development mode.
_, devel := runnerScaling.(*localscaling.LocalScaling)
if devel {
config.RunnerTimeout = time.Second * 5
config.ControllerTimeout = time.Second * 5
}

svc := &Service{
tasks: scheduledtask.New(ctx, key, db),
dal: db,
Expand All @@ -194,37 +203,44 @@ func New(ctx context.Context, db *dal.DAL, config Config, runnerScaling scaling.
svc.controllerListListeners = append(svc.controllerListListeners, cronSvc)

// Use min, max backoff if we are running in production, otherwise use
// (1s, 1s) or develBackoff if available.
maybeDevelBackoff := func(min, max time.Duration, develBackoff ...backoff.Backoff) backoff.Backoff {
// (1s, 1s) (or develBackoff). Will also wrap the job such that it its next
// runtime is capped at 1s.
maybeDevelTask := func(job scheduledtask.Job, maxNext, minDelay, maxDelay time.Duration, develBackoff ...backoff.Backoff) (backoff.Backoff, scheduledtask.Job) {
if len(develBackoff) > 1 {
panic("too many devel backoffs")
}
if _, devel := runnerScaling.(*localscaling.LocalScaling); devel {
if devel {
chain := job
job = func(ctx context.Context) (time.Duration, error) {
next, err := chain(ctx)
// Cap at 1s in development mode.
return min(next, maxNext), err
}
if len(develBackoff) == 1 {
return develBackoff[0]
return develBackoff[0], job
}
return backoff.Backoff{Min: time.Second, Max: time.Second}
return backoff.Backoff{Min: time.Second, Max: time.Second}, job
}
return makeBackoff(min, max)
return makeBackoff(minDelay, maxDelay), job
}

// Parallel tasks.
svc.tasks.Parallel(maybeDevelBackoff(time.Second, time.Second*5), svc.syncRoutes)
svc.tasks.Parallel(maybeDevelBackoff(time.Second*3, time.Second*5), svc.heartbeatController)
svc.tasks.Parallel(maybeDevelBackoff(time.Second*5, time.Second*5), svc.updateControllersList)
svc.tasks.Parallel(maybeDevelBackoff(time.Second*5, time.Second*10), svc.executeAsyncCalls)
svc.tasks.Parallel(maybeDevelTask(svc.syncRoutes, time.Second, time.Second, time.Second*5))
svc.tasks.Parallel(maybeDevelTask(svc.heartbeatController, time.Second, time.Second*3, time.Second*5))
svc.tasks.Parallel(maybeDevelTask(svc.updateControllersList, time.Second, time.Second*5, time.Second*5))
svc.tasks.Parallel(maybeDevelTask(svc.executeAsyncCalls, time.Second, time.Second*5, time.Second*10))

// This should be a singleton task, but because this is the task that
// actually expires the leases used to run singleton tasks, it must be
// parallel.
svc.tasks.Parallel(maybeDevelBackoff(time.Second, time.Second*5), svc.expireStaleLeases)
svc.tasks.Parallel(maybeDevelTask(svc.expireStaleLeases, time.Second*2, time.Second, time.Second*5))

// Singleton tasks use leases to only run on a single controller.
svc.tasks.Singleton(maybeDevelBackoff(time.Second*20, time.Second*20), svc.reapStaleControllers)
svc.tasks.Singleton(maybeDevelBackoff(time.Second, time.Second*10), svc.reapStaleRunners)
svc.tasks.Singleton(maybeDevelBackoff(time.Second, time.Second*20), svc.releaseExpiredReservations)
svc.tasks.Singleton(maybeDevelBackoff(time.Second, time.Second*5), svc.reconcileDeployments)
svc.tasks.Singleton(maybeDevelBackoff(time.Second, time.Second*5), svc.reconcileRunners)
svc.tasks.Singleton(maybeDevelTask(svc.reapStaleControllers, time.Second*2, time.Second*20, time.Second*20))
svc.tasks.Singleton(maybeDevelTask(svc.reapStaleRunners, time.Second*2, time.Second, time.Second*10))
svc.tasks.Singleton(maybeDevelTask(svc.releaseExpiredReservations, time.Second*2, time.Second, time.Second*20))
svc.tasks.Singleton(maybeDevelTask(svc.reconcileDeployments, time.Second*2, time.Second, time.Second*5))
svc.tasks.Singleton(maybeDevelTask(svc.reconcileRunners, time.Second*2, time.Second, time.Second*5))
return svc, nil
}

Expand Down Expand Up @@ -1089,7 +1105,7 @@ func (s *Service) executeAsyncCalls(ctx context.Context) (time.Duration, error)
}
switch call.Origin {
case dal.AsyncCallOriginFSM:
return time.Second * 2, s.onAsyncFSMCallCompletion(ctx, call, resp.Msg)
return time.Millisecond * 100, s.onAsyncFSMCallCompletion(ctx, call, resp.Msg)

default:
panic(fmt.Sprintf("unexpected async call origin: %s", call.Origin))
Expand Down Expand Up @@ -1166,7 +1182,7 @@ func (s *Service) reserveRunner(ctx context.Context, reconcile model.Deployment)
// Periodically remove stale (ie. have not heartbeat recently) controllers from the database.
func (s *Service) reapStaleControllers(ctx context.Context) (time.Duration, error) {
logger := log.FromContext(ctx)
count, err := s.dal.KillStaleControllers(context.Background(), s.config.RunnerTimeout)
count, err := s.dal.KillStaleControllers(context.Background(), s.config.ControllerTimeout)
if err != nil {
return 0, fmt.Errorf("failed to delete stale controllers: %w", err)
} else if count > 0 {
Expand Down
24 changes: 0 additions & 24 deletions backend/controller/dal/async_calls.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,6 @@ import (
"github.com/TBD54566975/ftl/backend/schema"
)

// SendFSMEvent sends an event to an executing instance of an FSM.
//
// If the instance doesn't exist a new one will be created.
//
// [name] is the name of the state machine to execute, [executionKey] is the
// unique identifier for this execution of the FSM.
//
// Returns ErrConflict if the state machine is already executing.
//
// Note: this does not actually call the FSM, it just enqueues an async call for
// future execution.
//
// Note: no validation of the FSM is performed.
func (d *DAL) SendFSMEvent(ctx context.Context, name, executionKey, destinationState string, verb schema.Ref, request json.RawMessage) error {
_, err := d.db.SendFSMEvent(ctx, sql.SendFSMEventParams{
Key: executionKey,
Name: name,
State: destinationState,
Verb: verb,
Request: request,
})
return translatePGError(err)
}

// AsyncCallOrigin represents the kind of originator of the async call.
type AsyncCallOrigin sql.AsyncCallOrigin

Expand Down
33 changes: 33 additions & 0 deletions backend/controller/dal/fsm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package dal

import (
"context"
"encoding/json"

"github.com/TBD54566975/ftl/backend/controller/sql"
"github.com/TBD54566975/ftl/backend/schema"
)

// SendFSMEvent sends an event to an executing instance of an FSM.
//
// If the instance doesn't exist a new one will be created.
//
// [name] is the name of the state machine to execute, [executionKey] is the
// unique identifier for this execution of the FSM.
//
// Returns ErrConflict if the state machine is already executing.
//
// Note: this does not actually call the FSM, it just enqueues an async call for
// future execution.
//
// Note: no validation of the FSM is performed.
func (d *DAL) SendFSMEvent(ctx context.Context, name, executionKey, destinationState string, verb schema.Ref, request json.RawMessage) error {
_, err := d.db.SendFSMEvent(ctx, sql.SendFSMEventParams{
Key: executionKey,
Name: name,
State: destinationState,
Verb: verb,
Request: request,
})
return translatePGError(err)
}
3 changes: 3 additions & 0 deletions backend/controller/scaling/localscaling/local_scaling.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"path/filepath"
"sync"
"time"

"github.com/alecthomas/kong"

Expand Down Expand Up @@ -95,6 +96,8 @@ func (l *LocalScaling) SetReplicas(ctx context.Context, replicas int, idleRunner
}); err != nil {
return err
}
config.HeartbeatPeriod = time.Second
config.HeartbeatJitter = time.Millisecond * 100

runnerCtx := log.ContextWithLogger(ctx, logger.Scope(simpleName))

Expand Down
34 changes: 31 additions & 3 deletions backend/controller/sql/databasetesting/devel.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ func CreateForDevel(ctx context.Context, dsn string, recreate bool) (*pgxpool.Po
if recreate {
// Terminate any dangling connections.
_, err = conn.Exec(ctx, `
SELECT pid, pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname = $1 AND pid <> pg_backend_pid()`,
SELECT pid, pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname = $1 AND pid <> pg_backend_pid()`,
config.Database)
if err != nil {
return nil, err
Expand All @@ -72,5 +72,33 @@ func CreateForDevel(ctx context.Context, dsn string, recreate bool) (*pgxpool.Po
if err != nil {
return nil, err
}
// Reset transient state in the database to a clean state for development purposes.
// This includes things like resetting the state of async calls, leases,
// controller/runner registration, etc. but not anything more.
if !recreate {
_, err = realConn.Exec(ctx, `
WITH deleted AS (
DELETE FROM async_calls
RETURNING 1
), deleted_fsm_executions AS (
DELETE FROM fsm_executions
RETURNING 1
), deleted_leases AS (
DELETE FROM leases
RETURNING 1
), deleted_controllers AS (
DELETE FROM controller
RETURNING 1
), deleted_runners AS (
DELETE FROM runners
RETURNING 1
)
SELECT COUNT(*)
`)
if err != nil {
return nil, err
}
}

return realConn, nil
}
1 change: 1 addition & 0 deletions backend/controller/sql/queries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ WITH async_call AS (
FROM async_calls
WHERE state = 'pending'
LIMIT 1
FOR UPDATE SKIP LOCKED
), lease AS (
INSERT INTO leases (idempotency_key, key, expires_at)
VALUES (gen_random_uuid(), '/system/async_call/' || (SELECT id FROM async_call), (NOW() AT TIME ZONE 'utc') + @ttl::interval)
Expand Down
1 change: 1 addition & 0 deletions backend/controller/sql/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading