diff --git a/canary-checker.properties b/canary-checker.properties index 7554ec754..e8295057a 100644 --- a/canary-checker.properties +++ b/canary-checker.properties @@ -6,5 +6,6 @@ # topology.runNow=true log.level.db=warn +# check.concurrency=100 # jobs.ComponentRelationshipSync.runNow=true diff --git a/cmd/operator.go b/cmd/operator.go index e9091f9a2..13e4bfe50 100644 --- a/cmd/operator.go +++ b/cmd/operator.go @@ -96,6 +96,12 @@ func run() error { // so we use a goroutine to unblock server start // to prevent health check from failing go jobs.Start() + + // TODO: stop the cron scheduler so that no more checks are scheduled + + shutdown.AddHookWithPriority("check jobs", shutdown.PriorityJobs, func() { + canaryJobs.AcquireAllCheckLocks(ctx) + }) } go serve() diff --git a/cmd/serve.go b/cmd/serve.go index e6d94194f..a0301c143 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -49,6 +49,12 @@ var Serve = &cobra.Command{ canaryJobs.StartScanCanaryConfigs(apicontext.DefaultContext, dataFile, configFiles) if executor { jobs.Start() + + // TODO: stop the cron scheduler so that no more checks are scheduled + + shutdown.AddHookWithPriority("check jobs", shutdown.PriorityJobs, func() { + canaryJobs.AcquireAllCheckLocks(apicontext.DefaultContext) + }) } serve() diff --git a/pkg/jobs/canary/sync.go b/pkg/jobs/canary/sync.go index 77f53ad6d..67fc8dbd3 100644 --- a/pkg/jobs/canary/sync.go +++ b/pkg/jobs/canary/sync.go @@ -19,8 +19,31 @@ import ( "github.com/flanksource/duty/job" "github.com/flanksource/duty/models" "github.com/robfig/cron/v3" + "golang.org/x/sync/semaphore" ) +const propertyCheckConcurrency = "check.concurrency" + +var ( + // The maximum number of checks that can run concurrently + defaultCheckConcurrency = 50 + + // Holds in the lock for every running check. + // Can be overwritten by 'check.concurrency' property. + globalCheckSemaphore *semaphore.Weighted +) + +// AcquireAllCheckLocks blocks until the global check sempahore is fully acquired. +// +// This helps to ensure that no checks are currently running. +func AcquireAllCheckLocks(ctx context.Context) { + ctx.Logger.V(6).Infof("acquiring all check locks") + if err := globalCheckSemaphore.Acquire(ctx, int64(ctx.Properties().Int(propertyCheckConcurrency, defaultCheckConcurrency))); err != nil { + ctx.Logger.Errorf("failed to acquire check semaphores: %v", err) + } + ctx.Logger.V(6).Infof("acquired all check locks") +} + var canaryJobs sync.Map const DefaultCanarySchedule = "@every 5m" @@ -140,6 +163,7 @@ func newCanaryJob(c CanaryJob) { IgnoreSuccessHistory: true, Retention: job.RetentionBalanced, ResourceID: c.DBCanary.ID.String(), + Semaphores: []*semaphore.Weighted{globalCheckSemaphore}, ResourceType: "canary", ID: fmt.Sprintf("%s/%s", c.Canary.Namespace, c.Canary.Name), Fn: c.Run, @@ -159,6 +183,10 @@ var SyncCanaryJobs = &job.Job{ Schedule: "@every 5m", Retention: job.RetentionFew, Fn: func(ctx job.JobRuntime) error { + if globalCheckSemaphore == nil { + globalCheckSemaphore = semaphore.NewWeighted(int64(ctx.Properties().Int(propertyCheckConcurrency, defaultCheckConcurrency))) + } + canaries, err := db.GetAllCanariesForSync(ctx.Context, runner.WatchNamespace) if err != nil { return err