Skip to content

Commit

Permalink
No remediate, only detect
Browse files Browse the repository at this point in the history
  • Loading branch information
RebeccaMahany committed Dec 18, 2024
1 parent 26e3846 commit 5ec7393
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 82 deletions.
46 changes: 17 additions & 29 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,13 @@ func (i *OsqueryInstance) Launch() error {
return fmt.Errorf("could not calculate osquery file paths: %w", err)
}

// Make sure no lock files exist that could prevent osquery from starting up
if err := i.cleanUpOldDatabaseLock(ctx, paths); err != nil {
traces.SetError(span, fmt.Errorf("cleaning up old database lock: %w", err))
return fmt.Errorf("cleaning up old database lock: %w", err)
// Check to see whether lock files exist that could prevent osquery from starting up
if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil {
// A detection error shouldn't prevent us from creating the process -- log it and move on
i.slogger.Log(ctx, slog.LevelInfo,
"error detecting stale database lock",
"err", err,
)
}

// Populate augeas lenses, if requested
Expand Down Expand Up @@ -708,43 +711,28 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st
return osqueryFilePaths, nil
}

// cleanUpOldDatabaseLock checks for the presence of a lock file in the given database and removes it
// if it exists. The lock file can hang around if an osquery process isn't properly terminated, and
// will prevent us from launching a new osquery process.
func (i *OsqueryInstance) cleanUpOldDatabaseLock(ctx context.Context, paths *osqueryFilePaths) error {
// detectStaleDatabaseLock checks for the presence of a lock file in the given database.
// We have noticed the lock file occasionally hanging around -- we think this can happen
// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock
// file will prevent osquery from starting up entirely. For now, we just detect this stale
// lock file, and log information about it. In the future we will attempt to remediate.
// See: https://github.com/kolide/launcher/issues/2004.
func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) {
lockFilePath := filepath.Join(paths.databasePath, "LOCK")

lockFileInfo, err := os.Stat(lockFilePath)
if os.IsNotExist(err) {
// No lock file, nothing to do here
return nil
}

// We do a couple retries here -- Windows complains about removing files in use sometimes.
err = backoff.WaitFor(func() error {
if err := os.Remove(lockFilePath); err != nil {
return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err)
}
return nil
}, 5*time.Second, 500*time.Millisecond)

if err != nil {
i.slogger.Log(ctx, slog.LevelInfo,
"detected old lock file but could not remove it",
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().String(),
"err", err,
)
return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err)
return false, nil
}

i.slogger.Log(ctx, slog.LevelInfo,
"detected and removed old osquery db lock file",
"detected stale osquery db lock file",
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().String(),
)

return nil
return true, nil
}

// createOsquerydCommand uses osqueryOptions to return an *exec.Cmd
Expand Down
68 changes: 15 additions & 53 deletions pkg/osquery/runtime/osqueryinstance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,75 +300,37 @@ func TestLaunch(t *testing.T) {
k.AssertExpectations(t)
}

func TestLaunch_WithLockFileCleanup(t *testing.T) {
func Test_detectStaleDatabaseLock(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
_, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())

// Create a lock file that would ordinarily prevent osquery from starting up
// Calculate paths
paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts)
require.NoError(t, err)

// Check for stale database lock -- there shouldn't be one
detected, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.False(t, detected)

// Create a lock file
lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK")
require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx
createLockFile(t, lockFilePath)

// Start the instance
instanceStartTime := time.Now()
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Confirm that the lock file is new -- it should have a modtime after instanceStartTime
lockFileInfo, err := os.Stat(lockFilePath)
// Check for a stale database lock again -- now, we should find it
detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.True(t, lockFileInfo.ModTime().After(instanceStartTime), logBytes.String())

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}
require.True(t, detectedRetry)

k.AssertExpectations(t)
}

0 comments on commit 5ec7393

Please sign in to comment.