From bb27062d712c3c99cf3da1ba75277cf415661703 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Mon, 30 Dec 2024 16:54:49 -0500 Subject: [PATCH] Check for and log information about stale osquery database lock files (#2006) --- pkg/osquery/runtime/osqueryinstance.go | 83 ++++++++++++++ .../runtime/osqueryinstance_other_test.go | 20 ++++ pkg/osquery/runtime/osqueryinstance_test.go | 103 ++++++++++++++++++ .../runtime/osqueryinstance_windows_test.go | 40 +++++++ pkg/osquery/runtime/runtime_helpers.go | 45 ++++++++ .../runtime/runtime_helpers_windows.go | 38 +++++++ 6 files changed, 329 insertions(+) create mode 100644 pkg/osquery/runtime/osqueryinstance_other_test.go diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 0ecb3ad4d..502127bee 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -29,6 +29,7 @@ import ( "github.com/osquery/osquery-go/plugin/config" "github.com/osquery/osquery-go/plugin/distributed" osquerylogger "github.com/osquery/osquery-go/plugin/logger" + "github.com/shirou/gopsutil/v3/process" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" @@ -249,6 +250,15 @@ func (i *OsqueryInstance) Launch() error { return fmt.Errorf("could not calculate osquery file paths: %w", err) } + // Check to see whether lock files exist that could prevent osquery from starting up + if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil { + // A detection error shouldn't prevent us from creating the process -- log it and move on + i.slogger.Log(ctx, slog.LevelInfo, + "error detecting stale database lock", + "err", err, + ) + } + // Populate augeas lenses, if requested if i.opts.augeasLensFunc != nil { if err := os.MkdirAll(paths.augeasPath, 0755); err != nil { @@ -702,6 +712,79 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st return osqueryFilePaths, nil } +// detectStaleDatabaseLock checks for the presence of a lock file in the given database. +// We have noticed the lock file occasionally hanging around -- we think this can happen +// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock +// file will prevent osquery from starting up entirely. For now, we just detect this stale +// lock file, and log information about it. In the future we will attempt to remediate. +// See: https://github.com/kolide/launcher/issues/2004. +func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) { + lockFilePath := filepath.Join(paths.databasePath, "LOCK") + + lockFileInfo, err := os.Stat(lockFilePath) + if os.IsNotExist(err) { + // No lock file, nothing to do here + return false, nil + } + if err != nil { + return false, fmt.Errorf("determining whether lock file exists: %w", err) + } + + infoToLog := []any{ + "lockfile_path", lockFilePath, + "lockfile_modtime", lockFileInfo.ModTime().UTC().String(), + } + + defer func() { + i.slogger.Log(ctx, slog.LevelInfo, + "detected stale osquery db lock file", + infoToLog..., + ) + }() + + // Check to see whether the process holding the file still exists + processes, err := getProcessesHoldingFile(ctx, lockFilePath) + if err != nil { + infoToLog = append(infoToLog, "err", err) + return false, fmt.Errorf("getting process holding file: %w", err) + } + + // Grab more info to log from the processes using the lockfile + processStrs := make([]string, len(processes)) + for i, p := range processes { + processStrs[i] = processStr(ctx, p) + } + infoToLog = append(infoToLog, "processes", processStrs) + + return true, nil +} + +func processStr(ctx context.Context, p *process.Process) string { + name := "unknown" + processOwner := "unknown" + runningStatus := "unknown" + cmdline := "unknown" + + if gotName, err := p.NameWithContext(ctx); err == nil { + name = gotName + } + if gotUsername, err := p.UsernameWithContext(ctx); err == nil { + processOwner = gotUsername + } + if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil { + if gotIsRunning { + runningStatus = "running" + } else { + runningStatus = "not running" + } + } + if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil { + cmdline = gotCmdline + } + + return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline) +} + // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd // which will launch a properly configured osqueryd process. func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) { diff --git a/pkg/osquery/runtime/osqueryinstance_other_test.go b/pkg/osquery/runtime/osqueryinstance_other_test.go new file mode 100644 index 000000000..7ce670ba9 --- /dev/null +++ b/pkg/osquery/runtime/osqueryinstance_other_test.go @@ -0,0 +1,20 @@ +//go:build !windows +// +build !windows + +package runtime + +import ( + "syscall" + "testing" + + "github.com/stretchr/testify/require" +) + +func createLockFile(t *testing.T, fileToLock string) { + lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600) + require.NoError(t, err) + require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB)) + t.Cleanup(func() { + syscall.Close(lockFile) + }) +} diff --git a/pkg/osquery/runtime/osqueryinstance_test.go b/pkg/osquery/runtime/osqueryinstance_test.go index 0eaf27f7e..9bd8c8b11 100644 --- a/pkg/osquery/runtime/osqueryinstance_test.go +++ b/pkg/osquery/runtime/osqueryinstance_test.go @@ -2,7 +2,9 @@ package runtime import ( "context" + "errors" "fmt" + "os" "path/filepath" "runtime" "strings" @@ -12,8 +14,10 @@ import ( "github.com/kolide/kit/ulid" "github.com/kolide/launcher/ee/agent/types" typesMocks "github.com/kolide/launcher/ee/agent/types/mocks" + "github.com/kolide/launcher/pkg/backoff" "github.com/kolide/launcher/pkg/log/multislogger" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" ) @@ -235,3 +239,102 @@ func Test_healthcheckWithRetries(t *testing.T) { // No client available, so healthcheck should fail despite retries require.Error(t, i.healthcheckWithRetries(context.TODO(), 5, 100*time.Millisecond)) } + +func TestLaunch(t *testing.T) { + t.Parallel() + + logBytes, slogger := setUpTestSlogger() + rootDirectory := testRootDirectory(t) + + k := typesMocks.NewKnapsack(t) + k.On("WatchdogEnabled").Return(true) + k.On("WatchdogMemoryLimitMB").Return(150) + k.On("WatchdogUtilizationLimitPercent").Return(20) + k.On("WatchdogDelaySec").Return(120) + k.On("OsqueryFlags").Return([]string{"verbose=true"}) + k.On("OsqueryVerbose").Return(true) + k.On("Slogger").Return(slogger) + k.On("RootDirectory").Return(rootDirectory) + k.On("LoggingInterval").Return(1 * time.Second) + k.On("LogMaxBytesPerBatch").Return(500) + k.On("Transport").Return("jsonrpc") + setUpMockStores(t, k) + k.On("ReadEnrollSecret").Return("", nil) + k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinary) + k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second) + + i := newInstance(types.DefaultRegistrationID, k, mockServiceClient()) + go i.Launch() + + // Wait for the instance to become healthy + require.NoError(t, backoff.WaitFor(func() error { + // Instance self-reports as healthy + if err := i.Healthy(); err != nil { + return fmt.Errorf("instance not healthy: %w", err) + } + + // Confirm instance setup is complete + if i.stats == nil || i.stats.ConnectTime == "" { + return errors.New("no connect time set yet") + } + + // Good to go + return nil + }, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String())) + + // Now wait for full shutdown + i.BeginShutdown() + shutdownErr := make(chan error) + go func() { + shutdownErr <- i.WaitShutdown() + }() + + select { + case err := <-shutdownErr: + require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String())) + case <-time.After(1 * time.Minute): + t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String())) + t.FailNow() + } + + k.AssertExpectations(t) +} + +func Test_detectStaleDatabaseLock(t *testing.T) { + t.Parallel() + + if runtime.GOOS == "windows" && os.Getenv("GITHUB_ACTIONS") == "true" { + t.Skip("Skipping test on GitHub Actions -- test only works locally on Windows") + } + + _, slogger := setUpTestSlogger() + rootDirectory := testRootDirectory(t) + + k := typesMocks.NewKnapsack(t) + k.On("Slogger").Return(slogger) + k.On("RootDirectory").Return(rootDirectory) + setUpMockStores(t, k) + + i := newInstance(types.DefaultRegistrationID, k, mockServiceClient()) + + // Calculate paths + paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts) + require.NoError(t, err) + + // Check for stale database lock -- there shouldn't be one + detected, err := i.detectStaleDatabaseLock(context.TODO(), paths) + require.NoError(t, err) + require.False(t, detected) + + // Create a lock file + lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK") + require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx + createLockFile(t, lockFilePath) + + // Check for a stale database lock again -- now, we should find it + detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths) + require.NoError(t, err) + require.True(t, detectedRetry) + + k.AssertExpectations(t) +} diff --git a/pkg/osquery/runtime/osqueryinstance_windows_test.go b/pkg/osquery/runtime/osqueryinstance_windows_test.go index d59d6582f..1d76e3b8d 100644 --- a/pkg/osquery/runtime/osqueryinstance_windows_test.go +++ b/pkg/osquery/runtime/osqueryinstance_windows_test.go @@ -5,7 +5,9 @@ package runtime import ( "strings" + "syscall" "testing" + "unsafe" "github.com/kolide/launcher/ee/agent/types" typesMocks "github.com/kolide/launcher/ee/agent/types/mocks" @@ -50,3 +52,41 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) { k.AssertExpectations(t) } + +func createLockFile(t *testing.T, fileToLock string) { + lockedFileName, err := syscall.UTF16PtrFromString(fileToLock) + require.NoError(t, err) + + handle, err := syscall.CreateFile( + lockedFileName, + syscall.GENERIC_READ, + syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE, + nil, + syscall.CREATE_ALWAYS, + syscall.FILE_ATTRIBUTE_NORMAL, + 0, + ) + require.NoError(t, err) + + t.Cleanup(func() { + syscall.Close(handle) + }) + + // See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex + modkernel32 := syscall.NewLazyDLL("kernel32.dll") + procLockFileEx := modkernel32.NewProc("LockFileEx") + overlapped := &syscall.Overlapped{ + HEvent: 0, + } + + r1, _, e1 := syscall.SyscallN( + procLockFileEx.Addr(), + uintptr(handle), + uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK + uintptr(0), // Reserved parameter; must be set to zero. + uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock. + uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock. + uintptr(unsafe.Pointer(overlapped)), + ) + require.False(t, r1 == 0, error(e1)) +} diff --git a/pkg/osquery/runtime/runtime_helpers.go b/pkg/osquery/runtime/runtime_helpers.go index 49ffb614c..500a4c7e2 100644 --- a/pkg/osquery/runtime/runtime_helpers.go +++ b/pkg/osquery/runtime/runtime_helpers.go @@ -4,10 +4,17 @@ package runtime import ( + "context" + "errors" "fmt" "os/exec" "path/filepath" + "strconv" + "strings" "syscall" + + "github.com/kolide/launcher/ee/allowedcmd" + "github.com/shirou/gopsutil/v3/process" ) func setpgid() *syscall.SysProcAttr { @@ -33,3 +40,41 @@ func platformArgs() []string { func isExitOk(_ error) bool { return false } + +func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { + cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile) + if err != nil { + return nil, fmt.Errorf("creating lsof command: %w", err) + } + + out, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("running lsof: %w", err) + } + + pidStr := strings.TrimSpace(string(out)) + if pidStr == "" { + return nil, errors.New("no process found using file via lsof") + } + + pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(pidStrs) == 0 { + return nil, errors.New("no processes found using file via lsof") + } + + processes := make([]*process.Process, 0) + for _, pidStr := range pidStrs { + pid, err := strconv.ParseInt(pidStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err) + } + + p, err := process.NewProcess(int32(pid)) + if err != nil { + return nil, fmt.Errorf("getting process for %d: %w", pid, err) + } + processes = append(processes, p) + } + + return processes, nil +} diff --git a/pkg/osquery/runtime/runtime_helpers_windows.go b/pkg/osquery/runtime/runtime_helpers_windows.go index 7c5ed26e3..ee3f4eef3 100644 --- a/pkg/osquery/runtime/runtime_helpers_windows.go +++ b/pkg/osquery/runtime/runtime_helpers_windows.go @@ -7,11 +7,13 @@ import ( "context" "fmt" "os/exec" + "strings" "syscall" "time" "github.com/kolide/launcher/ee/allowedcmd" "github.com/pkg/errors" + "github.com/shirou/gopsutil/v3/process" ) func setpgid() *syscall.SysProcAttr { @@ -80,3 +82,39 @@ func isExitOk(err error) bool { } return false } + +func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { + allProcesses, err := process.ProcessesWithContext(ctx) + if err != nil { + return nil, fmt.Errorf("getting process list: %w", err) + } + if len(allProcesses) == 0 { + return nil, errors.New("could not get any processes") + } + + processes := make([]*process.Process, 0) + for _, p := range allProcesses { + openFiles, err := p.OpenFilesWithContext(ctx) + if err != nil { + continue + } + + // Check the process's open files to see if this process is the one using the lockfile + for _, f := range openFiles { + // We check for strings.Contains rather than equals because the open file's path contains + // a `\\?\` prefix. + if !strings.Contains(f.Path, pathToFile) { + continue + } + + processes = append(processes, p) + break + } + } + + if len(processes) == 0 { + return nil, fmt.Errorf("no processes found using file %s", pathToFile) + } + + return processes, nil +}