Skip to content

Commit

Permalink
Check for and log information about stale osquery database lock files (
Browse files Browse the repository at this point in the history
  • Loading branch information
RebeccaMahany authored Dec 30, 2024
1 parent 3b0aa9a commit bb27062
Show file tree
Hide file tree
Showing 6 changed files with 329 additions and 0 deletions.
83 changes: 83 additions & 0 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/osquery/osquery-go/plugin/config"
"github.com/osquery/osquery-go/plugin/distributed"
osquerylogger "github.com/osquery/osquery-go/plugin/logger"
"github.com/shirou/gopsutil/v3/process"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"

Expand Down Expand Up @@ -249,6 +250,15 @@ func (i *OsqueryInstance) Launch() error {
return fmt.Errorf("could not calculate osquery file paths: %w", err)
}

// Check to see whether lock files exist that could prevent osquery from starting up
if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil {
// A detection error shouldn't prevent us from creating the process -- log it and move on
i.slogger.Log(ctx, slog.LevelInfo,
"error detecting stale database lock",
"err", err,
)
}

// Populate augeas lenses, if requested
if i.opts.augeasLensFunc != nil {
if err := os.MkdirAll(paths.augeasPath, 0755); err != nil {
Expand Down Expand Up @@ -702,6 +712,79 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st
return osqueryFilePaths, nil
}

// detectStaleDatabaseLock checks for the presence of a lock file in the given database.
// We have noticed the lock file occasionally hanging around -- we think this can happen
// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock
// file will prevent osquery from starting up entirely. For now, we just detect this stale
// lock file, and log information about it. In the future we will attempt to remediate.
// See: https://github.com/kolide/launcher/issues/2004.
func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) {
lockFilePath := filepath.Join(paths.databasePath, "LOCK")

lockFileInfo, err := os.Stat(lockFilePath)
if os.IsNotExist(err) {
// No lock file, nothing to do here
return false, nil
}
if err != nil {
return false, fmt.Errorf("determining whether lock file exists: %w", err)
}

infoToLog := []any{
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().UTC().String(),
}

defer func() {
i.slogger.Log(ctx, slog.LevelInfo,
"detected stale osquery db lock file",
infoToLog...,
)
}()

// Check to see whether the process holding the file still exists
processes, err := getProcessesHoldingFile(ctx, lockFilePath)
if err != nil {
infoToLog = append(infoToLog, "err", err)
return false, fmt.Errorf("getting process holding file: %w", err)
}

// Grab more info to log from the processes using the lockfile
processStrs := make([]string, len(processes))
for i, p := range processes {
processStrs[i] = processStr(ctx, p)
}
infoToLog = append(infoToLog, "processes", processStrs)

return true, nil
}

func processStr(ctx context.Context, p *process.Process) string {
name := "unknown"
processOwner := "unknown"
runningStatus := "unknown"
cmdline := "unknown"

if gotName, err := p.NameWithContext(ctx); err == nil {
name = gotName
}
if gotUsername, err := p.UsernameWithContext(ctx); err == nil {
processOwner = gotUsername
}
if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil {
if gotIsRunning {
runningStatus = "running"
} else {
runningStatus = "not running"
}
}
if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil {
cmdline = gotCmdline
}

return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline)
}

// createOsquerydCommand uses osqueryOptions to return an *exec.Cmd
// which will launch a properly configured osqueryd process.
func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) {
Expand Down
20 changes: 20 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_other_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//go:build !windows
// +build !windows

package runtime

import (
"syscall"
"testing"

"github.com/stretchr/testify/require"
)

func createLockFile(t *testing.T, fileToLock string) {
lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600)
require.NoError(t, err)
require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB))
t.Cleanup(func() {
syscall.Close(lockFile)
})
}
103 changes: 103 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package runtime

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
Expand All @@ -12,8 +14,10 @@ import (
"github.com/kolide/kit/ulid"
"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/log/multislogger"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -235,3 +239,102 @@ func Test_healthcheckWithRetries(t *testing.T) {
// No client available, so healthcheck should fail despite retries
require.Error(t, i.healthcheckWithRetries(context.TODO(), 5, 100*time.Millisecond))
}

func TestLaunch(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinary)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}

k.AssertExpectations(t)
}

func Test_detectStaleDatabaseLock(t *testing.T) {
t.Parallel()

if runtime.GOOS == "windows" && os.Getenv("GITHUB_ACTIONS") == "true" {
t.Skip("Skipping test on GitHub Actions -- test only works locally on Windows")
}

_, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
setUpMockStores(t, k)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())

// Calculate paths
paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts)
require.NoError(t, err)

// Check for stale database lock -- there shouldn't be one
detected, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.False(t, detected)

// Create a lock file
lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK")
require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx
createLockFile(t, lockFilePath)

// Check for a stale database lock again -- now, we should find it
detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.True(t, detectedRetry)

k.AssertExpectations(t)
}
40 changes: 40 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package runtime

import (
"strings"
"syscall"
"testing"
"unsafe"

"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
Expand Down Expand Up @@ -50,3 +52,41 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) {

k.AssertExpectations(t)
}

func createLockFile(t *testing.T, fileToLock string) {
lockedFileName, err := syscall.UTF16PtrFromString(fileToLock)
require.NoError(t, err)

handle, err := syscall.CreateFile(
lockedFileName,
syscall.GENERIC_READ,
syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
nil,
syscall.CREATE_ALWAYS,
syscall.FILE_ATTRIBUTE_NORMAL,
0,
)
require.NoError(t, err)

t.Cleanup(func() {
syscall.Close(handle)
})

// See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex
modkernel32 := syscall.NewLazyDLL("kernel32.dll")
procLockFileEx := modkernel32.NewProc("LockFileEx")
overlapped := &syscall.Overlapped{
HEvent: 0,
}

r1, _, e1 := syscall.SyscallN(
procLockFileEx.Addr(),
uintptr(handle),
uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK
uintptr(0), // Reserved parameter; must be set to zero.
uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock.
uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock.
uintptr(unsafe.Pointer(overlapped)),
)
require.False(t, r1 == 0, error(e1))
}
45 changes: 45 additions & 0 deletions pkg/osquery/runtime/runtime_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,17 @@
package runtime

import (
"context"
"errors"
"fmt"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"

"github.com/kolide/launcher/ee/allowedcmd"
"github.com/shirou/gopsutil/v3/process"
)

func setpgid() *syscall.SysProcAttr {
Expand All @@ -33,3 +40,41 @@ func platformArgs() []string {
func isExitOk(_ error) bool {
return false
}

func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) {
cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile)
if err != nil {
return nil, fmt.Errorf("creating lsof command: %w", err)
}

out, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("running lsof: %w", err)
}

pidStr := strings.TrimSpace(string(out))
if pidStr == "" {
return nil, errors.New("no process found using file via lsof")
}

pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n")
if len(pidStrs) == 0 {
return nil, errors.New("no processes found using file via lsof")
}

processes := make([]*process.Process, 0)
for _, pidStr := range pidStrs {
pid, err := strconv.ParseInt(pidStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err)
}

p, err := process.NewProcess(int32(pid))
if err != nil {
return nil, fmt.Errorf("getting process for %d: %w", pid, err)
}
processes = append(processes, p)
}

return processes, nil
}
Loading

0 comments on commit bb27062

Please sign in to comment.