Skip to content

Commit

Permalink
Check for and clean up old osquery database lock files before startin…
Browse files Browse the repository at this point in the history
…g process
  • Loading branch information
RebeccaMahany committed Dec 17, 2024
1 parent 338c676 commit a833edf
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 0 deletions.
37 changes: 37 additions & 0 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,12 @@ func (i *OsqueryInstance) Launch() error {
return fmt.Errorf("could not calculate osquery file paths: %w", err)
}

// Make sure no lock files exist that could prevent osquery from starting up
if err := i.cleanUpOldDatabaseLock(ctx, paths); err != nil {
traces.SetError(span, fmt.Errorf("cleaning up old database lock: %w", err))
return fmt.Errorf("cleaning up old database lock: %w", err)
}

// Populate augeas lenses, if requested
if i.opts.augeasLensFunc != nil {
if err := os.MkdirAll(paths.augeasPath, 0755); err != nil {
Expand Down Expand Up @@ -702,6 +708,37 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st
return osqueryFilePaths, nil
}

// cleanUpOldDatabaseLock checks for the presence of a lock file in the given database and removes it
// if it exists. The lock file can hang around if an osquery process isn't properly terminated, and
// will prevent us from launching a new osquery process.
func (i *OsqueryInstance) cleanUpOldDatabaseLock(ctx context.Context, paths *osqueryFilePaths) error {
lockFilePath := filepath.Join(paths.databasePath, "LOCK")

lockFileInfo, err := os.Stat(lockFilePath)
if os.IsNotExist(err) {
// No lock file, nothing to do here
return nil
}

if err := os.Remove(lockFilePath); err != nil {
i.slogger.Log(ctx, slog.LevelInfo,
"detected old lock file but could not remove it",
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().String(),
"err", err,
)
return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err)
}

i.slogger.Log(ctx, slog.LevelInfo,
"detected and removed old osquery db lock file",
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().String(),
)

return nil
}

// createOsquerydCommand uses osqueryOptions to return an *exec.Cmd
// which will launch a properly configured osqueryd process.
func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) {
Expand Down
20 changes: 20 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_other_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//go:build !windows
// +build !windows

package runtime

import (
"syscall"
"testing"

"github.com/stretchr/testify/require"
)

func createLockFile(t *testing.T, fileToLock string) {
lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600)
require.NoError(t, err)
require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB))
t.Cleanup(func() {
syscall.Close(lockFile)
})
}
137 changes: 137 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package runtime

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
Expand All @@ -12,8 +14,10 @@ import (
"github.com/kolide/kit/ulid"
"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/log/multislogger"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -235,3 +239,136 @@ func Test_healthcheckWithRetries(t *testing.T) {
// No client available, so healthcheck should fail despite retries
require.Error(t, i.healthcheckWithRetries(context.TODO(), 5, 100*time.Millisecond))
}

func TestLaunch(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}

k.AssertExpectations(t)
}

func TestLaunch_WithLockFileCleanup(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())

// Create a lock file that would ordinarily prevent osquery from starting up
lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK")
require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx
createLockFile(t, lockFilePath)

// Start the instance
instanceStartTime := time.Now()
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Confirm that the lock file is new -- it should have a modtime after instanceStartTime
lockFileInfo, err := os.Stat(lockFilePath)
require.NoError(t, err)
require.True(t, lockFileInfo.ModTime().After(instanceStartTime), logBytes.String())

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}

k.AssertExpectations(t)
}
40 changes: 40 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package runtime

import (
"strings"
"syscall"
"testing"
"unsafe"

"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
Expand Down Expand Up @@ -50,3 +52,41 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) {

k.AssertExpectations(t)
}

func createLockFile(t *testing.T, fileToLock string) {
lockedFileName, err := syscall.UTF16PtrFromString(fileToLock)
require.NoError(t, err)

handle, err := syscall.CreateFile(
lockedFileName,
syscall.GENERIC_READ,
syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
nil,
syscall.CREATE_ALWAYS,
syscall.FILE_ATTRIBUTE_NORMAL,
0,
)
require.NoError(t, err)

t.Cleanup(func() {
syscall.Close(handle)
})

// See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex
modkernel32 := syscall.NewLazyDLL("kernel32.dll")
procLockFileEx := modkernel32.NewProc("LockFileEx")
overlapped := &syscall.Overlapped{
HEvent: 0,
}

r1, _, e1 := syscall.SyscallN(
procLockFileEx.Addr(),
uintptr(handle),
uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK
uintptr(0), // Reserved parameter; must be set to zero.
uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock.
uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock.
uintptr(unsafe.Pointer(overlapped)),
)
require.False(t, r1 == 0, error(e1))
}

0 comments on commit a833edf

Please sign in to comment.