Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check for and log information about stale osquery database lock files #2006

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,12 @@ func (i *OsqueryInstance) Launch() error {
return fmt.Errorf("could not calculate osquery file paths: %w", err)
}

// Make sure no lock files exist that could prevent osquery from starting up
if err := i.cleanUpOldDatabaseLock(ctx, paths); err != nil {
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
traces.SetError(span, fmt.Errorf("cleaning up old database lock: %w", err))
return fmt.Errorf("cleaning up old database lock: %w", err)
}

// Populate augeas lenses, if requested
if i.opts.augeasLensFunc != nil {
if err := os.MkdirAll(paths.augeasPath, 0755); err != nil {
Expand Down Expand Up @@ -702,6 +708,45 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st
return osqueryFilePaths, nil
}

// cleanUpOldDatabaseLock checks for the presence of a lock file in the given database and removes it
// if it exists. The lock file can hang around if an osquery process isn't properly terminated, and
// will prevent us from launching a new osquery process.
func (i *OsqueryInstance) cleanUpOldDatabaseLock(ctx context.Context, paths *osqueryFilePaths) error {
lockFilePath := filepath.Join(paths.databasePath, "LOCK")

lockFileInfo, err := os.Stat(lockFilePath)
if os.IsNotExist(err) {
// No lock file, nothing to do here
return nil
}

// We do a couple retries here -- Windows complains about removing files in use sometimes.
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
err = backoff.WaitFor(func() error {
if err := os.Remove(lockFilePath); err != nil {
return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err)
}
return nil
}, 5*time.Second, 500*time.Millisecond)

if err != nil {
i.slogger.Log(ctx, slog.LevelInfo,
"detected old lock file but could not remove it",
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().String(),
"err", err,
)
return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err)
}

i.slogger.Log(ctx, slog.LevelInfo,
"detected and removed old osquery db lock file",
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().String(),
)

return nil
}

// createOsquerydCommand uses osqueryOptions to return an *exec.Cmd
// which will launch a properly configured osqueryd process.
func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) {
Expand Down
20 changes: 20 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_other_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//go:build !windows
// +build !windows

package runtime

import (
"syscall"
"testing"

"github.com/stretchr/testify/require"
)

func createLockFile(t *testing.T, fileToLock string) {
lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600)
require.NoError(t, err)
require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB))
t.Cleanup(func() {
syscall.Close(lockFile)
})
}
137 changes: 137 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package runtime

import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
Expand All @@ -12,8 +14,10 @@ import (
"github.com/kolide/kit/ulid"
"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
"github.com/kolide/launcher/pkg/backoff"
"github.com/kolide/launcher/pkg/log/multislogger"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -235,3 +239,136 @@ func Test_healthcheckWithRetries(t *testing.T) {
// No client available, so healthcheck should fail despite retries
require.Error(t, i.healthcheckWithRetries(context.TODO(), 5, 100*time.Millisecond))
}

func TestLaunch(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}

k.AssertExpectations(t)
}

func TestLaunch_WithLockFileCleanup(t *testing.T) {
t.Parallel()

logBytes, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("WatchdogEnabled").Return(true)
k.On("WatchdogMemoryLimitMB").Return(150)
k.On("WatchdogUtilizationLimitPercent").Return(20)
k.On("WatchdogDelaySec").Return(120)
k.On("OsqueryFlags").Return([]string{"verbose=true"})
k.On("OsqueryVerbose").Return(true)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
k.On("LoggingInterval").Return(1 * time.Second)
k.On("LogMaxBytesPerBatch").Return(500)
k.On("Transport").Return("jsonrpc")
setUpMockStores(t, k)
k.On("ReadEnrollSecret").Return("", nil)
k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory)
k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())

// Create a lock file that would ordinarily prevent osquery from starting up
lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK")
require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx
createLockFile(t, lockFilePath)

// Start the instance
instanceStartTime := time.Now()
go i.Launch()

// Wait for the instance to become healthy
require.NoError(t, backoff.WaitFor(func() error {
// Instance self-reports as healthy
if err := i.Healthy(); err != nil {
return fmt.Errorf("instance not healthy: %w", err)
}

// Confirm instance setup is complete
if i.stats == nil || i.stats.ConnectTime == "" {
return errors.New("no connect time set yet")
}

// Good to go
return nil
}, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String()))

// Confirm that the lock file is new -- it should have a modtime after instanceStartTime
lockFileInfo, err := os.Stat(lockFilePath)
require.NoError(t, err)
require.True(t, lockFileInfo.ModTime().After(instanceStartTime), logBytes.String())

// Now wait for full shutdown
i.BeginShutdown()
shutdownErr := make(chan error)
go func() {
shutdownErr <- i.WaitShutdown()
}()

select {
case err := <-shutdownErr:
require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String()))
case <-time.After(1 * time.Minute):
t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String()))
t.FailNow()
}

k.AssertExpectations(t)
}
40 changes: 40 additions & 0 deletions pkg/osquery/runtime/osqueryinstance_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package runtime

import (
"strings"
"syscall"
"testing"
"unsafe"

"github.com/kolide/launcher/ee/agent/types"
typesMocks "github.com/kolide/launcher/ee/agent/types/mocks"
Expand Down Expand Up @@ -50,3 +52,41 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) {

k.AssertExpectations(t)
}

func createLockFile(t *testing.T, fileToLock string) {
lockedFileName, err := syscall.UTF16PtrFromString(fileToLock)
require.NoError(t, err)

handle, err := syscall.CreateFile(
lockedFileName,
syscall.GENERIC_READ,
syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE,
nil,
syscall.CREATE_ALWAYS,
syscall.FILE_ATTRIBUTE_NORMAL,
0,
)
require.NoError(t, err)

t.Cleanup(func() {
syscall.Close(handle)
})

// See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex
modkernel32 := syscall.NewLazyDLL("kernel32.dll")
procLockFileEx := modkernel32.NewProc("LockFileEx")
overlapped := &syscall.Overlapped{
HEvent: 0,
}

r1, _, e1 := syscall.SyscallN(
procLockFileEx.Addr(),
uintptr(handle),
uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK
uintptr(0), // Reserved parameter; must be set to zero.
uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock.
uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock.
uintptr(unsafe.Pointer(overlapped)),
)
require.False(t, r1 == 0, error(e1))
}
Loading