From 028a78b462e104b73bf929a03aaeaef661730690 Mon Sep 17 00:00:00 2001 From: "David A. Bradley" Date: Wed, 27 Mar 2024 15:01:34 -0400 Subject: [PATCH] Use force unmount and explicitly unmount bad mount points There have been cases where the logic to cleanup a mount point has caused the driver to get into a bad state. This is most obvious when a subdirectory is mounted to a volume and a parent directory of that subdirectory is deleted. The Lustre driver doesn't handle that case in the way that Kubernetes expects and returns invalid data. To avoid this scenario causing our driver to get into a bad state, leak mount points, etc, we must explicitly check that we can read the necessary information about the mount point, and if not, explicitly unmount that mount point before allowing Kubernetes to clean up the directory. To ensure that we don't end up in a bad state, this change enables force unmounting as well. The force unmount will only occur after a timeout has expired, since force unmounts can cause issues with the Lustre driver. However, in this case, it is better if we are in a bad enough situation to be able to eventually return to a good state rather than require manual intervention. --- pkg/azurelustre/azurelustre.go | 8 ++++++ pkg/azurelustre/fake_mount.go | 5 ++++ pkg/azurelustre/nodeserver.go | 41 +++++++++++++++++++++++++++--- pkg/azurelustre/nodeserver_test.go | 9 +++++++ 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/pkg/azurelustre/azurelustre.go b/pkg/azurelustre/azurelustre.go index 77aa20e70..0da84e72b 100644 --- a/pkg/azurelustre/azurelustre.go +++ b/pkg/azurelustre/azurelustre.go @@ -91,6 +91,7 @@ type Driver struct { // enableAzureLustreMockMount is only for testing, DO NOT set as true in non-testing scenario enableAzureLustreMockMount bool mounter *mount.SafeFormatAndMount // TODO_JUSJIN: check any other alternatives + forceMounter *mount.MounterForceUnmounter volLockMap *util.LockMap // Directory to temporarily mount to for subdirectory creation workingMountDir string @@ -132,6 +133,13 @@ func (d *Driver) Run(endpoint string, testBool bool) { Interface: mount.New(""), Exec: utilexec.New(), } + forceUnmounter, ok := d.mounter.Interface.(mount.MounterForceUnmounter) + if ok { + klog.V(4).Infof("Using force unmounter interface") + d.forceMounter = &forceUnmounter + } else { + klog.Fatalf("Mounter does not support force unmount") + } // TODO_JUSJIN: revisit these caps // Initialize default library driver diff --git a/pkg/azurelustre/fake_mount.go b/pkg/azurelustre/fake_mount.go index 912ed4602..b13321b18 100644 --- a/pkg/azurelustre/fake_mount.go +++ b/pkg/azurelustre/fake_mount.go @@ -19,6 +19,7 @@ package azurelustre import ( "fmt" "strings" + "time" mount "k8s.io/mount-utils" ) @@ -69,3 +70,7 @@ func (f *fakeMounter) IsLikelyNotMountPoint(file string) (bool, error) { } return true, nil } + +func (f *fakeMounter) UnmountWithForce(target string, _ time.Duration) error { + return f.Unmount(target) +} diff --git a/pkg/azurelustre/nodeserver.go b/pkg/azurelustre/nodeserver.go index 1d5519486..493b0d8a5 100644 --- a/pkg/azurelustre/nodeserver.go +++ b/pkg/azurelustre/nodeserver.go @@ -21,6 +21,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/container-storage-interface/spec/lib/go/csi" "golang.org/x/net/context" @@ -317,10 +318,44 @@ func (d *Driver) NodeUnpublishVolume( } func unmountVolumeAtPath(d *Driver, targetPath string) error { + shouldUnmountBadPath := false + d.kernelModuleLock.Lock() defer d.kernelModuleLock.Unlock() - err := mount.CleanupMountPoint(targetPath, d.mounter, - true /*extensiveMountPointCheck*/) + + parent := filepath.Dir(targetPath) + klog.V(2).Infof("Listing dir: %s", parent) + entries, err := os.ReadDir(parent) + if err != nil { + klog.Warningf("could not list directory %s, will explicitly unmount path before cleanup %s: %q", parent, targetPath, err) + shouldUnmountBadPath = true + } + + for _, e := range entries { + if e.Name() == filepath.Base(targetPath) { + _, err := e.Info() + if err != nil { + klog.Warningf("could not get info for entry %s, will explicitly unmount path before cleanup %s: %q", e.Name(), targetPath, err) + shouldUnmountBadPath = true + } + } + } + + if shouldUnmountBadPath { + // In these cases, if we only ran mount.CleanupMountWithForce, + // it would have issues trying to stat the directory before + // cleanup, so we need to explicitly unmount the path, with + // force if necessary. Then the directory can be cleaned up + // by the mount.CleanupMountWithForce call. + klog.V(4).Infof("unmounting bad mount: %s)", targetPath) + forceUnmounter := *d.forceMounter + if err := forceUnmounter.UnmountWithForce(targetPath, 30*time.Second); err != nil { + klog.Warningf("couldn't unmount %s: %q", targetPath, err) + } + } + + err = mount.CleanupMountWithForce(targetPath, *d.forceMounter, + true /*extensiveMountPointCheck*/, 10*time.Second) return err } @@ -653,7 +688,7 @@ func (d *Driver) internalUnmount(mountPath string) error { klog.V(4).Infof("internally unmounting %v", target) - err = mount.CleanupMountPoint(target, d.mounter, true) + err = mount.CleanupMountWithForce(target, *d.forceMounter, true, 10*time.Second) if err != nil { err = status.Errorf(codes.Internal, "failed to unmount staging target %q: %v", target, err) } diff --git a/pkg/azurelustre/nodeserver_test.go b/pkg/azurelustre/nodeserver_test.go index 028f4ed85..4511f0228 100644 --- a/pkg/azurelustre/nodeserver_test.go +++ b/pkg/azurelustre/nodeserver_test.go @@ -115,6 +115,9 @@ func TestEnsureMountPoint(t *testing.T) { Interface: fakeMounter, Exec: fakeExec, } + forceMounter, ok := d.mounter.Interface.(mount.MounterForceUnmounter) + require.True(t, ok, "Mounter should implement MounterForceUnmounter") + d.forceMounter = &forceMounter for _, test := range tests { err := makeDir(alreadyExistTarget) @@ -542,6 +545,9 @@ func TestNodePublishVolume(t *testing.T) { Interface: fakeMounter, Exec: fakeExec, } + forceMounter, ok := d.mounter.Interface.(mount.MounterForceUnmounter) + require.True(t, ok, "Mounter should implement MounterForceUnmounter") + d.forceMounter = &forceMounter d.workingMountDir = workingMountDir err := makeDir(targetTest) require.NoError(t, err) @@ -736,6 +742,9 @@ func TestNodeUnpublishVolume(t *testing.T) { Interface: fakeMounter, Exec: fakeExec, } + forceMounter, ok := d.mounter.Interface.(mount.MounterForceUnmounter) + require.True(t, ok, "Mounter should implement MounterForceUnmounter") + d.forceMounter = &forceMounter err := makeDir(targetTest) require.NoError(t, err)