Skip to content

Commit

Permalink
[SS] Support caching old snapshot manifests (#5906)
Browse files Browse the repository at this point in the history
  • Loading branch information
maggie-lou authored Feb 21, 2024
1 parent 7c8d601 commit 6756b50
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 60 deletions.
12 changes: 12 additions & 0 deletions app/invocation/invocation_action_card.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,18 @@ export default class InvocationActionCardComponent extends React.Component<Props
}
</TextLink>
</div>
<div className="metadata-title">VM resumed from snapshot ID</div>
<div className="metadata-detail">
{this.state.actionResult.executionMetadata.vmMetadata.lastExecutedTask.snapshotId}
</div>
</>
)}
{this.state.actionResult.executionMetadata.vmMetadata.snapshotId && (
<>
<div className="metadata-title">Saved to snapshot ID</div>
<div className="metadata-detail">
{this.state.actionResult.executionMetadata.vmMetadata.snapshotId}
</div>
</>
)}
</>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ go_test(
"//enterprise/server/util/oci",
"//proto:firecracker_go_proto",
"//proto:remote_execution_go_proto",
"//proto:runner_go_proto",
"//proto:scheduler_go_proto",
"//server/backends/disk_cache",
"//server/interfaces",
Expand Down Expand Up @@ -195,6 +196,7 @@ go_test(
"//enterprise/server/util/oci",
"//proto:firecracker_go_proto",
"//proto:remote_execution_go_proto",
"//proto:runner_go_proto",
"//proto:scheduler_go_proto",
"//server/backends/disk_cache",
"//server/interfaces",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -462,9 +462,10 @@ func (p *Provider) New(ctx context.Context, props *platform.Properties, task *re

// FirecrackerContainer executes commands inside of a firecracker VM.
type FirecrackerContainer struct {
id string // a random GUID, unique per-run of firecracker
vmIdx int // the index of this vm on the host machine
loader *snaploader.FileCacheLoader
id string // a random GUID, unique per-run of firecracker
snapshotID string // a random GUID, unique per-run of firecracker
vmIdx int // the index of this vm on the host machine
loader *snaploader.FileCacheLoader

vmConfig *fcpb.VMConfiguration
containerImage string // the OCI container image. ex "alpine:latest"
Expand Down Expand Up @@ -621,10 +622,10 @@ func NewContainer(ctx context.Context, env environment.Env, task *repb.Execution
label := ""
if err != nil {
label = metrics.MissStatusLabel
log.CtxInfof(ctx, "Failed to get VM snapshot for keyset %s: %s", snaploader.KeysetDebugString(ctx, c.env, c.SnapshotKeySet()), err)
log.CtxInfof(ctx, "Failed to get VM snapshot for keyset %s: %s", snaploader.KeysetDebugString(ctx, c.env, c.SnapshotKeySet(), c.supportsRemoteSnapshots), err)
} else {
label = metrics.HitStatusLabel
log.CtxInfof(ctx, "Found snapshot for key %s", snaploader.KeyDebugString(ctx, c.env, snap.GetKey()))
log.CtxInfof(ctx, "Found snapshot for key %s", snaploader.KeyDebugString(ctx, c.env, snap.GetKey(), c.supportsRemoteSnapshots))
}
metrics.RecycleRunnerRequests.With(prometheus.Labels{
metrics.RecycleRunnerRequestStatusLabel: label,
Expand Down Expand Up @@ -787,6 +788,10 @@ func (c *FirecrackerContainer) SnapshotKeySet() *fcpb.SnapshotKeySet {
return c.snapshotKeySet.CloneVT()
}

func (c *FirecrackerContainer) SnapshotID() string {
return c.snapshotID
}

// State returns the container state to be persisted to disk so that this
// container can be reconstructed from the state on disk after an executor
// restart.
Expand Down Expand Up @@ -915,7 +920,10 @@ func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails

func (c *FirecrackerContainer) getVMMetadata() *repb.VMMetadata {
if c.snapshot == nil || c.snapshot.GetVMMetadata() == nil {
return &repb.VMMetadata{VmId: c.id}
return &repb.VMMetadata{
VmId: c.id,
SnapshotId: c.snapshotID,
}
}
return c.snapshot.GetVMMetadata()
}
Expand All @@ -927,6 +935,7 @@ func (c *FirecrackerContainer) getVMTask() *repb.VMMetadata_VMTask {
ExecutionId: c.task.GetExecutionId(),
ActionDigest: c.task.GetExecuteRequest().GetActionDigest(),
ExecuteResponseDigest: d,
SnapshotId: c.snapshotID, // Unique ID pertaining to this execution run
}
}

Expand Down Expand Up @@ -1021,6 +1030,16 @@ func (c *FirecrackerContainer) LoadSnapshot(ctx context.Context) error {
if err != nil {
return status.WrapError(err, "failed to get snapshot")
}

// Set unique per-run identifier on the vm metadata so this exact snapshot
// run can be identified
if snap.GetVMMetadata() == nil {
md := &repb.VMMetadata{
VmId: c.id,
}
snap.SetVMMetadata(md)
}
snap.GetVMMetadata().SnapshotId = c.snapshotID
c.snapshot = snap

if err := os.MkdirAll(c.getChroot(), 0777); err != nil {
Expand Down Expand Up @@ -1316,6 +1335,7 @@ func (c *FirecrackerContainer) newID(ctx context.Context) error {
vmIdx += 1
log.CtxDebugf(ctx, "Container id changing from %q (%d) to %q (%d)", c.id, c.vmIdx, u.String(), vmIdx)
c.id = u.String()
c.snapshotID = u.String()
c.vmIdx = vmIdx

if vmIdx > maxVMSPerHost {
Expand Down Expand Up @@ -2646,7 +2666,7 @@ func (c *FirecrackerContainer) SnapshotDebugString(ctx context.Context) string {
if c.snapshot == nil {
return ""
}
return snaploader.KeyDebugString(ctx, c.env, c.snapshot.GetKey())
return snaploader.KeyDebugString(ctx, c.env, c.snapshot.GetKey(), c.supportsRemoteSnapshots)
}

func (c *FirecrackerContainer) VMConfig() *fcpb.VMConfiguration {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (

fcpb "github.com/buildbuddy-io/buildbuddy/proto/firecracker"
repb "github.com/buildbuddy-io/buildbuddy/proto/remote_execution"
rnpb "github.com/buildbuddy-io/buildbuddy/proto/runner"
scpb "github.com/buildbuddy-io/buildbuddy/proto/scheduler"
bspb "google.golang.org/genproto/googleapis/bytestream"
)
Expand Down Expand Up @@ -706,18 +707,19 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
require.NoError(t, err)
err = baseVM.Create(ctx, opts.ActionWorkingDirectory)
require.NoError(t, err)
baseSnapshotId := baseVM.SnapshotID()

// Create a snapshot. Data written to this snapshot should persist
// when other VMs reuse the snapshot
cmd := appendToLog("Base")
res := baseVM.Exec(ctx, cmd, nil /*=stdio*/)
require.NoError(t, res.Error)
require.Equal(t, "Base\n", string(res.Stdout))
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
err = baseVM.Pause(ctx)
require.NoError(t, err)

// Vms should be able to start from the snapshot. Artifacts should be stored
// locally in the filecache
// Start a VM from the snapshot. Artifacts should be stored locally in the filecache
workDirForkLocalFetch := testfs.MakeDirAll(t, rootDir, "work-fork-local-fetch")
opts = firecracker.ContainerOpts{
ContainerImage: busyboxImage,
Expand All @@ -735,15 +737,15 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
containersToCleanup = append(containersToCleanup, forkedVM)
err = forkedVM.Unpause(ctx)
require.NoError(t, err)

// Write VM-specific data to the log
cmd = appendToLog("Fork local fetch")
res = forkedVM.Exec(ctx, cmd, nil /*=stdio*/)
require.NoError(t, res.Error)
// The log should contain data written to the original snapshot
// and the current VM, but not from any of the other VMs sharing
// the same original snapshot
// and the current VM
require.Equal(t, "Base\nFork local fetch\n", string(res.Stdout))
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
err = forkedVM.Pause(ctx)
require.NoError(t, err)

// Clear the local filecache. Vms should still be able to unpause the snapshot
// by pulling artifacts from the remote cache
Expand All @@ -755,6 +757,7 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
fc2.WaitForDirectoryScanToComplete()
env.SetFileCache(fc2)

// Start a VM from the snapshot.
workDirForkRemoteFetch := testfs.MakeDirAll(t, rootDir, "work-fork-remote-fetch")
opts = firecracker.ContainerOpts{
ContainerImage: busyboxImage,
Expand All @@ -772,15 +775,43 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
containersToCleanup = append(containersToCleanup, forkedVM2)
err = forkedVM2.Unpause(ctx)
require.NoError(t, err)

// Write VM-specific data to the log
cmd = appendToLog("Fork remote fetch")
res = forkedVM2.Exec(ctx, cmd, nil /*=stdio*/)
require.NoError(t, res.Error)
// The log should contain data written to the most recent snapshot
require.Equal(t, "Base\nFork local fetch\nFork remote fetch\n", string(res.Stdout))
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())

// Should still be able to start from the original snapshot if we use
// a snapshot key containing the original VM's snapshot ID
workDirForkOriginalSnapshot := testfs.MakeDirAll(t, rootDir, "work-fork-og-snapshot")
originalSnapshotKey := baseVM.SnapshotKeySet().GetBranchKey()
originalSnapshotKey.SnapshotId = baseSnapshotId
opts = firecracker.ContainerOpts{
ContainerImage: busyboxImage,
ActionWorkingDirectory: workDirForkOriginalSnapshot,
VMConfiguration: &fcpb.VMConfiguration{
NumCpus: 1,
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
EnableNetworking: false,
ScratchDiskSizeMb: 100,
},
ExecutorConfig: cfg,
SavedState: &rnpb.FirecrackerState{SnapshotKey: originalSnapshotKey},
}
ogFork, err := firecracker.NewContainer(ctx, env, task, opts)
require.NoError(t, err)
containersToCleanup = append(containersToCleanup, ogFork)
err = ogFork.Unpause(ctx)
require.NoError(t, err)
cmd = appendToLog("Fork from original vm")
res = ogFork.Exec(ctx, cmd, nil /*=stdio*/)
require.NoError(t, res.Error)
// The log should contain data written to the original snapshot
// and the current VM, but not from any of the other VMs sharing
// the same original snapshot
require.Equal(t, "Base\nFork remote fetch\n", string(res.Stdout))
// and the current VM, but not from any of the other VMs, including the master
// snapshot
require.Equal(t, "Base\nFork from original vm\n", string(res.Stdout))
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
}

func TestFirecracker_RemoteSnapshotSharing_RemoteInstanceName(t *testing.T) {
Expand Down
Loading

0 comments on commit 6756b50

Please sign in to comment.