From 197816008e2cfec12d6c7f6eca2280270aa1485d Mon Sep 17 00:00:00 2001 From: Paul Khuong Date: Sat, 5 Oct 2024 15:17:59 -0400 Subject: [PATCH 1/3] tracker/snapshot_file_content: avoid useless uploads after desync When the replicating (writer) Verneuil VFS notices that there is spooled metadata, but that it doesn't match the sqlite file, we currently always copy everything from scratch. This makes sense for long-lived processes, but isn't great (still correct, just wasteful!) for short-lived ones that run between regular writes to the sqlite db. This commit adds logic to notice when we have a valid spooled manifest, but it doesn't match the file on disk. We then force a full scan of the sqlite file, but still avoid uploading any chunk that happens to match what was in the manifest. This new feature does not assume any relationship between the file on disk and the manifest we happen to find in the spooling directory. The only assumption is that chunks in that manifest have been uploaded, or are staged for upload. If any chunk in the manifest happens to match the corresponding page in the sqlite file, that's great, if not, we just upload the new data, with content addressing. The old behaviour (upload everything) is preserved when we don't find a manifest, e.g., because the spooling directory is empty, or because the manifest predates the most recent boot. Independently, it might be interesting to treat the previous manifest as a set of chunks that are known to be already uploaded or staged for upload, and not stage them again. In practice, we mostly expect matches at identical page offset, but who knows with large blobs? --- src/tracker/snapshot_file_contents.rs | 89 +++++++++++++++++---------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/src/tracker/snapshot_file_contents.rs b/src/tracker/snapshot_file_contents.rs index 0fa0842..cceaa7f 100644 --- a/src/tracker/snapshot_file_contents.rs +++ b/src/tracker/snapshot_file_contents.rs @@ -241,15 +241,26 @@ impl Tracker { } } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum PreviousManifestState { + // We think the manifest was up-to-date wrt the initial sqlite db + // file's state. This means we can rely on our dirty page tracking. + Reliable, + // We can only assume the previous manifest was valid, but otherwise + // unrelated to the sqlite db file. All pages are potentially dirty. + Arbitrary, +} + /// How do we want to treat the current manifest (that we're about to /// replace)? enum CurrentManifest { // Don't generate a new snapshot at all. UpToDate, - // Snapshot from scratch. - Desync, - // Use the previous manifest as an initial snapshot. - Initial(Manifest, Option>), + // We don't have any initial manifest + FromScratch, + // Use the previous manifest as an initial snapshot, but be mindful + // of how much we trust that manifest. + Initial(PreviousManifestState, Manifest, Option>), } // This impl block has all the snapshot update logic. @@ -262,12 +273,14 @@ enum CurrentManifest { impl Tracker { /// Loads the current manifest and figures out what to do with it. fn judge_current_manifest(&self, version_id: &[u8]) -> CurrentManifest { - let mut current_manifest: Option<(Manifest, Option>)> = self + let current_manifest: Option<(Manifest, Option>)> = self .read_current_manifest() .map_err(|e| chain_info!(e, "failed to read staged manifest file")) .ok() .flatten(); + let prev_manifest_state: PreviousManifestState; + // If we're snapshotting after a write, we always want to go // through the whole process. We made some changes, let's // guarantee we try and publish them. That's important because, @@ -326,10 +339,12 @@ impl Tracker { up_to_date = false; } - // If the manifest isn't up to date, we can't use it. - if !up_to_date { - current_manifest = None; - } + // If the manifest isn't up to date, remember that. + prev_manifest_state = if up_to_date { + PreviousManifestState::Reliable + } else { + PreviousManifestState::Arbitrary + }; } else { // If we're doing this opportunistically (not after a write) // and the staging manifest seems up to date, there's nothing @@ -344,14 +359,14 @@ impl Tracker { } // If we think there's work to do after a read - // transaction, assume the worst, and rebuild - // the snapshot from scratch. - current_manifest = None; + // transaction, assume the worst, and rescan + // the whole file + prev_manifest_state = PreviousManifestState::Arbitrary; } match current_manifest { - None => CurrentManifest::Desync, - Some((manifest, base)) => CurrentManifest::Initial(manifest, base), + None => CurrentManifest::FromScratch, + Some((manifest, base)) => CurrentManifest::Initial(prev_manifest_state, manifest, base), } } @@ -363,6 +378,7 @@ impl Tracker { fn snapshot_chunks( &self, base: Option>, + prev_manifest_state: PreviousManifestState, ) -> Result<(u64, Vec, Vec, usize)> { use rand::Rng; let mut rng = rand::thread_rng(); @@ -402,12 +418,18 @@ impl Tracker { .is_some(); let delta = (grown || wrote_past_end) as u64; - // We definitely don't know anything about what's at or - // after chunk index `fprints.len()`. We also don't - // want to go out of bounds if the new file shrunk. - backfill_begin = (fprints.len() as u64) - .clamp(0, num_chunks) - .saturating_sub(delta); + if prev_manifest_state == PreviousManifestState::Arbitrary { + // Assume all the chunks in the manifests exist, but confirm + // that they match what we want. + backfill_begin = 0; + } else { + // We definitely don't know anything about what's at or + // after chunk index `fprints.len()`. We also don't + // want to go out of bounds if the new file shrunk. + backfill_begin = (fprints.len() as u64) + .clamp(0, num_chunks) + .saturating_sub(delta); + } fprints } else { backfill_begin = 0; @@ -550,7 +572,7 @@ impl Tracker { &mut self, header_fprint: Fingerprint, version_id: Vec, - current_manifest: Option<(Manifest, Option>)>, + current_manifest: Option<(PreviousManifestState, Manifest, Option>)>, ) -> Result<(usize, Vec, Option)> { use std::os::unix::fs::MetadataExt; @@ -568,7 +590,7 @@ impl Tracker { // mark them as dirty: it doesn't matter that we didn't change // them, we can't refer to them without making sure they're // available for readers. - if let Some(v1) = current_manifest.as_ref().and_then(|x| x.0.v1.as_ref()) { + if let Some(v1) = current_manifest.as_ref().and_then(|x| x.1.v1.as_ref()) { for bundled in &v1.bundled_chunks { self.dirty_chunks.insert(bundled.chunk_offset, None); } @@ -583,9 +605,15 @@ impl Tracker { } // Try to get an initial list of chunks to work off. - let base_fprints = Self::base_chunk_fprints(current_manifest.as_ref().map(|x| &x.0)); + let base_fprints = Self::base_chunk_fprints(current_manifest.as_ref().map(|x| &x.1)); + + let prev_manifest_state = match current_manifest.as_ref() { + None => PreviousManifestState::Arbitrary, + Some((state, _, _)) => *state, + }; - let (len, mut chunks, bundled_chunks, mut copied) = self.snapshot_chunks(base_fprints)?; + let (len, mut chunks, bundled_chunks, mut copied) = + self.snapshot_chunks(base_fprints, prev_manifest_state)?; let (ctime, ctime_ns) = match self.file.metadata() { Ok(meta) => (meta.ctime(), meta.ctime_nsec() as i32), @@ -598,7 +626,7 @@ impl Tracker { let flattened = flatten_chunk_fprints(&chunks); let manifest_fprint = fingerprint_v1_chunk_list(&flattened); let (compressible, base_chunk) = - reencode_flattened_chunks(&self.buffer, current_manifest.and_then(|x| x.1), flattened)?; + reencode_flattened_chunks(&self.buffer, current_manifest.and_then(|x| x.2), flattened)?; let base_fprint = base_chunk.as_ref().map(|x| x.fprint()); @@ -724,12 +752,11 @@ impl Tracker { e => chain_warn!(e, "failed to force populate version xattr", path=?self.path)); } - let current_manifest: Option<(Manifest, Option>)> = - match self.judge_current_manifest(&version_id) { - CurrentManifest::UpToDate => return Ok(()), - CurrentManifest::Desync => None, - CurrentManifest::Initial(manifest, base) => Some((manifest, base)), - }; + let current_manifest = match self.judge_current_manifest(&version_id) { + CurrentManifest::UpToDate => return Ok(()), + CurrentManifest::FromScratch => None, + CurrentManifest::Initial(state, manifest, base) => Some((state, manifest, base)), + }; // We don't *have* to overwrite the .metadata file, but we // should create it if it's missing: without that file, the From b5f782ca47965a8eddc039c2f57fe8a25b8f146c Mon Sep 17 00:00:00 2001 From: Paul Khuong Date: Sun, 6 Oct 2024 18:25:05 -0400 Subject: [PATCH 2/3] NFC tracker/snapshot_file_contents: box CurrentManifest::Initial payload Clippy is right to point out that it's pretty big, and we just pass it around for a while. --- src/tracker/snapshot_file_contents.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/tracker/snapshot_file_contents.rs b/src/tracker/snapshot_file_contents.rs index cceaa7f..f8cc615 100644 --- a/src/tracker/snapshot_file_contents.rs +++ b/src/tracker/snapshot_file_contents.rs @@ -251,6 +251,8 @@ enum PreviousManifestState { Arbitrary, } +type ValidManifestInfo = Box<(PreviousManifestState, Manifest, Option>)>; + /// How do we want to treat the current manifest (that we're about to /// replace)? enum CurrentManifest { @@ -260,7 +262,7 @@ enum CurrentManifest { FromScratch, // Use the previous manifest as an initial snapshot, but be mindful // of how much we trust that manifest. - Initial(PreviousManifestState, Manifest, Option>), + Initial(ValidManifestInfo), } // This impl block has all the snapshot update logic. @@ -366,7 +368,9 @@ impl Tracker { match current_manifest { None => CurrentManifest::FromScratch, - Some((manifest, base)) => CurrentManifest::Initial(prev_manifest_state, manifest, base), + Some((manifest, base)) => { + CurrentManifest::Initial(Box::new((prev_manifest_state, manifest, base))) + } } } @@ -572,7 +576,7 @@ impl Tracker { &mut self, header_fprint: Fingerprint, version_id: Vec, - current_manifest: Option<(PreviousManifestState, Manifest, Option>)>, + current_manifest: Option, ) -> Result<(usize, Vec, Option)> { use std::os::unix::fs::MetadataExt; @@ -609,7 +613,7 @@ impl Tracker { let prev_manifest_state = match current_manifest.as_ref() { None => PreviousManifestState::Arbitrary, - Some((state, _, _)) => *state, + Some(state) => state.0, }; let (len, mut chunks, bundled_chunks, mut copied) = @@ -755,7 +759,7 @@ impl Tracker { let current_manifest = match self.judge_current_manifest(&version_id) { CurrentManifest::UpToDate => return Ok(()), CurrentManifest::FromScratch => None, - CurrentManifest::Initial(state, manifest, base) => Some((state, manifest, base)), + CurrentManifest::Initial(manifest) => Some(manifest), }; // We don't *have* to overwrite the .metadata file, but we From 963a5fcce2eccee1a0abdd15254b2cf39cc8baed Mon Sep 17 00:00:00 2001 From: Paul Khuong Date: Sun, 6 Oct 2024 18:28:43 -0400 Subject: [PATCH 3/3] doc: describe how to use verneuilctl for incremental backup/restore --- doc/INCREMENTAL_BACKUP_RESTORE.md | 338 ++++++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 doc/INCREMENTAL_BACKUP_RESTORE.md diff --git a/doc/INCREMENTAL_BACKUP_RESTORE.md b/doc/INCREMENTAL_BACKUP_RESTORE.md new file mode 100644 index 0000000..0033552 --- /dev/null +++ b/doc/INCREMENTAL_BACKUP_RESTORE.md @@ -0,0 +1,338 @@ +Incremental backup/restore with verneuilctl +=========================================== + +Verneuil is primarily used as a pair of loadable VFSes (one for the writer +and another for read replicas), but it also comes with `verneuilctl`, a +command-line utility that wraps the combination of sqlite and the VFSes. + +The `verneuilctl` utility was always suitable for one-off backups from +scratch, and the VFS works for live replication. With +https://github.com/backtrace-labs/verneuil/pull/30, `verneuilctl` is +now reasonable for incremental backups. + +Building +-------- + +The `verneuilctl` utility is bundled as an example that depends on the +`vendor_sqlite` feature; build the utility with: + +``` +$ cargo build --release --examples --features vendor_sqlite +``` + +Setup +----- + +Verneuil backs up and restores sqlite databases to S3 (or any +compatible service). On an EC2 machine, Verneuil implicitly picks up +credentials from the instance's metadata. Otherwise, you can setup the +usual `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment +variables: + +``` +$ export AWS_ACCESS_KEY_ID=... +$ export AWS_SECRET_ACCESS_KEY=... +``` + +The credentials must give us access (read for restore, read and write +for backup) to *two* S3 buckets in the same region: one bucket will be +used for content-addressed "chunks", and the other for "manifests" with +names generated deterministically based on the source machine's hostname +and the database's local path. + +At scale, the content-addressed "chunks" bucket usually has an +expiration policy; long-lived Verneuil worker threads attempt to touch +every chunk at least once a week, so expiring after a month or two of +inactivity is safe. + +The "manifests" bucket should have versioning enabled, and the bucket +is usually configured to delete old versions (e.g., more than 1000 +versions behind) automatically. + +Assume the buckets are pre-created and accessible to the acount with +the access key. We need a configuration JSON file to tell Verneuil +where to find the chunks and manifests: + +``` +$ cat /tmp/verneuil.json +{ + "make_default": true, + "replication_spooling_dir": "/tmp/verneuil/", + "replication_targets": [ + { + "s3": { + "region": "SOMETHING LIKE us-east-2 SET IT TO THE CORRECT VALUE", + "chunk_bucket": "A BUCKET NAME LIKE my-verneuil-chunks", + "manifest_bucket": "A BUCKET NAME LIKE my-verneuil-manifests", + "domain_addressing": true + } + } + ] +} +``` + +The configuration file also specifies the `replication_spooling_dir`; that's +a local storage directory where replication data and metadata will be stored. +This information doesn't have to survive machine crashes or reboots. In fact, +it's assumed that its contents are invalid after a reboot. + +Prepare the data +---------------- + +We're finally ready to back up and restore SQLite databases. + +For our demo, let's fill a database file with random data (https://antonz.org/random-table/): + +``` + sqlite3 test.db +SQLite version 3.45.1 2024-01-30 16:01:20 +Enter ".help" for usage hints. +sqlite> create table random_data as +with recursive tmp(x) as ( + select random() + union all + select random() from tmp + limit 1000000 +) +select * from tmp; +sqlite> ^D + +``` + +Initial backup +-------------- + +We'll use the `verneuilctl sync` subcommand (run `verneuilctl --help` and `verneuilctl sync --help` for details). + +``` +$ time target/release/examples/verneuilctl -c @/tmp/verneuil.json sync test.db --optimize +5deb-verneuil%3Apk-m2-mba%3A8d12%2F%2Fhome%2Fpkhuong%2Fopen-sauce%2Fverneuil%2Ftest.db + +real 0m13.981s +user 0m1.810s +sys 0m0.412s +``` + +The test data base spans 15.4 MB (245 pages at 64 KB each, plus the +header page). The `sync` command wrote 490 blobs to the chunks +bucket, double what we expect: that's because the sync always starts +by backing up the current state of the database *before* running the +optimization command, `"PRAGMA page_size = 65536; VACUUM;"`. + +It's possible to avoid that overhead by just running the command above +manually before the initial `sync`. That leaves us with exactly the +number of chunks we expect (one per 64 KB page in the DB file, except +for the header page). Alternatively, use the regular 4KB page size, +the only impact is that we might end up writing more chunks to S3. + +Restore +------- + +In order to restore a database, we must find a manifest, download the +corresponding chunks, and assemble them. The `verneuilctl restore` +subcommand can reconstruct the manifest's blob name for a given +hostname and source path, and fetch the most recent version from S3: + +``` +$ time target/release/examples/verneuilctl -c @/tmp/verneuil.json restore --hostname pk-m2-mba --source-path /home/pkhuong/open-sauce/verneuil/test.db --out restore.db + +real 0m4.693s +user 0m0.621s +sys 0m0.244s +$ sha256sum test.db restore.db +a895465a62e1afcdc95703b739c23699d8e9a56b7ee2d2b0e51dfa938b5e64e8 test.db +a895465a62e1afcdc95703b739c23699d8e9a56b7ee2d2b0e51dfa938b5e64e8 restore.db +``` + +Subsequent backups +------------------ + +Immediately running a sync for the same database no-ops quickly: + +``` +$ time target/release/examples/verneuilctl -c @/tmp/verneuil.json sync test.db +5deb-verneuil%3Apk-m2-mba%3A8d12%2F%2Fhome%2Fpkhuong%2Fopen-sauce%2Fverneuil%2Ftest.db + +real 0m2.847s +user 0m0.008s +sys 0m0.015s +``` + +That's because verneuil compares the database with the metadata in the +spooling directory to avoid useless work. If the spooling directory +is missing or was populated before the most recent boot, we'll always +sync from scratch. However, the chunks bucket is content-addressed, +so this only wastes API calls and bandwidth, never persistent storage. + +Let's perform a small update to the `test.db` file and sync it again. + +``` +$ sqlite3 test.db +SQLite version 3.45.1 2024-01-30 16:01:20 +Enter ".help" for usage hints. +sqlite> .schema +CREATE TABLE random_data(x); +sqlite> select * from random_data limit 1; +-511094929343328393 +sqlite> update random_data set x = 0 where x = -511094929343328393; +sqlite> ^D + +``` + +``` +$ time target/release/examples/verneuilctl -c @/tmp/verneuil.json sync test.db +5deb-verneuil%3Apk-m2-mba%3A8d12%2F%2Fhome%2Fpkhuong%2Fopen-sauce%2Fverneuil%2Ftest.db + +real 0m1.995s +user 0m0.019s +sys 0m0.015s +``` + +This is a lot faster because the `sync` process still scans the whole +database file when the database has changed, but only uploads chunks +that have actually changed the manifest on disk. + +Incremental restore +------------------- + +The `verneuilctl` utility always writes out the whole database when +restoring. We can however use a local cache directory to save +redundant fetches from S3. + +We simply add this block to the `replication_targets` array in verneuil.json: + +``` + { + "local": { + "directory": "/tmp/verneuil-cache/", + "num_shards": 128, + "capacity": 10000 + } + } +``` + +This lets the *read-side* logic know to also look for chunks there +before hitting the S3 bucket, and to populate the local cache on +misses. For now, writes only consider S3 replication targets. + +``` +$ cat /tmp/verneuil.json +{ + "make_default": true, + "replication_spooling_dir": "/tmp/verneuil/", + "replication_targets": [ + { + "s3": { + "region": "us-east-2", + "chunk_bucket": "pkhuong-verneuil-chunks", + "manifest_bucket": "pkhuong-verneuil-manifests", + "domain_addressing": true + } + }, + { + "local": { + "directory": "/tmp/verneuil-cache/", + "num_shards": 128, + "capacity": 10000 + } + } + ] +} +``` + +The first time we restore with the cache enabled, the runtime is about +the same: the cache is empty. The second time however, is much faster +because the chunks are cached locally. + +``` +$ time target/release/examples/verneuilctl -c @/tmp/verneuil.json restore --hostname pk-m2-mba --source-path /home/pkhuong/open-sauce/verneuil/test.db --out restore2.db + +real 0m5.007s +user 0m0.646s +sys 0m0.297s +$ time target/release/examples/verneuilctl -c @/tmp/verneuil.json restore --hostname pk-m2-mba --source-path /home/pkhuong/open-sauce/verneuil/test.db --out restore3.db + +real 0m0.179s +user 0m0.008s +sys 0m0.025s +$ sha256sum restore2.db restore3.db +5c40f7ea75ea6c02e0f3f1f6965e293a14966b6d42123fd4a8adfbb4a0c2f72a restore2.db +5c40f7ea75ea6c02e0f3f1f6965e293a14966b6d42123fd4a8adfbb4a0c2f72a restore3.db +``` + +Using the Verneuil (write) VFS +------------------------------ + +We can perform similar update through the Verneuil VFS, which will +both update the local file and replicate the changes on the fly. + +But first, we have to build the vfs + +``` +$ cargo build --examples --release --features dynamic_vfs +``` + +``` +$ VERNEUIL_CONFIG=@/tmp/verneuil.json sqlite3 +SQLite version 3.45.1 2024-01-30 16:01:20 +Enter ".help" for usage hints. +Connected to a transient in-memory database. +Use ".open FILENAME" to reopen on a persistent database. +sqlite> .load target/release/examples/libverneuil_vfs.so +sqlite> .open test.db +sqlite> update random_data set x = 1 where rowid == 1; +sqlite> ^D + +$ target/release/examples/verneuilctl flush /tmp/verneuil # just in case we exited too quickly +$ target/release/examples/verneuilctl -c @/tmp/verneuil.json restore --hostname pk-m2-mba --source-path /home/pkhuong/open-sauce/verneuil/test.db --out restore2.db +$ sqlite3 restore2.db +SQLite version 3.45.1 2024-01-30 16:01:20 +Enter ".help" for usage hints. +sqlite> select * from random_data limit 1; +2 +sqlite> ^D + +``` + +Using the Verneuil replica VFS +------------------------------ + +Once the loadable VFS is built, we can also use it for read replicas. +The `verneuilctl shell` subcommand simply runs the sqlite3 shell with +two pre-defined commands to load the VFS and open a remote read replica. + +``` +$ target/release/examples/verneuilctl -c @/tmp/verneuil.json shell --hostname pk-m2-mba --source-path /home/pkhuong/open-sauce/verneuil/test.db +sqlite> select * from random_data limit 2; +2 +7377110096418126384 +sqlite> update random_data set x = 2 where rowid == 1; +Runtime error: attempt to write a readonly database (8) +``` + +However, if another process (on the source machine!) opens the same +source data base with the write VFS, we'll be able to update the +replica in-place and observe the new writes. + +``` + VERNEUIL_CONFIG=@/tmp/verneuil.json sqlite3 +SQLite version 3.45.1 2024-01-30 16:01:20 +Enter ".help" for usage hints. +Connected to a transient in-memory database. +Use ".open FILENAME" to reopen on a persistent database. +sqlite> .load target/release/examples/libverneuil_vfs.so +sqlite> .open test.db +sqlite> update random_data set x = -2 where rowid == 2; +sqlite> + +``` + +Back in the `verneuilctl shell` +``` +sqlite> pragma verneuil_snapshot_refresh = 2; +1728153173.150809900 +sqlite> select * from random_data limit 2; +2 +-2 +sqlite> +```