From ecef14bdbb7d2df6310de0cf5f6d5821cb87f86e Mon Sep 17 00:00:00 2001 From: Rommel Layco Date: Mon, 21 Oct 2024 11:14:34 +0100 Subject: [PATCH] restore: Make max stale_seconds retry with a longer timeout When timing out downloading a backup, on next retry, retry with a longer timeout. --- pghoard/restore.py | 3 +++ test/test_restore.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pghoard/restore.py b/pghoard/restore.py index 930b167c..32e047f5 100644 --- a/pghoard/restore.py +++ b/pghoard/restore.py @@ -716,6 +716,9 @@ def fetch_all(self): except TimeoutError: self.pending_jobs.clear() self.last_progress_ts = time.monotonic() + + # Increase the timeout and retry + self.max_stale_seconds = max(self.max_stale_seconds * 2, 480) if self.errors: break diff --git a/test/test_restore.py b/test/test_restore.py index 391e6aea..4da0bafa 100644 --- a/test/test_restore.py +++ b/test/test_restore.py @@ -349,6 +349,7 @@ def test_real_processing_with_threading_retries_on_timeout_fails_after_3(self): def real_processing_with_threading_retries_on_timeout(self, fetcher, restore_dir, max_fails): fail_counter = [0] + base_max_stale_seconds = 2 class FailingChunkFetcher(ChunkFetcher): def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn): @@ -358,9 +359,14 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn): # Corrupt the file to test that retrying failed basebackup chunk yields sensible results with open(os.path.join(restore_dir, "pg_notify", "0000"), "w") as f: f.write("foo") - time.sleep(4) - fetcher.max_stale_seconds = 2 + # ensure we sleep long enough to timeout based on the number of retries + sleep_seconds = base_max_stale_seconds * ( + 2 ** max_fails + ) if max_fails < STALL_MIN_RETRIES else base_max_stale_seconds + time.sleep(sleep_seconds) + + fetcher.max_stale_seconds = base_max_stale_seconds with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher): if max_fails < STALL_MIN_RETRIES: fetcher.fetch_all()