restore: Make max stale_seconds retry with a longer timeout

When timing out downloading a backup, on next retry, retry with a longer timeout.
Aiven-Open · Oct 28, 2024 · 5775163 · 5775163
1 parent 69d1115
commit 5775163
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 3 deletions.
diff --git a/pghoard/restore.py b/pghoard/restore.py
@@ -657,7 +657,17 @@ def run(self, args=None):
 
 
 class BasebackupFetcher:
-    def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None):
+    def __init__(
+        self,
+        *,
+        app_config,
+        debug,
+        site,
+        pgdata,
+        tablespaces,
+        data_files: List[FileInfo],
+        status_output_file=None,
+    ):
         self.log = logging.getLogger(self.__class__.__name__)
         self.completed_jobs: Set[str] = set()
         self.config = app_config
@@ -698,6 +708,9 @@ def fetch_all(self):
             except TimeoutError:
                 self.pending_jobs.clear()
                 self.last_progress_ts = time.monotonic()
+
+                # Increase the timeout and retry
+                self.max_stale_seconds *= 2
                 if self.errors:
                     break
 

diff --git a/test/test_restore.py b/test/test_restore.py
@@ -348,6 +348,7 @@ def test_real_processing_with_threading_retries_on_timeout_fails_after_3(self):
 
     def real_processing_with_threading_retries_on_timeout(self, fetcher, restore_dir, max_fails):
         fail_counter = [0]
+        base_max_stale_seconds = 2
 
         class FailingChunkFetcher(ChunkFetcher):
             def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
@@ -357,9 +358,14 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
                     # Corrupt the file to test that retrying failed basebackup chunk yields sensible results
                     with open(os.path.join(restore_dir, "pg_notify", "0000"), "w") as f:
                         f.write("foo")
-                    time.sleep(4)
 
-        fetcher.max_stale_seconds = 2
+                    # ensure we sleep long enough to timeout based on the number of retries
+                    sleep_seconds = base_max_stale_seconds * (
+                        2 ** max_fails
+                    ) if max_fails < MAX_RETRIES else base_max_stale_seconds
+                    time.sleep(sleep_seconds)
+
+        fetcher.max_stale_seconds = base_max_stale_seconds
         with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher):
             if max_fails < MAX_RETRIES:
                 fetcher.fetch_all()