From ecef14bdbb7d2df6310de0cf5f6d5821cb87f86e Mon Sep 17 00:00:00 2001
From: Rommel Layco <rommel.layco@aiven.io>
Date: Mon, 21 Oct 2024 11:14:34 +0100
Subject: [PATCH] restore: Make max stale_seconds retry with a longer timeout

When timing out downloading a backup, on next retry, retry with a longer
timeout.
---
 pghoard/restore.py   |  3 +++
 test/test_restore.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pghoard/restore.py b/pghoard/restore.py
index 930b167c..32e047f5 100644
--- a/pghoard/restore.py
+++ b/pghoard/restore.py
@@ -716,6 +716,9 @@ def fetch_all(self):
             except TimeoutError:
                 self.pending_jobs.clear()
                 self.last_progress_ts = time.monotonic()
+
+                # Increase the timeout and retry
+                self.max_stale_seconds = max(self.max_stale_seconds * 2, 480)
                 if self.errors:
                     break
 
diff --git a/test/test_restore.py b/test/test_restore.py
index 391e6aea..4da0bafa 100644
--- a/test/test_restore.py
+++ b/test/test_restore.py
@@ -349,6 +349,7 @@ def test_real_processing_with_threading_retries_on_timeout_fails_after_3(self):
 
     def real_processing_with_threading_retries_on_timeout(self, fetcher, restore_dir, max_fails):
         fail_counter = [0]
+        base_max_stale_seconds = 2
 
         class FailingChunkFetcher(ChunkFetcher):
             def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
@@ -358,9 +359,14 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
                     # Corrupt the file to test that retrying failed basebackup chunk yields sensible results
                     with open(os.path.join(restore_dir, "pg_notify", "0000"), "w") as f:
                         f.write("foo")
-                    time.sleep(4)
 
-        fetcher.max_stale_seconds = 2
+                    # ensure we sleep long enough to timeout based on the number of retries
+                    sleep_seconds = base_max_stale_seconds * (
+                        2 ** max_fails
+                    ) if max_fails < STALL_MIN_RETRIES else base_max_stale_seconds
+                    time.sleep(sleep_seconds)
+
+        fetcher.max_stale_seconds = base_max_stale_seconds
         with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher):
             if max_fails < STALL_MIN_RETRIES:
                 fetcher.fetch_all()