Skip to content

Commit

Permalink
restore: Make max stale_seconds retry with a longer timeout
Browse files Browse the repository at this point in the history
When timing out downloading a backup, on next retry, retry with a longer
timeout.
  • Loading branch information
RommelLayco committed Oct 28, 2024
1 parent 69d1115 commit 670d31a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
15 changes: 14 additions & 1 deletion pghoard/restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,17 @@ def run(self, args=None):


class BasebackupFetcher:
def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None):
def __init__(
self,
*,
app_config,
debug,
site,
pgdata,
tablespaces,
data_files: List[FileInfo],
status_output_file=None,
):
self.log = logging.getLogger(self.__class__.__name__)
self.completed_jobs: Set[str] = set()
self.config = app_config
Expand Down Expand Up @@ -698,6 +708,9 @@ def fetch_all(self):
except TimeoutError:
self.pending_jobs.clear()
self.last_progress_ts = time.monotonic()

# Increase the timeout and retry
self.max_stale_seconds *= 2
if self.errors:
break

Expand Down
8 changes: 6 additions & 2 deletions test/test_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ def test_real_processing_with_threading_retries_on_timeout_fails_after_3(self):

def real_processing_with_threading_retries_on_timeout(self, fetcher, restore_dir, max_fails):
fail_counter = [0]
base_max_stale_seconds = 2

class FailingChunkFetcher(ChunkFetcher):
def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
Expand All @@ -357,9 +358,12 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
# Corrupt the file to test that retrying failed basebackup chunk yields sensible results
with open(os.path.join(restore_dir, "pg_notify", "0000"), "w") as f:
f.write("foo")
time.sleep(4)

fetcher.max_stale_seconds = 2
# ensure we sleep long enough to timeout based on the number of retries
sleep_seconds = 2 * (2 ** max_fails) if max_fails < MAX_RETRIES else base_max_stale_seconds
time.sleep(sleep_seconds)

fetcher.max_stale_seconds = base_max_stale_seconds
with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher):
if max_fails < MAX_RETRIES:
fetcher.fetch_all()
Expand Down

0 comments on commit 670d31a

Please sign in to comment.