Aiven-Open · alexole · Oct 30, 2024 · Oct 28, 2024
diff --git a/pghoard/restore.py b/pghoard/restore.py
@@ -42,7 +42,8 @@
 from . import common, config, logutil, version
 from .postgres_command import PGHOARD_HOST, PGHOARD_PORT
 
-MAX_RETRIES = 6
+STALL_MIN_RETRIES = 6  # minimum retry for stalled download, for the whole basebackup restore
+SINGLE_FILE_MAX_RETRIES = 6  # maximum retry for a single file
 
 
 class RestoreError(Error):
@@ -594,6 +595,10 @@ def _get_basebackup(
                 os.makedirs(dirname, exist_ok=True)
                 os.chmod(dirname, 0o700)
 
+        # Based on limited samples, there could be one stalled download per 122GiB of transfer
+        # So we tolerate one stall for every 64GiB of transfer (or STALL_MIN_RETRIES for smaller backup)
+        stall_max_retries = max(STALL_MIN_RETRIES, int(int(metadata.get("total-size-enc", 0)) / (64 * 2 ** 30)))
+
         fetcher = BasebackupFetcher(
             app_config=self.config,
             data_files=basebackup_data_files,
@@ -602,6 +607,7 @@ def _get_basebackup(
             pgdata=pgdata,
             site=site,
             tablespaces=tablespaces,
+            stall_max_retries=stall_max_retries,
         )
         fetcher.fetch_all()
 
@@ -644,7 +650,18 @@ def run(self, args=None):
 
 
 class BasebackupFetcher:
-    def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None):
+    def __init__(
+        self,
+        *,
+        app_config,
+        debug,
+        site,
+        pgdata,
+        tablespaces,
+        data_files: List[FileInfo],
+        stall_max_retries: int,
+        status_output_file=None
+    ):
         self.log = logging.getLogger(self.__class__.__name__)
         self.completed_jobs: Set[str] = set()
         self.config = app_config
@@ -668,9 +685,10 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files:
         self.tablespaces = tablespaces
         self.total_download_size = 0
         self.retry_per_file: Dict[str, int] = {}
+        self.stall_max_retries = stall_max_retries
 
     def fetch_all(self):
-        for retry in range(MAX_RETRIES):
+        for retry in range(self.stall_max_retries):
             try:
                 with self.manager_class() as manager:
                     self._setup_progress_tracking(manager)
@@ -688,8 +706,11 @@ def fetch_all(self):
                 if self.errors:
                     break
 
-                if retry == MAX_RETRIES - 1:
-                    self.log.error("Download stalled despite retries, aborting")
+                if retry == self.stall_max_retries - 1:
+                    self.log.error(
+                        "Download stalled despite retries, aborting"
+                        " (reached maximum retry %r)", self.stall_max_retries
+                    )
                     self.errors = 1
                     break
 
@@ -768,7 +789,7 @@ def job_failed(self, key, exception):
                     retries = self.retry_per_file.get(key, 0) + 1
                     self.retry_per_file[key] = retries
                     self.pending_jobs.remove(key)
-                    if retries < MAX_RETRIES:
+                    if retries < SINGLE_FILE_MAX_RETRIES:
                         self.jobs_to_retry.add(key)
                         return
                     self.errors += 1

diff --git a/test/test_restore.py b/test/test_restore.py
@@ -21,7 +21,7 @@
 
 from pghoard.common import TAR_METADATA_FILENAME, write_json_file
 from pghoard.restore import (
-    MAX_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError,
+    STALL_MIN_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError,
     create_recovery_conf
 )
 
@@ -195,7 +195,8 @@ def test_progress_tracking_and_error_handling(self):
             status_output_file=status_output_file,
             pgdata=pgdata,
             site=site,
-            tablespaces=tablespaces
+            tablespaces=tablespaces,
+            stall_max_retries=STALL_MIN_RETRIES,
         )
         manager, pool, manager_enter = MagicMock(), MagicMock(), MagicMock()
         fetcher.manager_class = lambda: manager
@@ -361,7 +362,7 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
 
         fetcher.max_stale_seconds = 2
         with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher):
-            if max_fails < MAX_RETRIES:
+            if max_fails < STALL_MIN_RETRIES:
                 fetcher.fetch_all()
                 self.check_sha256(
                     os.path.join(restore_dir, "pg_notify", "0000"),
@@ -484,7 +485,13 @@ def run_restore_test(self, path, tar_executable, logic, tablespaces=None, files=
         config["tar_executable"] = tar_executable
         site = next(iter(config["backup_sites"]))
         fetcher = BasebackupFetcher(
-            app_config=config, data_files=files, debug=True, pgdata=restore_dir, site=site, tablespaces=tablespaces or {}
+            app_config=config,
+            data_files=files,
+            debug=True,
+            pgdata=restore_dir,
+            site=site,
+            tablespaces=tablespaces or {},
+            stall_max_retries=STALL_MIN_RETRIES
         )
         try:
             logic(fetcher, restore_dir)