From d83daec041143d8b22527fcf86346a870866709b Mon Sep 17 00:00:00 2001
From: Orange Kao <orange@aiven.io>
Date: Mon, 28 Oct 2024 05:55:41 +0000
Subject: [PATCH] Better retry for base backup restore

Current implementation use fixed retry (6) for base backup restore.
However, on a large (2~4 TiB) database, there are more data to be
transferred and will take longer. As a result, it is more likely to hit
the limit.

This commit allows a dynamic number of retry, depends on the size of the
base backup.
---
 pghoard/restore.py   | 19 +++++++++++++------
 test/test_restore.py |  4 ++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/pghoard/restore.py b/pghoard/restore.py
index 9c9cb91a..124cbaa3 100644
--- a/pghoard/restore.py
+++ b/pghoard/restore.py
@@ -42,7 +42,8 @@
 from . import common, config, logutil, version
 from .postgres_command import PGHOARD_HOST, PGHOARD_PORT
 
-MAX_RETRIES = 6
+STALL_MIN_RETRIES = 6  # minimum retry for stalled download, for the whole basebackup restore
+SINGLE_FILE_MAX_RETRIES = 6  # maximum retry for a single file
 
 
 class RestoreError(Error):
@@ -594,6 +595,10 @@ def _get_basebackup(
                 os.makedirs(dirname, exist_ok=True)
                 os.chmod(dirname, 0o700)
 
+        # Based on limited samples, there could be one stalled download per 122GiB of transfer
+        # So we tolerate one stall for every 64GiB of transfer (or STALL_MIN_RETRIES for smaller backup)
+        stall_max_retries = max(STALL_MIN_RETRIES, int(int(metadata.get("total-size-enc", 0))/(64*2**30)))
+
         fetcher = BasebackupFetcher(
             app_config=self.config,
             data_files=basebackup_data_files,
@@ -602,6 +607,7 @@ def _get_basebackup(
             pgdata=pgdata,
             site=site,
             tablespaces=tablespaces,
+            stall_max_retries=stall_max_retries,
         )
         fetcher.fetch_all()
 
@@ -644,7 +650,7 @@ def run(self, args=None):
 
 
 class BasebackupFetcher:
-    def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None):
+    def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], stall_max_retries:int, status_output_file=None):
         self.log = logging.getLogger(self.__class__.__name__)
         self.completed_jobs: Set[str] = set()
         self.config = app_config
@@ -668,9 +674,10 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files:
         self.tablespaces = tablespaces
         self.total_download_size = 0
         self.retry_per_file: Dict[str, int] = {}
+        self.stall_max_retries = stall_max_retries
 
     def fetch_all(self):
-        for retry in range(MAX_RETRIES):
+        for retry in range(self.stall_max_retries):
             try:
                 with self.manager_class() as manager:
                     self._setup_progress_tracking(manager)
@@ -688,8 +695,8 @@ def fetch_all(self):
                 if self.errors:
                     break
 
-                if retry == MAX_RETRIES - 1:
-                    self.log.error("Download stalled despite retries, aborting")
+                if retry == self.stall_max_retries - 1:
+                    self.log.error("Download stalled despite retries, aborting (reached maximum retry %r)", self.stall_max_retries)
                     self.errors = 1
                     break
 
@@ -768,7 +775,7 @@ def job_failed(self, key, exception):
                     retries = self.retry_per_file.get(key, 0) + 1
                     self.retry_per_file[key] = retries
                     self.pending_jobs.remove(key)
-                    if retries < MAX_RETRIES:
+                    if retries < SINGLE_FILE_MAX_RETRIES:
                         self.jobs_to_retry.add(key)
                         return
                     self.errors += 1
diff --git a/test/test_restore.py b/test/test_restore.py
index f5f18648..8a698a88 100644
--- a/test/test_restore.py
+++ b/test/test_restore.py
@@ -21,7 +21,7 @@
 
 from pghoard.common import TAR_METADATA_FILENAME, write_json_file
 from pghoard.restore import (
-    MAX_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError,
+    STALL_MIN_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError,
     create_recovery_conf
 )
 
@@ -361,7 +361,7 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn):
 
         fetcher.max_stale_seconds = 2
         with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher):
-            if max_fails < MAX_RETRIES:
+            if max_fails < STALL_MIN_RETRIES:
                 fetcher.fetch_all()
                 self.check_sha256(
                     os.path.join(restore_dir, "pg_notify", "0000"),