From 4869d8491cb6df9ced1613af1be5a3fd3dfbc358 Mon Sep 17 00:00:00 2001 From: Orange Kao Date: Mon, 28 Oct 2024 05:55:41 +0000 Subject: [PATCH] Better retry for base backup restore Current implementation use fixed retry (6) for base backup restore. However, on a large (2~4 TiB) database, there are more data to be transferred and will take longer. As a result, it is more likely to hit the limit. This commit allows a dynamic number of retry, depends on the size of the base backup. --- pghoard/restore.py | 33 +++++++++++++++++++++++++++------ test/test_restore.py | 15 +++++++++++---- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/pghoard/restore.py b/pghoard/restore.py index 9c9cb91a..1a29d520 100644 --- a/pghoard/restore.py +++ b/pghoard/restore.py @@ -42,7 +42,8 @@ from . import common, config, logutil, version from .postgres_command import PGHOARD_HOST, PGHOARD_PORT -MAX_RETRIES = 6 +STALL_MIN_RETRIES = 6 # minimum retry for stalled download, for the whole basebackup restore +SINGLE_FILE_MAX_RETRIES = 6 # maximum retry for a single file class RestoreError(Error): @@ -594,6 +595,10 @@ def _get_basebackup( os.makedirs(dirname, exist_ok=True) os.chmod(dirname, 0o700) + # Based on limited samples, there could be one stalled download per 122GiB of transfer + # So we tolerate one stall for every 64GiB of transfer (or STALL_MIN_RETRIES for smaller backup) + stall_max_retries = max(STALL_MIN_RETRIES, int(int(metadata.get("total-size-enc", 0)) / (64 * 2 ** 30))) + fetcher = BasebackupFetcher( app_config=self.config, data_files=basebackup_data_files, @@ -602,6 +607,7 @@ def _get_basebackup( pgdata=pgdata, site=site, tablespaces=tablespaces, + stall_max_retries=stall_max_retries, ) fetcher.fetch_all() @@ -644,7 +650,18 @@ def run(self, args=None): class BasebackupFetcher: - def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None): + def __init__( + self, + *, + app_config, + debug, + site, + pgdata, + tablespaces, + data_files: List[FileInfo], + stall_max_retries: int, + status_output_file=None + ): self.log = logging.getLogger(self.__class__.__name__) self.completed_jobs: Set[str] = set() self.config = app_config @@ -668,9 +685,10 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: self.tablespaces = tablespaces self.total_download_size = 0 self.retry_per_file: Dict[str, int] = {} + self.stall_max_retries = stall_max_retries def fetch_all(self): - for retry in range(MAX_RETRIES): + for retry in range(self.stall_max_retries): try: with self.manager_class() as manager: self._setup_progress_tracking(manager) @@ -688,8 +706,11 @@ def fetch_all(self): if self.errors: break - if retry == MAX_RETRIES - 1: - self.log.error("Download stalled despite retries, aborting") + if retry == self.stall_max_retries - 1: + self.log.error( + "Download stalled despite retries, aborting" + " (reached maximum retry %r)", self.stall_max_retries + ) self.errors = 1 break @@ -768,7 +789,7 @@ def job_failed(self, key, exception): retries = self.retry_per_file.get(key, 0) + 1 self.retry_per_file[key] = retries self.pending_jobs.remove(key) - if retries < MAX_RETRIES: + if retries < SINGLE_FILE_MAX_RETRIES: self.jobs_to_retry.add(key) return self.errors += 1 diff --git a/test/test_restore.py b/test/test_restore.py index f5f18648..391e6aea 100644 --- a/test/test_restore.py +++ b/test/test_restore.py @@ -21,7 +21,7 @@ from pghoard.common import TAR_METADATA_FILENAME, write_json_file from pghoard.restore import ( - MAX_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError, + STALL_MIN_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError, create_recovery_conf ) @@ -195,7 +195,8 @@ def test_progress_tracking_and_error_handling(self): status_output_file=status_output_file, pgdata=pgdata, site=site, - tablespaces=tablespaces + tablespaces=tablespaces, + stall_max_retries=STALL_MIN_RETRIES, ) manager, pool, manager_enter = MagicMock(), MagicMock(), MagicMock() fetcher.manager_class = lambda: manager @@ -361,7 +362,7 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn): fetcher.max_stale_seconds = 2 with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher): - if max_fails < MAX_RETRIES: + if max_fails < STALL_MIN_RETRIES: fetcher.fetch_all() self.check_sha256( os.path.join(restore_dir, "pg_notify", "0000"), @@ -484,7 +485,13 @@ def run_restore_test(self, path, tar_executable, logic, tablespaces=None, files= config["tar_executable"] = tar_executable site = next(iter(config["backup_sites"])) fetcher = BasebackupFetcher( - app_config=config, data_files=files, debug=True, pgdata=restore_dir, site=site, tablespaces=tablespaces or {} + app_config=config, + data_files=files, + debug=True, + pgdata=restore_dir, + site=site, + tablespaces=tablespaces or {}, + stall_max_retries=STALL_MIN_RETRIES ) try: logic(fetcher, restore_dir)