diff --git a/pghoard/restore.py b/pghoard/restore.py index 9c9cb91a..124cbaa3 100644 --- a/pghoard/restore.py +++ b/pghoard/restore.py @@ -42,7 +42,8 @@ from . import common, config, logutil, version from .postgres_command import PGHOARD_HOST, PGHOARD_PORT -MAX_RETRIES = 6 +STALL_MIN_RETRIES = 6 # minimum retry for stalled download, for the whole basebackup restore +SINGLE_FILE_MAX_RETRIES = 6 # maximum retry for a single file class RestoreError(Error): @@ -594,6 +595,10 @@ def _get_basebackup( os.makedirs(dirname, exist_ok=True) os.chmod(dirname, 0o700) + # Based on limited samples, there could be one stalled download per 122GiB of transfer + # So we tolerate one stall for every 64GiB of transfer (or STALL_MIN_RETRIES for smaller backup) + stall_max_retries = max(STALL_MIN_RETRIES, int(int(metadata.get("total-size-enc", 0))/(64*2**30))) + fetcher = BasebackupFetcher( app_config=self.config, data_files=basebackup_data_files, @@ -602,6 +607,7 @@ def _get_basebackup( pgdata=pgdata, site=site, tablespaces=tablespaces, + stall_max_retries=stall_max_retries, ) fetcher.fetch_all() @@ -644,7 +650,7 @@ def run(self, args=None): class BasebackupFetcher: - def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], status_output_file=None): + def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: List[FileInfo], stall_max_retries:int, status_output_file=None): self.log = logging.getLogger(self.__class__.__name__) self.completed_jobs: Set[str] = set() self.config = app_config @@ -668,9 +674,10 @@ def __init__(self, *, app_config, debug, site, pgdata, tablespaces, data_files: self.tablespaces = tablespaces self.total_download_size = 0 self.retry_per_file: Dict[str, int] = {} + self.stall_max_retries = stall_max_retries def fetch_all(self): - for retry in range(MAX_RETRIES): + for retry in range(self.stall_max_retries): try: with self.manager_class() as manager: self._setup_progress_tracking(manager) @@ -688,8 +695,8 @@ def fetch_all(self): if self.errors: break - if retry == MAX_RETRIES - 1: - self.log.error("Download stalled despite retries, aborting") + if retry == self.stall_max_retries - 1: + self.log.error("Download stalled despite retries, aborting (reached maximum retry %r)", self.stall_max_retries) self.errors = 1 break @@ -768,7 +775,7 @@ def job_failed(self, key, exception): retries = self.retry_per_file.get(key, 0) + 1 self.retry_per_file[key] = retries self.pending_jobs.remove(key) - if retries < MAX_RETRIES: + if retries < SINGLE_FILE_MAX_RETRIES: self.jobs_to_retry.add(key) return self.errors += 1 diff --git a/test/test_restore.py b/test/test_restore.py index f5f18648..8a698a88 100644 --- a/test/test_restore.py +++ b/test/test_restore.py @@ -21,7 +21,7 @@ from pghoard.common import TAR_METADATA_FILENAME, write_json_file from pghoard.restore import ( - MAX_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError, + STALL_MIN_RETRIES, BasebackupFetcher, ChunkFetcher, FileDataInfo, FileInfoType, FilePathInfo, Restore, RestoreError, create_recovery_conf ) @@ -361,7 +361,7 @@ def _fetch_and_extract_one_backup(self, metadata, file_size, fetch_fn): fetcher.max_stale_seconds = 2 with patch("pghoard.restore.ChunkFetcher", new=FailingChunkFetcher): - if max_fails < MAX_RETRIES: + if max_fails < STALL_MIN_RETRIES: fetcher.fetch_all() self.check_sha256( os.path.join(restore_dir, "pg_notify", "0000"),