From 5eca45db7c42f5748b1ff620ec7ded31ad305232 Mon Sep 17 00:00:00 2001 From: Egor Voynov Date: Tue, 9 Apr 2024 16:52:16 +0200 Subject: [PATCH 1/3] pghoard: convert 'metadata["backup-reason"]' value to enum [BF-2390] --- pghoard/common.py | 6 ++++++ pghoard/pghoard.py | 16 +++++++++------- test/basebackup/test_basebackup.py | 30 +++++++++++++++--------------- test/conftest.py | 3 ++- test/test_pghoard.py | 4 ++-- 5 files changed, 34 insertions(+), 25 deletions(-) diff --git a/pghoard/common.py b/pghoard/common.py index 6f93b7cb..23f0edaf 100644 --- a/pghoard/common.py +++ b/pghoard/common.py @@ -104,6 +104,12 @@ class BaseBackupMode(StrEnum): pipe = "pipe" +@enum.unique +class BackupReason(StrEnum): + requested = "requested" + scheduled = "scheduled" + + class ProgressData(BaseModel): current_progress: float = 0 last_updated_time: float = 0 diff --git a/pghoard/pghoard.py b/pghoard/pghoard.py index 396ecda1..17be4ad6 100644 --- a/pghoard/pghoard.py +++ b/pghoard/pghoard.py @@ -34,7 +34,7 @@ from pghoard import config, logutil, metrics, version, wal from pghoard.basebackup.base import PGBaseBackup from pghoard.common import ( - BaseBackupFormat, BaseBackupMode, CallbackEvent, FileType, FileTypePrefixes, create_alert_file, + BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, FileType, FileTypePrefixes, create_alert_file, download_backup_meta_file, extract_pghoard_bb_v2_metadata, extract_pghoard_delta_metadata, get_object_storage_config, replication_connection_string_and_slot_using_pgpass, write_json_file ) @@ -481,7 +481,7 @@ def patch_basebackup_info(self, *, entry, site_config): metadata["backup-decision-time"] = metadata["start-time"] # Backups are usually scheduled if "backup-reason" not in metadata: - metadata["backup-reason"] = "scheduled" + metadata["backup-reason"] = BackupReason.scheduled # Calculate normalized backup time based on start time if missing if "normalized-backup-time" not in metadata: metadata["normalized-backup-time"] = self.get_normalized_backup_time(site_config, now=metadata["start-time"]) @@ -815,17 +815,19 @@ def get_new_backup_details(self, *, now=None, site, site_config): if site in self.requested_basebackup_sites: self.log.info("Creating a new basebackup for %r due to request", site) self.requested_basebackup_sites.discard(site) - backup_reason = "requested" + backup_reason = BackupReason.requested elif site_config["basebackup_interval_hours"] is None: # Basebackups are disabled for this site (but they can still be requested over the API.) pass elif not basebackups: self.log.info("Creating a new basebackup for %r because there are currently none", site) - backup_reason = "scheduled" + backup_reason = BackupReason.scheduled elif backup_hour is not None and backup_minute is not None: most_recent_scheduled = None last_normalized_backup_time = basebackups[-1]["metadata"]["normalized-backup-time"] - scheduled_backups = [backup for backup in basebackups if backup["metadata"]["backup-reason"] == "scheduled"] + scheduled_backups = [ + backup for backup in basebackups if backup["metadata"]["backup-reason"] == BackupReason.scheduled + ] if scheduled_backups: most_recent_scheduled = scheduled_backups[-1]["metadata"]["backup-decision-time"] @@ -841,7 +843,7 @@ def get_new_backup_details(self, *, now=None, site, site_config): "Normalized backup time %r differs from previous %r, creating new basebackup", normalized_backup_time, last_normalized_backup_time ) - backup_reason = "scheduled" + backup_reason = BackupReason.scheduled elif backup_hour is not None and backup_minute is None: self.log.warning("Ignoring basebackup_hour as basebackup_minute is not defined") else: @@ -852,7 +854,7 @@ def get_new_backup_details(self, *, now=None, site, site_config): self.log.info( "Creating a new basebackup for %r by schedule (%s from previous)", site, delta_since_last_backup ) - backup_reason = "scheduled" + backup_reason = BackupReason.scheduled if not backup_reason: return None diff --git a/test/basebackup/test_basebackup.py b/test/basebackup/test_basebackup.py index c417de3b..00578185 100644 --- a/test/basebackup/test_basebackup.py +++ b/test/basebackup/test_basebackup.py @@ -23,7 +23,7 @@ from pghoard import common, metrics from pghoard.basebackup.base import PGBaseBackup -from pghoard.common import (BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue) +from pghoard.common import (BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue) from pghoard.restore import Restore, RestoreError from ..conftest import PGHoardForTest, PGTester @@ -240,7 +240,7 @@ def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, acti now = datetime.datetime.now(datetime.timezone.utc) metadata = { - "backup-reason": "scheduled", + "backup-reason": BackupReason.scheduled, "backup-decision-time": now.isoformat(), "normalized-backup-time": now.isoformat(), } @@ -276,7 +276,7 @@ def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, acti assert "start-wal-segment" in last_backup["metadata"] assert "start-time" in last_backup["metadata"] assert dateutil.parser.parse(last_backup["metadata"]["start-time"]).tzinfo # pylint: disable=no-member - assert last_backup["metadata"]["backup-reason"] == "scheduled" + assert last_backup["metadata"]["backup-reason"] == BackupReason.scheduled assert last_backup["metadata"]["backup-decision-time"] == now.isoformat() assert last_backup["metadata"]["normalized-backup-time"] == now.isoformat() if mode in {BaseBackupMode.local_tar, BaseBackupMode.delta}: @@ -672,7 +672,7 @@ def test_handle_site(self, pghoard): pghoard.handle_site(pghoard.test_site, site_config) assert pghoard.test_site not in pghoard.basebackups first_basebackups = pghoard.state["backup_sites"][pghoard.test_site]["basebackups"] - assert first_basebackups[0]["metadata"]["backup-reason"] == "scheduled" + assert first_basebackups[0]["metadata"]["backup-reason"] == BackupReason.scheduled assert first_basebackups[0]["metadata"]["backup-decision-time"] assert first_basebackups[0]["metadata"]["normalized-backup-time"] is None first_time_of_check = pghoard.time_of_last_backup_check[pghoard.test_site] @@ -725,7 +725,7 @@ def test_get_new_backup_details(self, pghoard): # No backups, one should be created. No backup schedule defined so normalized backup time is None metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config) assert metadata - assert metadata["backup-reason"] == "scheduled" + assert metadata["backup-reason"] == BackupReason.scheduled assert metadata["backup-decision-time"] == now.isoformat() assert metadata["normalized-backup-time"] is None @@ -734,7 +734,7 @@ def test_get_new_backup_details(self, pghoard): site_config["basebackup_minute"] = 10 metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config) assert metadata - assert metadata["backup-reason"] == "scheduled" + assert metadata["backup-reason"] == BackupReason.scheduled assert metadata["backup-decision-time"] == now.isoformat() assert "T13:10:00+00:00" in metadata["normalized-backup-time"] @@ -742,7 +742,7 @@ def test_get_new_backup_details(self, pghoard): site_config["basebackup_interval_hours"] = 1.5 metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config) assert metadata - assert metadata["backup-reason"] == "scheduled" + assert metadata["backup-reason"] == BackupReason.scheduled assert metadata["backup-decision-time"] == now.isoformat() assert "T14:40:00+00:00" in metadata["normalized-backup-time"] @@ -750,7 +750,7 @@ def test_get_new_backup_details(self, pghoard): "metadata": { "start-time": now - datetime.timedelta(hours=1), "backup-decision-time": now - datetime.timedelta(hours=1), - "backup-reason": "scheduled", + "backup-reason": BackupReason.scheduled, "normalized-backup-time": metadata["normalized-backup-time"], }, "name": "name01", @@ -763,7 +763,7 @@ def test_get_new_backup_details(self, pghoard): now2 = now + datetime.timedelta(hours=1) metadata = pghoard.get_new_backup_details(now=now2, site=pghoard.test_site, site_config=site_config) assert metadata - assert metadata["backup-reason"] == "scheduled" + assert metadata["backup-reason"] == BackupReason.scheduled assert metadata["backup-decision-time"] == now2.isoformat() assert "T16:10:00+00:00" in metadata["normalized-backup-time"] @@ -779,7 +779,7 @@ def test_get_new_backup_details(self, pghoard): now3 = now + datetime.timedelta(hours=7) metadata = pghoard.get_new_backup_details(now=now3, site=pghoard.test_site, site_config=site_config) assert metadata - assert metadata["backup-reason"] == "scheduled" + assert metadata["backup-reason"] == BackupReason.scheduled assert metadata["backup-decision-time"] == now3.isoformat() assert "T14:50:00+00:00" in metadata["normalized-backup-time"] @@ -789,7 +789,7 @@ def test_get_new_backup_details(self, pghoard): "metadata": { "start-time": now3 - datetime.timedelta(hours=1), "backup-decision-time": now - datetime.timedelta(hours=1), - "backup-reason": "requested", + "backup-reason": BackupReason.requested, "normalized-backup-time": metadata["normalized-backup-time"] + "different", }, "name": "name02", @@ -807,7 +807,7 @@ def test_get_new_backup_details(self, pghoard): pghoard.requested_basebackup_sites.add(site) metadata2 = pghoard.get_new_backup_details(now=now3, site=pghoard.test_site, site_config=site_config) assert metadata2 - assert metadata2["backup-reason"] == "requested" + assert metadata2["backup-reason"] == BackupReason.requested assert metadata2["backup-decision-time"] == now3.isoformat() assert metadata2["normalized-backup-time"] == metadata["normalized-backup-time"] @@ -827,7 +827,7 @@ def test_patch_basebackup_info(self, pghoard): pghoard.patch_basebackup_info(entry=entry, site_config=site_config) assert entry["name"] == "bar" assert entry["metadata"]["start-time"] == now - assert entry["metadata"]["backup-reason"] == "scheduled" + assert entry["metadata"]["backup-reason"] == BackupReason.scheduled assert entry["metadata"]["backup-decision-time"] == now assert isinstance(entry["metadata"]["normalized-backup-time"], str) @@ -836,14 +836,14 @@ def test_patch_basebackup_info(self, pghoard): "metadata": { "start-time": now.isoformat(), "backup-decision-time": (now - datetime.timedelta(seconds=30)).isoformat(), - "backup-reason": "requested", + "backup-reason": BackupReason.requested, "normalized-backup-time": None, } } pghoard.patch_basebackup_info(entry=entry, site_config=site_config) assert entry["name"] == "bar" assert entry["metadata"]["start-time"] == now - assert entry["metadata"]["backup-reason"] == "requested" + assert entry["metadata"]["backup-reason"] == BackupReason.requested assert entry["metadata"]["backup-decision-time"] == now - datetime.timedelta(seconds=30) assert entry["metadata"]["normalized-backup-time"] is None diff --git a/test/conftest.py b/test/conftest.py index 4ebab8d3..dffd51a0 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -30,6 +30,7 @@ from pghoard import config as pghconfig from pghoard import logutil, pgutil from pghoard.archive_cleanup import ArchiveCleanup +from pghoard.common import BackupReason from pghoard.pghoard import PGHoard logutil.configure_logging() @@ -491,7 +492,7 @@ def fixture_archive_cleaner(tmp_path): bb_metadata = { "_hash": "abc", "backup-decision-time": "2022-03-23T14:57:55.883514+00:00", - "backup-reason": "scheduled", + "backup-reason": BackupReason.scheduled, "start-time": "2022-03-23T15:57:55+01:00", "start-wal-segment": "000000010000000000000002", "active-backup-mode": "basic", diff --git a/test/test_pghoard.py b/test/test_pghoard.py index 0d364f5c..d6439259 100644 --- a/test/test_pghoard.py +++ b/test/test_pghoard.py @@ -17,7 +17,7 @@ import pghoard.pghoard as pghoard_module from pghoard.common import ( - TAR_METADATA_FILENAME, BaseBackupFormat, FileType, create_alert_file, delete_alert_file, write_json_file + TAR_METADATA_FILENAME, BackupReason, BaseBackupFormat, FileType, create_alert_file, delete_alert_file, write_json_file ) from pghoard.pghoard import PGHoard from pghoard.pgutil import create_connection_string @@ -92,7 +92,7 @@ def test_get_local_basebackups_info(self): assert available_backup["name"] == "2015-07-03_0" start_time = datetime.datetime(2015, 7, 3, 12, tzinfo=datetime.timezone.utc) assert available_backup["metadata"]["start-time"] == start_time - assert available_backup["metadata"]["backup-reason"] == "scheduled" + assert available_backup["metadata"]["backup-reason"] == BackupReason.scheduled assert available_backup["metadata"]["normalized-backup-time"] is None assert available_backup["metadata"]["backup-decision-time"] From 3363fb53bd25dd3a7ce11e48cab1a6b442c82a6f Mon Sep 17 00:00:00 2001 From: Egor Voynov Date: Wed, 10 Apr 2024 12:16:18 +0200 Subject: [PATCH 2/3] pghoard: ignore delta backup failures counter in some cases Cases: 1. Backup was requested by operator: `AVN-PROD service request-backup` 2. More than `backup_interval` have passed since the last unsuccessful attempt [BF-2390] --- pghoard/pghoard.py | 12 ++++++++++-- test/basebackup/test_basebackup.py | 25 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pghoard/pghoard.py b/pghoard/pghoard.py index 17be4ad6..852497fa 100644 --- a/pghoard/pghoard.py +++ b/pghoard/pghoard.py @@ -779,7 +779,15 @@ def handle_site(self, site, site_config): if metadata and not os.path.exists(self.config["maintenance_mode_file"]): if site in self.delta_backup_failures: retries = self.delta_backup_failures[site].retries - if retries > site_config["basebackup_delta_mode_max_retries"]: + bih = site_config.get("basebackup_interval_hours") + backup_interval = datetime.timedelta(hours=bih) if bih else None + last_failed_time = self.delta_backup_failures[site].last_failed_time + since_last_fail_interval = utc_now() - last_failed_time if last_failed_time else None + if metadata["backup-reason"] == BackupReason.requested: + self.log.info("Re-trying delta basebackup. Backup was requested") + elif backup_interval and since_last_fail_interval and since_last_fail_interval > backup_interval: + self.log.info("Re-trying delta basebackup. \"%s\" have passed since last fail", since_last_fail_interval) + elif retries > site_config["basebackup_delta_mode_max_retries"]: self.log.info("Giving up backup after exceeding max retries: %r", retries) return else: @@ -787,7 +795,7 @@ def handle_site(self, site, site_config): retry_interval = min(3 ** (retries + 1), 60 * 60) if utc_now( ) >= self.delta_backup_failures[site].last_failed_time + datetime.timedelta(seconds=retry_interval): - self.log.info("Re-trying delta basebackup") + self.log.info("Re-trying delta basebackup. Retry: %r", retries) else: self.log.info("Waiting for backoff time before re-trying new delta backup due to previous failures") return diff --git a/test/basebackup/test_basebackup.py b/test/basebackup/test_basebackup.py index 00578185..961e524d 100644 --- a/test/basebackup/test_basebackup.py +++ b/test/basebackup/test_basebackup.py @@ -24,6 +24,7 @@ from pghoard import common, metrics from pghoard.basebackup.base import PGBaseBackup from pghoard.common import (BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue) +from pghoard.pghoard import DeltaBaseBackupFailureInfo from pghoard.restore import Restore, RestoreError from ..conftest import PGHoardForTest, PGTester @@ -639,6 +640,30 @@ def test_basebackups_tablespaces(self, capsys, db, pghoard, tmpdir, pg_version: cursor.execute("DROP TABLESPACE tstest") conn.close() + def test_handle_site_create_backup_ignoring_failures(self, pghoard): + site_config = deepcopy(pghoard.config["backup_sites"][pghoard.test_site]) + assert pghoard.basebackups == {} + utc_now_dt = datetime.datetime.now(datetime.timezone.utc) + pghoard.delta_backup_failures[pghoard.test_site] = DeltaBaseBackupFailureInfo( + retries=site_config["basebackup_delta_mode_max_retries"] + 1, last_failed_time=utc_now_dt + ) + + # skip - too many errors + pghoard.handle_site(pghoard.test_site, site_config) + assert pghoard.test_site not in pghoard.basebackups + + # create "requested" backup + pghoard.requested_basebackup_sites.add(pghoard.test_site) + pghoard.handle_site(pghoard.test_site, site_config) + assert pghoard.test_site in pghoard.basebackups + + # last failed attempt was more than "basebackup_interval_hours" ago - create backup + pghoard.basebackups = {} + last_failed_time = utc_now_dt - datetime.timedelta(hours=site_config["basebackup_interval_hours"], seconds=5) + pghoard.delta_backup_failures[pghoard.test_site].last_failed_time = last_failed_time + pghoard.handle_site(pghoard.test_site, site_config) + assert pghoard.test_site in pghoard.basebackups + def test_handle_site(self, pghoard): site_config = deepcopy(pghoard.config["backup_sites"][pghoard.test_site]) site_config["basebackup_interval_hours"] = 1 / 3600 From c479f652cdee0f60e1c9b653d54ded6751dd1c72 Mon Sep 17 00:00:00 2001 From: Egor Voynov Date: Wed, 10 Apr 2024 12:33:53 +0200 Subject: [PATCH 3/3] pghoard: fix flaky test test_surviving_pg_receivewal_hickup On CI environment threads don't have enough time for initialization. --- test/test_pghoard.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_pghoard.py b/test/test_pghoard.py index d6439259..37ad3b46 100644 --- a/test/test_pghoard.py +++ b/test/test_pghoard.py @@ -900,6 +900,7 @@ def test_surviving_pg_receivewal_hickup(self, db, pghoard): os.makedirs(wal_directory, exist_ok=True) pghoard.receivexlog_listener(pghoard.test_site, db.user, wal_directory) + time.sleep(0.5) # waiting for thread setup conn = db.connect() conn.autocommit = True @@ -918,6 +919,7 @@ def test_surviving_pg_receivewal_hickup(self, db, pghoard): # stopping the thread is not enough, it's possible that killed receiver will leave incomplete partial files # around, pghoard is capable of cleaning those up but needs to be restarted, for the test it should be OK # just to call startup_walk_for_missed_files, so it takes care of cleaning up + time.sleep(0.5) # waiting for the end of file processing pghoard.startup_walk_for_missed_files() n_xlogs = pghoard.transfer_agent_state[pghoard.test_site]["upload"]["xlog"]["xlogs_since_basebackup"] @@ -930,6 +932,7 @@ def test_surviving_pg_receivewal_hickup(self, db, pghoard): # restart pghoard.receivexlog_listener(pghoard.test_site, db.user, wal_directory) assert pghoard.receivexlogs[pghoard.test_site].is_alive() + time.sleep(0.5) # waiting for thread setup # We should now process all created segments, not only the ones which were created after pg_receivewal was restarted wait_for_xlog(pghoard, n_xlogs + 10)