Skip to content

Commit

Permalink
Merge pull request #621 from Aiven-Open/egor-voynov-reset-backup-atte…
Browse files Browse the repository at this point in the history
…mpt-counter

pghoard: ignore delta backup failures counter in some cases
  • Loading branch information
alexole authored May 27, 2024
2 parents cea3d0b + c479f65 commit a1da446
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 27 deletions.
6 changes: 6 additions & 0 deletions pghoard/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ class BaseBackupMode(StrEnum):
pipe = "pipe"


@enum.unique
class BackupReason(StrEnum):
requested = "requested"
scheduled = "scheduled"


class ProgressData(BaseModel):
current_progress: float = 0
last_updated_time: float = 0
Expand Down
28 changes: 19 additions & 9 deletions pghoard/pghoard.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pghoard import config, logutil, metrics, version, wal
from pghoard.basebackup.base import PGBaseBackup
from pghoard.common import (
BaseBackupFormat, BaseBackupMode, CallbackEvent, FileType, FileTypePrefixes, create_alert_file,
BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, FileType, FileTypePrefixes, create_alert_file,
download_backup_meta_file, extract_pghoard_bb_v2_metadata, extract_pghoard_delta_metadata, get_object_storage_config,
replication_connection_string_and_slot_using_pgpass, write_json_file
)
Expand Down Expand Up @@ -485,7 +485,7 @@ def patch_basebackup_info(self, *, entry, site_config):
metadata["backup-decision-time"] = metadata["start-time"]
# Backups are usually scheduled
if "backup-reason" not in metadata:
metadata["backup-reason"] = "scheduled"
metadata["backup-reason"] = BackupReason.scheduled
# Calculate normalized backup time based on start time if missing
if "normalized-backup-time" not in metadata:
metadata["normalized-backup-time"] = self.get_normalized_backup_time(site_config, now=metadata["start-time"])
Expand Down Expand Up @@ -784,15 +784,23 @@ def handle_site(self, site, site_config):
if metadata and not os.path.exists(self.config["maintenance_mode_file"]):
if site in self.delta_backup_failures:
retries = self.delta_backup_failures[site].retries
if retries > site_config["basebackup_delta_mode_max_retries"]:
bih = site_config.get("basebackup_interval_hours")
backup_interval = datetime.timedelta(hours=bih) if bih else None
last_failed_time = self.delta_backup_failures[site].last_failed_time
since_last_fail_interval = utc_now() - last_failed_time if last_failed_time else None
if metadata["backup-reason"] == BackupReason.requested:
self.log.info("Re-trying delta basebackup. Backup was requested")
elif backup_interval and since_last_fail_interval and since_last_fail_interval > backup_interval:
self.log.info("Re-trying delta basebackup. \"%s\" have passed since last fail", since_last_fail_interval)
elif retries > site_config["basebackup_delta_mode_max_retries"]:
self.log.info("Giving up backup after exceeding max retries: %r", retries)
return
else:
# Start from ~3 sec with cap of one hour
retry_interval = min(3 ** (retries + 1), 60 * 60)
if utc_now(
) >= self.delta_backup_failures[site].last_failed_time + datetime.timedelta(seconds=retry_interval):
self.log.info("Re-trying delta basebackup")
self.log.info("Re-trying delta basebackup. Retry: %r", retries)
else:
self.log.info("Waiting for backoff time before re-trying new delta backup due to previous failures")
return
Expand Down Expand Up @@ -820,17 +828,19 @@ def get_new_backup_details(self, *, now=None, site, site_config):
if site in self.requested_basebackup_sites:
self.log.info("Creating a new basebackup for %r due to request", site)
self.requested_basebackup_sites.discard(site)
backup_reason = "requested"
backup_reason = BackupReason.requested
elif site_config["basebackup_interval_hours"] is None:
# Basebackups are disabled for this site (but they can still be requested over the API.)
pass
elif not basebackups:
self.log.info("Creating a new basebackup for %r because there are currently none", site)
backup_reason = "scheduled"
backup_reason = BackupReason.scheduled
elif backup_hour is not None and backup_minute is not None:
most_recent_scheduled = None
last_normalized_backup_time = basebackups[-1]["metadata"]["normalized-backup-time"]
scheduled_backups = [backup for backup in basebackups if backup["metadata"]["backup-reason"] == "scheduled"]
scheduled_backups = [
backup for backup in basebackups if backup["metadata"]["backup-reason"] == BackupReason.scheduled
]
if scheduled_backups:
most_recent_scheduled = scheduled_backups[-1]["metadata"]["backup-decision-time"]

Expand All @@ -846,7 +856,7 @@ def get_new_backup_details(self, *, now=None, site, site_config):
"Normalized backup time %r differs from previous %r, creating new basebackup", normalized_backup_time,
last_normalized_backup_time
)
backup_reason = "scheduled"
backup_reason = BackupReason.scheduled
elif backup_hour is not None and backup_minute is None:
self.log.warning("Ignoring basebackup_hour as basebackup_minute is not defined")
else:
Expand All @@ -857,7 +867,7 @@ def get_new_backup_details(self, *, now=None, site, site_config):
self.log.info(
"Creating a new basebackup for %r by schedule (%s from previous)", site, delta_since_last_backup
)
backup_reason = "scheduled"
backup_reason = BackupReason.scheduled

if not backup_reason:
return None
Expand Down
55 changes: 40 additions & 15 deletions test/basebackup/test_basebackup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

from pghoard import common, metrics
from pghoard.basebackup.base import PGBaseBackup
from pghoard.common import (BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue)
from pghoard.common import (BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue)
from pghoard.pghoard import DeltaBaseBackupFailureInfo
from pghoard.restore import Restore, RestoreError

from ..conftest import PGHoardForTest, PGTester
Expand Down Expand Up @@ -240,7 +241,7 @@ def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, acti

now = datetime.datetime.now(datetime.timezone.utc)
metadata = {
"backup-reason": "scheduled",
"backup-reason": BackupReason.scheduled,
"backup-decision-time": now.isoformat(),
"normalized-backup-time": now.isoformat(),
}
Expand Down Expand Up @@ -276,7 +277,7 @@ def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, acti
assert "start-wal-segment" in last_backup["metadata"]
assert "start-time" in last_backup["metadata"]
assert dateutil.parser.parse(last_backup["metadata"]["start-time"]).tzinfo # pylint: disable=no-member
assert last_backup["metadata"]["backup-reason"] == "scheduled"
assert last_backup["metadata"]["backup-reason"] == BackupReason.scheduled
assert last_backup["metadata"]["backup-decision-time"] == now.isoformat()
assert last_backup["metadata"]["normalized-backup-time"] == now.isoformat()
if mode in {BaseBackupMode.local_tar, BaseBackupMode.delta}:
Expand Down Expand Up @@ -639,6 +640,30 @@ def test_basebackups_tablespaces(self, capsys, db, pghoard, tmpdir, pg_version:
cursor.execute("DROP TABLESPACE tstest")
conn.close()

def test_handle_site_create_backup_ignoring_failures(self, pghoard):
site_config = deepcopy(pghoard.config["backup_sites"][pghoard.test_site])
assert pghoard.basebackups == {}
utc_now_dt = datetime.datetime.now(datetime.timezone.utc)
pghoard.delta_backup_failures[pghoard.test_site] = DeltaBaseBackupFailureInfo(
retries=site_config["basebackup_delta_mode_max_retries"] + 1, last_failed_time=utc_now_dt
)

# skip - too many errors
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site not in pghoard.basebackups

# create "requested" backup
pghoard.requested_basebackup_sites.add(pghoard.test_site)
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site in pghoard.basebackups

# last failed attempt was more than "basebackup_interval_hours" ago - create backup
pghoard.basebackups = {}
last_failed_time = utc_now_dt - datetime.timedelta(hours=site_config["basebackup_interval_hours"], seconds=5)
pghoard.delta_backup_failures[pghoard.test_site].last_failed_time = last_failed_time
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site in pghoard.basebackups

def test_handle_site(self, pghoard):
site_config = deepcopy(pghoard.config["backup_sites"][pghoard.test_site])
site_config["basebackup_interval_hours"] = 1 / 3600
Expand Down Expand Up @@ -672,7 +697,7 @@ def test_handle_site(self, pghoard):
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site not in pghoard.basebackups
first_basebackups = pghoard.state["backup_sites"][pghoard.test_site]["basebackups"]
assert first_basebackups[0]["metadata"]["backup-reason"] == "scheduled"
assert first_basebackups[0]["metadata"]["backup-reason"] == BackupReason.scheduled
assert first_basebackups[0]["metadata"]["backup-decision-time"]
assert first_basebackups[0]["metadata"]["normalized-backup-time"] is None
first_time_of_check = pghoard.time_of_last_backup_check[pghoard.test_site]
Expand Down Expand Up @@ -725,7 +750,7 @@ def test_get_new_backup_details(self, pghoard):
# No backups, one should be created. No backup schedule defined so normalized backup time is None
metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now.isoformat()
assert metadata["normalized-backup-time"] is None

Expand All @@ -734,23 +759,23 @@ def test_get_new_backup_details(self, pghoard):
site_config["basebackup_minute"] = 10
metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now.isoformat()
assert "T13:10:00+00:00" in metadata["normalized-backup-time"]

# No backups, one should be created. Backup schedule defined so normalized backup time is set
site_config["basebackup_interval_hours"] = 1.5
metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now.isoformat()
assert "T14:40:00+00:00" in metadata["normalized-backup-time"]

pghoard.state["backup_sites"][site]["basebackups"].append({
"metadata": {
"start-time": now - datetime.timedelta(hours=1),
"backup-decision-time": now - datetime.timedelta(hours=1),
"backup-reason": "scheduled",
"backup-reason": BackupReason.scheduled,
"normalized-backup-time": metadata["normalized-backup-time"],
},
"name": "name01",
Expand All @@ -763,7 +788,7 @@ def test_get_new_backup_details(self, pghoard):
now2 = now + datetime.timedelta(hours=1)
metadata = pghoard.get_new_backup_details(now=now2, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now2.isoformat()
assert "T16:10:00+00:00" in metadata["normalized-backup-time"]

Expand All @@ -779,7 +804,7 @@ def test_get_new_backup_details(self, pghoard):
now3 = now + datetime.timedelta(hours=7)
metadata = pghoard.get_new_backup_details(now=now3, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now3.isoformat()
assert "T14:50:00+00:00" in metadata["normalized-backup-time"]

Expand All @@ -789,7 +814,7 @@ def test_get_new_backup_details(self, pghoard):
"metadata": {
"start-time": now3 - datetime.timedelta(hours=1),
"backup-decision-time": now - datetime.timedelta(hours=1),
"backup-reason": "requested",
"backup-reason": BackupReason.requested,
"normalized-backup-time": metadata["normalized-backup-time"] + "different",
},
"name": "name02",
Expand All @@ -807,7 +832,7 @@ def test_get_new_backup_details(self, pghoard):
pghoard.requested_basebackup_sites.add(site)
metadata2 = pghoard.get_new_backup_details(now=now3, site=pghoard.test_site, site_config=site_config)
assert metadata2
assert metadata2["backup-reason"] == "requested"
assert metadata2["backup-reason"] == BackupReason.requested
assert metadata2["backup-decision-time"] == now3.isoformat()
assert metadata2["normalized-backup-time"] == metadata["normalized-backup-time"]

Expand All @@ -827,7 +852,7 @@ def test_patch_basebackup_info(self, pghoard):
pghoard.patch_basebackup_info(entry=entry, site_config=site_config)
assert entry["name"] == "bar"
assert entry["metadata"]["start-time"] == now
assert entry["metadata"]["backup-reason"] == "scheduled"
assert entry["metadata"]["backup-reason"] == BackupReason.scheduled
assert entry["metadata"]["backup-decision-time"] == now
assert isinstance(entry["metadata"]["normalized-backup-time"], str)

Expand All @@ -836,14 +861,14 @@ def test_patch_basebackup_info(self, pghoard):
"metadata": {
"start-time": now.isoformat(),
"backup-decision-time": (now - datetime.timedelta(seconds=30)).isoformat(),
"backup-reason": "requested",
"backup-reason": BackupReason.requested,
"normalized-backup-time": None,
}
}
pghoard.patch_basebackup_info(entry=entry, site_config=site_config)
assert entry["name"] == "bar"
assert entry["metadata"]["start-time"] == now
assert entry["metadata"]["backup-reason"] == "requested"
assert entry["metadata"]["backup-reason"] == BackupReason.requested
assert entry["metadata"]["backup-decision-time"] == now - datetime.timedelta(seconds=30)
assert entry["metadata"]["normalized-backup-time"] is None

Expand Down
3 changes: 2 additions & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pghoard import config as pghconfig
from pghoard import logutil, pgutil
from pghoard.archive_cleanup import ArchiveCleanup
from pghoard.common import BackupReason
from pghoard.pghoard import PGHoard

logutil.configure_logging()
Expand Down Expand Up @@ -491,7 +492,7 @@ def fixture_archive_cleaner(tmp_path):
bb_metadata = {
"_hash": "abc",
"backup-decision-time": "2022-03-23T14:57:55.883514+00:00",
"backup-reason": "scheduled",
"backup-reason": BackupReason.scheduled,
"start-time": "2022-03-23T15:57:55+01:00",
"start-wal-segment": "000000010000000000000002",
"active-backup-mode": "basic",
Expand Down
4 changes: 2 additions & 2 deletions test/test_pghoard.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pghoard.pghoard as pghoard_module
from pghoard.common import (
TAR_METADATA_FILENAME, BaseBackupFormat, FileType, create_alert_file, delete_alert_file, write_json_file
TAR_METADATA_FILENAME, BackupReason, BaseBackupFormat, FileType, create_alert_file, delete_alert_file, write_json_file
)
from pghoard.pghoard import PGHoard
from pghoard.pgutil import create_connection_string
Expand Down Expand Up @@ -92,7 +92,7 @@ def test_get_local_basebackups_info(self):
assert available_backup["name"] == "2015-07-03_0"
start_time = datetime.datetime(2015, 7, 3, 12, tzinfo=datetime.timezone.utc)
assert available_backup["metadata"]["start-time"] == start_time
assert available_backup["metadata"]["backup-reason"] == "scheduled"
assert available_backup["metadata"]["backup-reason"] == BackupReason.scheduled
assert available_backup["metadata"]["normalized-backup-time"] is None
assert available_backup["metadata"]["backup-decision-time"]

Expand Down

0 comments on commit a1da446

Please sign in to comment.