Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pghoard: ignore delta backup failures counter in some cases #621

Merged
merged 3 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pghoard/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ class BaseBackupMode(StrEnum):
pipe = "pipe"


@enum.unique
class BackupReason(StrEnum):
requested = "requested"
scheduled = "scheduled"


class ProgressData(BaseModel):
current_progress: float = 0
last_updated_time: float = 0
Expand Down
28 changes: 19 additions & 9 deletions pghoard/pghoard.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pghoard import config, logutil, metrics, version, wal
from pghoard.basebackup.base import PGBaseBackup
from pghoard.common import (
BaseBackupFormat, BaseBackupMode, CallbackEvent, FileType, FileTypePrefixes, create_alert_file,
BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, FileType, FileTypePrefixes, create_alert_file,
download_backup_meta_file, extract_pghoard_bb_v2_metadata, extract_pghoard_delta_metadata, get_object_storage_config,
replication_connection_string_and_slot_using_pgpass, write_json_file
)
Expand Down Expand Up @@ -481,7 +481,7 @@ def patch_basebackup_info(self, *, entry, site_config):
metadata["backup-decision-time"] = metadata["start-time"]
# Backups are usually scheduled
if "backup-reason" not in metadata:
metadata["backup-reason"] = "scheduled"
metadata["backup-reason"] = BackupReason.scheduled
# Calculate normalized backup time based on start time if missing
if "normalized-backup-time" not in metadata:
metadata["normalized-backup-time"] = self.get_normalized_backup_time(site_config, now=metadata["start-time"])
Expand Down Expand Up @@ -779,15 +779,23 @@ def handle_site(self, site, site_config):
if metadata and not os.path.exists(self.config["maintenance_mode_file"]):
if site in self.delta_backup_failures:
retries = self.delta_backup_failures[site].retries
if retries > site_config["basebackup_delta_mode_max_retries"]:
bih = site_config.get("basebackup_interval_hours")
backup_interval = datetime.timedelta(hours=bih) if bih else None
last_failed_time = self.delta_backup_failures[site].last_failed_time
since_last_fail_interval = utc_now() - last_failed_time if last_failed_time else None
if metadata["backup-reason"] == BackupReason.requested:
self.log.info("Re-trying delta basebackup. Backup was requested")
elif backup_interval and since_last_fail_interval and since_last_fail_interval > backup_interval:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to discuss whether we need to consider backoff retry internal for this two new cases. Just keeping retrying without waiting a bit seems less meaningful to me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not do any backoff in case if the backup was requested manually, it's up to the user to decide if backup needs to be tried.
As for the second case, I am not quite sure it makes sense because if the backup used to fail and maxed out and is now failing, probably the problem did not go away. Thus I would not waste resources and money on object storage calls.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. Only let "requested" backup keep retrying.

self.log.info("Re-trying delta basebackup. \"%s\" have passed since last fail", since_last_fail_interval)
elif retries > site_config["basebackup_delta_mode_max_retries"]:
self.log.info("Giving up backup after exceeding max retries: %r", retries)
return
else:
# Start from ~3 sec with cap of one hour
retry_interval = min(3 ** (retries + 1), 60 * 60)
if utc_now(
) >= self.delta_backup_failures[site].last_failed_time + datetime.timedelta(seconds=retry_interval):
self.log.info("Re-trying delta basebackup")
self.log.info("Re-trying delta basebackup. Retry: %r", retries)
else:
self.log.info("Waiting for backoff time before re-trying new delta backup due to previous failures")
return
Expand Down Expand Up @@ -815,17 +823,19 @@ def get_new_backup_details(self, *, now=None, site, site_config):
if site in self.requested_basebackup_sites:
self.log.info("Creating a new basebackup for %r due to request", site)
self.requested_basebackup_sites.discard(site)
backup_reason = "requested"
backup_reason = BackupReason.requested
elif site_config["basebackup_interval_hours"] is None:
# Basebackups are disabled for this site (but they can still be requested over the API.)
pass
elif not basebackups:
self.log.info("Creating a new basebackup for %r because there are currently none", site)
backup_reason = "scheduled"
backup_reason = BackupReason.scheduled
elif backup_hour is not None and backup_minute is not None:
most_recent_scheduled = None
last_normalized_backup_time = basebackups[-1]["metadata"]["normalized-backup-time"]
scheduled_backups = [backup for backup in basebackups if backup["metadata"]["backup-reason"] == "scheduled"]
scheduled_backups = [
backup for backup in basebackups if backup["metadata"]["backup-reason"] == BackupReason.scheduled
]
if scheduled_backups:
most_recent_scheduled = scheduled_backups[-1]["metadata"]["backup-decision-time"]

Expand All @@ -841,7 +851,7 @@ def get_new_backup_details(self, *, now=None, site, site_config):
"Normalized backup time %r differs from previous %r, creating new basebackup", normalized_backup_time,
last_normalized_backup_time
)
backup_reason = "scheduled"
backup_reason = BackupReason.scheduled
elif backup_hour is not None and backup_minute is None:
self.log.warning("Ignoring basebackup_hour as basebackup_minute is not defined")
else:
Expand All @@ -852,7 +862,7 @@ def get_new_backup_details(self, *, now=None, site, site_config):
self.log.info(
"Creating a new basebackup for %r by schedule (%s from previous)", site, delta_since_last_backup
)
backup_reason = "scheduled"
backup_reason = BackupReason.scheduled

if not backup_reason:
return None
Expand Down
55 changes: 40 additions & 15 deletions test/basebackup/test_basebackup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@

from pghoard import common, metrics
from pghoard.basebackup.base import PGBaseBackup
from pghoard.common import (BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue)
from pghoard.common import (BackupReason, BaseBackupFormat, BaseBackupMode, CallbackEvent, CallbackQueue)
from pghoard.pghoard import DeltaBaseBackupFailureInfo
from pghoard.restore import Restore, RestoreError

from ..conftest import PGHoardForTest, PGTester
Expand Down Expand Up @@ -240,7 +241,7 @@ def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, acti

now = datetime.datetime.now(datetime.timezone.utc)
metadata = {
"backup-reason": "scheduled",
"backup-reason": BackupReason.scheduled,
"backup-decision-time": now.isoformat(),
"normalized-backup-time": now.isoformat(),
}
Expand Down Expand Up @@ -276,7 +277,7 @@ def _test_create_basebackup(self, capsys, db, pghoard, mode, replica=False, acti
assert "start-wal-segment" in last_backup["metadata"]
assert "start-time" in last_backup["metadata"]
assert dateutil.parser.parse(last_backup["metadata"]["start-time"]).tzinfo # pylint: disable=no-member
assert last_backup["metadata"]["backup-reason"] == "scheduled"
assert last_backup["metadata"]["backup-reason"] == BackupReason.scheduled
assert last_backup["metadata"]["backup-decision-time"] == now.isoformat()
assert last_backup["metadata"]["normalized-backup-time"] == now.isoformat()
if mode in {BaseBackupMode.local_tar, BaseBackupMode.delta}:
Expand Down Expand Up @@ -639,6 +640,30 @@ def test_basebackups_tablespaces(self, capsys, db, pghoard, tmpdir, pg_version:
cursor.execute("DROP TABLESPACE tstest")
conn.close()

def test_handle_site_create_backup_ignoring_failures(self, pghoard):
site_config = deepcopy(pghoard.config["backup_sites"][pghoard.test_site])
assert pghoard.basebackups == {}
utc_now_dt = datetime.datetime.now(datetime.timezone.utc)
pghoard.delta_backup_failures[pghoard.test_site] = DeltaBaseBackupFailureInfo(
retries=site_config["basebackup_delta_mode_max_retries"] + 1, last_failed_time=utc_now_dt
)

# skip - too many errors
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site not in pghoard.basebackups

# create "requested" backup
pghoard.requested_basebackup_sites.add(pghoard.test_site)
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site in pghoard.basebackups

# last failed attempt was more than "basebackup_interval_hours" ago - create backup
pghoard.basebackups = {}
last_failed_time = utc_now_dt - datetime.timedelta(hours=site_config["basebackup_interval_hours"], seconds=5)
pghoard.delta_backup_failures[pghoard.test_site].last_failed_time = last_failed_time
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site in pghoard.basebackups

def test_handle_site(self, pghoard):
site_config = deepcopy(pghoard.config["backup_sites"][pghoard.test_site])
site_config["basebackup_interval_hours"] = 1 / 3600
Expand Down Expand Up @@ -672,7 +697,7 @@ def test_handle_site(self, pghoard):
pghoard.handle_site(pghoard.test_site, site_config)
assert pghoard.test_site not in pghoard.basebackups
first_basebackups = pghoard.state["backup_sites"][pghoard.test_site]["basebackups"]
assert first_basebackups[0]["metadata"]["backup-reason"] == "scheduled"
assert first_basebackups[0]["metadata"]["backup-reason"] == BackupReason.scheduled
assert first_basebackups[0]["metadata"]["backup-decision-time"]
assert first_basebackups[0]["metadata"]["normalized-backup-time"] is None
first_time_of_check = pghoard.time_of_last_backup_check[pghoard.test_site]
Expand Down Expand Up @@ -725,7 +750,7 @@ def test_get_new_backup_details(self, pghoard):
# No backups, one should be created. No backup schedule defined so normalized backup time is None
metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now.isoformat()
assert metadata["normalized-backup-time"] is None

Expand All @@ -734,23 +759,23 @@ def test_get_new_backup_details(self, pghoard):
site_config["basebackup_minute"] = 10
metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now.isoformat()
assert "T13:10:00+00:00" in metadata["normalized-backup-time"]

# No backups, one should be created. Backup schedule defined so normalized backup time is set
site_config["basebackup_interval_hours"] = 1.5
metadata = pghoard.get_new_backup_details(now=now, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now.isoformat()
assert "T14:40:00+00:00" in metadata["normalized-backup-time"]

pghoard.state["backup_sites"][site]["basebackups"].append({
"metadata": {
"start-time": now - datetime.timedelta(hours=1),
"backup-decision-time": now - datetime.timedelta(hours=1),
"backup-reason": "scheduled",
"backup-reason": BackupReason.scheduled,
"normalized-backup-time": metadata["normalized-backup-time"],
},
"name": "name01",
Expand All @@ -763,7 +788,7 @@ def test_get_new_backup_details(self, pghoard):
now2 = now + datetime.timedelta(hours=1)
metadata = pghoard.get_new_backup_details(now=now2, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now2.isoformat()
assert "T16:10:00+00:00" in metadata["normalized-backup-time"]

Expand All @@ -779,7 +804,7 @@ def test_get_new_backup_details(self, pghoard):
now3 = now + datetime.timedelta(hours=7)
metadata = pghoard.get_new_backup_details(now=now3, site=pghoard.test_site, site_config=site_config)
assert metadata
assert metadata["backup-reason"] == "scheduled"
assert metadata["backup-reason"] == BackupReason.scheduled
assert metadata["backup-decision-time"] == now3.isoformat()
assert "T14:50:00+00:00" in metadata["normalized-backup-time"]

Expand All @@ -789,7 +814,7 @@ def test_get_new_backup_details(self, pghoard):
"metadata": {
"start-time": now3 - datetime.timedelta(hours=1),
"backup-decision-time": now - datetime.timedelta(hours=1),
"backup-reason": "requested",
"backup-reason": BackupReason.requested,
"normalized-backup-time": metadata["normalized-backup-time"] + "different",
},
"name": "name02",
Expand All @@ -807,7 +832,7 @@ def test_get_new_backup_details(self, pghoard):
pghoard.requested_basebackup_sites.add(site)
metadata2 = pghoard.get_new_backup_details(now=now3, site=pghoard.test_site, site_config=site_config)
assert metadata2
assert metadata2["backup-reason"] == "requested"
assert metadata2["backup-reason"] == BackupReason.requested
assert metadata2["backup-decision-time"] == now3.isoformat()
assert metadata2["normalized-backup-time"] == metadata["normalized-backup-time"]

Expand All @@ -827,7 +852,7 @@ def test_patch_basebackup_info(self, pghoard):
pghoard.patch_basebackup_info(entry=entry, site_config=site_config)
assert entry["name"] == "bar"
assert entry["metadata"]["start-time"] == now
assert entry["metadata"]["backup-reason"] == "scheduled"
assert entry["metadata"]["backup-reason"] == BackupReason.scheduled
assert entry["metadata"]["backup-decision-time"] == now
assert isinstance(entry["metadata"]["normalized-backup-time"], str)

Expand All @@ -836,14 +861,14 @@ def test_patch_basebackup_info(self, pghoard):
"metadata": {
"start-time": now.isoformat(),
"backup-decision-time": (now - datetime.timedelta(seconds=30)).isoformat(),
"backup-reason": "requested",
"backup-reason": BackupReason.requested,
"normalized-backup-time": None,
}
}
pghoard.patch_basebackup_info(entry=entry, site_config=site_config)
assert entry["name"] == "bar"
assert entry["metadata"]["start-time"] == now
assert entry["metadata"]["backup-reason"] == "requested"
assert entry["metadata"]["backup-reason"] == BackupReason.requested
assert entry["metadata"]["backup-decision-time"] == now - datetime.timedelta(seconds=30)
assert entry["metadata"]["normalized-backup-time"] is None

Expand Down
3 changes: 2 additions & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pghoard import config as pghconfig
from pghoard import logutil, pgutil
from pghoard.archive_cleanup import ArchiveCleanup
from pghoard.common import BackupReason
from pghoard.pghoard import PGHoard

logutil.configure_logging()
Expand Down Expand Up @@ -491,7 +492,7 @@ def fixture_archive_cleaner(tmp_path):
bb_metadata = {
"_hash": "abc",
"backup-decision-time": "2022-03-23T14:57:55.883514+00:00",
"backup-reason": "scheduled",
"backup-reason": BackupReason.scheduled,
"start-time": "2022-03-23T15:57:55+01:00",
"start-wal-segment": "000000010000000000000002",
"active-backup-mode": "basic",
Expand Down
7 changes: 5 additions & 2 deletions test/test_pghoard.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pghoard.pghoard as pghoard_module
from pghoard.common import (
TAR_METADATA_FILENAME, BaseBackupFormat, FileType, create_alert_file, delete_alert_file, write_json_file
TAR_METADATA_FILENAME, BackupReason, BaseBackupFormat, FileType, create_alert_file, delete_alert_file, write_json_file
)
from pghoard.pghoard import PGHoard
from pghoard.pgutil import create_connection_string
Expand Down Expand Up @@ -92,7 +92,7 @@ def test_get_local_basebackups_info(self):
assert available_backup["name"] == "2015-07-03_0"
start_time = datetime.datetime(2015, 7, 3, 12, tzinfo=datetime.timezone.utc)
assert available_backup["metadata"]["start-time"] == start_time
assert available_backup["metadata"]["backup-reason"] == "scheduled"
assert available_backup["metadata"]["backup-reason"] == BackupReason.scheduled
assert available_backup["metadata"]["normalized-backup-time"] is None
assert available_backup["metadata"]["backup-decision-time"]

Expand Down Expand Up @@ -900,6 +900,7 @@ def test_surviving_pg_receivewal_hickup(self, db, pghoard):
os.makedirs(wal_directory, exist_ok=True)

pghoard.receivexlog_listener(pghoard.test_site, db.user, wal_directory)
time.sleep(0.5) # waiting for thread setup
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are those sleeps really necessary?

conn = db.connect()
conn.autocommit = True

Expand All @@ -918,6 +919,7 @@ def test_surviving_pg_receivewal_hickup(self, db, pghoard):
# stopping the thread is not enough, it's possible that killed receiver will leave incomplete partial files
# around, pghoard is capable of cleaning those up but needs to be restarted, for the test it should be OK
# just to call startup_walk_for_missed_files, so it takes care of cleaning up
time.sleep(0.5) # waiting for the end of file processing
pghoard.startup_walk_for_missed_files()

n_xlogs = pghoard.transfer_agent_state[pghoard.test_site]["upload"]["xlog"]["xlogs_since_basebackup"]
Expand All @@ -930,6 +932,7 @@ def test_surviving_pg_receivewal_hickup(self, db, pghoard):
# restart
pghoard.receivexlog_listener(pghoard.test_site, db.user, wal_directory)
assert pghoard.receivexlogs[pghoard.test_site].is_alive()
time.sleep(0.5) # waiting for thread setup

# We should now process all created segments, not only the ones which were created after pg_receivewal was restarted
wait_for_xlog(pghoard, n_xlogs + 10)
Expand Down
Loading