Skip to content

Commit

Permalink
When recovering with --no-get-wal and --target-time, copy all WAL…
Browse files Browse the repository at this point in the history
… files

Previous to this commit Barman would attempt to guess the required WAL
files using the filesystem creation timestamp of them.

However, that is not a reliable approach. For example, if there is
streaming replication lag, the WAL files will be created in the Barman
host later when compared to Postgres.

That can be even worse in the case of `archive_command`, because it
waits for the WAL files to be filled up before making them available
for archiving.

In those cases the recovery could end up failing because of missing
`COMMIT` or `ABORT` records in the WAL files that were copied by Barman,
i.e. Postgres would fail to perform recovery because it wouldn't know if
it satistified or not the requested `recovery_target_time`.

From now on, if the user requests a recovery with `--no-get-wal` and
`--target-time`, Barman will simply copy all WAL files up to the
timeline being recovered, guaranteeing that way that Postgres will be
able to find `COMMIT` or `ABORT` records, if they exist in the archived
WAL files, making it possible to complete the recovery.

Note: we evaluated other implementations to avoid possibly copying a lot
of unused WAL files. For example, with `pg_waldump` we would be able to
look up for `COMMIT` and `ABORT` records in a way similar to what
Postgres does. However, that could put a lot of overhead wherever that
would be processed (during WAL archiving or backup recovery), so that
option was discarded.

References: BAR-189 #881.

Signed-off-by: Israel Barth Rubio <[email protected]>
  • Loading branch information
barthisrael committed Jun 26, 2024
1 parent d468322 commit aa14bab
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 24 deletions.
8 changes: 2 additions & 6 deletions barman/recovery_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import shutil
import socket
import tempfile
import time
from io import BytesIO

import dateutil.parser
Expand Down Expand Up @@ -254,7 +253,8 @@ def recover(
# Retrieve a list of required log files
required_xlog_files = tuple(
self.server.get_required_xlog_files(
backup_info, target_tli, recovery_info["target_epoch"]
backup_info,
target_tli,
)
)

Expand Down Expand Up @@ -448,7 +448,6 @@ def _set_pitr_targets(
is reached
:param str|None target_action: recovery target action for PITR
"""
target_epoch = None
target_datetime = None

# Calculate the integer value of TLI if a keyword is provided
Expand Down Expand Up @@ -507,8 +506,6 @@ def _set_pitr_targets(
% (target_datetime, backup_info.end_time)
)

ms = target_datetime.microsecond / 1000000.0
target_epoch = time.mktime(target_datetime.timetuple()) + ms
targets["time"] = str(target_datetime)
if target_xid:
targets["xid"] = str(target_xid)
Expand Down Expand Up @@ -578,7 +575,6 @@ def _set_pitr_targets(
"Can't enable recovery target action when PITR is not required"
)

recovery_info["target_epoch"] = target_epoch
recovery_info["target_datetime"] = target_datetime

def _retrieve_safe_horizon(self, recovery_info, backup_info, dest):
Expand Down
2 changes: 0 additions & 2 deletions barman/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,8 +1836,6 @@ def get_required_xlog_files(
yield wal_info
if wal_info.name > end:
end = wal_info.name
if target_time and wal_info.time > target_time:
break
# return all the remaining history files
for line in fxlogdb:
wal_info = WalFileInfo.from_xlogdb_line(line)
Expand Down
12 changes: 0 additions & 12 deletions tests/test_recovery_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from functools import partial
import os
import shutil
import time
from contextlib import closing

import dateutil
Expand Down Expand Up @@ -390,7 +389,6 @@ def test_set_pitr_targets(self, tmpdir):
recovery_info, backup_info, dest.strpath, "", "", "", "", "", False, None
)
# Test with empty values (no PITR)
assert recovery_info["target_epoch"] is None
assert recovery_info["target_datetime"] is None
assert recovery_info["wal_dest"] == wal_dest.strpath

Expand All @@ -408,12 +406,8 @@ def test_set_pitr_targets(self, tmpdir):
None,
)
target_datetime = dateutil.parser.parse("2015-06-03 16:11:03.710380+02:00")
target_epoch = time.mktime(target_datetime.timetuple()) + (
target_datetime.microsecond / 1000000.0
)

assert recovery_info["target_datetime"] == target_datetime
assert recovery_info["target_epoch"] == target_epoch
assert recovery_info["wal_dest"] == dest.join("barman_wal").strpath

# Test for PITR targets with implicit target time
Expand All @@ -431,12 +425,8 @@ def test_set_pitr_targets(self, tmpdir):
)
target_datetime = dateutil.parser.parse("2015-06-03 16:11:03.710380")
target_datetime = target_datetime.replace(tzinfo=dateutil.tz.tzlocal())
target_epoch = time.mktime(target_datetime.timetuple()) + (
target_datetime.microsecond / 1000000.0
)

assert recovery_info["target_datetime"] == target_datetime
assert recovery_info["target_epoch"] == target_epoch
assert recovery_info["wal_dest"] == dest.join("barman_wal").strpath

# Test for too early PITR target
Expand Down Expand Up @@ -1325,7 +1315,6 @@ def test_recovery(
),
],
},
"target_epoch": None,
"configuration_files": ["postgresql.conf", "postgresql.auto.conf"],
"target_datetime": None,
"safe_horizon": None,
Expand Down Expand Up @@ -1374,7 +1363,6 @@ def test_recovery(
),
],
},
"target_epoch": None,
"configuration_files": ["postgresql.conf", "postgresql.auto.conf"],
"target_datetime": None,
"safe_horizon": None,
Expand Down
10 changes: 6 additions & 4 deletions tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,10 +339,12 @@ def test_get_wal_full_path(self, tmpdir):
# AND a target_time of 44
44,
# WHEN get_required_xlog_files runs for a backup on tli 2
# the first two WALs on tli 2 are returned along with all history
# files. The WAL on tli 2 which starts after the target_time is
# not returned.
[1, 2, 3, 5, 7],
# all WALs on tli 2 are returned along with all history files.
# All WALs on tli 2 are returned because there is no reliable
# way of determining the required WAL files based on target_time
# other than inspecting pg_waldump, which would put a lot of
# overhead
[1, 2, 3, 4, 5, 7],
),
(
# Verify both WALs on timeline 2 are returned plus all history files
Expand Down

0 comments on commit aa14bab

Please sign in to comment.