From 397dcd780720df21ce0e36a5329e233ad90c7e98 Mon Sep 17 00:00:00 2001 From: Kristjan Eimre Date: Thu, 7 Sep 2023 19:49:57 +0200 Subject: [PATCH] reorganize; separate utility functions to backup_utils --- aiida/cmdline/commands/cmd_storage.py | 47 ++++- aiida/orm/implementation/storage_backend.py | 33 +++ aiida/storage/psql_dos/backend.py | 210 ++++++++------------ aiida/storage/psql_dos/backup_utils.py | 115 +++++++++++ 4 files changed, 266 insertions(+), 139 deletions(-) create mode 100644 aiida/storage/psql_dos/backup_utils.py diff --git a/aiida/cmdline/commands/cmd_storage.py b/aiida/cmdline/commands/cmd_storage.py index 6e2a523a83..071b2511f5 100644 --- a/aiida/cmdline/commands/cmd_storage.py +++ b/aiida/cmdline/commands/cmd_storage.py @@ -169,20 +169,47 @@ def storage_maintain(ctx, full, no_repack, force, dry_run, compress): @verdi_storage.command('backup') -@click.option('--path', type=click.Path(), required=True, help='Specify the backup location.') +@click.option( + '--path', + type=click.Path(), + required=True, + help=( + "Path to where the backup will be created. If 'remote' is specified, must be an absolute path, " + 'otherwise can be relative.' + ) +) @click.option( '--remote', type=click.STRING, default=None, - help='Specify remote host for backup location. If set, path needs to be absolute.' + help=( + "Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote" + 'hosts configured for it are supported (e.g. via .ssh/config file).' + ) +) +@click.option( + '--prev_backup', + type=click.Path(), + default=None, + help=( + 'Path to the previous backup. Rsync calls will be hard-linked to this path, making the backup' + 'incremental and efficient. If this is specified, the automatic folder management is not used.' + ) +) +@click.option( + '--pg_dump_exec', type=click.STRING, default='pg_dump', help="Specify the 'pg_dump' executable, if not in PATH." ) @click.option( - '--pg_dump_exec', type=click.STRING, default='pg_dump', help="Specify the 'pg_dump' executable, if needed." + '--rsync_exec', type=click.STRING, default='rsync', help="Specify the 'rsync' executable, if not in PATH." ) -@click.option('--rsync_exec', type=click.STRING, default='rsync', help="Specify the 'rsync' executable, if needed.") @decorators.with_dbenv() -def storage_backup(path, remote, pg_dump_exec, rsync_exec): - """Create a backup of the profile data.""" +def storage_backup(path, remote, prev_backup, pg_dump_exec, rsync_exec): + """Create a backup of the profile data. + + By default, automatically manages incremental/delta backup: creates a subfolder in the specified path + and if the subfolder already exists, creates an incremental backup from it. The 'prev_backup' argument + disables this automatic management. + """ import pathlib from aiida.manage.manager import get_manager @@ -190,4 +217,10 @@ def storage_backup(path, remote, pg_dump_exec, rsync_exec): manager = get_manager() storage = manager.get_profile_storage() - storage.backup_auto(pathlib.Path(path), remote=remote, pg_dump_exec=pg_dump_exec, rsync_exec=rsync_exec) + storage.backup( + pathlib.Path(path), + remote=remote, + prev_backup=pathlib.Path(prev_backup) if prev_backup else None, + pg_dump=pg_dump_exec, + rsync=rsync_exec + ) diff --git a/aiida/orm/implementation/storage_backend.py b/aiida/orm/implementation/storage_backend.py index bc99001ffb..25ccb8c76d 100644 --- a/aiida/orm/implementation/storage_backend.py +++ b/aiida/orm/implementation/storage_backend.py @@ -9,6 +9,7 @@ ########################################################################### """Generic backend related objects""" import abc +import pathlib from typing import TYPE_CHECKING, Any, ContextManager, List, Optional, Sequence, TypeVar, Union if TYPE_CHECKING: @@ -288,6 +289,38 @@ def maintain(self, full: bool = False, dry_run: bool = False, **kwargs) -> None: :param dry_run: flag to only print the actions that would be taken without actually executing them. """ + @abc.abstractmethod + def backup( + self, + path: pathlib.Path, + remote: Optional[str] = None, + prev_backup: Optional[pathlib.Path] = None, + **kwargs + ) -> bool: + """Create a backup of the storage contents. + + By default, automatically manages incremental/delta backup: creates a subfolder in the specified path + and if the subfolder already exists, creates an incremental backup from it. + + :param path: + Path to where the backup will be created. If 'remote' is specified, must be an absolute path, + otherwise can be relative. + + :param remote: + Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote + hosts configured for it are supported (e.g. via .ssh/config file). + + :param prev_backup: + Path to the previous backup. Rsync calls will be hard-linked to this path, making the backup + incremental and efficient. If this is specified, the automatic folder management is not used. + + :param kwargs: + * Executable paths, if not default. + + :return: + True is successful and False if unsuccessful. + """ + def get_info(self, detailed: bool = False) -> dict: """Return general information on the storage. diff --git a/aiida/storage/psql_dos/backend.py b/aiida/storage/psql_dos/backend.py index 6a9541ea5f..69a9a6de05 100644 --- a/aiida/storage/psql_dos/backend.py +++ b/aiida/storage/psql_dos/backend.py @@ -23,6 +23,7 @@ from aiida.orm.entities import EntityTypes from aiida.orm.implementation import BackendEntity, StorageBackend from aiida.storage.log import STORAGE_LOGGER +from aiida.storage.psql_dos import backup_utils from aiida.storage.psql_dos.migrator import REPOSITORY_UUID_KEY, PsqlDosMigrator from aiida.storage.psql_dos.models import base @@ -425,67 +426,6 @@ def get_info(self, detailed: bool = False) -> dict: results['repository'] = self.get_repository().get_info(detailed) return results - def _call_rsync( - self, - args: list, - src: pathlib.Path, - dest: pathlib.Path, - link_dest: Optional[pathlib.Path] = None, - remote: Optional[str] = None, - src_trailing_slash: bool = False, - dest_trailing_slash: bool = False - ) -> bool: - """Call rsync with specified arguments and handle possible errors & stdout/stderr - - :param link_dest: - Path to the hardlinked files location (previous backup). - - :param src_trailing_slash: - Add a trailing slash to the source path. This makes rsync copy the contents - of the folder instead of the folder itself. - - :param dest_trailing_slash: - Add a trailing slash to the destination path. This makes rsync interpret the - destination as a folder and create it if it doesn't exists. - - :return: - True if successful and False if unsuccessful. - """ - import subprocess - - all_args = args[:] - if link_dest: - if not remote: - # for local paths, use resolve() to get absolute path - link_dest_str = str(link_dest.resolve()) - else: - # for remote paths, we require absolute paths anyways - link_dest_str = str(link_dest) - all_args += [f'--link-dest={link_dest_str}'] - - if src_trailing_slash: - all_args += [str(src) + '/'] - else: - all_args += [str(src)] - - dest_str = str(dest) - if dest_trailing_slash: - dest_str += '/' - - if not remote: - all_args += [dest_str] - else: - all_args += [f'{remote}:{dest_str}'] - - try: - res = subprocess.run(all_args, check=True, capture_output=True) - STORAGE_LOGGER.debug(f"stdout: {all_args}\n{res.stdout.decode('utf-8')}") - STORAGE_LOGGER.debug(f"stderr: {all_args}\n{res.stderr.decode('utf-8')}") - except subprocess.CalledProcessError as exc: - STORAGE_LOGGER.error(f'rsync: {exc}') - return False - return True - def _backup_dos( self, location: pathlib.Path, @@ -510,7 +450,7 @@ def _backup_dos( # step 1: loose files loose_path = container_path / 'loose' - success = self._call_rsync( + success = backup_utils.call_rsync( rsync_args, loose_path, location, remote=remote, link_dest=prev_backup / 'loose' if prev_backup else None ) if not success: @@ -540,20 +480,22 @@ def _backup_dos( return False # step 3: transfer the SQLITE database file - success = self._call_rsync(rsync_args, sqlite_temp_loc, location, remote=remote, link_dest=prev_backup) + success = backup_utils.call_rsync( + rsync_args, sqlite_temp_loc, location, remote=remote, link_dest=prev_backup + ) if not success: return False # step 4: transfer the packed files packs_path = container_path / 'packs' - success = self._call_rsync( + success = backup_utils.call_rsync( rsync_args, packs_path, location, remote=remote, link_dest=prev_backup / 'packs' if prev_backup else None ) if not success: return False # step 5: transfer anything else in the container folder - success = self._call_rsync( + success = backup_utils.call_rsync( rsync_args + [ '--exclude', 'loose', @@ -573,26 +515,12 @@ def _backup_dos( return True - def _run_bash_cmd(self, args: list, remote: Optional[str] = None, shell: bool = False, suppress_log: bool = False): - import subprocess - all_args = args[:] - if remote: - all_args = ['ssh', remote] + all_args - try: - subprocess.run(all_args, check=True, shell=shell) - except subprocess.CalledProcessError as exc: - if not suppress_log: - STORAGE_LOGGER.error(f'{all_args}: {exc}') - return False - return True - - def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-many-branches, too-many-statements + def _backup( # pylint: disable=too-many-locals, too-many-return-statements, too-many-branches, too-many-statements self, path: pathlib.Path, remote: Optional[str] = None, prev_backup: Optional[pathlib.Path] = None, - pg_dump_exec: str = 'pg_dump', - rsync_exec: str = 'rsync' + **kwargs ) -> bool: """Create a backup of the postgres database and disk-objectstore to the provided path. @@ -608,11 +536,8 @@ def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-m Path to the previous backup. Rsync calls will be hard-linked to this path, making the backup incremental and efficient. - :param pg_dump_exec: - Path to the `pg_dump` executable. - - :param rsync_exec: - Path to the `rsync` executable. + :param kwargs: + * Executable paths if not default: 'pg_dump', 'rsync' :return: True is successful and False if unsuccessful. @@ -631,11 +556,14 @@ def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-m if remote: # check if accessible - success = self._run_bash_cmd(['exit'], remote=remote) + success = backup_utils.run_cmd(['exit'], remote=remote) if not success: STORAGE_LOGGER.error(f"Remote '{remote}' is not accessible!") return False - STORAGE_LOGGER.info(f"Remote '{remote}' is accessible!") + STORAGE_LOGGER.report(f"Remote '{remote}' is accessible!") + + pg_dump_exec = kwargs.get('pg_dump', 'pg_dump') + rsync_exec = kwargs.get('rsync', 'rsync') # check if the specified executables are found for exe in [pg_dump_exec, rsync_exec]: @@ -648,19 +576,17 @@ def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-m cfg = self._profile.storage_config - # check that 'path' doesn't exist - success = self._run_bash_cmd([f'[ ! -e "{str(path)}" ]'], - remote=remote, - shell=remote is None, - suppress_log=True) - if not success: - # path exists, check if it's an empty folder - success = self._run_bash_cmd([f'[ -d "{str(path)}" ] && [ -z "$(ls -A "{str(path)}")" ]'], - remote=remote, - shell=remote is None) + path_exists = backup_utils.check_path_exists(path, remote) + + if path_exists: + if not backup_utils.check_path_is_empty_folder(path, remote): + STORAGE_LOGGER.error(f"The path '{str(path)}' exists and is not an empty folder!") + return False + else: + # path doesn't exist, check if it can be created + success = backup_utils.run_cmd(['mkdir', str(path)], remote=remote) if not success: - # it's not an empty folder, so stop the backup - STORAGE_LOGGER.error(f"The path '{str(path)}' exists and is not empty!") + STORAGE_LOGGER.error(f"Couldn't access/create '{str(path)}'!") return False # check that the AiiDA profile is not locked and request access for the duration of this backup process @@ -698,7 +624,7 @@ def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-m return False # step 3: transfer the PostgreSQL database file - success = self._call_rsync( + success = backup_utils.call_rsync( rsync_args, psql_temp_loc, path, link_dest=prev_backup, remote=remote, dest_trailing_slash=True ) if not success: @@ -717,27 +643,22 @@ def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-m # step 6: back up aiida config.json file try: config = get_config() - success = self._call_rsync(rsync_args, pathlib.Path(config.filepath), path, remote=remote) + success = backup_utils.call_rsync(rsync_args, pathlib.Path(config.filepath), path, remote=remote) if not success: return False except (exceptions.MissingConfigurationError, exceptions.ConfigurationError): - STORAGE_LOGGER.info('aiida config.json not found!') + STORAGE_LOGGER.warning('aiida config.json not found!') # step 5: write a file including date that signifies the backup completed successfully - success = self._run_bash_cmd(['touch', str(path / f'COMPLETED_{datetime.today().isoformat()}')], remote=remote) + success = backup_utils.run_cmd(['touch', str(path / f'COMPLETED_{datetime.today().isoformat()}')], + remote=remote) if not success: return False - STORAGE_LOGGER.info(f"Success! Backup completed to {f'{remote}:' if remote else ''}{str(path)}") + STORAGE_LOGGER.report(f"Success! Backup completed to {f'{remote}:' if remote else ''}{str(path)}") return True - def backup_auto( - self, - path: pathlib.Path, - remote: Optional[str] = None, - pg_dump_exec: str = 'pg_dump', - rsync_exec: str = 'rsync' - ): + def _backup_auto_folders(self, path: pathlib.Path, remote: Optional[str] = None, **kwargs): """Create a backup of the AiiDA profile data, managing live and previous backup folders automatically The running backup is done to `/live-backup`. When it completes, it is moved to @@ -753,11 +674,8 @@ def backup_auto( Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote hosts configured for it are supported (e.g. via .ssh/config file). - :param pg_dump_exec: - Path to the `pg_dump` executable. - - :param rsync_exec: - Path to the `rsync` executable. + :param kwargs: + * Executable paths if not default: 'pg_dump', 'rsync' :return: True is successful and False if unsuccessful. @@ -767,18 +685,9 @@ def backup_auto( final_folder = path / 'last-backup' # does previous backup exist? - prev_exists = self._run_bash_cmd([f'[ -d "{str(final_folder)}" ]'], - remote=remote, - shell=remote is None, - suppress_log=True) + prev_exists = backup_utils.check_path_exists(final_folder, remote) - success = self.backup( - live_folder, - remote=remote, - prev_backup=final_folder if prev_exists else None, - pg_dump_exec=pg_dump_exec, - rsync_exec=rsync_exec - ) + success = self._backup(live_folder, remote=remote, prev_backup=final_folder if prev_exists else None, **kwargs) if not success: return False @@ -786,18 +695,55 @@ def backup_auto( # (such that if the process stops at any point, that we wouldn't lose data) # step 1: last-backup -> last-backup-old if prev_exists: - success = self._run_bash_cmd(['mv', str(final_folder), str(final_folder) + '-old'], remote=remote) + success = backup_utils.run_cmd(['mv', str(final_folder), str(final_folder) + '-old'], remote=remote) if not success: return False # step 2: live-backup -> last-backup - success = self._run_bash_cmd(['mv', str(live_folder), str(final_folder)], remote=remote) + success = backup_utils.run_cmd(['mv', str(live_folder), str(final_folder)], remote=remote) if not success: return False # step 3: remote last-backup-old if prev_exists: - success = self._run_bash_cmd(['rm', '-rf', str(final_folder) + '-old'], remote=remote) + success = backup_utils.run_cmd(['rm', '-rf', str(final_folder) + '-old'], remote=remote) if not success: return False - STORAGE_LOGGER.info(f"Backup moved from '{str(live_folder)}' to '{str(final_folder)}'.") + STORAGE_LOGGER.report(f"Backup moved from '{str(live_folder)}' to '{str(final_folder)}'.") return True + + def backup( # pylint: disable=too-many-locals, too-many-return-statements, too-many-branches, too-many-statements + self, + path: pathlib.Path, + remote: Optional[str] = None, + prev_backup: Optional[pathlib.Path] = None, + **kwargs + ) -> bool: + """Create a backup of the postgres database and disk-objectstore. + + By default, automatically manages incremental/delta backup: creates a subfolder in the specified path + and if the subfolder already exists, creates an incremental backup from it. + + :param path: + Path to where the backup will be created. If 'remote' is specified, must be an absolute path, + otherwise can be relative. + + :param remote: + Remote host of the backup location. 'ssh' executable is called via subprocess and therefore remote + hosts configured for it are supported (e.g. via .ssh/config file). + + :param prev_backup: + Path to the previous backup. Rsync calls will be hard-linked to this path, making the backup + incremental and efficient. If this is specified, the automatic folder management is not used. + + :param kwargs: + * Executable paths if not default: 'pg_dump', 'rsync' + + :return: + True is successful and False if unsuccessful. + """ + + if prev_backup: + success = self._backup(path, remote=remote, prev_backup=prev_backup, **kwargs) + else: + success = self._backup_auto_folders(path, remote=remote, **kwargs) + return success diff --git a/aiida/storage/psql_dos/backup_utils.py b/aiida/storage/psql_dos/backup_utils.py new file mode 100644 index 0000000000..4f66f9b330 --- /dev/null +++ b/aiida/storage/psql_dos/backup_utils.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +# pylint: disable=import-error,no-name-in-module +"""Utility functions for running the psql_dos backend backups.""" + +import logging +import pathlib +import subprocess +from typing import Optional + +from aiida.storage.log import STORAGE_LOGGER + + +def run_cmd(args: list, remote: Optional[str] = None, check: bool = True, logger: logging.Logger = STORAGE_LOGGER): + """ + Run a command locally or remotely. + """ + all_args = args[:] + if remote: + all_args = ['ssh', remote] + all_args + + try: + res = subprocess.run(all_args, capture_output=True, text=True, check=check) + except subprocess.CalledProcessError as exc: + logger.error(exc) + return False + + logger.info(f'stdout: {all_args}\n{res.stdout}') + logger.info(f'stderr: {all_args}\n{res.stderr}') + + success = not bool(res.returncode) + + return success + + +def check_path_exists(path, remote: Optional[str] = None): + cmd = ['[', '-e', str(path), ']'] + return run_cmd(cmd, remote=remote, check=False) + + +def check_path_is_empty_folder(path, remote: Optional[str] = None): + cmd = ['[', '-d', str(path), ']', '&&', '[', '-z', f'$(ls -A "{str(path)}")', ']'] + return run_cmd(cmd, remote=remote, check=False) + + +def call_rsync( + args: list, + src: pathlib.Path, + dest: pathlib.Path, + link_dest: Optional[pathlib.Path] = None, + remote: Optional[str] = None, + src_trailing_slash: bool = False, + dest_trailing_slash: bool = False, + logger: logging.Logger = STORAGE_LOGGER +) -> bool: + """Call rsync with specified arguments and handle possible errors & stdout/stderr + + :param link_dest: + Path to the hardlinked files location (previous backup). + + :param src_trailing_slash: + Add a trailing slash to the source path. This makes rsync copy the contents + of the folder instead of the folder itself. + + :param dest_trailing_slash: + Add a trailing slash to the destination path. This makes rsync interpret the + destination as a folder and create it if it doesn't exists. + + :return: + True if successful and False if unsuccessful. + """ + + all_args = args[:] + if link_dest: + if not remote: + # for local paths, use resolve() to get absolute path + link_dest_str = str(link_dest.resolve()) + else: + # for remote paths, we require absolute paths anyways + link_dest_str = str(link_dest) + all_args += [f'--link-dest={link_dest_str}'] + + if src_trailing_slash: + all_args += [str(src) + '/'] + else: + all_args += [str(src)] + + dest_str = str(dest) + if dest_trailing_slash: + dest_str += '/' + + if not remote: + all_args += [dest_str] + else: + all_args += [f'{remote}:{dest_str}'] + + try: + res = subprocess.run(all_args, capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as exc: + logger.error(exc) + return False + + logger.info(f'stdout: {all_args}\n{res.stdout}') + logger.info(f'stderr: {all_args}\n{res.stderr}') + + success = not bool(res.returncode) + + return success