Skip to content

Commit

Permalink
Apply changes from code review.
Browse files Browse the repository at this point in the history
  • Loading branch information
GeigerJ2 committed Dec 12, 2024
1 parent a0047ed commit f0bb1f6
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 73 deletions.
8 changes: 4 additions & 4 deletions src/aiida/cmdline/commands/cmd_data/cmd_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def remote_show(datum):
'--method',
type=click.STRING,
default='du',
help='The method that should be used to evaluate the size (either ``du`` or ``lstat``.)',
help='The method that should be used to evaluate the size (either ``du`` or ``stat``.)',
)
@click.option(
'-p',
Expand All @@ -116,14 +116,14 @@ def remote_show(datum):
help='Return the size in bytes or human-readable format?',
)
def remote_size(node, method, path, return_bytes):
"""Obtain the total size of a file or directory at a given path that is stored as a ``RemoteData`` object."""
"""Obtain the total size of a file or directory at a given path that is stored via a ``RemoteData`` object."""
try:
# `method` might change, if `du` fails, so the variable is reassigned.
total_size, method = node.get_size_on_disk(relpath=path, method=method, return_bytes=return_bytes)
remote_path = Path(node.get_remote_path())
full_path = remote_path / path if path is not None else remote_path
echo.echo(
f'Estimated total size of file/directory `{full_path}` on the Computer '
echo.echo_success(
f'Estimated total size of path `{full_path}` on the Computer '
f'<{node.computer.label}> obtained via `{method}`: {total_size}'
)
except (OSError, FileNotFoundError, NotImplementedError) as exc:
Expand Down
91 changes: 43 additions & 48 deletions src/aiida/orm/nodes/data/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def listdir(self, relpath='.'):
full_path = os.path.join(self.get_remote_path(), relpath)
if not transport.isdir(full_path):
raise OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
f'The required remote path {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
)

Expand Down Expand Up @@ -206,20 +206,21 @@ def get_size_on_disk(
:param relpath: File or directory path for which the total size should be returned, relative to
``self.get_remote_path()``.
:param method: Method to be used to evaluate the directory/file size (either ``du`` or ``lstat``).
:param method: Method to be used to evaluate the directory/file size (either ``du`` or ``stat``).
:param return_bytes: Return the total byte size is int, or in human-readable format.
:raises FileNotFoundError: If file or directory does not exist anymore on the remote ``Computer``.
:raises NotImplementedError: If a method other than ``du`` or ``lstat`` is selected.
:raises NotImplementedError: If a method other than ``du`` or ``stat`` is selected.
:return: Total size of given file or directory.
"""

from aiida.common.utils import format_directory_size

total_size = -1

# Initialize it here and only return if set. Avoid repeating the code in the `else` of the try-except-else
# blocks for each method
total_size: int | None = None
if relpath is None:
relpath = Path('.')

Expand All @@ -234,50 +235,47 @@ def get_size_on_disk(
)
raise FileNotFoundError(exc_message)

if method not in ('du', 'lstat'):
if method not in ('du', 'stat'):
raise NotImplementedError(
f'Specified method `{method}` for evaluating the size on disk not implemented.'
)

if method == 'du':
try:
total_size: int = self._get_size_on_disk_du(full_path, transport)
_logger.report('Obtained size on the remote using `du`.')
if return_bytes:
return total_size, method
else:
return format_directory_size(size_in_bytes=total_size), method

except (RuntimeError, NotImplementedError):
# NotImplementedError captures the fact that, e.g., FirecREST does not allow for `exec_command_wait`
lstat_warn = (
'Problem executing `du` command. Will return total file size based on `lstat`. '
'Take the result with a grain of salt, as `lstat` does not consider the file system block '
'size, but instead returns the true size of the file contents in bytes, which is usually '
'smaller than the actual space requirements on disk.'
stat_warn = (
'Problem executing `du` command. Will return total file size based on `stat` as fallback. '
)

_logger.warning(lstat_warn)
method = 'lstat'

else:
_logger.report('Obtained size on the remote using `du`.')
_logger.warning(stat_warn)

# No elif here, but another if, to allow that the method is internally changed to `lstat`, if `du` fails.
if method == 'lstat':
if method == 'stat' or total_size < 0:
try:
total_size: int = self._get_size_on_disk_lstat(full_path, transport)
total_size: int = self._get_size_on_disk_stat(full_path, transport)
_logger.report('Obtained size on the remote using `stat`.')
_logger.warning(
'Take the result with a grain of salt, as `stat` returns the apparent size of files, '
'not their actual disk usage.'
)
if return_bytes:
return total_size, 'stat'
else:
return format_directory_size(size_in_bytes=total_size), 'stat'

# This should typically not even be reached, as the OSError occours if the path is not a directory or
# does not exist. Though, we check for its existence already in the beginning of this method.
except OSError:
_logger.critical('Could not evaluate directory size using either `du` or `lstat`.')
_logger.critical('Could not evaluate directory size using either `du` or `stat`.')
raise

else:
_logger.report('Obtained size on the remote using `lstat`.')

if total_size is not None:
if return_bytes:
return total_size, method
else:
return format_directory_size(size_in_bytes=total_size), method

def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:
"""Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using
the ``du`` command.
Expand All @@ -293,23 +291,23 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:

try:
retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}')
if not stderr and retval == 0:
total_size: int = int(stdout.split('\t')[0])
return total_size
else:
raise RuntimeError(f'Error executing `du` command: {stderr}')

except NotImplementedError as exc:
raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc

def _get_size_on_disk_lstat(self, full_path: Path, transport: Transport) -> int:
if stderr or retval != 0:
raise RuntimeError(f'Error executing `du` command: {stderr}')
else:
total_size: int = int(stdout.split('\t')[0])
return total_size

def _get_size_on_disk_stat(self, full_path: Path, transport: Transport) -> int:
"""Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using
the ``lstat`` command.
the ``stat`` command.
Connects to the remote folder and returns the total size of all files in the directory in bytes using ``lstat``.
Connects to the remote folder and returns the total size of all files in the directory in bytes using ``stat``.
Note that even if a file is only 1 byte, on disk, it still occupies one full disk block size. As such, getting
accurate measures of the total expected size on disk when retrieving a ``RemoteData`` is not straightforward
with ``lstat``, as one would need to consider the occupied block sizes for each file, as well as repository
with ``stat``, as one would need to consider the occupied block sizes for each file, as well as repository
metadata. Thus, this function only serves as a fallback in the absence of the ``du`` command, and the returned
estimate is expected to be smaller than the size on disk that is actually occupied.
Expand All @@ -321,7 +319,7 @@ def _get_size_on_disk_lstat(self, full_path: Path, transport: Transport) -> int:
:return: Total size of the file/directory in bytes (including all its contents).
"""

def _get_size_on_disk_lstat_recursive(full_path: Path, transport: Transport):
def _get_size_on_disk_stat_recursive(full_path: Path, transport: Transport):
"""Helper function for recursive directory traversal."""

total_size = 0
Expand All @@ -334,19 +332,16 @@ def _get_size_on_disk_lstat_recursive(full_path: Path, transport: Transport):

# If it's a directory, recursively get size of contents
if item['isdir']:
total_size += _get_size_on_disk_lstat_recursive(item_path, transport)
total_size += _get_size_on_disk_stat_recursive(item_path, transport)

return total_size

# `RemoteData.listdir_withattributes` only works for directories
# Here, also allow returning the size of a file
if transport.isfile(path=str(full_path)):
return transport.get_attribute(str(full_path))['st_size']

else:
try:
return _get_size_on_disk_lstat_recursive(full_path, transport)
try:
return _get_size_on_disk_stat_recursive(full_path, transport)

except OSError:
# Not a directory or not existing anymore. Exception is captured outside in `get_size_on_disk`.
raise
except OSError:
# Not a directory or not existing anymore. Exception is captured outside in `get_size_on_disk`.
raise
42 changes: 21 additions & 21 deletions tests/orm/nodes/data/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def test_clean(request, fixture):
(
(('du', False), ('4.01 KB', 'du')),
(('du', True), (4108, 'du')),
(('lstat', False), ('12.00 B', 'lstat')),
(('lstat', True), (12, 'lstat')),
(('stat', False), ('12.00 B', 'stat')),
(('stat', True), (12, 'stat')),
),
)
def test_get_size_on_disk_params(request, fixture, setup, results):
Expand All @@ -84,11 +84,11 @@ def test_get_size_on_disk_params(request, fixture, setup, results):
@pytest.mark.parametrize(
'num_char, sizes',
(
(1, {'du': 4097, 'lstat': 1, 'human': '4.00 KB'}),
(10, {'du': 4106, 'lstat': 10, 'human': '4.01 KB'}),
(100, {'du': 4196, 'lstat': 100, 'human': '4.10 KB'}),
(1000, {'du': 5096, 'lstat': 1000, 'human': '4.98 KB'}),
(int(1e6), {'du': 1004096, 'lstat': int(1e6), 'human': '980.56 KB'}),
(1, {'du': 4097, 'stat': 1, 'human': '4.00 KB'}),
(10, {'du': 4106, 'stat': 10, 'human': '4.01 KB'}),
(100, {'du': 4196, 'stat': 100, 'human': '4.10 KB'}),
(1000, {'du': 5096, 'stat': 1000, 'human': '4.98 KB'}),
(int(1e6), {'du': 1004096, 'stat': int(1e6), 'human': '980.56 KB'}),
),
)
@pytest.mark.parametrize('fixture', ['remote_data_local', 'remote_data_ssh'])
Expand All @@ -106,23 +106,23 @@ def test_get_size_on_disk_sizes(tmp_path, num_char, sizes, request, fixture):

with authinfo.get_transport() as transport:
size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
size_on_disk_lstat = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)
size_on_disk_stat = remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path)
size_on_disk_human, _ = remote_data.get_size_on_disk()

assert size_on_disk_du == sizes['du']
assert size_on_disk_lstat == sizes['lstat']
assert size_on_disk_stat == sizes['stat']
assert size_on_disk_human == sizes['human']


@pytest.mark.parametrize(
'num_char, relpath, sizes',
(
(1, '.', {'du': 12291, 'lstat': 8195, 'human': '12.00 KB'}),
(100, '.', {'du': 12588, 'lstat': 8492, 'human': '12.29 KB'}),
(int(1e6), '.', {'du': 3012288, 'lstat': 3008192, 'human': '2.87 MB'}),
(1, 'subdir1', {'du': 8194, 'lstat': 4098, 'human': '8.00 KB'}),
(100, 'subdir1', {'du': 8392, 'lstat': 4296, 'human': '8.20 KB'}),
(int(1e6), 'subdir1', {'du': 2008192, 'lstat': 2004096, 'human': '1.92 MB'}),
(1, '.', {'du': 12291, 'stat': 8195, 'human': '12.00 KB'}),
(100, '.', {'du': 12588, 'stat': 8492, 'human': '12.29 KB'}),
(int(1e6), '.', {'du': 3012288, 'stat': 3008192, 'human': '2.87 MB'}),
(1, 'subdir1', {'du': 8194, 'stat': 4098, 'human': '8.00 KB'}),
(100, 'subdir1', {'du': 8392, 'stat': 4296, 'human': '8.20 KB'}),
(int(1e6), 'subdir1', {'du': 2008192, 'stat': 2004096, 'human': '1.92 MB'}),
),
)
def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, sizes):
Expand All @@ -149,12 +149,12 @@ def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, s

with authinfo.get_transport() as transport:
size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
size_on_disk_lstat = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)
size_on_disk_stat = remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path)

size_on_disk_human, _ = remote_data.get_size_on_disk(relpath=relpath)

assert size_on_disk_du == sizes['du']
assert size_on_disk_lstat == sizes['lstat']
assert size_on_disk_stat == sizes['stat']
assert size_on_disk_human == sizes['human']


Expand Down Expand Up @@ -206,8 +206,8 @@ def mock_exec_command_wait(command):


@pytest.mark.parametrize('fixture', ['remote_data_local', 'remote_data_ssh'])
def test_get_size_on_disk_lstat(request, fixture):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData._get_size_on_disk_lstat` private method."""
def test_get_size_on_disk_stat(request, fixture):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData._get_size_on_disk_stat` private method."""
# No additional parametrization here, as already done in `test_get_size_on_disk_sizes`.

remote_data = request.getfixturevalue(fixture)
Expand All @@ -216,9 +216,9 @@ def test_get_size_on_disk_lstat(request, fixture):
full_path = Path(remote_data.get_remote_path())

with authinfo.get_transport() as transport:
size_on_disk = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)
size_on_disk = remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path)
assert size_on_disk == 12

# Raises OSError for non-existent directory
with pytest.raises(OSError, match='The required remote folder.*'):
remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path / 'non-existent')
remote_data._get_size_on_disk_stat(transport=transport, full_path=full_path / 'non-existent')

0 comments on commit f0bb1f6

Please sign in to comment.