Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORM: Add get_size_on_disk method to RemoteData #6584

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/aiida/cmdline/commands/cmd_data/cmd_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""`verdi data core.remote` command."""

import stat
from pathlib import Path

import click

Expand Down Expand Up @@ -87,3 +88,43 @@
"""Show information for a RemoteData object."""
echo.echo(f'- Remote computer name: {datum.computer.label}')
echo.echo(f'- Remote folder full path: {datum.get_remote_path()}')


@remote.command('size')
@arguments.NODE()
GeigerJ2 marked this conversation as resolved.
Show resolved Hide resolved
@click.option(
'-m',
'--method',
type=click.STRING,
default='du',
help='The method that should be used to evaluate the size (either ``du`` or ``stat``.)',
)
@click.option(
'-p',
'--path',
type=click.Path(),
default=None,
help='Relative path of the object of the ``RemoteData`` node for which the size should be evaluated.',
)
@click.option(
'-b',
'--bytes',
'return_bytes',
type=bool,
is_flag=True,
default=False,
help='Return the size in bytes or human-readable format?',
)
def remote_size(node, method, path, return_bytes):
"""Obtain the total size of a file or directory at a given path that is stored via a ``RemoteData`` object."""
try:

Check warning on line 120 in src/aiida/cmdline/commands/cmd_data/cmd_remote.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/cmdline/commands/cmd_data/cmd_remote.py#L120

Added line #L120 was not covered by tests
# `method` might change, if `du` fails, so assigning to new variable here
total_size, used_method = node.get_size_on_disk(relpath=path, method=method, return_bytes=return_bytes)
remote_path = Path(node.get_remote_path())
full_path = remote_path / path if path is not None else remote_path
echo.echo_success(

Check warning on line 125 in src/aiida/cmdline/commands/cmd_data/cmd_remote.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/cmdline/commands/cmd_data/cmd_remote.py#L122-L125

Added lines #L122 - L125 were not covered by tests
f'Estimated total size of path `{full_path}` on the Computer '
f'<{node.computer.label}> obtained via `{used_method}`: {total_size}'
)
except (OSError, FileNotFoundError, NotImplementedError) as exc:
echo.echo_critical(str(exc))

Check warning on line 130 in src/aiida/cmdline/commands/cmd_data/cmd_remote.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/cmdline/commands/cmd_data/cmd_remote.py#L129-L130

Added lines #L129 - L130 were not covered by tests
31 changes: 31 additions & 0 deletions src/aiida/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,3 +572,34 @@

self.dtobj = dtobj
self.precision = precision


def format_directory_size(size_in_bytes: int) -> str:
khsrali marked this conversation as resolved.
Show resolved Hide resolved
"""Converts a size in bytes to a human-readable string with the appropriate prefix.

:param size_in_bytes: Size in bytes.
:raises ValueError: If the size is negative.
:return: Human-readable size string with a prefix (e.g., "1.23 KB", "5.67 MB").

The function converts a given size in bytes to a more readable format by
adding the appropriate unit suffix (e.g., KB, MB, GB). It uses the binary
system (base-1024) for unit conversions.

Example:
>>> format_directory_size(123456789)
'117.74 MB'
"""
if size_in_bytes < 0:
raise ValueError('Size cannot be negative.')

Check warning on line 593 in src/aiida/common/utils.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/common/utils.py#L593

Added line #L593 was not covered by tests

# Define size prefixes
prefixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
factor = 1024 # 1 KB = 1024 B
index = 0

while size_in_bytes >= factor and index < len(prefixes) - 1:
size_in_bytes /= factor
index += 1

# Format the size to two decimal places
return f'{size_in_bytes:.2f} {prefixes[index]}'
164 changes: 161 additions & 3 deletions src/aiida/orm/nodes/data/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,20 @@
###########################################################################
"""Data plugin that models a folder on a remote computer."""

from __future__ import annotations

import logging
import os
from pathlib import Path

from aiida.orm import AuthInfo
from aiida.orm.fields import add_field
from aiida.transports import Transport

from ..data import Data

_logger = logging.getLogger(__name__)

__all__ = ('RemoteData',)


Expand Down Expand Up @@ -96,14 +103,15 @@
full_path = os.path.join(self.get_remote_path(), relpath)
if not transport.isdir(full_path):
raise OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
f'The required remote path {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
)

try:
return transport.listdir(full_path)
except OSError as exception:
if exception.errno in (2, 20): # directory not existing or not a directory
if exception.errno in (2, 20):

Check warning on line 113 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L113

Added line #L113 was not covered by tests
# directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
Expand Down Expand Up @@ -132,7 +140,8 @@
try:
return transport.listdir_withattributes(full_path)
except OSError as exception:
if exception.errno in (2, 20): # directory not existing or not a directory
if exception.errno in (2, 20):

Check warning on line 143 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L143

Added line #L143 was not covered by tests
# directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
Expand Down Expand Up @@ -185,3 +194,152 @@

def get_authinfo(self):
return AuthInfo.get_collection(self.backend).get(dbcomputer=self.computer, aiidauser=self.user)

def get_size_on_disk(
self,
relpath: Path | None = None,
method: str = 'du',
return_bytes: bool = False,
) -> int | str:
"""Connects to the remote Computer of the `RemoteData` object and returns the total size of a file or a
directory at the given `relpath` in a human-readable format.

:param relpath: File or directory path for which the total size should be returned, relative to
``self.get_remote_path()``.
:param method: Method to be used to evaluate the directory/file size (either ``du`` or ``stat``).
:param return_bytes: Return the total byte size is int, or in human-readable format.

:raises FileNotFoundError: If file or directory does not exist anymore on the remote ``Computer``.
:raises NotImplementedError: If a method other than ``du`` or ``stat`` is selected.

:return: Total size of given file or directory.
"""

from aiida.common.utils import format_directory_size

total_size: int = -1

if relpath is None:
relpath = Path('.')

authinfo = self.get_authinfo()
full_path = Path(self.get_remote_path()) / relpath
computer_label = self.computer.label if self.computer is not None else ''

with authinfo.get_transport() as transport:
if not transport.path_exists(str(full_path)):
exc_message = f'The required remote path {full_path} on Computer <{computer_label}> ' 'does not exist.'
raise FileNotFoundError(exc_message)

if method not in ('du', 'stat'):
raise NotImplementedError(
f'Specified method `{method}` for evaluating the size on disk not implemented.'
)

if method == 'du':
try:
total_size: int = self._get_size_on_disk_du(full_path, transport)
_logger.report('Obtained size on the remote using `du`.')
if return_bytes:
return total_size, method
else:
return format_directory_size(size_in_bytes=total_size), method

except (RuntimeError, NotImplementedError):

Check warning on line 248 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L248

Added line #L248 was not covered by tests
# NotImplementedError captures the fact that, e.g., FirecREST does not allow for `exec_command_wait`
stat_warn = (

Check warning on line 250 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L250

Added line #L250 was not covered by tests
'Problem executing `du` command. Will return total file size based on `stat` as fallback. '
)

_logger.warning(stat_warn)

Check warning on line 254 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L254

Added line #L254 was not covered by tests

if method == 'stat' or total_size < 0:
try:
total_size: int = self._get_size_on_disk_stat(full_path, transport)
_logger.report('Obtained size on the remote using `stat`.')
_logger.warning(
'Take the result with a grain of salt, as `stat` returns the apparent size of files, '
'not their actual disk usage.'
)
if return_bytes:
return total_size, 'stat'
else:
return format_directory_size(size_in_bytes=total_size), 'stat'

# This should typically not even be reached, as the OSError occours if the path is not a directory or
# does not exist. Though, we check for its existence already in the beginning of this method.
except OSError:
_logger.critical('Could not evaluate directory size using either `du` or `stat`.')

Check warning on line 272 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L271-L272

Added lines #L271 - L272 were not covered by tests
raise

def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:
"""Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using
the ``du`` command.

:param full_path: Full path of file or directory for which the size should be evaluated.
:param transport: Open transport instance.

:raises NotImplementedError: When ``exec_command_wait`` is not implemented, e.g., for the FirecREST plugin.
:raises RuntimeError: When ``du`` command cannot be successfully executed.

:return: Total size of the file/directory in bytes (including all its contents).
"""

try:
retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}')
except NotImplementedError as exc:
raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc

if stderr or retval != 0:
raise RuntimeError(f'Error executing `du` command: {stderr}')
else:
total_size: int = int(stdout.split('\t')[0])
return total_size

def _get_size_on_disk_stat(self, full_path: Path, transport: Transport) -> int:
"""Returns the total size of a file/directory at the given ``full_path`` on the remote Computer in bytes using
the ``stat`` command.

Connects to the remote folder and returns the total size of all files in the directory in bytes using ``stat``.
Note that `stat` returns the apparent file size, not actual disk usage. Thus, even if a file is only 1 byte, on
disk, it still occupies one full disk block size. As such, getting accurate measures of the total expected size
on disk when retrieving a ``RemoteData`` is not straightforward with ``stat``, as one would need to consider the
occupied block sizes for each file, as well as repository metadata. Therefore, this function only serves as a
fallback in the absence of the ``du`` command, and the returned estimate is expected to be smaller than the size
on disk that is actually occupied. Further note that the `Transport.get_attribute` method that is
eventually being called on each file, calls `lstat`, which is equivalent to ``os.stat(follow_symlinks=False)``.

:param full_path: Full path of file or directory of which the size should be evaluated.
:param transport: Open transport instance.

:raises OSError: When object at ``full_path`` doesn't exist.

:return: Total size of the file/directory in bytes (including all its contents).
"""

def _get_size_on_disk_stat_recursive(full_path: Path, transport: Transport):
"""Helper function for recursive directory traversal."""

total_size = 0
contents = self.listdir_withattributes(full_path)
GeigerJ2 marked this conversation as resolved.
Show resolved Hide resolved

for item in contents:
item_path = full_path / item['name']
# Add size of current item (file or directory metadata)
total_size += item['attributes']['st_size']

# If it's a directory, recursively get size of contents
if item['isdir']:
total_size += _get_size_on_disk_stat_recursive(item_path, transport)

return total_size

if transport.isfile(path=str(full_path)):
return transport.get_attribute(str(full_path))['st_size']

Check warning on line 338 in src/aiida/orm/nodes/data/remote/base.py

View check run for this annotation

Codecov / codecov/patch

src/aiida/orm/nodes/data/remote/base.py#L338

Added line #L338 was not covered by tests

try:
GeigerJ2 marked this conversation as resolved.
Show resolved Hide resolved
return _get_size_on_disk_stat_recursive(full_path, transport)

except OSError:
# Not a directory or not existing anymore. Exception is captured outside in `get_size_on_disk`.
raise
Loading
Loading