Skip to content

Commit

Permalink
Quick cache manager (#3550)
Browse files Browse the repository at this point in the history
* Quick cache manager

PBENCH-1249

On large datasets, our direct tarball extraction method can time out the API
call. Unlike on a long intake, there is no persistent artifact so a retry will
always time out as well. This applies to any `get_inventory` call, and
therefore to the `/inventory`, `/visualize`, and `/compare` APIs; and given
the central importance of those APIs for our Server 1.0 story, that's not an
acceptable failure mode.

This PR mitigates that problem with a "compromise" partial cache manager,
leveraging the existing `unpack` method but adding a file lock to manage
shared access. The idea is that any consumer of tarball contents (including
the indexer) will unpack the entire tarball, but leave a "last reference"
timestamp. A periodic timer service will check the cache unpack timestamps,
and delete the unpack directories which aren't currently locked and which
haven't been referenced for longer than a set time period.
  • Loading branch information
dbutenhof authored Oct 10, 2023
1 parent cc22946 commit 8d830a8
Show file tree
Hide file tree
Showing 11 changed files with 927 additions and 279 deletions.
127 changes: 120 additions & 7 deletions lib/pbench/cli/server/tree_manage.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,111 @@
from datetime import datetime, timedelta, timezone
import errno
from logging import Logger

import click

from pbench.cli import pass_cli_context
from pbench.cli.server import config_setup
from pbench.cli.server.options import common_options
from pbench.common.logger import get_pbench_logger
from pbench.server import BadConfig
from pbench.server.cache_manager import CacheManager
from pbench.server import BadConfig, OperationCode
from pbench.server.cache_manager import CacheManager, LockManager
from pbench.server.database.models.audit import Audit, AuditStatus, AuditType

# Length of time in hours to retain unreferenced cached results data.
# TODO: this could become a configurable setting?
CACHE_LIFETIME = 4.0


def reclaim_cache(tree: CacheManager, logger: Logger, lifetime: float = CACHE_LIFETIME):
"""Reclaim unused caches
Args:
tree: the cache manager instance
lifetime: number of hours to retain unused cache data
logger: a Logger object
"""
window = datetime.now(timezone.utc) - timedelta(hours=lifetime)
total_count = 0
has_cache = 0
reclaimed = 0
reclaim_failed = 0
for tarball in tree.datasets.values():
total_count += 1
if tarball.unpacked:
has_cache += 1
date = datetime.fromtimestamp(
tarball.last_ref.stat().st_mtime, timezone.utc
)
if date >= window:
continue
error = None
audit = None
logger.info(
"RECLAIM {}: last_ref {:%Y-%m-%d %H:%M:%S} is older than {:%Y-%m-%d %H:%M:%S}",
tarball.name,
date,
window,
)
try:
with LockManager(tarball.lock, exclusive=True, wait=False):
try:
audit = Audit.create(
name="reclaim",
operation=OperationCode.DELETE,
status=AuditStatus.BEGIN,
user_name=Audit.BACKGROUND_USER,
object_type=AuditType.DATASET,
object_id=tarball.resource_id,
object_name=tarball.name,
)
except Exception as e:
logger.warn(
"Unable to audit cache reclaim for {}: '{}'",
tarball.name,
e,
)
tarball.cache_delete()
reclaimed += 1
except OSError as e:
if e.errno in (errno.EAGAIN, errno.EACCES):
logger.info(
"RECLAIM {}: skipping because cache is locked",
tarball.name,
)
# If the cache is locked, regardless of age, then
# the last_ref timestamp is about to be updated,
# and we skip the dataset this time around.
continue
error = e
except Exception as e:
error = e
attributes = {"last_ref": f"{date:%Y-%m-%d %H:%M:%S}"}
if error:
reclaim_failed += 1
logger.error("RECLAIM {} failed with '{}'", tarball.name, error)
attributes["error"] = str(error)
if audit:
Audit.create(
root=audit,
status=AuditStatus.FAILURE if error else AuditStatus.SUCCESS,
attributes=attributes,
)
logger.info(
"RECLAIM summary: {} datasets, {} had cache: {} reclaimed and {} errors",
total_count,
has_cache,
reclaimed,
reclaim_failed,
)


def print_tree(tree: CacheManager):
"""Print basic information about the cache
Args:
tree: a cache instance
"""
print(f"Tree anchored at {tree.archive_root}\n")

if len(tree.datasets) == 0 and len(tree.controllers) == 0:
Expand All @@ -18,34 +115,48 @@ def print_tree(tree: CacheManager):
print("Tarballs:")
for tarball in tree.datasets.values():
print(f" {tarball.name}")
if tarball.unpacked:
date = datetime.fromtimestamp(
tarball.last_ref.stat().st_mtime, timezone.utc
)
print(f" Inventory is cached, last referenced {date:%Y-%m-%d %H:%M:%S}")

print("\nControllers:")
for controller in tree.controllers.values():
print(f" Controller {controller.name}:")
for tarball in controller.tarballs.values():
print(f" Tarball {tarball.name}")
if tarball.unpacked:
print(f" Unpacked in {tarball.unpacked}")


@click.command(name="pbench-tree-manager")
@pass_cli_context
@click.option(
"--display", default=False, is_flag=True, help="Display the full tree on completion"
)
@click.option(
"--reclaim",
show_default=True,
is_flag=False,
flag_value=CACHE_LIFETIME,
type=click.FLOAT,
help="Reclaim cached data older than <n> hours",
)
@common_options
def tree_manage(context: object, display: bool):
def tree_manage(context: object, display: bool, reclaim: float):
"""
Discover, display, and manipulate the on-disk representation of controllers
and datasets.
This primarily exposes the CacheManager object hierarchy, and provides a simple
hierarchical display of controllers and datasets.
This primarily exposes the CacheManager object hierarchy, and provides a
hierarchical display of controllers and datasets. This also supports
reclaiming cached dataset files that haven't been referenced recently.
\f
Args:
context: Click context (contains shared `--config` value)
display: Print a simplified representation of the hierarchy
lifetime: Number of hours to retain unused cache before reclaim
reclaim: Reclaim stale cached data
"""
try:
config = config_setup(context)
Expand All @@ -54,6 +165,8 @@ def tree_manage(context: object, display: bool):
cache_m.full_discovery()
if display:
print_tree(cache_m)
if reclaim:
reclaim_cache(cache_m, logger, reclaim)
rv = 0
except Exception as exc:
logger.exception("An error occurred discovering the file tree: {}", exc)
Expand Down
9 changes: 7 additions & 2 deletions lib/pbench/server/api/resources/datasets_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
ParamType,
Schema,
)
from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound
from pbench.server.cache_manager import (
CacheExtractBadPath,
CacheManager,
CacheType,
TarballNotFound,
)


class DatasetsInventory(ApiBase):
Expand Down Expand Up @@ -63,7 +68,7 @@ def _get(
cache_m = CacheManager(self.config, current_app.logger)
try:
file_info = cache_m.get_inventory(dataset.resource_id, target)
except TarballNotFound as e:
except (TarballNotFound, CacheExtractBadPath) as e:
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))

if file_info["type"] != CacheType.FILE:
Expand Down
Loading

0 comments on commit 8d830a8

Please sign in to comment.