Skip to content

Commit

Permalink
Move vacuum to pure Python (#2067)
Browse files Browse the repository at this point in the history
  • Loading branch information
kounelisagis authored Sep 17, 2024
1 parent 49c50a0 commit 9d04430
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 74 deletions.
2 changes: 1 addition & 1 deletion tiledb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@
remove,
save,
schema_like,
vacuum,
walk,
)
from .libtiledb import (
Array,
Ctx,
DenseArrayImpl,
SparseArrayImpl,
vacuum,
)
from .multirange_indexing import EmptyRange
from .object import Object
Expand Down
64 changes: 64 additions & 0 deletions tiledb/highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,70 @@ def consolidate(uri, config=None, ctx=None, fragment_uris=None, timestamp=None):
return arr.consolidate(ctx, config)


def vacuum(uri, config=None, ctx=None, timestamp=None):
"""
Vacuum underlying array fragments after consolidation.
:param str uri: URI of array to be vacuumed
:param config: Override the context configuration for vacuuming.
Defaults to None, inheriting the context parameters.
:param (ctx: tiledb.Ctx, optional): Context. Defaults to
`tiledb.default_ctx()`.
:raises TypeError: cannot convert `uri` to unicode string
:raises: :py:exc:`tiledb.TileDBError`
This operation of this function is controlled by
the `"sm.vacuum.mode"` parameter, which accepts the values ``fragments``,
``fragment_meta``, and ``array_meta``. Rather than passing the timestamp
into this function, it may be set by using `"sm.vacuum.timestamp_start"`and
`"sm.vacuum.timestamp_end"` which takes in a time in UNIX seconds. If both
are set then this function's `timestamp` argument will be used.
**Example:**
>>> import tiledb, numpy as np
>>> import tempfile
>>> path = tempfile.mkdtemp()
>>> with tiledb.from_numpy(path, np.random.rand(4)) as A:
... pass # make sure to close
>>> with tiledb.open(path, 'w') as A:
... for i in range(4):
... A[:] = np.ones(4, dtype=np.int64) * i
>>> paths = tiledb.VFS().ls(path)
>>> # should be 12 (2 base files + 2*5 fragment+ok files)
>>> (); len(paths); () # doctest:+ELLIPSIS
(...)
>>> () ; tiledb.consolidate(path) ; () # doctest:+ELLIPSIS
(...)
>>> tiledb.vacuum(path)
>>> paths = tiledb.VFS().ls(path)
>>> # should now be 4 ( base files + 2 fragment+ok files)
>>> (); len(paths); () # doctest:+ELLIPSIS
(...)
"""
ctx = _get_ctx(ctx)
if config is None:
config = tiledb.Config()

if timestamp is not None:
warnings.warn(
"Partial vacuuming via timestamp will be deprecrated in "
"a future release and replaced by passing in fragment URIs.",
DeprecationWarning,
)

if not isinstance(timestamp, tuple) and len(timestamp) != 2:
raise TypeError("'timestamp' argument expects tuple(start: int, end: int)")

if timestamp[0] is not None:
config["sm.vacuum.timestamp_start"] = timestamp[0]
if timestamp[1] is not None:
config["sm.vacuum.timestamp_end"] = timestamp[1]

lt.Array.vacuum(ctx, uri, config)


def schema_like(*args, shape=None, dtype=None, ctx=None, **kwargs):
"""
Return an ArraySchema corresponding to a NumPy-like object or
Expand Down
73 changes: 0 additions & 73 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3156,76 +3156,3 @@ cdef class SparseArrayImpl(Array):
dim_values[dim] = tuple(np.unique(query[dim]))

return dim_values


def vacuum(uri, config=None, ctx=None, timestamp=None):
"""
Vacuum underlying array fragments after consolidation.
:param str uri: URI of array to be vacuumed
:param config: Override the context configuration for vacuuming.
Defaults to None, inheriting the context parameters.
:param (ctx: tiledb.Ctx, optional): Context. Defaults to
`tiledb.default_ctx()`.
:raises TypeError: cannot convert `uri` to unicode string
:raises: :py:exc:`tiledb.TileDBError`
This operation of this function is controlled by
the `"sm.vacuum.mode"` parameter, which accepts the values ``fragments``,
``fragment_meta``, and ``array_meta``. Rather than passing the timestamp
into this function, it may be set by using `"sm.vacuum.timestamp_start"`and
`"sm.vacuum.timestamp_end"` which takes in a time in UNIX seconds. If both
are set then this function's `timestamp` argument will be used.
**Example:**
>>> import tiledb, numpy as np
>>> import tempfile
>>> path = tempfile.mkdtemp()
>>> with tiledb.from_numpy(path, np.random.rand(4)) as A:
... pass # make sure to close
>>> with tiledb.open(path, 'w') as A:
... for i in range(4):
... A[:] = np.ones(4, dtype=np.int64) * i
>>> paths = tiledb.VFS().ls(path)
>>> # should be 12 (2 base files + 2*5 fragment+ok files)
>>> (); len(paths); () # doctest:+ELLIPSIS
(...)
>>> () ; tiledb.consolidate(path) ; () # doctest:+ELLIPSIS
(...)
>>> tiledb.vacuum(path)
>>> paths = tiledb.VFS().ls(path)
>>> # should now be 4 ( base files + 2 fragment+ok files)
>>> (); len(paths); () # doctest:+ELLIPSIS
(...)
"""
cdef tiledb_ctx_t* ctx_ptr = NULL
cdef tiledb_config_t* config_ptr = NULL

if not ctx:
ctx = default_ctx()

if timestamp is not None:
warnings.warn("Partial vacuuming via timestamp will be deprecrated in "
"a future release and replaced by passing in fragment URIs.",
DeprecationWarning)

if config is None:
config = Config()

if not isinstance(timestamp, tuple) and len(timestamp) != 2:
raise TypeError("'timestamp' argument expects tuple(start: int, end: int)")

if timestamp[0] is not None:
config["sm.vacuum.timestamp_start"] = timestamp[0]
if timestamp[1] is not None:
config["sm.vacuum.timestamp_end"] = timestamp[1]

ctx_ptr = safe_ctx_ptr(ctx)
config_ptr = <tiledb_config_t*>PyCapsule_GetPointer(
config.__capsule__(), "config") if config is not None else NULL
cdef bytes buri = unicode_path(uri)
cdef const char* uri_ptr = PyBytes_AS_STRING(buri)

check_error(ctx, tiledb_array_vacuum(ctx_ptr, uri_ptr, config_ptr))

0 comments on commit 9d04430

Please sign in to comment.