diff --git a/HISTORY.md b/HISTORY.md index 0f987baf83..121ee4cb59 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ ## Improvements +* Add wrapping for ls_recursive by @kounelisagis in https://github.com/TileDB-Inc/TileDB-Py/pull/1933 * Migrate away from deprecated TileDB C++ APIs by @kounelisagis in https://github.com/TileDB-Inc/TileDB-Py/pull/1958 * Pybind11 Config should honor prefix for iter by @Shelnutt2 in https://github.com/TileDB-Inc/TileDB-Py/pull/1962 * Fix test skipping by @kounelisagis in https://github.com/TileDB-Inc/TileDB-Py/pull/1957 diff --git a/tiledb/cc/vfs.cc b/tiledb/cc/vfs.cc index c4028d61d0..debf87df1d 100644 --- a/tiledb/cc/vfs.cc +++ b/tiledb/cc/vfs.cc @@ -1,5 +1,7 @@ #include +#include +#include #include #include #include @@ -41,6 +43,34 @@ void init_vfs(py::module &m) { .def("_copy_file", &VFS::copy_file) .def("_ls", &VFS::ls) + .def( + "_ls_recursive", + // 1. the user provides a callback, the user callback is passed to the + // C++ function. Return an empty vector. + // 2. no callback is passed and a default callback is used. The + // default callback just appends each path gathered. Return the paths. + [](VFS &vfs, std::string uri, py::object callback) { + tiledb::Context ctx; + if (callback.is_none()) { + std::vector paths; + tiledb::VFSExperimental::ls_recursive( + ctx, vfs, uri, + [&](const std::string_view path, + uint64_t object_size) -> bool { + paths.push_back(std::string(path)); + return true; + }); + return paths; + } else { + tiledb::VFSExperimental::ls_recursive( + ctx, vfs, uri, + [&](const std::string_view path, + uint64_t object_size) -> bool { + return callback(path, object_size).cast(); + }); + return std::vector(); + } + }) .def("_touch", &VFS::touch); } diff --git a/tiledb/libtiledb.pxd b/tiledb/libtiledb.pxd index d05f4ae7a5..1a3d0d9aed 100644 --- a/tiledb/libtiledb.pxd +++ b/tiledb/libtiledb.pxd @@ -1053,8 +1053,7 @@ cdef extern from "tiledb/tiledb.h": tiledb_ctx_t * ctx, tiledb_vfs_t * vfs, const char * path, - int (*callback)(const char *, void *) noexcept, - void * data) + void * data) nogil int tiledb_vfs_move_file( tiledb_ctx_t* ctx, diff --git a/tiledb/tests/test_vfs.py b/tiledb/tests/test_vfs.py index a6f1862cbc..5559cb3f8b 100644 --- a/tiledb/tests/test_vfs.py +++ b/tiledb/tests/test_vfs.py @@ -270,7 +270,7 @@ def test_ls(self): basepath = self.path("test_vfs_ls") self.vfs.create_dir(basepath) for id in (1, 2, 3): - dir = os.path.join(basepath, "dir" + str(id)) + dir = os.path.join(basepath, f"dir{id}") self.vfs.create_dir(dir) fname = os.path.join(basepath, "file_" + str(id)) with tiledb.FileIO(self.vfs, fname, "wb") as fio: @@ -291,6 +291,106 @@ def test_ls(self): ), ) + @pytest.mark.skipif( + pytest.tiledb_vfs not in ["s3", "file"], reason="Only test on S3 and local" + ) + def test_ls_recursive(self): + # Create a nested directory structure to test recursive listing + basepath = self.path("test_vfs_ls_recursive") + self.vfs.create_dir(basepath) + + dir = os.path.join(basepath, "dir1") + self.vfs.create_dir(dir) + + fname = os.path.join(dir, "file_1") + with tiledb.FileIO(self.vfs, fname, "wb") as fio: + fio.write(b"") + + fname = os.path.join(dir, "file_2") + with tiledb.FileIO(self.vfs, fname, "wb") as fio: + fio.write(b"") + + dir = os.path.join(basepath, "dir2") + self.vfs.create_dir(dir) + + dir2 = os.path.join(dir, "dir2_1") + self.vfs.create_dir(dir2) + + fname = os.path.join(dir2, "file_1") + with tiledb.FileIO(self.vfs, fname, "wb") as fio: + fio.write(b"") + fname = os.path.join(dir2, "file_2") + with tiledb.FileIO(self.vfs, fname, "wb") as fio: + fio.write(b"") + + dir2 = os.path.join(dir, "dir2_2") + self.vfs.create_dir(dir2) + + fname = os.path.join(dir2, "file_1") + with tiledb.FileIO(self.vfs, fname, "wb") as fio: + fio.write(b"") + + expected = [ + "dir1", + "dir1/file_1", + "dir1/file_2", + "dir2", + "dir2/dir2_1", + "dir2/dir2_1/file_1", + "dir2/dir2_1/file_2", + "dir2/dir2_2", + "dir2/dir2_2/file_1", + ] + + self.assertSetEqual( + set(expected), + set( + map( + # # Keep only the paths after the basepath and normalize them to work on all platforms + lambda x: os.path.normpath( + x.split("test_vfs_ls_recursive/")[1] + ).replace("\\", "/"), + self.vfs.ls_recursive(basepath), + ) + ), + ) + + # Check with user provided callback + callback_results = [] + + def callback(uri, _): # we don't use the second argument 'is_dir' + callback_results.append(uri) + return True + + self.vfs.ls_recursive(basepath, callback) + + self.assertSetEqual( + set(expected), + set( + map( + # Keep only the paths after the basepath and normalize them to work on all platforms + lambda x: os.path.normpath( + x.split("test_vfs_ls_recursive/")[1] + ).replace("\\", "/"), + callback_results, + ) + ), + ) + + # Can also be called by calling ls with recursive=True + self.assertSetEqual( + set(expected), + set( + map( + # Keep only the paths after the basepath and normalize them to work on all platforms + lambda x: os.path.normpath( + x.split("test_vfs_ls_recursive/")[1] + ).replace("\\", "/"), + self.vfs.ls(basepath, recursive=True), + ) + ), + ) + def test_dir_size(self): vfs = tiledb.VFS() diff --git a/tiledb/vfs.py b/tiledb/vfs.py index dc5f0f5ebf..456232c782 100644 --- a/tiledb/vfs.py +++ b/tiledb/vfs.py @@ -1,7 +1,7 @@ import io import os from types import TracebackType -from typing import List, Optional, Type, Union +from typing import Callable, List, Optional, Type, Union import numpy as np @@ -287,17 +287,39 @@ def copy_file(self, old_uri: _AnyPath, new_uri: _AnyPath): """ return self._copy_file(_to_path_str(old_uri), _to_path_str(new_uri)) - def ls(self, uri: _AnyPath) -> List[str]: + def ls(self, uri: _AnyPath, recursive: bool = False) -> List[str]: """Retrieves the children in directory `uri`. This function is non-recursive, i.e., it focuses in one level below `uri`. :param str uri: Input URI of the directory + :param bool recursive: If True, recursively list all children in the directory :rtype: List[str] :return: The children in directory `uri` """ + if recursive: + return VFS._ls_recursive(self, _to_path_str(uri), None) + return self._ls(_to_path_str(uri)) + def ls_recursive( + self, uri: _AnyPath, callback: Optional[Callable[[str, int], bool]] = None + ): + """Recursively lists objects at the input URI, invoking the provided callback + on each entry gathered. The callback is passed the data pointer provided + on each invocation and is responsible for writing the collected results + into this structure. If the callback returns True, the walk will continue. + If False, the walk will stop. If an error is thrown, the walk will stop and + the error will be propagated to the caller using std::throw_with_nested. + + Currently only S3 is supported, and the `path` must be a valid S3 URI. + + :param str uri: Input URI of the directory + :param callback: Callback function to invoke on each entry + + """ + return VFS._ls_recursive(self, _to_path_str(uri), callback) + def touch(self, uri: _AnyPath): """Touches a file with the input URI, i.e., creates a new empty file.