Skip to content

Commit

Permalink
Revert "Revert 1933 agis/sc 43316/add wrapping for ls recursive (#1964)"
Browse files Browse the repository at this point in the history
This reverts commit 75e32f9.
  • Loading branch information
kounelisagis authored May 15, 2024
1 parent 997b328 commit 6e76410
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 5 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

## Improvements

* Add wrapping for ls_recursive by @kounelisagis in https://github.com/TileDB-Inc/TileDB-Py/pull/1933
* Migrate away from deprecated TileDB C++ APIs by @kounelisagis in https://github.com/TileDB-Inc/TileDB-Py/pull/1958
* Pybind11 Config should honor prefix for iter by @Shelnutt2 in https://github.com/TileDB-Inc/TileDB-Py/pull/1962
* Fix test skipping by @kounelisagis in https://github.com/TileDB-Inc/TileDB-Py/pull/1957
Expand Down
30 changes: 30 additions & 0 deletions tiledb/cc/vfs.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <tiledb/tiledb>
#include <tiledb/tiledb_experimental>

#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
Expand Down Expand Up @@ -41,6 +43,34 @@ void init_vfs(py::module &m) {
.def("_copy_file", &VFS::copy_file)

.def("_ls", &VFS::ls)
.def(
"_ls_recursive",
// 1. the user provides a callback, the user callback is passed to the
// C++ function. Return an empty vector.
// 2. no callback is passed and a default callback is used. The
// default callback just appends each path gathered. Return the paths.
[](VFS &vfs, std::string uri, py::object callback) {
tiledb::Context ctx;
if (callback.is_none()) {
std::vector<std::string> paths;
tiledb::VFSExperimental::ls_recursive(
ctx, vfs, uri,
[&](const std::string_view path,
uint64_t object_size) -> bool {
paths.push_back(std::string(path));
return true;
});
return paths;
} else {
tiledb::VFSExperimental::ls_recursive(
ctx, vfs, uri,
[&](const std::string_view path,
uint64_t object_size) -> bool {
return callback(path, object_size).cast<bool>();
});
return std::vector<std::string>();
}
})
.def("_touch", &VFS::touch);
}

Expand Down
3 changes: 1 addition & 2 deletions tiledb/libtiledb.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1053,8 +1053,7 @@ cdef extern from "tiledb/tiledb.h":
tiledb_ctx_t * ctx,
tiledb_vfs_t * vfs,
const char * path,
int (*callback)(const char *, void *) noexcept,
void * data)
void * data) nogil

int tiledb_vfs_move_file(
tiledb_ctx_t* ctx,
Expand Down
102 changes: 101 additions & 1 deletion tiledb/tests/test_vfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def test_ls(self):
basepath = self.path("test_vfs_ls")
self.vfs.create_dir(basepath)
for id in (1, 2, 3):
dir = os.path.join(basepath, "dir" + str(id))
dir = os.path.join(basepath, f"dir{id}")
self.vfs.create_dir(dir)
fname = os.path.join(basepath, "file_" + str(id))
with tiledb.FileIO(self.vfs, fname, "wb") as fio:
Expand All @@ -291,6 +291,106 @@ def test_ls(self):
),
)

@pytest.mark.skipif(
pytest.tiledb_vfs not in ["s3", "file"], reason="Only test on S3 and local"
)
def test_ls_recursive(self):
# Create a nested directory structure to test recursive listing
basepath = self.path("test_vfs_ls_recursive")
self.vfs.create_dir(basepath)

dir = os.path.join(basepath, "dir1")
self.vfs.create_dir(dir)

fname = os.path.join(dir, "file_1")
with tiledb.FileIO(self.vfs, fname, "wb") as fio:
fio.write(b"")

fname = os.path.join(dir, "file_2")
with tiledb.FileIO(self.vfs, fname, "wb") as fio:
fio.write(b"")

dir = os.path.join(basepath, "dir2")
self.vfs.create_dir(dir)

dir2 = os.path.join(dir, "dir2_1")
self.vfs.create_dir(dir2)

fname = os.path.join(dir2, "file_1")
with tiledb.FileIO(self.vfs, fname, "wb") as fio:
fio.write(b"")
fname = os.path.join(dir2, "file_2")
with tiledb.FileIO(self.vfs, fname, "wb") as fio:
fio.write(b"")

dir2 = os.path.join(dir, "dir2_2")
self.vfs.create_dir(dir2)

fname = os.path.join(dir2, "file_1")
with tiledb.FileIO(self.vfs, fname, "wb") as fio:
fio.write(b"")

expected = [
"dir1",
"dir1/file_1",
"dir1/file_2",
"dir2",
"dir2/dir2_1",
"dir2/dir2_1/file_1",
"dir2/dir2_1/file_2",
"dir2/dir2_2",
"dir2/dir2_2/file_1",
]

self.assertSetEqual(
set(expected),
set(
map(
# # Keep only the paths after the basepath and normalize them to work on all platforms
lambda x: os.path.normpath(
x.split("test_vfs_ls_recursive/")[1]
).replace("\\", "/"),
self.vfs.ls_recursive(basepath),
)
),
)

# Check with user provided callback
callback_results = []

def callback(uri, _): # we don't use the second argument 'is_dir'
callback_results.append(uri)
return True

self.vfs.ls_recursive(basepath, callback)

self.assertSetEqual(
set(expected),
set(
map(
# Keep only the paths after the basepath and normalize them to work on all platforms
lambda x: os.path.normpath(
x.split("test_vfs_ls_recursive/")[1]
).replace("\\", "/"),
callback_results,
)
),
)

# Can also be called by calling ls with recursive=True
self.assertSetEqual(
set(expected),
set(
map(
# Keep only the paths after the basepath and normalize them to work on all platforms
lambda x: os.path.normpath(
x.split("test_vfs_ls_recursive/")[1]
).replace("\\", "/"),
self.vfs.ls(basepath, recursive=True),
)
),
)

def test_dir_size(self):
vfs = tiledb.VFS()

Expand Down
26 changes: 24 additions & 2 deletions tiledb/vfs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
import os
from types import TracebackType
from typing import List, Optional, Type, Union
from typing import Callable, List, Optional, Type, Union

import numpy as np

Expand Down Expand Up @@ -287,17 +287,39 @@ def copy_file(self, old_uri: _AnyPath, new_uri: _AnyPath):
"""
return self._copy_file(_to_path_str(old_uri), _to_path_str(new_uri))

def ls(self, uri: _AnyPath) -> List[str]:
def ls(self, uri: _AnyPath, recursive: bool = False) -> List[str]:
"""Retrieves the children in directory `uri`. This function is
non-recursive, i.e., it focuses in one level below `uri`.
:param str uri: Input URI of the directory
:param bool recursive: If True, recursively list all children in the directory
:rtype: List[str]
:return: The children in directory `uri`
"""
if recursive:
return VFS._ls_recursive(self, _to_path_str(uri), None)

return self._ls(_to_path_str(uri))

def ls_recursive(
self, uri: _AnyPath, callback: Optional[Callable[[str, int], bool]] = None
):
"""Recursively lists objects at the input URI, invoking the provided callback
on each entry gathered. The callback is passed the data pointer provided
on each invocation and is responsible for writing the collected results
into this structure. If the callback returns True, the walk will continue.
If False, the walk will stop. If an error is thrown, the walk will stop and
the error will be propagated to the caller using std::throw_with_nested.
Currently only S3 is supported, and the `path` must be a valid S3 URI.
:param str uri: Input URI of the directory
:param callback: Callback function to invoke on each entry
"""
return VFS._ls_recursive(self, _to_path_str(uri), callback)

def touch(self, uri: _AnyPath):
"""Touches a file with the input URI, i.e., creates a new empty file.
Expand Down

0 comments on commit 6e76410

Please sign in to comment.