Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GroupMetadata backwards compatibility #2102

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tiledb/cc/group.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ void put_metadata_numpy(Group &group, const std::string &key, py::array value) {
throw py::type_error("Only 1D Numpy arrays can be stored as metadata");

py::size_t ncells = get_ncells(value.dtype());
if (ncells != 1)
// we can't store multi-cell arrays as metadata
// e.g. an array of strings containing strings of more than one character
if (ncells != 1 && value.size() > 1)
throw py::type_error("Unsupported dtype '" +
std::string(py::str(value.dtype())) +
"' for metadata");
Expand Down
15 changes: 15 additions & 0 deletions tiledb/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,16 @@ def __setitem__(self, key: str, value: GroupMetadataValueType):
# If the value is not a 1D ndarray, store its associated shape.
# The value's shape will be stored as separate metadata with the correct prefix.
self.__setitem__(f"{Group._NP_SHAPE_PREFIX}{key}", value.shape)
elif isinstance(value, np.generic):
tiledb_type = DataType.from_numpy(value.dtype).tiledb_type
if tiledb_type in (lt.DataType.BLOB, lt.DataType.CHAR):
put_metadata(key, tiledb_type, len(value), value)
elif tiledb_type == lt.DataType.STRING_UTF8:
put_metadata(
key, lt.DataType.STRING_UTF8, len(value), value.encode("UTF-8")
)
else:
put_metadata(key, tiledb_type, 1, value)
else:
from .metadata import pack_metadata_val

Expand All @@ -141,11 +151,16 @@ def __getitem__(self, key: str, include_type=False) -> GroupMetadataValueType:

if self._group._has_metadata(key):
data, tdb_type = self._group._get_metadata(key, False)
dtype = DataType.from_tiledb(tdb_type).np_dtype
# we return all int and float values as numpy scalars
if dtype.kind in ("i", "f") and not isinstance(data, tuple):
data = np.dtype(dtype).type(data)
elif self._group._has_metadata(f"{Group._NP_DATA_PREFIX}{key}"):
data, tdb_type = self._group._get_metadata(
f"{Group._NP_DATA_PREFIX}{key}", True
)
# reshape numpy array back to original shape, if needed
# this will not be found in any case for TileDB-Py <= 0.32.3.
shape_key = f"{Group._NP_SHAPE_PREFIX}{key}"
if self._group._has_metadata(shape_key):
shape, tdb_type = self._group._get_metadata(shape_key, False)
Expand Down
132 changes: 132 additions & 0 deletions tiledb/tests/test_group.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import base64
import io
import os
import pathlib
import tarfile

import numpy as np
import pytest
Expand Down Expand Up @@ -762,3 +765,132 @@ def test_bytes_metadata(self, capfd):
grp.meta.dump()
assert_captured(capfd, "Type: DataType.BLOB")
grp.close()

def test_group_metadata_backwards_compat(self):
# This test ensures that metadata written with the TileDB-Py 0.32.3
# will be read correctly in the future versions.

# === The following code creates a group with metadata using the current version of TileDB-Py ===
path_new = self.path("new_group")
tiledb.Group.create(path_new)
group = tiledb.Group(path_new, "w")

# python primitive types
group.meta["python_int"] = -1234
group.meta["python_float"] = 3.14
group.meta["python_str"] = "hello"
group.meta["python_bytes"] = b"hello"
group.meta["python_bool"] = False

# numpy primitive types
group.meta["numpy_int"] = np.int64(-93)
group.meta["numpy_uint"] = np.uint64(42)
group.meta["numpy_float64"] = np.float64(3.14)
group.meta["numpy_bytes"] = np.bytes_("hello")
group.meta["numpy_str"] = np.str_("hello")
group.meta["numpy_bool"] = np.bool(False)

# lists/tuples
group.meta["list_int"] = [7]
group.meta["tuple_int"] = (7,)
group.meta["list_ints"] = [1, -2, 3]
group.meta["tuple_ints"] = (1, 2, 3)
group.meta["list_float"] = [1.1]
group.meta["tuple_float"] = (1.1,)
group.meta["list_floats"] = [1.1, 2.2, 3.3]
group.meta["tuple_floats"] = (1.1, 2.2, 3.3)
group.meta["list_empty"] = []
group.meta["tuple_empty"] = ()

# numpy arrays
group.meta["numpy_int"] = np.array([-11], dtype=np.int64)
group.meta["numpy_ints"] = np.array([1, -2, 3], dtype=np.int64)
group.meta["numpy_uint"] = np.array([22], dtype=np.uint64)
group.meta["numpy_uints"] = np.array([1, 2, 3], dtype=np.uint64)
group.meta["numpy_float"] = np.array([3.14], dtype=np.float64)
group.meta["numpy_floats"] = np.array([1.1, 2.2, 3.3], dtype=np.float64)
group.meta["numpy_byte"] = np.array([b"hello"], dtype="S5")
group.meta["numpy_str"] = np.array(["hello"], dtype="U5")
group.meta["numpy_bool"] = np.array([True, False, True])

group.close()
# === End of the code that creates the group with metadata ===

# The following commented out code was used to generate the base64 encoded string of the group
# from the TileDB-Py 0.32.3 after creating the group with metadata in the exact same way as above.
'''
# Compress the contents of the group folder to tgz
with tarfile.open("test.tar.gz", "w:gz") as tar:
with os.scandir(path_new) as entries:
for entry in entries:
tar.add(entry.path, arcname=entry.name)

# Read the .tgz file and encode it to base64
with open("test.tar.gz", 'rb') as f:
s = base64.encodebytes(f.read())

# Print the base64 encoded string
group_tgz = f"""{s.decode():>32}"""
print(group_tgz)
'''

# The following base64 encoded string is the contents of the group folder compressed
# to a tgz file using TileDB-Py 0.32.3.
group_tgz = b"""H4sICO/+G2cC/3Rlc3QudGFyANPT19N3CEis8EhNTEktYqAJMIAAXLSBgbEJgg0SNzQwMjRiUKhg
oAMoLS5JLAJazzAygZGFQm5JZm6qraG5kaWFhbmlhbGekaGphbGlJRfDKBj2ID4+N7UkUZ+mdoAy
tbmpKYQ2g9AGRqh53tDE3MDM3Nzc2NQcmP8NDc3NGRRM6Zn/E9Mzi/GpAypLSxt+8a83KMp/Y8zy
33C0/KdL+W+Otfy3NBot/kdS+R8fj4h/YPSj8UxTktOSjQxMjNPMzS0MDCxTjVLNTUwS01IMzMxM
zJMTicj/ZiYmuMp/QwNjM9Ty38jQAFhdKBjQM/+P0PJfDIhfMULYV1khNAsjTFYITDIygAQYQbKM
YBYDQv0xIEcAymdEEqtgbA1x9DtsIBATrJgRpRfwgC18R8GqqqXxD1gDJwZtnTTb5YbtE0YbprhD
8y0KH7SwVJTnps9d9sorMOX8Met7M8+yMHzas+bz0rgbMet7z3b75kqb3mSdtisqonQnu8GrGvHI
6WGxX/Jm+7UW7V45+8/OVSZ3+O+Ic/0Sloo+8OKG6hqutaun9NgfXjqDz9ftBZNBwLvXt6+fX94/
++EfK0X1S2nBpVv5jQ0cut7nS8T3/wn7rOpq5q9/Jn2XW8OhQ/frZTLrkycxHt1evlKvrtbsXeIX
2dw33D0fd0yt5vqe8T/k3d3wtO4UI5Vm8yMvspXTJE+ozFY+13ZA7e+avDertDwP+b1mcjq0JPar
QLS26mvFLQH6D97dDbyZlx1b8X/ZHYmHWpqMjTP6QiVvrZX/3nsqxv3WwofHjtgmbk+YGnhC/U1D
v5+z0SvXZ5YfmXhYiw4Ynmi727rZteXvpZULJ/jvNikQV1/tuiM73XDytc2ZVu6PRcy4NN3Cuze9
0GJc1KHr+mXOAxexJaUFAv/kVgi/K+FaI+2wZfqOxoYWocQPGzNeG9h9edh+3DfBJMYzOKL2l+em
ezc0Hyq98xaQ8eT40PDoxpYX60KKnogs7Ht2d+cf9lm5m9pGy8fhDvRG+/+j/X+M9p+JqYGJ+WgD
cES0/0oyc1JTkuLTi/JLC/RKUpJok//xtP+w9P+NTUD9v9H232j5P1r+D0j5b2ZoYDZa/o+I8h9c
8NN0AJiM8V8TA9PR8d9RMApGwSgYBaNgFIyCUTAKRsEooCYAAP1+F2wAKAAA"""

# Ceate a new group by extracting the contents of the tgz file
path_original = self.path("original_group")
with tarfile.open(fileobj=io.BytesIO(base64.b64decode(group_tgz))) as tf:
try:
tf.extractall(path_original, filter="fully_trusted")
except TypeError:
tf.extractall(path_original)

# Open both the original and the new group and compare the metadata both in values and types
group_original = tiledb.Group(path_original, "r")
group_new = tiledb.Group(path_new, "r")

self.assert_metadata_roundtrip(group_new.meta, group_original.meta)

group_original.close()
group_new.close()

def test_group_metadata_new_types(self):
# This kind of data was not supported for TileDB-Py <= 0.32.3
path_new = self.path("new_group")

tiledb.Group.create(path_new)
group = tiledb.Group(path_new, "w")
test_vals = {
"int64": np.array(-1111, dtype=np.int64),
"uint64": np.array(2, dtype=np.uint64),
"float64": np.array(3.14, dtype=np.float64),
"bool": np.array(True, dtype=bool),
"str": np.array(["a", "b", "c"], dtype="S"),
"unicode": np.array(["a", "b", "c"], dtype="U"),
"bytes": np.array([b"a", b"b", b"c"]),
"datetime": np.array(
[np.datetime64("2021-01-01"), np.datetime64("2021-01-02")]
),
}
group.meta.update(test_vals)
group.close()

group = tiledb.Group(path_new, "r")
self.assert_metadata_roundtrip(group.meta, test_vals)
group.close()
Loading