Skip to content

Commit

Permalink
TILEDB_STRING_ASCII Now Displaying As UTF-8 / str Everywhere
Browse files Browse the repository at this point in the history
* Previously `TILEDB_STRING_ASCII` data was inconsistently displayed
  as `bytes`
* There is a need to coerce to `str` everywhere because (1) previously
  the resulting dataframe displayed ASCII as bytes with Pyarrow disabled
  but as str with Pyarrow enabled, and (2) this fix would remove the
  need to copy large amounts of data to convert back and forth in the
  TileDB-SingleCell Python API
* Warning now emitted to the user to pass `dtype="ascii"` for string dim
  types in lieu of `np.bytes_` or `np.str_` for clarity. All three still
  work and under the hood use `np.str_` and `TILEDB_STRING_ASCII`
* `repr` of string dimensions is now always displayed as `dtype="ascii"`.
  Calling `.dtype()` will return `np.dtype('U')` as the return signature
  of `dtype` requires a Numpy dtype
  • Loading branch information
nguyenv committed Aug 30, 2022
1 parent fe5fb33 commit dead46e
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 83 deletions.
3 changes: 3 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# In Progress

## API Changes
* `TILEDB_STRING_ASCII` displaying as `str` instead of `bytes` [#1304](https://github.com/TileDB-Inc/TileDB-Py/pull/1304)

## Misc Updates
* Wheels will no longer be supported for macOS 10.15 Catalina; the minimum supported macOS version is now 11 Big Sur [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
* Wheels will no longer supported for Python 3.6 [#1300](https://github.com/TileDB-Inc/TileDB-Py/pull/1300)
Expand Down
2 changes: 1 addition & 1 deletion tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,10 @@ py::dtype tiledb_dtype(tiledb_datatype_t type, uint32_t cell_val_num) {
std::string base_str;
switch (type) {
case TILEDB_CHAR:
case TILEDB_STRING_ASCII:
base_str = "|S";
break;
case TILEDB_STRING_UTF8:
case TILEDB_STRING_ASCII:
base_str = "|U";
break;
default:
Expand Down
5 changes: 2 additions & 3 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,8 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non
return tiledb.Dim(
name=name,
domain=(dim_min, dim_max),
# libtiledb only supports TILEDB_ASCII dimensions, so we must use
# nb.bytes_ which will force encoding on write
dtype=np.bytes_ if dtype == np.str_ else dtype,
# TileDB only supports TILEDB_STRING_ASCII dimensions for strings
dtype="ascii" if dtype in (np.bytes_, np.str_) else dtype,
tile=tile,
filters=dim_filters,
)
Expand Down
4 changes: 2 additions & 2 deletions tiledb/libmetadata.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ cdef object unpack_metadata_val(
):
assert value_num != 0, "internal error: unexpected value_num==0"

if value_type == TILEDB_STRING_UTF8:
if value_type == TILEDB_STRING_UTF8 or value_type == TILEDB_STRING_ASCII:
return value_ptr[:value_num].decode('UTF-8') if value_ptr != NULL else ''

if value_type == TILEDB_CHAR or value_type == TILEDB_STRING_ASCII:
if value_type == TILEDB_CHAR:
return value_ptr[:value_num] if value_ptr != NULL else b''

if value_ptr == NULL:
Expand Down
52 changes: 29 additions & 23 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={
TILEDB_INT16: np.NPY_INT16,
TILEDB_UINT16: np.NPY_UINT16,
TILEDB_CHAR: np.NPY_STRING,
TILEDB_STRING_ASCII: np.NPY_STRING,
TILEDB_STRING_ASCII: np.NPY_UNICODE,
TILEDB_STRING_UTF8: np.NPY_UNICODE,
}
IF LIBTILEDB_VERSION_MAJOR >= 2:
Expand All @@ -133,7 +133,7 @@ _tiledb_dtype_to_numpy_dtype_convert = {
TILEDB_INT16: np.int16,
TILEDB_UINT16: np.uint16,
TILEDB_CHAR: np.dtype('S1'),
TILEDB_STRING_ASCII: np.dtype('S'),
TILEDB_STRING_ASCII: np.dtype('U'),
TILEDB_STRING_UTF8: np.dtype('U1'),
}
IF LIBTILEDB_VERSION_MAJOR >= 2:
Expand Down Expand Up @@ -1824,10 +1824,8 @@ cdef class Attr(object):
filters_str += repr(f) + ", "
filters_str += "])"

attr_dtype = "ascii" if self.isascii else self.dtype

# filters_str must be last with no spaces
return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """
return (f"""Attr(name={repr(self.name)}, dtype='{self.dtype!s}', """
f"""var={self.isvar!s}, nullable={self.isnullable!s}"""
f"""{filters_str})""")

Expand All @@ -1852,7 +1850,7 @@ cdef class Attr(object):

output.write("<tr>")
output.write(f"<td>{self.name}</td>")
output.write(f"<td>{'ascii' if self.isascii else self.dtype}</td>")
output.write(f"<td>{self.isascii}</td>")
output.write(f"<td>{self.isvar}</td>")
output.write(f"<td>{self.isnullable}</td>")
output.write(f"<td>{self.filters._repr_html_()}</td>")
Expand Down Expand Up @@ -1903,8 +1901,12 @@ cdef class Dim(object):
if not ctx:
ctx = default_ctx()

is_string = (
isinstance(dtype, str) and dtype == "ascii"
) or np.dtype(dtype) in (np.str_, np.bytes_)

if var is not None:
if var and np.dtype(dtype) not in (np.str_, np.bytes_):
if var and not is_string:
raise TypeError("'var=True' specified for non-str/bytes dtype")

if domain is not None and len(domain) != 2:
Expand All @@ -1919,12 +1921,14 @@ cdef class Dim(object):
cdef void* tile_size_ptr = NULL
cdef np.dtype domain_dtype

if ((isinstance(dtype, str) and dtype == "ascii") or
dtype == np.dtype('S')):
if is_string:
# Handle var-len domain type
# (currently only TILEDB_STRING_ASCII)
# The dimension's domain is implicitly formed as
# coordinates are written.
if dtype != "ascii":
warnings.warn("Use 'ascii' for string dimensions.")
dtype = np.dtype("|U0")
dim_datatype = TILEDB_STRING_ASCII
else:
if domain is None or len(domain) != 2:
Expand Down Expand Up @@ -1985,17 +1989,19 @@ cdef class Dim(object):
self.ptr = dim_ptr

def __repr__(self):
filters_str = ""
filters = ""
if self.filters:
filters_str = ", filters=FilterList(["
filters = ", filters=FilterList(["
for f in self.filters:
filters_str += repr(f) + ", "
filters_str += "])"
filters += repr(f) + ", "
filters += "])"

dtype = "ascii" if self._get_type() == TILEDB_STRING_ASCII else self.dtype

# for consistency, print `var=True` for string-like types
varlen = "" if not self.dtype in (np.str_, np.bytes_) else ", var=True"
return "Dim(name={0!r}, domain={1!s}, tile={2!r}, dtype='{3!s}'{4}{5})" \
.format(self.name, self.domain, self.tile, self.dtype, varlen, filters_str)
varlen = "" if dtype != "ascii" else ", var=True"
return f"Dim(name={self.name!r}, domain={self.domain}, tile={self.tile!r}, dtype='{dtype}'{varlen}{filters})"


def _repr_html_(self) -> str:
output = io.StringIO()
Expand All @@ -2022,7 +2028,7 @@ cdef class Dim(object):
output.write(f"<td>{self.domain}</td>")
output.write(f"<td>{self.tile}</td>")
output.write(f"<td>{self.dtype}</td>")
output.write(f"<td>{self.dtype in (np.str_, np.bytes_)}</td>")
output.write(f"<td>{self.dtype == 'ascii'}</td>")
output.write(f"<td>{self.filters._repr_html_()}</td>")
output.write("</tr>")

Expand Down Expand Up @@ -2222,7 +2228,7 @@ cdef class Dim(object):
:rtype: tuple(numpy scalar, numpy scalar)
"""
if self.dtype == np.dtype('S'):
if self.dtype == np.dtype('U'):
return None, None
cdef const void* domain_ptr = NULL
check_error(self.ctx,
Expand Down Expand Up @@ -3864,9 +3870,8 @@ cdef class Array(object):
results.append((None, None))
continue

buf_dtype = 'S'
start_buf = np.empty(start_size, 'S' + str(start_size))
end_buf = np.empty(end_size, 'S' + str(end_size))
start_buf = np.empty(start_size, f"S{start_size}")
end_buf = np.empty(end_size, f"S{end_size}")
start_buf_ptr = np.PyArray_DATA(start_buf)
end_buf_ptr = np.PyArray_DATA(end_buf)
else:
Expand All @@ -3884,7 +3889,8 @@ cdef class Array(object):
return None

if start_size > 0 and end_size > 0:
results.append((start_buf.item(0), end_buf.item(0)))
results.append((start_buf.item(0).decode("UTF-8"),
end_buf.item(0).decode("UTF-8")))
else:
results.append((None, None))
else:
Expand Down Expand Up @@ -4918,7 +4924,7 @@ def index_domain_coords(dom: Domain, idx: tuple, check_ndim: bool):
# ensure strings contain only ASCII characters
domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1))
except Exception as exc:
raise TileDBError(f'Dim\' strings may only contain ASCII characters')
raise TileDBError('Dimension strings may only contain ASCII characters')
else:
domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1))

Expand Down
2 changes: 1 addition & 1 deletion tiledb/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_dictionary_encoding(self):
schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True)
tiledb.Array.create(path, schema)

data = [b"x" * i for i in np.random.randint(1, 10, size=10)]
data = ["x" * i for i in np.random.randint(1, 10, size=10)]

with tiledb.open(path, "w") as A:
A[np.arange(10)] = data
Expand Down
14 changes: 7 additions & 7 deletions tiledb/tests/test_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_array_fragments_var(self):

uri = self.path("test_array_fragments_var")
dom = tiledb.Domain(
tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype=np.bytes_)
tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype="ascii")
)
schema = tiledb.ArraySchema(
domain=dom,
Expand Down Expand Up @@ -285,22 +285,22 @@ def test_nonempty_domain_date(self):
def test_nonempty_domain_strings(self):
uri = self.path("test_nonempty_domain_strings")
dom = tiledb.Domain(
tiledb.Dim(name="x", domain=(None, None), dtype=np.bytes_),
tiledb.Dim(name="y", domain=(None, None), dtype=np.bytes_),
tiledb.Dim(name="x", domain=(None, None), dtype="ascii"),
tiledb.Dim(name="y", domain=(None, None), dtype="ascii"),
)
att = tiledb.Attr()
schema = tiledb.ArraySchema(sparse=True, domain=dom, attrs=(att,))

tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode="w") as T:
x_dims = [b"a", b"b", b"c", b"d"]
y_dims = [b"e", b"f", b"g", b"h"]
x_dims = ["a", "b", "c", "d"]
y_dims = ["e", "f", "g", "h"]
T[x_dims, y_dims] = np.array([1, 2, 3, 4])

with tiledb.SparseArray(uri, mode="w") as T:
x_dims = [b"a", b"b"]
y_dims = [b"e", b"f"]
x_dims = ["a", "b"]
y_dims = ["e", "f"]
T[x_dims, y_dims] = np.array([1, 2])

fragment_info = PyFragmentInfo(uri, schema, False, tiledb.default_ctx())
Expand Down
Loading

0 comments on commit dead46e

Please sign in to comment.