From 3202aee85a6dbb48fefad3d3acf7841bc80ad03d Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Tue, 30 Aug 2022 09:27:44 -0500 Subject: [PATCH] `TILEDB_STRING_ASCII` Now Displaying As UTF-8 / `str` Everywhere * Previously `TILEDB_STRING_ASCII` data was inconsistently displayed as `bytes` * There is a need to coerce to `str` everywhere because (1) previously the resulting dataframe displayed ASCII as bytes with Pyarrow disabled but as str with Pyarrow enabled, and (2) this fix would remove the need to copy large amounts of data to convert back and forth in the TileDB-SingleCell Python API * Warning now emitted to the user to pass `dtype="ascii"` for string dim types in lieu of `np.bytes_` or `np.str_` for clarity. All three still work and under the hood use `np.str_` and `TILEDB_STRING_ASCII` * `repr` of string dimensions is now always displayed as `dtype="ascii"`. Calling `.dtype()` will return `np.dtype('U')` as the return signature of `dtype` requires a Numpy dtype --- tiledb/core.cc | 2 +- tiledb/dataframe_.py | 5 +- tiledb/libmetadata.pyx | 4 +- tiledb/libtiledb.pyx | 52 +++++++++++-------- tiledb/tests/test_filters.py | 2 +- tiledb/tests/test_fragments.py | 14 ++--- tiledb/tests/test_libtiledb.py | 75 +++++++++++++++++---------- tiledb/tests/test_metadata.py | 2 +- tiledb/tests/test_multi_index.py | 6 +-- tiledb/tests/test_pandas_dataframe.py | 20 ++++--- tiledb/tests/test_query_condition.py | 10 ++-- 11 files changed, 109 insertions(+), 83 deletions(-) diff --git a/tiledb/core.cc b/tiledb/core.cc index 75cb656973..ec13f26e70 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -246,10 +246,10 @@ py::dtype tiledb_dtype(tiledb_datatype_t type, uint32_t cell_val_num) { std::string base_str; switch (type) { case TILEDB_CHAR: - case TILEDB_STRING_ASCII: base_str = "|S"; break; case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: base_str = "|U"; break; default: diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 034299200c..5794a541ec 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -273,9 +273,8 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non return tiledb.Dim( name=name, domain=(dim_min, dim_max), - # libtiledb only supports TILEDB_ASCII dimensions, so we must use - # nb.bytes_ which will force encoding on write - dtype=np.bytes_ if dtype == np.str_ else dtype, + # TileDB only supports TILEDB_STRING_ASCII dimensions for strings + dtype="ascii" if dtype in (np.bytes_, np.str_) else dtype, tile=tile, filters=dim_filters, ) diff --git a/tiledb/libmetadata.pyx b/tiledb/libmetadata.pyx index fcbe62ab38..d644cd6440 100644 --- a/tiledb/libmetadata.pyx +++ b/tiledb/libmetadata.pyx @@ -82,10 +82,10 @@ cdef object unpack_metadata_val( ): assert value_num != 0, "internal error: unexpected value_num==0" - if value_type == TILEDB_STRING_UTF8: + if value_type == TILEDB_STRING_UTF8 or value_type == TILEDB_STRING_ASCII: return value_ptr[:value_num].decode('UTF-8') if value_ptr != NULL else '' - if value_type == TILEDB_CHAR or value_type == TILEDB_STRING_ASCII: + if value_type == TILEDB_CHAR: return value_ptr[:value_num] if value_ptr != NULL else b'' if value_ptr == NULL: diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index 34b4cdfbc4..01667543f8 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -110,7 +110,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={ TILEDB_INT16: np.NPY_INT16, TILEDB_UINT16: np.NPY_UINT16, TILEDB_CHAR: np.NPY_STRING, - TILEDB_STRING_ASCII: np.NPY_STRING, + TILEDB_STRING_ASCII: np.NPY_UNICODE, TILEDB_STRING_UTF8: np.NPY_UNICODE, } IF LIBTILEDB_VERSION_MAJOR >= 2: @@ -133,7 +133,7 @@ _tiledb_dtype_to_numpy_dtype_convert = { TILEDB_INT16: np.int16, TILEDB_UINT16: np.uint16, TILEDB_CHAR: np.dtype('S1'), - TILEDB_STRING_ASCII: np.dtype('S'), + TILEDB_STRING_ASCII: np.dtype('U'), TILEDB_STRING_UTF8: np.dtype('U1'), } IF LIBTILEDB_VERSION_MAJOR >= 2: @@ -1824,10 +1824,8 @@ cdef class Attr(object): filters_str += repr(f) + ", " filters_str += "])" - attr_dtype = "ascii" if self.isascii else self.dtype - # filters_str must be last with no spaces - return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """ + return (f"""Attr(name={repr(self.name)}, dtype='{self.dtype!s}', """ f"""var={self.isvar!s}, nullable={self.isnullable!s}""" f"""{filters_str})""") @@ -1852,7 +1850,7 @@ cdef class Attr(object): output.write("") output.write(f"{self.name}") - output.write(f"{'ascii' if self.isascii else self.dtype}") + output.write(f"{self.isascii}") output.write(f"{self.isvar}") output.write(f"{self.isnullable}") output.write(f"{self.filters._repr_html_()}") @@ -1903,8 +1901,12 @@ cdef class Dim(object): if not ctx: ctx = default_ctx() + is_string = ( + isinstance(dtype, str) and dtype == "ascii" + ) or np.dtype(dtype) in (np.str_, np.bytes_) + if var is not None: - if var and np.dtype(dtype) not in (np.str_, np.bytes_): + if var and not is_string: raise TypeError("'var=True' specified for non-str/bytes dtype") if domain is not None and len(domain) != 2: @@ -1919,12 +1921,14 @@ cdef class Dim(object): cdef void* tile_size_ptr = NULL cdef np.dtype domain_dtype - if ((isinstance(dtype, str) and dtype == "ascii") or - dtype == np.dtype('S')): + if is_string: # Handle var-len domain type # (currently only TILEDB_STRING_ASCII) # The dimension's domain is implicitly formed as # coordinates are written. + if dtype != "ascii": + warnings.warn("Use 'ascii' for string dimensions.") + dtype = np.dtype("|U0") dim_datatype = TILEDB_STRING_ASCII else: if domain is None or len(domain) != 2: @@ -1985,17 +1989,19 @@ cdef class Dim(object): self.ptr = dim_ptr def __repr__(self): - filters_str = "" + filters = "" if self.filters: - filters_str = ", filters=FilterList([" + filters = ", filters=FilterList([" for f in self.filters: - filters_str += repr(f) + ", " - filters_str += "])" + filters += repr(f) + ", " + filters += "])" + + dtype = "ascii" if self._get_type() == TILEDB_STRING_ASCII else self.dtype # for consistency, print `var=True` for string-like types - varlen = "" if not self.dtype in (np.str_, np.bytes_) else ", var=True" - return "Dim(name={0!r}, domain={1!s}, tile={2!r}, dtype='{3!s}'{4}{5})" \ - .format(self.name, self.domain, self.tile, self.dtype, varlen, filters_str) + varlen = "" if dtype != "ascii" else ", var=True" + return f"Dim(name={self.name!r}, domain={self.domain}, tile={self.tile!r}, dtype='{dtype}'{varlen}{filters})" + def _repr_html_(self) -> str: output = io.StringIO() @@ -2022,7 +2028,7 @@ cdef class Dim(object): output.write(f"{self.domain}") output.write(f"{self.tile}") output.write(f"{self.dtype}") - output.write(f"{self.dtype in (np.str_, np.bytes_)}") + output.write(f"{self.dtype == 'ascii'}") output.write(f"{self.filters._repr_html_()}") output.write("") @@ -2222,7 +2228,7 @@ cdef class Dim(object): :rtype: tuple(numpy scalar, numpy scalar) """ - if self.dtype == np.dtype('S'): + if self.dtype == np.dtype('U'): return None, None cdef const void* domain_ptr = NULL check_error(self.ctx, @@ -3864,9 +3870,8 @@ cdef class Array(object): results.append((None, None)) continue - buf_dtype = 'S' - start_buf = np.empty(start_size, 'S' + str(start_size)) - end_buf = np.empty(end_size, 'S' + str(end_size)) + start_buf = np.empty(start_size, f"S{start_size}") + end_buf = np.empty(end_size, f"S{end_size}") start_buf_ptr = np.PyArray_DATA(start_buf) end_buf_ptr = np.PyArray_DATA(end_buf) else: @@ -3884,7 +3889,8 @@ cdef class Array(object): return None if start_size > 0 and end_size > 0: - results.append((start_buf.item(0), end_buf.item(0))) + results.append((start_buf.item(0).decode("UTF-8"), + end_buf.item(0).decode("UTF-8"))) else: results.append((None, None)) else: @@ -4918,7 +4924,7 @@ def index_domain_coords(dom: Domain, idx: tuple, check_ndim: bool): # ensure strings contain only ASCII characters domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1)) except Exception as exc: - raise TileDBError(f'Dim\' strings may only contain ASCII characters') + raise TileDBError('Dimension strings may only contain ASCII characters') else: domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1)) diff --git a/tiledb/tests/test_filters.py b/tiledb/tests/test_filters.py index 18371ba08c..7d2ccdf882 100644 --- a/tiledb/tests/test_filters.py +++ b/tiledb/tests/test_filters.py @@ -97,7 +97,7 @@ def test_dictionary_encoding(self): schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True) tiledb.Array.create(path, schema) - data = [b"x" * i for i in np.random.randint(1, 10, size=10)] + data = ["x" * i for i in np.random.randint(1, 10, size=10)] with tiledb.open(path, "w") as A: A[np.arange(10)] = data diff --git a/tiledb/tests/test_fragments.py b/tiledb/tests/test_fragments.py index 469a8afe90..4c58f87980 100644 --- a/tiledb/tests/test_fragments.py +++ b/tiledb/tests/test_fragments.py @@ -75,7 +75,7 @@ def test_array_fragments_var(self): uri = self.path("test_array_fragments_var") dom = tiledb.Domain( - tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype=np.bytes_) + tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype="ascii") ) schema = tiledb.ArraySchema( domain=dom, @@ -285,8 +285,8 @@ def test_nonempty_domain_date(self): def test_nonempty_domain_strings(self): uri = self.path("test_nonempty_domain_strings") dom = tiledb.Domain( - tiledb.Dim(name="x", domain=(None, None), dtype=np.bytes_), - tiledb.Dim(name="y", domain=(None, None), dtype=np.bytes_), + tiledb.Dim(name="x", domain=(None, None), dtype="ascii"), + tiledb.Dim(name="y", domain=(None, None), dtype="ascii"), ) att = tiledb.Attr() schema = tiledb.ArraySchema(sparse=True, domain=dom, attrs=(att,)) @@ -294,13 +294,13 @@ def test_nonempty_domain_strings(self): tiledb.SparseArray.create(uri, schema) with tiledb.SparseArray(uri, mode="w") as T: - x_dims = [b"a", b"b", b"c", b"d"] - y_dims = [b"e", b"f", b"g", b"h"] + x_dims = ["a", "b", "c", "d"] + y_dims = ["e", "f", "g", "h"] T[x_dims, y_dims] = np.array([1, 2, 3, 4]) with tiledb.SparseArray(uri, mode="w") as T: - x_dims = [b"a", b"b"] - y_dims = [b"e", b"f"] + x_dims = ["a", "b"] + y_dims = ["e", "f"] T[x_dims, y_dims] = np.array([1, 2]) fragment_info = PyFragmentInfo(uri, schema, False, tiledb.default_ctx()) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index ee82339283..253fc50c29 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -306,13 +306,34 @@ def test_datetime_dimension(self): ) def test_shape(self): - dim = tiledb.Dim(name="", dtype="|S0", var=True) + dim = tiledb.Dim(name="", dtype="ascii", var=True) with self.assertRaisesRegex( TypeError, "shape only valid for integer and datetime dimension domains", ): dim.shape + def test_string_dtypes(self): + expected_repr = ( + "Dim(name='__dim_0', domain=(None, None), " + "tile=None, dtype='ascii', var=True)" + ) + expected_warn = "Use 'ascii' for string dimensions." + + dim = tiledb.Dim(dtype="ascii") + assert dim.dtype == np.dtype("U") + assert repr(dim) == expected_repr + + with pytest.warns(UserWarning, match=expected_warn): + dim = tiledb.Dim(dtype=np.bytes_) + assert dim.dtype == np.dtype("U") + assert repr(dim) == expected_repr + + with pytest.warns(UserWarning, match=expected_warn): + dim = tiledb.Dim(dtype=np.str_) + assert dim.dtype == np.dtype("U") + assert repr(dim) == expected_repr + class DomainTest(DiskTestCase): def test_domain(self, capfd): @@ -368,7 +389,7 @@ def test_ascii_domain(self, capfd): path = self.path("test_ascii_domain") dim = tiledb.Dim(name="d", dtype="ascii") - assert dim.dtype == np.bytes_ + assert dim.dtype == np.str_ dom = tiledb.Domain(dim) dom.dump() @@ -545,9 +566,9 @@ def test_ascii_attribute(self, sparse, capfd): assert A.schema.nattr == 1 A.schema.dump() assert_captured(capfd, "Type: STRING_ASCII") - assert A.schema.attr("A").dtype == np.bytes_ + assert A.schema.attr("A").dtype == np.str_ assert A.schema.attr("A").isascii - assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_)) + assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.str_)) class ArraySchemaTest(DiskTestCase): @@ -815,7 +836,7 @@ def test_mixed_string_schema(self): dims = [ tiledb.Dim(name="dpos", domain=(-100.0, 100.0), tile=10, dtype=np.float64), - tiledb.Dim(name="str_index", tile=None, dtype=np.bytes_), + tiledb.Dim(name="str_index", tile=None, dtype="ascii"), ] dom = tiledb.Domain(*dims) attrs = [tiledb.Attr(name="val", dtype=np.float64)] @@ -827,12 +848,12 @@ def test_mixed_string_schema(self): self.assertTrue(schema.domain.dim("str_index").isvar) self.assertFalse(schema.domain.dim("dpos").isvar) self.assertEqual(schema.domain.dim("dpos").dtype, np.double) - self.assertEqual(schema.domain.dim("str_index").dtype, np.bytes_) + self.assertEqual(schema.domain.dim("str_index").dtype, np.str_) self.assertFalse(schema.domain.homogeneous) tiledb.Array.create(path, schema) with tiledb.open(path, "r") as arr: - assert_array_equal(arr[:]["str_index"], np.array([], dtype="|S1")) + assert_array_equal(arr[:]["str_index"], np.array([], dtype="|U1")) class ArrayTest(DiskTestCase): @@ -1035,7 +1056,7 @@ def test_nonempty_domain_empty_string(self): A[""] = None with tiledb.open(uri, "r") as A: - assert_array_equal(A.nonempty_domain(), ((b"", b""),)) + assert_array_equal(A.nonempty_domain(), (("", ""),)) def test_create_array_overwrite(self): uri = self.path("test_create_array_overwrite") @@ -2777,12 +2798,12 @@ def test_sparse_mixed_domain_uint_float64(self, sparse_cell_order): sparse_cell_order != "hilbert", ) a_nonempty = A.nonempty_domain() - self.assertEqual(a_nonempty[0], (0, 49)) - self.assertEqual(a_nonempty[1], (-100.0, 100.0)) + assert a_nonempty[0] == (0, 49) + assert a_nonempty[1] == (-100.0, 100.0) def test_sparse_string_domain(self, sparse_cell_order): path = self.path("sparse_string_domain") - dom = tiledb.Domain(tiledb.Dim(name="d", domain=(None, None), dtype=np.bytes_)) + dom = tiledb.Domain(tiledb.Dim(name="d", domain=(None, None), dtype="ascii")) att = tiledb.Attr(name="a", dtype=np.int64) schema = tiledb.ArraySchema( domain=dom, @@ -2794,7 +2815,7 @@ def test_sparse_string_domain(self, sparse_cell_order): tiledb.SparseArray.create(path, schema) data = [1, 2, 3, 4] - coords = [b"aa", b"bbb", b"c", b"dddd"] + coords = ["aa", "bb", "c", "dddd"] with tiledb.open(path, "w") as A: A[coords] = data @@ -2803,18 +2824,18 @@ def test_sparse_string_domain(self, sparse_cell_order): ned = A.nonempty_domain()[0] res = A[ned[0] : ned[1]] assert_array_equal(res["a"], data) - self.assertEqual(set(res["d"]), set(coords)) - self.assertEqual(A.nonempty_domain(), ((b"aa", b"dddd"),)) + assert set(res["d"]) == set(coords) + assert A.nonempty_domain() == (("aa", "dddd"),) def test_sparse_string_domain2(self, sparse_cell_order): path = self.path("sparse_string_domain2") with self.assertRaises(ValueError): dims = [ tiledb.Dim( - name="str", domain=(None, None, None), tile=None, dtype=np.bytes_ + name="str", domain=(None, None, None), tile=None, dtype="ascii" ) ] - dims = [tiledb.Dim(name="str", domain=(None, None), tile=None, dtype=np.bytes_)] + dims = [tiledb.Dim(name="str", domain=(None, None), tile=None, dtype="ascii")] dom = tiledb.Domain(*dims) attrs = [tiledb.Attr(name="val", dtype=np.float64)] @@ -2824,7 +2845,7 @@ def test_sparse_string_domain2(self, sparse_cell_order): tiledb.SparseArray.create(path, schema) data = np.random.rand(10) - coords = [rand_ascii_bytes(random.randint(5, 50)) for _ in range(10)] + coords = [rand_ascii(random.randint(5, 50)) for _ in range(10)] with tiledb.open(path, "w") as A: A[coords] = data @@ -2832,7 +2853,7 @@ def test_sparse_string_domain2(self, sparse_cell_order): with tiledb.open(path) as A: ned = A.nonempty_domain()[0] res = A[ned[0] : ned[1]] - self.assertTrue(set(res["str"]) == set(coords)) + assert set(res["str"]) == set(coords) # must check data ordered by coords assert_array_equal(res["val"], data[np.argsort(coords, kind="stable")]) @@ -2840,7 +2861,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order): uri = self.path("sparse_mixed_domain") dims = [ tiledb.Dim(name="p", domain=(-100.0, 100.0), tile=10, dtype=np.float64), - tiledb.Dim(name="str", domain=(None, None), tile=None, dtype=np.bytes_), + tiledb.Dim(name="str", domain=(None, None), tile=None, dtype="ascii"), ] dom = tiledb.Domain(*dims) attrs = [tiledb.Attr(name="val", dtype=np.float64)] @@ -2852,7 +2873,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order): nrows = 5 idx_f64 = np.random.rand(nrows) - idx_str = [rand_ascii(5).encode("utf-8") for _ in range(nrows)] + idx_str = [rand_ascii(5) for _ in range(nrows)] data = np.random.rand(nrows) with tiledb.SparseArray(uri, "w") as A: @@ -2868,7 +2889,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order): def test_sparse_get_unique_dim_values(self, sparse_cell_order): uri = self.path("get_non_empty_coords") - dim1 = tiledb.Dim(name="dim1", domain=(None, None), tile=None, dtype=np.bytes_) + dim1 = tiledb.Dim(name="dim1", domain=(None, None), tile=None, dtype="ascii") dim2 = tiledb.Dim(name="dim2", domain=(0, 1), tile=1, dtype=np.float64) attr = tiledb.Attr(name="attr", dtype=np.float32) dom = tiledb.Domain(dim1, dim2) @@ -2886,12 +2907,10 @@ def test_sparse_get_unique_dim_values(self, sparse_cell_order): with tiledb.open(uri, "r") as A: self.assertEqual( A.unique_dim_values(), - OrderedDict( - [("dim1", (b"a1", b"a2", b"a3")), ("dim2", (0.0, 0.25, 0.5))] - ), + OrderedDict([("dim1", ("a1", "a2", "a3")), ("dim2", (0.0, 0.25, 0.5))]), ) - self.assertEqual(A.unique_dim_values("dim1"), (b"a1", b"a2", b"a3")) + self.assertEqual(A.unique_dim_values("dim1"), ("a1", "a2", "a3")) self.assertEqual(A.unique_dim_values("dim2"), (0, 0.25, 0.5)) with self.assertRaises(ValueError): @@ -2919,7 +2938,7 @@ def test_sparse_write_for_zero_attrs(self): def test_sparse_write_nullable_default(self): uri = self.path("test_sparse_write_nullable_default") - dim1 = tiledb.Dim(name="d1", dtype="|S0", var=True) + dim1 = tiledb.Dim(name="d1", dtype="ascii", var=True) att = tiledb.Attr(name="a1", dtype="