diff --git a/tiledb/core.cc b/tiledb/core.cc index 75cb656973..ec13f26e70 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -246,10 +246,10 @@ py::dtype tiledb_dtype(tiledb_datatype_t type, uint32_t cell_val_num) { std::string base_str; switch (type) { case TILEDB_CHAR: - case TILEDB_STRING_ASCII: base_str = "|S"; break; case TILEDB_STRING_UTF8: + case TILEDB_STRING_ASCII: base_str = "|U"; break; default: diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 034299200c..5794a541ec 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -273,9 +273,8 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non return tiledb.Dim( name=name, domain=(dim_min, dim_max), - # libtiledb only supports TILEDB_ASCII dimensions, so we must use - # nb.bytes_ which will force encoding on write - dtype=np.bytes_ if dtype == np.str_ else dtype, + # TileDB only supports TILEDB_STRING_ASCII dimensions for strings + dtype="ascii" if dtype in (np.bytes_, np.str_) else dtype, tile=tile, filters=dim_filters, ) diff --git a/tiledb/libmetadata.pyx b/tiledb/libmetadata.pyx index fcbe62ab38..d644cd6440 100644 --- a/tiledb/libmetadata.pyx +++ b/tiledb/libmetadata.pyx @@ -82,10 +82,10 @@ cdef object unpack_metadata_val( ): assert value_num != 0, "internal error: unexpected value_num==0" - if value_type == TILEDB_STRING_UTF8: + if value_type == TILEDB_STRING_UTF8 or value_type == TILEDB_STRING_ASCII: return value_ptr[:value_num].decode('UTF-8') if value_ptr != NULL else '' - if value_type == TILEDB_CHAR or value_type == TILEDB_STRING_ASCII: + if value_type == TILEDB_CHAR: return value_ptr[:value_num] if value_ptr != NULL else b'' if value_ptr == NULL: diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index 34b4cdfbc4..01667543f8 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -110,7 +110,7 @@ _tiledb_dtype_to_numpy_typeid_convert ={ TILEDB_INT16: np.NPY_INT16, TILEDB_UINT16: np.NPY_UINT16, TILEDB_CHAR: np.NPY_STRING, - TILEDB_STRING_ASCII: np.NPY_STRING, + TILEDB_STRING_ASCII: np.NPY_UNICODE, TILEDB_STRING_UTF8: np.NPY_UNICODE, } IF LIBTILEDB_VERSION_MAJOR >= 2: @@ -133,7 +133,7 @@ _tiledb_dtype_to_numpy_dtype_convert = { TILEDB_INT16: np.int16, TILEDB_UINT16: np.uint16, TILEDB_CHAR: np.dtype('S1'), - TILEDB_STRING_ASCII: np.dtype('S'), + TILEDB_STRING_ASCII: np.dtype('U'), TILEDB_STRING_UTF8: np.dtype('U1'), } IF LIBTILEDB_VERSION_MAJOR >= 2: @@ -1824,10 +1824,8 @@ cdef class Attr(object): filters_str += repr(f) + ", " filters_str += "])" - attr_dtype = "ascii" if self.isascii else self.dtype - # filters_str must be last with no spaces - return (f"""Attr(name={repr(self.name)}, dtype='{attr_dtype!s}', """ + return (f"""Attr(name={repr(self.name)}, dtype='{self.dtype!s}', """ f"""var={self.isvar!s}, nullable={self.isnullable!s}""" f"""{filters_str})""") @@ -1852,7 +1850,7 @@ cdef class Attr(object): output.write("") output.write(f"{self.name}") - output.write(f"{'ascii' if self.isascii else self.dtype}") + output.write(f"{self.isascii}") output.write(f"{self.isvar}") output.write(f"{self.isnullable}") output.write(f"{self.filters._repr_html_()}") @@ -1903,8 +1901,12 @@ cdef class Dim(object): if not ctx: ctx = default_ctx() + is_string = ( + isinstance(dtype, str) and dtype == "ascii" + ) or np.dtype(dtype) in (np.str_, np.bytes_) + if var is not None: - if var and np.dtype(dtype) not in (np.str_, np.bytes_): + if var and not is_string: raise TypeError("'var=True' specified for non-str/bytes dtype") if domain is not None and len(domain) != 2: @@ -1919,12 +1921,14 @@ cdef class Dim(object): cdef void* tile_size_ptr = NULL cdef np.dtype domain_dtype - if ((isinstance(dtype, str) and dtype == "ascii") or - dtype == np.dtype('S')): + if is_string: # Handle var-len domain type # (currently only TILEDB_STRING_ASCII) # The dimension's domain is implicitly formed as # coordinates are written. + if dtype != "ascii": + warnings.warn("Use 'ascii' for string dimensions.") + dtype = np.dtype("|U0") dim_datatype = TILEDB_STRING_ASCII else: if domain is None or len(domain) != 2: @@ -1985,17 +1989,19 @@ cdef class Dim(object): self.ptr = dim_ptr def __repr__(self): - filters_str = "" + filters = "" if self.filters: - filters_str = ", filters=FilterList([" + filters = ", filters=FilterList([" for f in self.filters: - filters_str += repr(f) + ", " - filters_str += "])" + filters += repr(f) + ", " + filters += "])" + + dtype = "ascii" if self._get_type() == TILEDB_STRING_ASCII else self.dtype # for consistency, print `var=True` for string-like types - varlen = "" if not self.dtype in (np.str_, np.bytes_) else ", var=True" - return "Dim(name={0!r}, domain={1!s}, tile={2!r}, dtype='{3!s}'{4}{5})" \ - .format(self.name, self.domain, self.tile, self.dtype, varlen, filters_str) + varlen = "" if dtype != "ascii" else ", var=True" + return f"Dim(name={self.name!r}, domain={self.domain}, tile={self.tile!r}, dtype='{dtype}'{varlen}{filters})" + def _repr_html_(self) -> str: output = io.StringIO() @@ -2022,7 +2028,7 @@ cdef class Dim(object): output.write(f"{self.domain}") output.write(f"{self.tile}") output.write(f"{self.dtype}") - output.write(f"{self.dtype in (np.str_, np.bytes_)}") + output.write(f"{self.dtype == 'ascii'}") output.write(f"{self.filters._repr_html_()}") output.write("") @@ -2222,7 +2228,7 @@ cdef class Dim(object): :rtype: tuple(numpy scalar, numpy scalar) """ - if self.dtype == np.dtype('S'): + if self.dtype == np.dtype('U'): return None, None cdef const void* domain_ptr = NULL check_error(self.ctx, @@ -3864,9 +3870,8 @@ cdef class Array(object): results.append((None, None)) continue - buf_dtype = 'S' - start_buf = np.empty(start_size, 'S' + str(start_size)) - end_buf = np.empty(end_size, 'S' + str(end_size)) + start_buf = np.empty(start_size, f"S{start_size}") + end_buf = np.empty(end_size, f"S{end_size}") start_buf_ptr = np.PyArray_DATA(start_buf) end_buf_ptr = np.PyArray_DATA(end_buf) else: @@ -3884,7 +3889,8 @@ cdef class Array(object): return None if start_size > 0 and end_size > 0: - results.append((start_buf.item(0), end_buf.item(0))) + results.append((start_buf.item(0).decode("UTF-8"), + end_buf.item(0).decode("UTF-8"))) else: results.append((None, None)) else: @@ -4918,7 +4924,7 @@ def index_domain_coords(dom: Domain, idx: tuple, check_ndim: bool): # ensure strings contain only ASCII characters domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1)) except Exception as exc: - raise TileDBError(f'Dim\' strings may only contain ASCII characters') + raise TileDBError('Dimension strings may only contain ASCII characters') else: domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1)) diff --git a/tiledb/tests/test_filters.py b/tiledb/tests/test_filters.py index 18371ba08c..7d2ccdf882 100644 --- a/tiledb/tests/test_filters.py +++ b/tiledb/tests/test_filters.py @@ -97,7 +97,7 @@ def test_dictionary_encoding(self): schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True) tiledb.Array.create(path, schema) - data = [b"x" * i for i in np.random.randint(1, 10, size=10)] + data = ["x" * i for i in np.random.randint(1, 10, size=10)] with tiledb.open(path, "w") as A: A[np.arange(10)] = data diff --git a/tiledb/tests/test_fragments.py b/tiledb/tests/test_fragments.py index 469a8afe90..4c58f87980 100644 --- a/tiledb/tests/test_fragments.py +++ b/tiledb/tests/test_fragments.py @@ -75,7 +75,7 @@ def test_array_fragments_var(self): uri = self.path("test_array_fragments_var") dom = tiledb.Domain( - tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype=np.bytes_) + tiledb.Dim(name="dim", domain=(None, None), tile=None, dtype="ascii") ) schema = tiledb.ArraySchema( domain=dom, @@ -285,8 +285,8 @@ def test_nonempty_domain_date(self): def test_nonempty_domain_strings(self): uri = self.path("test_nonempty_domain_strings") dom = tiledb.Domain( - tiledb.Dim(name="x", domain=(None, None), dtype=np.bytes_), - tiledb.Dim(name="y", domain=(None, None), dtype=np.bytes_), + tiledb.Dim(name="x", domain=(None, None), dtype="ascii"), + tiledb.Dim(name="y", domain=(None, None), dtype="ascii"), ) att = tiledb.Attr() schema = tiledb.ArraySchema(sparse=True, domain=dom, attrs=(att,)) @@ -294,13 +294,13 @@ def test_nonempty_domain_strings(self): tiledb.SparseArray.create(uri, schema) with tiledb.SparseArray(uri, mode="w") as T: - x_dims = [b"a", b"b", b"c", b"d"] - y_dims = [b"e", b"f", b"g", b"h"] + x_dims = ["a", "b", "c", "d"] + y_dims = ["e", "f", "g", "h"] T[x_dims, y_dims] = np.array([1, 2, 3, 4]) with tiledb.SparseArray(uri, mode="w") as T: - x_dims = [b"a", b"b"] - y_dims = [b"e", b"f"] + x_dims = ["a", "b"] + y_dims = ["e", "f"] T[x_dims, y_dims] = np.array([1, 2]) fragment_info = PyFragmentInfo(uri, schema, False, tiledb.default_ctx()) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index ee82339283..253fc50c29 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -306,13 +306,34 @@ def test_datetime_dimension(self): ) def test_shape(self): - dim = tiledb.Dim(name="", dtype="|S0", var=True) + dim = tiledb.Dim(name="", dtype="ascii", var=True) with self.assertRaisesRegex( TypeError, "shape only valid for integer and datetime dimension domains", ): dim.shape + def test_string_dtypes(self): + expected_repr = ( + "Dim(name='__dim_0', domain=(None, None), " + "tile=None, dtype='ascii', var=True)" + ) + expected_warn = "Use 'ascii' for string dimensions." + + dim = tiledb.Dim(dtype="ascii") + assert dim.dtype == np.dtype("U") + assert repr(dim) == expected_repr + + with pytest.warns(UserWarning, match=expected_warn): + dim = tiledb.Dim(dtype=np.bytes_) + assert dim.dtype == np.dtype("U") + assert repr(dim) == expected_repr + + with pytest.warns(UserWarning, match=expected_warn): + dim = tiledb.Dim(dtype=np.str_) + assert dim.dtype == np.dtype("U") + assert repr(dim) == expected_repr + class DomainTest(DiskTestCase): def test_domain(self, capfd): @@ -368,7 +389,7 @@ def test_ascii_domain(self, capfd): path = self.path("test_ascii_domain") dim = tiledb.Dim(name="d", dtype="ascii") - assert dim.dtype == np.bytes_ + assert dim.dtype == np.str_ dom = tiledb.Domain(dim) dom.dump() @@ -545,9 +566,9 @@ def test_ascii_attribute(self, sparse, capfd): assert A.schema.nattr == 1 A.schema.dump() assert_captured(capfd, "Type: STRING_ASCII") - assert A.schema.attr("A").dtype == np.bytes_ + assert A.schema.attr("A").dtype == np.str_ assert A.schema.attr("A").isascii - assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_)) + assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.str_)) class ArraySchemaTest(DiskTestCase): @@ -815,7 +836,7 @@ def test_mixed_string_schema(self): dims = [ tiledb.Dim(name="dpos", domain=(-100.0, 100.0), tile=10, dtype=np.float64), - tiledb.Dim(name="str_index", tile=None, dtype=np.bytes_), + tiledb.Dim(name="str_index", tile=None, dtype="ascii"), ] dom = tiledb.Domain(*dims) attrs = [tiledb.Attr(name="val", dtype=np.float64)] @@ -827,12 +848,12 @@ def test_mixed_string_schema(self): self.assertTrue(schema.domain.dim("str_index").isvar) self.assertFalse(schema.domain.dim("dpos").isvar) self.assertEqual(schema.domain.dim("dpos").dtype, np.double) - self.assertEqual(schema.domain.dim("str_index").dtype, np.bytes_) + self.assertEqual(schema.domain.dim("str_index").dtype, np.str_) self.assertFalse(schema.domain.homogeneous) tiledb.Array.create(path, schema) with tiledb.open(path, "r") as arr: - assert_array_equal(arr[:]["str_index"], np.array([], dtype="|S1")) + assert_array_equal(arr[:]["str_index"], np.array([], dtype="|U1")) class ArrayTest(DiskTestCase): @@ -1035,7 +1056,7 @@ def test_nonempty_domain_empty_string(self): A[""] = None with tiledb.open(uri, "r") as A: - assert_array_equal(A.nonempty_domain(), ((b"", b""),)) + assert_array_equal(A.nonempty_domain(), (("", ""),)) def test_create_array_overwrite(self): uri = self.path("test_create_array_overwrite") @@ -2777,12 +2798,12 @@ def test_sparse_mixed_domain_uint_float64(self, sparse_cell_order): sparse_cell_order != "hilbert", ) a_nonempty = A.nonempty_domain() - self.assertEqual(a_nonempty[0], (0, 49)) - self.assertEqual(a_nonempty[1], (-100.0, 100.0)) + assert a_nonempty[0] == (0, 49) + assert a_nonempty[1] == (-100.0, 100.0) def test_sparse_string_domain(self, sparse_cell_order): path = self.path("sparse_string_domain") - dom = tiledb.Domain(tiledb.Dim(name="d", domain=(None, None), dtype=np.bytes_)) + dom = tiledb.Domain(tiledb.Dim(name="d", domain=(None, None), dtype="ascii")) att = tiledb.Attr(name="a", dtype=np.int64) schema = tiledb.ArraySchema( domain=dom, @@ -2794,7 +2815,7 @@ def test_sparse_string_domain(self, sparse_cell_order): tiledb.SparseArray.create(path, schema) data = [1, 2, 3, 4] - coords = [b"aa", b"bbb", b"c", b"dddd"] + coords = ["aa", "bb", "c", "dddd"] with tiledb.open(path, "w") as A: A[coords] = data @@ -2803,18 +2824,18 @@ def test_sparse_string_domain(self, sparse_cell_order): ned = A.nonempty_domain()[0] res = A[ned[0] : ned[1]] assert_array_equal(res["a"], data) - self.assertEqual(set(res["d"]), set(coords)) - self.assertEqual(A.nonempty_domain(), ((b"aa", b"dddd"),)) + assert set(res["d"]) == set(coords) + assert A.nonempty_domain() == (("aa", "dddd"),) def test_sparse_string_domain2(self, sparse_cell_order): path = self.path("sparse_string_domain2") with self.assertRaises(ValueError): dims = [ tiledb.Dim( - name="str", domain=(None, None, None), tile=None, dtype=np.bytes_ + name="str", domain=(None, None, None), tile=None, dtype="ascii" ) ] - dims = [tiledb.Dim(name="str", domain=(None, None), tile=None, dtype=np.bytes_)] + dims = [tiledb.Dim(name="str", domain=(None, None), tile=None, dtype="ascii")] dom = tiledb.Domain(*dims) attrs = [tiledb.Attr(name="val", dtype=np.float64)] @@ -2824,7 +2845,7 @@ def test_sparse_string_domain2(self, sparse_cell_order): tiledb.SparseArray.create(path, schema) data = np.random.rand(10) - coords = [rand_ascii_bytes(random.randint(5, 50)) for _ in range(10)] + coords = [rand_ascii(random.randint(5, 50)) for _ in range(10)] with tiledb.open(path, "w") as A: A[coords] = data @@ -2832,7 +2853,7 @@ def test_sparse_string_domain2(self, sparse_cell_order): with tiledb.open(path) as A: ned = A.nonempty_domain()[0] res = A[ned[0] : ned[1]] - self.assertTrue(set(res["str"]) == set(coords)) + assert set(res["str"]) == set(coords) # must check data ordered by coords assert_array_equal(res["val"], data[np.argsort(coords, kind="stable")]) @@ -2840,7 +2861,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order): uri = self.path("sparse_mixed_domain") dims = [ tiledb.Dim(name="p", domain=(-100.0, 100.0), tile=10, dtype=np.float64), - tiledb.Dim(name="str", domain=(None, None), tile=None, dtype=np.bytes_), + tiledb.Dim(name="str", domain=(None, None), tile=None, dtype="ascii"), ] dom = tiledb.Domain(*dims) attrs = [tiledb.Attr(name="val", dtype=np.float64)] @@ -2852,7 +2873,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order): nrows = 5 idx_f64 = np.random.rand(nrows) - idx_str = [rand_ascii(5).encode("utf-8") for _ in range(nrows)] + idx_str = [rand_ascii(5) for _ in range(nrows)] data = np.random.rand(nrows) with tiledb.SparseArray(uri, "w") as A: @@ -2868,7 +2889,7 @@ def test_sparse_mixed_domain(self, sparse_cell_order): def test_sparse_get_unique_dim_values(self, sparse_cell_order): uri = self.path("get_non_empty_coords") - dim1 = tiledb.Dim(name="dim1", domain=(None, None), tile=None, dtype=np.bytes_) + dim1 = tiledb.Dim(name="dim1", domain=(None, None), tile=None, dtype="ascii") dim2 = tiledb.Dim(name="dim2", domain=(0, 1), tile=1, dtype=np.float64) attr = tiledb.Attr(name="attr", dtype=np.float32) dom = tiledb.Domain(dim1, dim2) @@ -2886,12 +2907,10 @@ def test_sparse_get_unique_dim_values(self, sparse_cell_order): with tiledb.open(uri, "r") as A: self.assertEqual( A.unique_dim_values(), - OrderedDict( - [("dim1", (b"a1", b"a2", b"a3")), ("dim2", (0.0, 0.25, 0.5))] - ), + OrderedDict([("dim1", ("a1", "a2", "a3")), ("dim2", (0.0, 0.25, 0.5))]), ) - self.assertEqual(A.unique_dim_values("dim1"), (b"a1", b"a2", b"a3")) + self.assertEqual(A.unique_dim_values("dim1"), ("a1", "a2", "a3")) self.assertEqual(A.unique_dim_values("dim2"), (0, 0.25, 0.5)) with self.assertRaises(ValueError): @@ -2919,7 +2938,7 @@ def test_sparse_write_for_zero_attrs(self): def test_sparse_write_nullable_default(self): uri = self.path("test_sparse_write_nullable_default") - dim1 = tiledb.Dim(name="d1", dtype="|S0", var=True) + dim1 = tiledb.Dim(name="d1", dtype="ascii", var=True) att = tiledb.Attr(name="a1", dtype="