diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index da78f4cb..d7c865f9 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -649,6 +649,7 @@ async def PUT_AttributeValue(request): raise HTTPBadRequest(reason=msg) np_shape = getShapeDims(attr_shape) + log.debug(f"np_shape: {np_shape}") type_json = dn_json["type"] np_dtype = createDataType(type_json) # np datatype @@ -697,6 +698,10 @@ async def PUT_AttributeValue(request): # convert to JSON for transmission to DN data = arr.tolist() value = bytesArrayToList(data) + if attr_shape["class"] == "H5S_SCALAR": + # just send the value, not a list + value = value[0] + else: try: body = await request.json() diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 650cc0fa..39641b6e 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -509,7 +509,6 @@ async def PUT_Value(request): else: arr = jsonToArray(np_shape, dset_dtype, json_data) - log.debug(f"jsonToArray returned: {arr}") if num_elements != np.prod(arr.shape): msg = f"expected {num_elements} elements, but got {np.prod(arr.shape)}" raise HTTPBadRequest(reason=msg) @@ -520,13 +519,13 @@ async def PUT_Value(request): arr_tmp[...] = arr arr = arr_tmp except ValueError: - log.warn(msg) + log.warn(f"ValueError: {msg}") raise HTTPBadRequest(reason=msg) except TypeError: - log.warn(msg) + log.warn(f"TypeError: {msg}") raise HTTPBadRequest(reason=msg) except IndexError: - log.warn(msg) + log.warn(f"IndexError: {msg}") raise HTTPBadRequest(reason=msg) log.debug(f"got json arr: {arr.shape}") else: diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 26919178..a54829cc 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -801,12 +801,15 @@ async def POST_Dataset(request): shape_json["dims"] = dims rank = 1 elif isinstance(shape, str): - # only valid string value is H5S_NULL - if shape != "H5S_NULL": + # only valid string value is H5S_NULL or H5S_SCALAR + if shape == "H5S_NULL": + shape_json["class"] = "H5S_NULL" + elif shape == "H5S_SCALAR": + shape_json["class"] = "H5S_SCALAR" + else: msg = "POST Datset with invalid shape value" log.warn(msg) raise HTTPBadRequest(reason=msg) - shape_json["class"] = "H5S_NULL" elif isinstance(shape, list): if len(shape) == 0: shape_json["class"] = "H5S_SCALAR" diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index 5bd7e0ab..050c1f81 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -64,6 +64,8 @@ def toTuple(rank, data): else: return tuple(toTuple(rank - 1, x) for x in data) else: + if isinstance(data, str): + data = data.encode("utf8") return data @@ -93,6 +95,23 @@ def getNumElements(dims): return num_elements +def isVlen(dt): + """ + Return True if the type contains variable length elements + """ + is_vlen = False + if len(dt) > 1: + names = dt.names + for name in names: + if isVlen(dt[name]): + is_vlen = True + break + else: + if dt.metadata and "vlen" in dt.metadata: + is_vlen = True + return is_vlen + + def jsonToArray(data_shape, data_dtype, data_json): """ Return numpy array from the given json array. @@ -122,6 +141,8 @@ def fillVlenArray(rank, data, arr, index): converted_data = toTuple(np_shape_rank, data_json) data_json = converted_data else: + if isinstance(data_json, str): + data_json = data_json.encode("utf8") data_json = [data_json,] # listify if not (None in data_json): @@ -149,23 +170,6 @@ def fillVlenArray(rank, data, arr, index): return arr -def isVlen(dt): - """ - Return True if the type contains variable length elements - """ - is_vlen = False - if len(dt) > 1: - names = dt.names - for name in names: - if isVlen(dt[name]): - is_vlen = True - break - else: - if dt.metadata and "vlen" in dt.metadata: - is_vlen = True - return is_vlen - - def getElementSize(e, dt): """ Get number of byte needed to given element as a bytestream diff --git a/hsds/util/hdf5dtype.py b/hsds/util/hdf5dtype.py index 67119491..7a40ec11 100644 --- a/hsds/util/hdf5dtype.py +++ b/hsds/util/hdf5dtype.py @@ -339,12 +339,26 @@ def getTypeItem(dt, metadata=None): else: # Fixed length string type type_info["class"] = "H5T_STRING" - type_info["charSet"] = "H5T_CSET_ASCII" type_info["length"] = dt.itemsize + type_info["charSet"] = "H5T_CSET_ASCII" type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.base.kind == "U": # Fixed length unicode type - raise TypeError("Fixed length unicode type is not supported") + print("fixed UTF, itemsize:", dt.itemsize) + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + raise TypeError("unexpected reference type") + + # Fixed length string type with unicode support + type_info["class"] = "H5T_STRING" + + # this can be problematic if the encoding of the string is not valid, + # or reqires too many bytes. Use variable length strings to handle all + # UTF8 strings correctly + type_info["charSet"] = "H5T_CSET_UTF8" + # convert from UTF32 length to a fixed length + type_info["length"] = dt.itemsize + type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.kind == "b": # boolean type - h5py stores as enum @@ -614,8 +628,9 @@ def createBaseDataType(typeItem): if typeItem["charSet"] == "H5T_CSET_ASCII": type_code = "S" elif typeItem["charSet"] == "H5T_CSET_UTF8": - msg = "fixed-width unicode strings are not supported" - raise TypeError(msg) + # use the same type_code as ascii strings + # (othewise, numpy will reserve bytes for UTF32 representation) + type_code = "S" else: raise TypeError("unexpected 'charSet' value") # a fixed size string diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 072d25f4..b4aa5f83 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -517,7 +517,7 @@ def testPutFixedStringNullTerm(self): def testPutVLenUTF8String(self): # Test PUT value for 1d attribute with fixed length UTF-8 string - print("testPutFixedUTF8String", self.base_domain) + print("testPutVLenUTF8String", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) req = self.endpoint + "/" @@ -531,46 +531,149 @@ def testPutVLenUTF8String(self): # create attr text = "I'm an UTF-8 null terminated string" - text_length = len(text) + 1 - fixed_str_type = { + + variable_str_type = { "charSet": "H5T_CSET_UTF8", "class": "H5T_STRING", - "length": text_length, + "length": "H5T_VARIABLE", "strPad": "H5T_STR_NULLTERM", } - variable_str_type = { + scalar_shape = {"class": "H5S_SCALAR"} + + data = {"type": variable_str_type, "shape": scalar_shape, "value": text} + attr_name = "str_attr" + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # read attr + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], text) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_STRING") + self.assertTrue("length" in type_json) + self.assertEqual(type_json["length"], "H5T_VARIABLE") + self.assertTrue("strPad" in type_json) + self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") + self.assertTrue("charSet" in type_json) + self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8") + + def testPutFixedUTF8String(self): + # Test PUT value for 1d attribute with fixed length UTF-8 string + print("testPutFixedUTF8String", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create attr + text = "this is the chinese character for the number eight: \u516b" + + text_length = len(text) + 1 + fixed_str_type = { "charSet": "H5T_CSET_UTF8", "class": "H5T_STRING", - "length": "H5T_VARIABLE", + "length": text_length, "strPad": "H5T_STR_NULLTERM", } + scalar_shape = {"class": "H5S_SCALAR"} data = {"type": fixed_str_type, "shape": scalar_shape, "value": text} attr_name = "str_attr" req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name - # Should fail since UTF8 with fixed width is not supported rsp = self.session.put(req, data=json.dumps(data), headers=headers) - self.assertEqual(rsp.status_code, 400) + self.assertEqual(rsp.status_code, 201) - data = {"type": variable_str_type, "shape": scalar_shape, "value": text} - attr_name = "str_attr" + # read attr + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + print(rspJson) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], text) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_STRING") + self.assertTrue("length" in type_json) + self.assertEqual(type_json["length"], text_length) + self.assertTrue("strPad" in type_json) + self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") + self.assertTrue("charSet" in type_json) + self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8") + + def testPutFixedUTF8StringBinary(self): + # Test PUT value for 1d attribute with fixed length UTF-8 string in binary + print("testPutFixedUTF8StringBinary", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create attr with json + character_text = "this is the chinese character for the number eight: \u516b" + + binary_text = bytearray(character_text, "UTF-8") + byte_length = len(binary_text) + + fixed_str_type = { + "charSet": "H5T_CSET_UTF8", + "class": "H5T_STRING", + "length": byte_length, # Null byte explicitly included + "strPad": "H5T_STR_NULLTERM", + } + + scalar_shape = {"class": "H5S_SCALAR"} + data = {"type": fixed_str_type, "shape": scalar_shape} + attr_name = "fixed_unicode_str_attr_binary" req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name rsp = self.session.put(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) + # write to attr in binary + attr_name = "fixed_unicode_str_attr_binary" + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + "/value" + headers["Content-Type"] = "application/octet-stream" + rsp = self.session.put(req, data=binary_text, headers=headers) + self.assertEqual(rsp.status_code, 200) + # read attr + headers["Content-Type"] = "application/json" + req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) + print(rspJson) self.assertTrue("hrefs" in rspJson) self.assertTrue("value" in rspJson) - self.assertEqual(rspJson["value"], text) + print(f"Retrieved UTF8 string: {rspJson['value']}") + self.assertEqual(rspJson["value"], character_text) self.assertTrue("type" in rspJson) type_json = rspJson["type"] self.assertTrue("class" in type_json) self.assertEqual(type_json["class"], "H5T_STRING") self.assertTrue("length" in type_json) - self.assertEqual(type_json["length"], "H5T_VARIABLE") + self.assertEqual(type_json["length"], byte_length) self.assertTrue("strPad" in type_json) self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") self.assertTrue("charSet" in type_json) @@ -1302,7 +1405,7 @@ def testPutAttributeBinaryValue(self): rsp = self.session.put(req, data=data, headers=headers_bin_req) self.assertEqual(rsp.status_code, 200) - # try writing to few bytes, should fail + # try writing too few bytes, should fail data = bytearray(extent) for i in range(extent): data[i] = 255 diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index c9e1c18d..91b716b6 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -56,7 +56,7 @@ def testScalarDataset(self): helper.validateId(root_uuid) # create a dataset obj - data = {"type": "H5T_IEEE_F32LE"} + data = {"type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) @@ -207,6 +207,8 @@ def testScalarEmptyDimsDataset(self): helper.validateId(root_uuid) # create a dataset obj + # using an empty list for shape is equivalent to using + # "H5S_SCALAR" data = {"type": "H5T_IEEE_F32LE", "shape": []} req = self.endpoint + "/datasets" rsp = self.session.post(req, data=json.dumps(data), headers=headers) diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 70f62efe..b25bd4c8 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3407,6 +3407,181 @@ def testShapeReinitialization3D(self): else: self.assertEqual(n, 1) + def testPutFixedUTF8StringDataset(self): + # Test PUT value for 1d attribute with fixed length UTF-8 string + print("testPutFixedUTF8StringDataset", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get domain + req = f"{self.endpoint}/" + rsp = self.session.get(req, headers=headers) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + root_uuid = rspJson["root"] + req = helper.getEndpoint() + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + req = self.endpoint + "/datasets" + + text = "this is the chinese character for the number eight: \u516b" + + # size of datatype is in bytes + byte_data = bytearray(text, "UTF-8") + byte_length = len(byte_data) + + fixed_str_type = { + "charSet": "H5T_CSET_UTF8", + "class": "H5T_STRING", + "length": byte_length + 1, + "strPad": "H5T_STR_NULLTERM", + } + + data = {"type": fixed_str_type, "shape": "H5S_SCALAR"} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset_uuid)) + self.assertTrue("type" in rspJson) + type_json = rspJson["type"] + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_STRING") + self.assertTrue("length" in type_json) + self.assertEqual(type_json["length"], byte_length + 1) + self.assertTrue("strPad" in type_json) + self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM") + self.assertTrue("charSet" in type_json) + self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8") + + # link new dataset + name = "fixed_utf8_str_dset" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write fixed utf8 string to dset + data = {"value": text} + req = self.endpoint + "/datasets/" + dset_uuid + "/value" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read value back from dset + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], text) + + # write different utf8 string of same overall byte length + text = "this is the chinese character for the number eight: 888" + new_byte_length = len(bytearray(text, "UTF-8")) + self.assertEqual(byte_length, new_byte_length) + + data = {"value": text} + req = self.endpoint + "/datasets/" + dset_uuid + "/value" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read value back from dset + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], text) + + def testPutFixedUTF8StringDatasetBinary(self): + # Test PUT value for 1d attribute with fixed length UTF-8 string in binary + print("testPutFixedUTF8StringDatasetBinary", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req["Content-Type"] = "application/octet-stream" + headers_bin_rsp = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_rsp["accept"] = "application/octet-stream" + + req = helper.getEndpoint() + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + req = self.endpoint + "/datasets" + + text = "this is the chinese character for the number eight: \u516b" + # size of datatype is in bytes + binary_text = bytearray(text, "UTF-8") + byte_length = len(binary_text) + + fixed_str_type = { + "charSet": "H5T_CSET_UTF8", + "class": "H5T_STRING", + "length": byte_length, + "strPad": "H5T_STR_NULLTERM", + } + + data = {"type": fixed_str_type, "shape": "H5S_SCALAR"} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset_uuid)) + + # link new dataset + name = "fixed_utf8_str_dset_binary" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write fixed utf8 binary string to dset + req = self.endpoint + "/datasets/" + dset_uuid + "/value" + rsp = self.session.put(req, data=binary_text, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read value back from dset as json + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], text) + + # read value back as binary + rsp = self.session.get(req, headers=headers_bin_rsp) + self.assertEqual(rsp.status_code, 200) + self.assertEqual(rsp.text, text) + + # write different utf8 binary string of same overall byte length + text = "this is the chinese character for the number eight: 888" + binary_text = bytearray(text, "UTF-8") + new_byte_length = len(binary_text) + self.assertEqual(byte_length, new_byte_length) + + # read as JSON + req = self.endpoint + "/datasets/" + dset_uuid + "/value" + rsp = self.session.put(req, data=binary_text, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read as binary + rsp = self.session.get(req, headers=headers_bin_rsp) + self.assertEqual(rsp.status_code, 200) + self.assertEqual(rsp.text, text) + if __name__ == "__main__": # setup test files diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index c734e045..dac0e395 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -116,9 +116,7 @@ def testJsonToArray(self): # compound type dt = np.dtype([("a", "i4"), ("b", "S5")]) - shape = [ - 2, - ] + shape = [2, ] data = [[4, "four"], [5, "five"]] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) @@ -131,9 +129,7 @@ def testJsonToArray(self): e1 = out[1].tolist() self.assertEqual(e1, (5, b"five")) - shape = [ - 1, - ] + shape = [1, ] data = [ [6, "six"], ] @@ -148,10 +144,8 @@ def testJsonToArray(self): # VLEN ascii dt = special_dtype(vlen=bytes) - data = [b"one", b"two", b"three", "four", b"five"] - shape = [ - 5, - ] + data = [b"one", b"two", b"three", b"four", b"five"] + shape = [5, ] out = jsonToArray(shape, dt, data) self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], bytes) @@ -160,17 +154,15 @@ def testJsonToArray(self): # TBD: code does not actually enforce use of bytes vs. str, # probably not worth the effort to fix self.assertEqual(out[2], b"three") - self.assertEqual(out[3], "four") + self.assertEqual(out[3], b"four") # VLEN str dt = special_dtype(vlen=str) data = [ - ["part 1 - section A", "part 1 - section B"], - ["part 2 - section A", "part 2 - section B"], - ] - shape = [ - 2, + [b"part 1 - section A", b"part 1 - section B"], + [b"part 2 - section A", b"part 2 - section B"], ] + shape = [2,] out = jsonToArray(shape, dt, data) self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], str) @@ -182,29 +174,22 @@ def testJsonToArray(self): # VLEN Scalar str dt = special_dtype(vlen=str) data = "I'm a string!" - shape = [ - 1, - ] + shape = [1, ] out = jsonToArray(shape, dt, data) # VLEN unicode dt = special_dtype(vlen=bytes) data = ["one", "two", "three", "four", "five"] - shape = [ - 5, - ] + shape = [5, ] out = jsonToArray(shape, dt, data) self.assertTrue("vlen" in out.dtype.metadata) self.assertEqual(out.dtype.metadata["vlen"], bytes) self.assertEqual(out.dtype.kind, "O") - # TBD: this should show up as bytes, but may not be worth the effort - self.assertEqual(out[2], "three") + self.assertEqual(out[2], b"three") # VLEN data dt = special_dtype(vlen=np.dtype("int32")) - shape = [ - 4, - ] + shape = [4, ] data = [ [1,], [1, 2], @@ -228,15 +213,11 @@ def testJsonToArray(self): shape = [2, 2] data = [ [ - [ - 0, - ], + [0,], [1, 2], ], [ - [ - 1, - ], + [1,], [2, 3], ], ] @@ -257,20 +238,16 @@ def testJsonToArray(self): vlen_type = {"class": "H5T_VLEN", "base": ref_type} dt = createDataType(vlen_type) # np datatype - id0 = "g-a4f455b2-c8cf-11e7-8b73-0242ac110009" - id1 = "g-a50af844-c8cf-11e7-8b73-0242ac110009" - id2 = "g-a5236276-c8cf-11e7-8b73-0242ac110009" + id0 = b"g-a4f455b2-c8cf-11e7-8b73-0242ac110009" + id1 = b"g-a50af844-c8cf-11e7-8b73-0242ac110009" + id2 = b"g-a5236276-c8cf-11e7-8b73-0242ac110009" data = [ - [ - id0, - ], + [id0, ], [id0, id1], [id0, id1, id2], ] - shape = [ - 3, - ] + shape = [3, ] out = jsonToArray(shape, dt, data) self.assertTrue(isinstance(out, np.ndarray)) base_type = check_dtype(vlen=out.dtype) @@ -513,12 +490,34 @@ def testJsonToBytes(self): # # VLEN int # + + def array_equal(a, b): + """ compare two values element by element.""" + if type(a) in (list, tuple, np.void, np.ndarray): + if len(a) != len(b): + print("number of elements doesn't match") + return False + nelements = len(a) + for i in range(nelements): + if not array_equal(a[i], b[i]): + return False + else: + # treat a string and bytes as equal if the utf-8 encoding + # of the string is equal to the byte encoding + if isinstance(a, str): + a = a.encode("utf8") + if isinstance(b, str): + b = b.encode("utf8") + if a != b: + print(f"{a} != {b}") + return False + + return True + dt = special_dtype(vlen=np.dtype("int32")) shape = [4,] data = [ - [ - 1, - ], + [1,], [1, 2], [1, 2, 3], [1, 2, 3, 4], @@ -573,10 +572,7 @@ def testJsonToBytes(self): # np.array_equal doesn't work for object arrays self.assertEqual(arr.dtype, arr_copy.dtype) self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) + self.assertTrue(array_equal(arr, arr_copy)) # # VLEN utf with array type @@ -602,10 +598,7 @@ def testJsonToBytes(self): self.assertEqual(arr.dtype, arr_copy.dtype) self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) + self.assertTrue(array_equal(e, e_copy)) # # VLEN ascii with array type @@ -631,10 +624,7 @@ def testJsonToBytes(self): self.assertEqual(arr.dtype, arr_copy.dtype) self.assertEqual(arr.shape, arr_copy.shape) - for i in range(4): - e = arr[i] - e_copy = arr_copy[i] - self.assertTrue(np.array_equal(e, e_copy)) + self.assertTrue(array_equal(e, e_copy)) def testIndexIterator(self): i = 0 diff --git a/tests/unit/hdf5_dtype_test.py b/tests/unit/hdf5_dtype_test.py index 63da67dc..5f0d5c86 100755 --- a/tests/unit/hdf5_dtype_test.py +++ b/tests/unit/hdf5_dtype_test.py @@ -91,12 +91,12 @@ def testBaseStringTypeItem(self): def testBaseStringUTFTypeItem(self): dt = np.dtype("U3") - try: - typeItem = hdf5dtype.getTypeItem(dt) - self.assertTrue(typeItem is not None) # avoid pyflakes error - self.assertTrue(False) # expected exception - except TypeError: - pass # expected + typeItem = hdf5dtype.getTypeItem(dt) + self.assertEqual(typeItem["class"], "H5T_STRING") + # type item length in bytes (may no actual be enough space for some UTF strings) + self.assertEqual(typeItem["length"], 12) + self.assertEqual(typeItem["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(typeItem["charSet"], "H5T_CSET_UTF8") def testBaseVLenAsciiTypeItem(self): dt = special_dtype(vlen=bytes) @@ -388,13 +388,14 @@ def testCreateBaseStringType(self): self.assertEqual(typeSize, 6) def testCreateBaseUnicodeType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 32} - try: - dt = hdf5dtype.createDataType(typeItem) - self.assertTrue(dt is not None) - self.assertTrue(False) # expected exception - except TypeError: - pass + typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 6} + + dt = hdf5dtype.createDataType(typeItem) + typeSize = hdf5dtype.getItemSize(typeItem) + self.assertTrue(dt is not None) + self.assertEqual(dt.name, "bytes48") + self.assertEqual(dt.kind, "S") + self.assertEqual(typeSize, 6) def testCreateNullTermStringType(self): typeItem = {