Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add support for fixed width UTF8 strings - #270 #278

Merged
merged 3 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hsds/attr_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ async def PUT_AttributeValue(request):
raise HTTPBadRequest(reason=msg)

np_shape = getShapeDims(attr_shape)
log.debug(f"np_shape: {np_shape}")
type_json = dn_json["type"]
np_dtype = createDataType(type_json) # np datatype

Expand Down Expand Up @@ -697,6 +698,10 @@ async def PUT_AttributeValue(request):
# convert to JSON for transmission to DN
data = arr.tolist()
value = bytesArrayToList(data)
if attr_shape["class"] == "H5S_SCALAR":
# just send the value, not a list
value = value[0]

else:
try:
body = await request.json()
Expand Down
7 changes: 3 additions & 4 deletions hsds/chunk_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ async def PUT_Value(request):
else:
arr = jsonToArray(np_shape, dset_dtype, json_data)

log.debug(f"jsonToArray returned: {arr}")
if num_elements != np.prod(arr.shape):
msg = f"expected {num_elements} elements, but got {np.prod(arr.shape)}"
raise HTTPBadRequest(reason=msg)
Expand All @@ -520,13 +519,13 @@ async def PUT_Value(request):
arr_tmp[...] = arr
arr = arr_tmp
except ValueError:
log.warn(msg)
log.warn(f"ValueError: {msg}")
raise HTTPBadRequest(reason=msg)
except TypeError:
log.warn(msg)
log.warn(f"TypeError: {msg}")
raise HTTPBadRequest(reason=msg)
except IndexError:
log.warn(msg)
log.warn(f"IndexError: {msg}")
raise HTTPBadRequest(reason=msg)
log.debug(f"got json arr: {arr.shape}")
else:
Expand Down
9 changes: 6 additions & 3 deletions hsds/dset_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,12 +801,15 @@ async def POST_Dataset(request):
shape_json["dims"] = dims
rank = 1
elif isinstance(shape, str):
# only valid string value is H5S_NULL
if shape != "H5S_NULL":
# only valid string value is H5S_NULL or H5S_SCALAR
if shape == "H5S_NULL":
shape_json["class"] = "H5S_NULL"
elif shape == "H5S_SCALAR":
shape_json["class"] = "H5S_SCALAR"
else:
msg = "POST Datset with invalid shape value"
log.warn(msg)
raise HTTPBadRequest(reason=msg)
shape_json["class"] = "H5S_NULL"
elif isinstance(shape, list):
if len(shape) == 0:
shape_json["class"] = "H5S_SCALAR"
Expand Down
38 changes: 21 additions & 17 deletions hsds/util/arrayUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def toTuple(rank, data):
else:
return tuple(toTuple(rank - 1, x) for x in data)
else:
if isinstance(data, str):
data = data.encode("utf8")
return data


Expand Down Expand Up @@ -93,6 +95,23 @@ def getNumElements(dims):
return num_elements


def isVlen(dt):
"""
Return True if the type contains variable length elements
"""
is_vlen = False
if len(dt) > 1:
names = dt.names
for name in names:
if isVlen(dt[name]):
is_vlen = True
break
else:
if dt.metadata and "vlen" in dt.metadata:
is_vlen = True
return is_vlen


def jsonToArray(data_shape, data_dtype, data_json):
"""
Return numpy array from the given json array.
Expand Down Expand Up @@ -122,6 +141,8 @@ def fillVlenArray(rank, data, arr, index):
converted_data = toTuple(np_shape_rank, data_json)
data_json = converted_data
else:
if isinstance(data_json, str):
data_json = data_json.encode("utf8")
data_json = [data_json,] # listify

if not (None in data_json):
Expand Down Expand Up @@ -149,23 +170,6 @@ def fillVlenArray(rank, data, arr, index):
return arr


def isVlen(dt):
"""
Return True if the type contains variable length elements
"""
is_vlen = False
if len(dt) > 1:
names = dt.names
for name in names:
if isVlen(dt[name]):
is_vlen = True
break
else:
if dt.metadata and "vlen" in dt.metadata:
is_vlen = True
return is_vlen


def getElementSize(e, dt):
"""
Get number of byte needed to given element as a bytestream
Expand Down
23 changes: 19 additions & 4 deletions hsds/util/hdf5dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,26 @@ def getTypeItem(dt, metadata=None):
else:
# Fixed length string type
type_info["class"] = "H5T_STRING"
type_info["charSet"] = "H5T_CSET_ASCII"
type_info["length"] = dt.itemsize
type_info["charSet"] = "H5T_CSET_ASCII"
type_info["strPad"] = "H5T_STR_NULLPAD"
elif dt.base.kind == "U":
# Fixed length unicode type
raise TypeError("Fixed length unicode type is not supported")
print("fixed UTF, itemsize:", dt.itemsize)
ref_check = check_dtype(ref=dt.base)
if ref_check is not None:
raise TypeError("unexpected reference type")

# Fixed length string type with unicode support
type_info["class"] = "H5T_STRING"

# this can be problematic if the encoding of the string is not valid,
# or reqires too many bytes. Use variable length strings to handle all
# UTF8 strings correctly
type_info["charSet"] = "H5T_CSET_UTF8"
# convert from UTF32 length to a fixed length
type_info["length"] = dt.itemsize
type_info["strPad"] = "H5T_STR_NULLPAD"

elif dt.kind == "b":
# boolean type - h5py stores as enum
Expand Down Expand Up @@ -614,8 +628,9 @@ def createBaseDataType(typeItem):
if typeItem["charSet"] == "H5T_CSET_ASCII":
type_code = "S"
elif typeItem["charSet"] == "H5T_CSET_UTF8":
msg = "fixed-width unicode strings are not supported"
raise TypeError(msg)
# use the same type_code as ascii strings
# (othewise, numpy will reserve bytes for UTF32 representation)
type_code = "S"
else:
raise TypeError("unexpected 'charSet' value")
# a fixed size string
Expand Down
129 changes: 116 additions & 13 deletions tests/integ/attr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def testPutFixedStringNullTerm(self):

def testPutVLenUTF8String(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string
print("testPutFixedUTF8String", self.base_domain)
print("testPutVLenUTF8String", self.base_domain)

headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"
Expand All @@ -531,46 +531,149 @@ def testPutVLenUTF8String(self):

# create attr
text = "I'm an UTF-8 null terminated string"
text_length = len(text) + 1
fixed_str_type = {

variable_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": text_length,
"length": "H5T_VARIABLE",
"strPad": "H5T_STR_NULLTERM",
}
variable_str_type = {
scalar_shape = {"class": "H5S_SCALAR"}

data = {"type": variable_str_type, "shape": scalar_shape, "value": text}
attr_name = "str_attr"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)

# read attr
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], "H5T_VARIABLE")
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8")

def testPutFixedUTF8String(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string
print("testPutFixedUTF8String", self.base_domain)

headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"

# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)

# create attr
text = "this is the chinese character for the number eight: \u516b"

text_length = len(text) + 1
fixed_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": "H5T_VARIABLE",
"length": text_length,
"strPad": "H5T_STR_NULLTERM",
}

scalar_shape = {"class": "H5S_SCALAR"}
data = {"type": fixed_str_type, "shape": scalar_shape, "value": text}
attr_name = "str_attr"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name
# Should fail since UTF8 with fixed width is not supported
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 400)
self.assertEqual(rsp.status_code, 201)

data = {"type": variable_str_type, "shape": scalar_shape, "value": text}
attr_name = "str_attr"
# read attr
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
print(rspJson)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], text_length)
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
self.assertEqual(type_json["charSet"], "H5T_CSET_UTF8")

def testPutFixedUTF8StringBinary(self):
# Test PUT value for 1d attribute with fixed length UTF-8 string in binary
print("testPutFixedUTF8StringBinary", self.base_domain)

headers = helper.getRequestHeaders(domain=self.base_domain)
req = self.endpoint + "/"

# Get root uuid
rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
root_uuid = rspJson["root"]
helper.validateId(root_uuid)

# create attr with json
character_text = "this is the chinese character for the number eight: \u516b"

binary_text = bytearray(character_text, "UTF-8")
byte_length = len(binary_text)

fixed_str_type = {
"charSet": "H5T_CSET_UTF8",
"class": "H5T_STRING",
"length": byte_length, # Null byte explicitly included
"strPad": "H5T_STR_NULLTERM",
}

scalar_shape = {"class": "H5S_SCALAR"}
data = {"type": fixed_str_type, "shape": scalar_shape}
attr_name = "fixed_unicode_str_attr_binary"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name
rsp = self.session.put(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)

# write to attr in binary
attr_name = "fixed_unicode_str_attr_binary"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name + "/value"
headers["Content-Type"] = "application/octet-stream"
rsp = self.session.put(req, data=binary_text, headers=headers)
self.assertEqual(rsp.status_code, 200)

# read attr
headers["Content-Type"] = "application/json"
req = self.endpoint + "/groups/" + root_uuid + "/attributes/" + attr_name

rsp = self.session.get(req, headers=headers)
self.assertEqual(rsp.status_code, 200)
rspJson = json.loads(rsp.text)
print(rspJson)
self.assertTrue("hrefs" in rspJson)
self.assertTrue("value" in rspJson)
self.assertEqual(rspJson["value"], text)
print(f"Retrieved UTF8 string: {rspJson['value']}")
self.assertEqual(rspJson["value"], character_text)
self.assertTrue("type" in rspJson)
type_json = rspJson["type"]
self.assertTrue("class" in type_json)
self.assertEqual(type_json["class"], "H5T_STRING")
self.assertTrue("length" in type_json)
self.assertEqual(type_json["length"], "H5T_VARIABLE")
self.assertEqual(type_json["length"], byte_length)
self.assertTrue("strPad" in type_json)
self.assertEqual(type_json["strPad"], "H5T_STR_NULLTERM")
self.assertTrue("charSet" in type_json)
Expand Down Expand Up @@ -1302,7 +1405,7 @@ def testPutAttributeBinaryValue(self):
rsp = self.session.put(req, data=data, headers=headers_bin_req)
self.assertEqual(rsp.status_code, 200)

# try writing to few bytes, should fail
# try writing too few bytes, should fail
data = bytearray(extent)
for i in range(extent):
data[i] = 255
Expand Down
4 changes: 3 additions & 1 deletion tests/integ/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def testScalarDataset(self):
helper.validateId(root_uuid)

# create a dataset obj
data = {"type": "H5T_IEEE_F32LE"}
data = {"type": "H5T_IEEE_F32LE", "shape": "H5S_SCALAR"}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
self.assertEqual(rsp.status_code, 201)
Expand Down Expand Up @@ -207,6 +207,8 @@ def testScalarEmptyDimsDataset(self):
helper.validateId(root_uuid)

# create a dataset obj
# using an empty list for shape is equivalent to using
# "H5S_SCALAR"
data = {"type": "H5T_IEEE_F32LE", "shape": []}
req = self.endpoint + "/datasets"
rsp = self.session.post(req, data=json.dumps(data), headers=headers)
Expand Down
Loading
Loading