From 2f33dca59211d6b9348f4bfcc64e517459b509a2 Mon Sep 17 00:00:00 2001 From: jreadey Date: Thu, 5 Oct 2023 18:18:19 -0700 Subject: [PATCH 01/17] fix async errors in getting dset layout --- hsds/async_lib.py | 20 +++++++++---- hsds/chunk_sn.py | 9 +++--- hsds/dset_dn.py | 8 +---- hsds/dset_sn.py | 15 +++++++--- hsds/util/dsetUtil.py | 60 ++++++++++++++++++------------------- tests/integ/dataset_test.py | 22 ++++++++++---- 6 files changed, 76 insertions(+), 58 deletions(-) diff --git a/hsds/async_lib.py b/hsds/async_lib.py index 92e788f5..9ebfa099 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -22,7 +22,7 @@ from .util.hdf5dtype import getItemSize, createDataType from .util.arrayUtil import getShapeDims, getNumElements, bytesToArray from .util.dsetUtil import getHyperslabSelection, getFilterOps, getChunkDims -from .util.dsetUtil import getDatasetLayoutClass, getDatasetCreationPropertyLayout +from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj from .util.storUtil import deleteStorObj, getStorBytes, isStorObj @@ -79,9 +79,8 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): msg += f"for {dset_id}" log.warn(msg) return - layout = getDatasetCreationPropertyLayout(dset_json) msg = f"updateDatasetInfo - shape: {shape_json} type: {type_json} " - msg += f"item size: {item_size} layout: {layout}" + msg += f"item size: {item_size}" log.info(msg) dims = getShapeDims(shape_json) # returns None for HS_NULL dsets @@ -120,6 +119,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): linked_bytes = chunk_size * num_chunks num_linked_chunks = num_chunks elif layout_class == "H5D_CHUNKED_REF": + layout = getDatasetLayout(dset_json) if "chunks" not in layout: log.error("Expected to find 'chunks' key in H5D_CHUNKED_REF layout") return @@ -130,7 +130,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): linked_bytes += chunk_info[1] num_linked_chunks = len(chunks) elif layout_class == "H5D_CHUNKED_REF_INDIRECT": - log.debug("chunk ref indirect") + layout = getDatasetLayout(dset_json) if "chunk_table" not in layout: msg = "Expected to find chunk_table in dataset layout for " msg += f"{dset_id}" @@ -147,7 +147,7 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): msg += f"for {dset_id}" log.warn(msg) return - chunktable_layout = getDatasetCreationPropertyLayout(chunktable_json) + chunktable_layout = getDatasetLayout(chunktable_json) log.debug(f"chunktable_layout: {chunktable_layout}") if not isinstance(chunktable_layout, dict): log.warn(f"unexpected chunktable_layout: {chunktable_id}") @@ -234,7 +234,15 @@ async def updateDatasetInfo(app, dset_id, dataset_info, bucket=None): elif layout_class == "H5D_CHUNKED": msg = "updateDatasetInfo - no linked bytes/chunks for " msg += "H5D_CHUNKED layout" - log.debug(msg) + log.info(msg) + elif layout_class == "H5D_CONTIGUOUS": + msg = "updateDatasetInfo - no linked bytes/chunks for " + msg += "H5D_CONTIGUOUS layout" + log.info(msg) + elif layout_class == "H5D_COMPACT": + msg = "updateDatasetInfo - no linked bytes/chunks for " + msg += "H5D_COMPACT layout" + log.info(msg) else: log.error(f"unexpected chunk layout: {layout_class}") diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 73f0e5ed..df4d7476 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -30,10 +30,9 @@ from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain from .util.hdf5dtype import getItemSize, createDataType -from .util.dsetUtil import getSelectionList, isNullSpace, getDatasetLayoutClass +from .util.dsetUtil import getSelectionList, isNullSpace, getDatasetLayout, getDatasetLayoutClass from .util.dsetUtil import isExtensible, getSelectionPagination from .util.dsetUtil import getSelectionShape, getDsetMaxDims, getChunkLayout -from .util.dsetUtil import getDatasetCreationPropertyLayout from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId from .util.chunkUtil import getChunkIndex, getChunkSuffix from .util.chunkUtil import getChunkCoverage, getDataCoverage @@ -177,7 +176,7 @@ def getChunkItem(chunkid): return chunk_item if layout_class == "H5D_CONTIGUOUS_REF": - layout = getDatasetCreationPropertyLayout(dset_json) + layout = getDatasetLayout(dset_json) log.debug(f"cpl layout: {layout}") s3path = layout["file_uri"] s3size = layout["size"] @@ -229,7 +228,7 @@ def getChunkItem(chunkid): chunk_item["s3offset"] = s3offset chunk_item["s3size"] = chunk_size elif layout_class == "H5D_CHUNKED_REF": - layout = getDatasetCreationPropertyLayout(dset_json) + layout = getDatasetLayout(dset_json) log.debug(f"cpl layout: {layout}") s3path = layout["file_uri"] chunks = layout["chunks"] @@ -248,7 +247,7 @@ def getChunkItem(chunkid): chunk_item["s3size"] = s3size elif layout_class == "H5D_CHUNKED_REF_INDIRECT": - layout = getDatasetCreationPropertyLayout(dset_json) + layout = getDatasetLayout(dset_json) log.debug(f"cpl layout: {layout}") if "chunk_table" not in layout: log.error("Expected to find chunk_table in dataset layout") diff --git a/hsds/dset_dn.py b/hsds/dset_dn.py index beac5a1b..e250bde9 100755 --- a/hsds/dset_dn.py +++ b/hsds/dset_dn.py @@ -273,13 +273,7 @@ async def PUT_DatasetShape(request): # e.g. another client has already extended the shape since the SN # verified it shape_update = body["shape"] - log.debug("shape_update: {}".format(shape_update)) - - for i in range(len(dims)): - if shape_update[i] < dims[i]: - msg = "Dataspace can not be made smaller" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + log.debug(f"shape_update: {shape_update}") # Update the shape! for i in range(len(dims)): diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index e9e8729c..8f69a9fc 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -621,15 +621,22 @@ async def PUT_DatasetShape(request): msg = "Extent of update shape request does not match dataset sahpe" log.warn(msg) raise HTTPBadRequest(reason=msg) + shape_reduction = False for i in range(rank): if shape_update and shape_update[i] < dims[i]: - msg = "Dataspace can not be made smaller" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + shape_reduction = True + if shape_update[i] < 0: + msg = "Extension dimension can not be made less than zero" + log.warn(msg) + raise HTTPBadRequest(reason=msg) if shape_update and maxdims[i] != 0 and shape_update[i] > maxdims[i]: - msg = "Database can not be extended past max extent" + msg = "Extension dimension can not be extended past max extent" log.warn(msg) raise HTTPConflict() + if shape_reduction: + log.info("Shape extent reduced for dataset") + # TBD - ensure any chunks that are outside the new shape region are + # deleted if extend_dim < 0 or extend_dim >= rank: msg = "Extension dimension must be less than rank and non-negative" log.warn(msg) diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 9df8c0fe..da8dbdff 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -855,48 +855,46 @@ def isExtensible(dims, maxdims): return False -def getDatasetCreationPropertyLayout(dset_json): - """ return layout json from creation property list """ - cpl = None +def getDatasetLayout(dset_json): + """ Return layout json from creation property list or layout json """ + layout = None + if "creationProperties" in dset_json: cp = dset_json["creationProperties"] if "layout" in cp: - cpl = cp["layout"] - if not cpl and "layout" in dset_json: - # fallback to dset_json layout - cpl = dset_json["layout"] - if cpl is None: - log.warn(f"no layout found for {dset_json}") - return cpl + layout = cp["layout"] + if not layout and "layout" in dset_json: + layout = dset_json["layout"] + if not layout: + log.warn(f"no layout for {dset_json}") + return layout def getDatasetLayoutClass(dset_json): """ return layout class """ - chunk_layout = None - cp_layout = getDatasetCreationPropertyLayout(dset_json) - # check creation properties first - if cp_layout: - if "class" in cp_layout: - chunk_layout = cp_layout["class"] - # otherwise, get class prop from layout - if chunk_layout is None and "layout" in dset_json: - layout = dset_json["layout"] - if "class" in layout: - chunk_layout = layout["class"] - return chunk_layout + layout = getDatasetLayout(dset_json) + if layout and "class" in layout: + layout_class = layout["class"] + else: + layout_class = None + return layout_class def getChunkDims(dset_json): """ get chunk shape for given dset_json """ - cpl = getDatasetCreationPropertyLayout(dset_json) - if cpl and "dims" in cpl: - return cpl["dims"] - # otherwise, check the 'layout' key - if 'layout' in dset_json: - layout = dset_json["layout"] - if "dims" in layout: - return layout["dims"] - return None # not found + + layout = getDatasetLayout(dset_json) + if layout and "dims" in layout: + return layout["dims"] + else: + # H5D_COMPACT and H5D_CONTIGUOUS will not have a dims key + # Check the layout dict in dset_json to see if it's + # defined there + if "layout" in dset_json: + layout = dset_json["layout"] + if "dims" in layout: + return layout["dims"] + return None class ItemIterator: diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index 9a729e55..f42dcdb3 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -681,11 +681,23 @@ def testResizableDataset(self): self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) + # verify updated-shape using the GET shape request + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("shape" in rspJson) + shape = rspJson["shape"] + self.assertEqual(shape["class"], "H5S_SIMPLE") + self.assertEqual(len(shape["dims"]), 1) + self.assertEqual(shape["dims"][0], 15) # increased to 15 + self.assertTrue("maxdims" in shape) + self.assertEqual(shape["maxdims"][0], 20) + # reduce the size to 5 elements - # payload = {"shape": 5} - # rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - # self.assertEqual(rsp.status_code, 201) - # rspJson = json.loads(rsp.text) + payload = {"shape": 5} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) # verify updated-shape using the GET shape request rsp = self.session.get(req, headers=headers) @@ -695,7 +707,7 @@ def testResizableDataset(self): shape = rspJson["shape"] self.assertEqual(shape["class"], "H5S_SIMPLE") self.assertEqual(len(shape["dims"]), 1) - self.assertEqual(shape["dims"][0], 15) # increased to 15 + self.assertEqual(shape["dims"][0], 5) # decreased to 5 self.assertTrue("maxdims" in shape) self.assertEqual(shape["maxdims"][0], 20) From 65609d19027026fc34f129884fdbfd9e820b5299 Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 13 Oct 2023 09:25:43 -0700 Subject: [PATCH 02/17] broadcast support for SN PUT value --- hsds/chunk_crawl.py | 11 ++-- hsds/chunk_dn.py | 17 ++++- hsds/chunk_sn.py | 112 +++++++++++++++++++++++-------- hsds/dset_sn.py | 36 ++++++++-- hsds/servicenode_lib.py | 4 +- hsds/util/arrayUtil.py | 36 ++++++---- hsds/util/dsetUtil.py | 1 - tests/integ/value_test.py | 82 ++++++++++++++++++++++- tests/unit/array_util_test.py | 121 +++++++++++++++++++++++++++------- 9 files changed, 335 insertions(+), 85 deletions(-) diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 96497e1c..b7d2ce22 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -309,16 +309,13 @@ async def read_chunk_hyperslab( # TBD: this needs to be fixed up for variable length dtypes nrows = len(array_data) // query_dtype.itemsize try: - chunk_arr = bytesToArray( - array_data, - query_dtype, - [ - nrows, - ], - ) + chunk_arr = bytesToArray(array_data, query_dtype, (nrows,)) except ValueError as ve: log.warn(f"bytesToArray ValueError: {ve}") raise HTTPBadRequest() + if chunk_arr.shape[0] != nrows: + log.error(f"expected chunk shape to be ({nrows},), but got {chunk_arr.shape[0]}") + raise HTTPInternalServerError() # save result to chunk_info # chunk results will be merged later chunk_info["query_rsp"] = chunk_arr diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index 3bb3fc7f..3fafd940 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -20,7 +20,7 @@ from aiohttp.web import json_response, StreamResponse from .util.httpUtil import request_read, getContentType -from .util.arrayUtil import bytesToArray, arrayToBytes +from .util.arrayUtil import bytesToArray, arrayToBytes, getShapeDims from .util.idUtil import getS3Key, validateInPartition, isValidUuid from .util.storUtil import isStorObj, deleteStorObj from .util.hdf5dtype import createDataType @@ -137,7 +137,7 @@ async def PUT_Chunk(request): if getChunkInitializer(dset_json): chunk_init = True elif query: - chunk_init = False # don't initalize new chunks on query update + chunk_init = False # don't initialize new chunks on query update else: chunk_init = True @@ -221,6 +221,8 @@ async def PUT_Chunk(request): else: # regular chunk update + broadcast = 0 # broadcast update + # check that the content_length is what we expect if itemsize != "H5T_VARIABLE": log.debug(f"expect content_length: {num_elements*itemsize}") @@ -229,10 +231,14 @@ async def PUT_Chunk(request): actual = request.content_length if itemsize != "H5T_VARIABLE": expected = num_elements * itemsize - if expected != actual: + if expected % actual != 0: msg = f"Expected content_length of: {expected}, but got: {actual}" log.error(msg) raise HTTPBadRequest(reason=msg) + else: + broadcast = expected // actual + if broadcast != 1: + log.info(f"broadcast chunk write: {broadcast}") # create a numpy array for incoming data input_bytes = await request_read(request) @@ -375,6 +381,8 @@ async def GET_Chunk(request): dset_id = getDatasetId(chunk_id) dset_json = await get_metadata_obj(app, dset_id, bucket=bucket) + shape_dims = getShapeDims(dset_json["shape"]) + log.debug(f"shape_dims: {shape_dims}") dims = getChunkLayout(dset_json) log.debug(f"GET_Chunk - got dims: {dims}") @@ -385,6 +393,9 @@ async def GET_Chunk(request): select = None # get slices for entire datashape if select is not None: log.debug(f"GET_Chunk - using select string: {select}") + else: + log.debug("GET_Chunk - no selection string") + try: selection = getSelectionList(select, dims) except ValueError as ve: diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index df4d7476..f5aa03e9 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -494,6 +494,7 @@ async def PUT_Value(request): log.warn(msg) raise HTTPBadRequest(reason=msg) log.info(f"append_rows: {append_rows}") + if append_rows: for key in ("start", "stop", "step"): if key in body: @@ -509,6 +510,7 @@ async def PUT_Value(request): log.warn(msg) raise HTTPBadRequest(reason=msg) log.info(f"append_dim: {append_dim}") + # get state for dataset from DN. dset_json = await getObjectJson(app, dset_id, bucket=bucket, refresh=False) @@ -624,6 +626,8 @@ async def PUT_Value(request): else: http_streaming = True + http_streaming = False # test + # body could also contain a point selection specifier if body and "points" in body: if append_rows: @@ -709,11 +713,13 @@ async def PUT_Value(request): log.warn(msg) raise # re-throw + """ if len(binary_data) != request.content_length: msg = f"Read {len(binary_data)} bytes, expecting: " msg += f"{request.content_length}" log.error(msg) raise HTTPBadRequest(reason=msg) + """ if append_rows: for i in range(rank): @@ -753,38 +759,87 @@ async def PUT_Value(request): raise HTTPBadRequest(reason=msg) arr = None # np array to hold request data - if binary_data and isinstance(item_size, int): - # binary, fixed item_size - if num_elements * item_size != len(binary_data): - msg = f"Expected: {num_elements*item_size} bytes, " - msg += f"but got: {len(binary_data)}, " - msg += f"num_elements: {num_elements}, item_size: {item_size}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if num_elements * item_size > max_request_size: - msg = f"read {num_elements*item_size} bytes, greater than {max_request_size}" - log.warn(msg) - arr = np.fromstring(binary_data, dtype=dset_dtype) - try: - arr = arr.reshape(np_shape) # conform to selection shape - except ValueError: - msg = "Bad Request: binary input data doesn't match selection" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + if binary_data: + if item_size == "H5T_VARIABLE": + + # binary variable length data + try: + arr = bytesToArray(binary_data, dset_dtype, np_shape) + except ValueError as ve: + log.warn(f"bytesToArray value error: {ve}") + raise HTTPBadRequest() + + num_req_elements = getNumElements(arr.shape) + log.debug(f"binary variable data element count: {num_req_elements}") + else: + # fixed item size + if len(binary_data) % item_size != 0: + msg = f"Expected request size to be a multiple of {item_size}, " + msg += f"but {len(binary_data)} bytes received" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # check against max request size + if num_elements * item_size > max_request_size: + msg = f"read {num_elements*item_size} bytes, greater than {max_request_size}" + log.warn(msg) + + num_req_elements = len(binary_data) // item_size + + # if the req item count is less than expected, + # check to see if it is a broadcast request + broadcast_shape = None + if num_req_elements != num_elements and not append_rows: + broadcast_shape = [1,] + for ndim in range(rank): + if num_req_elements == np.prod(broadcast_shape): + break + np_shape_extent = np_shape[rank - 1 - ndim] + if ndim == 0: + broadcast_shape = [np_shape_extent,] + else: + broadcast_shape = [np_shape_extent].extend(broadcast_shape) + log.debug(f"trying broadcast_shape: {broadcast_shape}") + if len(broadcast_shape) == rank: + msg = f"Unexpected request size: {len(binary_data)}, " + msg += f"for num_elements: {num_elements} with item_size: {item_size}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # read bytes into a one-dimensional numpy array + if item_size != "H5T_VARIABLE": + """ + # binary variable length data + try: + arr = bytesToArray(binary_data, dset_dtype, (num_elements,)) + except ValueError as ve: + log.warn(f"Unable to parse variable length data: {ve}") + raise HTTPBadRequest() + """ + arr = np.fromstring(binary_data, dtype=dset_dtype) + + if broadcast_shape: + log.info(f"broadcasting from {broadcast_shape} to {np_shape}") + arr = arr.reshape(broadcast_shape) + tmp_arr = np.zeros(np_shape, dtype=dset_dtype) + tmp_arr[...] = arr + arr = tmp_arr + else: + try: + arr = arr.reshape(np_shape) # conform to selection shape + except ValueError: + msg = "Bad Request: binary input data doesn't match selection" + log.warn(msg) + raise HTTPBadRequest(reason=msg) msg = f"PUT value - numpy array shape: {arr.shape} dtype: {arr.dtype}" log.debug(msg) - elif binary_data and item_size == "H5T_VARIABLE": - # binary variable length data - try: - arr = bytesToArray(binary_data, dset_dtype, np_shape) - except ValueError as ve: - log.warn(f"bytesToArray value error: {ve}") - raise HTTPBadRequest() + elif request_type == "json": # get array from json input try: msg = "input data doesn't match selection" - arr = jsonToArray(np_shape, dset_dtype, json_data) + # only enable broadcast if not appending + arr = jsonToArray(np_shape, dset_dtype, json_data, broadcast=(False if append_rows else True)) except ValueError: log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1051,7 +1106,10 @@ async def GET_Value(request): bucket = getBucketForDomain(domain) # get state for dataset from DN. - dset_json = await getObjectJson(app, dset_id, bucket=bucket) + # Note - refreshShape will do a refresh if the dataset is extensible + # i.e. we need to make sure we have the correct shape dimensions + # + dset_json = await getObjectJson(app, dset_id, bucket=bucket, refresh=True) type_json = dset_json["type"] dset_dtype = createDataType(type_json) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 8f69a9fc..9e82a690 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -21,10 +21,10 @@ from .util.httpUtil import http_post, http_put, http_delete, getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId, isSchema2Id -from .util.dsetUtil import getPreviewQuery, getFilterItem +from .util.dsetUtil import getPreviewQuery, getFilterItem, getChunkLayout from .util.arrayUtil import getNumElements, getShapeDims, getNumpyValue from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk -from .util.chunkUtil import getContiguousLayout +from .util.chunkUtil import getContiguousLayout, getChunkIds from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -621,6 +621,7 @@ async def PUT_DatasetShape(request): msg = "Extent of update shape request does not match dataset sahpe" log.warn(msg) raise HTTPBadRequest(reason=msg) + shape_reduction = False for i in range(rank): if shape_update and shape_update[i] < dims[i]: @@ -633,14 +634,34 @@ async def PUT_DatasetShape(request): msg = "Extension dimension can not be extended past max extent" log.warn(msg) raise HTTPConflict() - if shape_reduction: - log.info("Shape extent reduced for dataset") - # TBD - ensure any chunks that are outside the new shape region are - # deleted + if extend_dim < 0 or extend_dim >= rank: msg = "Extension dimension must be less than rank and non-negative" log.warn(msg) raise HTTPBadRequest(reason=msg) + + if shape_reduction: + log.info(f"Shape extent reduced for dataset (rank: {rank})") + + # need to re-initialize any values that are now outside the shape + layout = getChunkLayout(dset_json) + log.debug(f"got layout: {layout}") + for n in range(rank): + if dims[n] <= shape_update[i]: + log.debug(f"skip dimension {n}") + continue + log.debug(f"reinitialize for dimension: {n}") + slices = [] + for m in range(rank): + if m == n: + s = slice(shape_update[m], dims[m], 1) + else: + # just select the entire extent + s = slice(0, dims[m]) + slices.append(s) + log.debug(f"shape_reinitialize - got slices: {slices} for dimension: {n}") + chunk_ids = getChunkIds(dset_id, slices, layout) + log.debug(f"got chunkIds: {chunk_ids}") # send request onto DN req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id + "/shape" @@ -664,6 +685,9 @@ async def PUT_DatasetShape(request): log.warn("got 409 extending dataspace") raise + + + resp = await jsonResponse(request, json_resp, status=201) log.response(request, resp=resp) return resp diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 3e1e2946..2f75284c 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -203,7 +203,7 @@ async def getObjectJson( if bucket: params["bucket"] = bucket req += "/" + collection + "/" + obj_id - + log.debug(f"getObjectJson - fetching {obj_id} from {req}") # throws 404 if doesn't exist obj_json = await http_get(app, req, params=params) meta_cache[obj_id] = obj_json @@ -211,7 +211,7 @@ async def getObjectJson( msg = f"Object: {obj_id} not found, req: {req}, params: {params}" log.warn(msg) raise HTTPNotFound() - + return obj_json diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index 0fe8a8f4..0f6b7099 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -93,6 +93,8 @@ def getNumElements(dims): return num_elements + + def getShapeDims(shape): """ Get dims from a given shape json. Return [1,] for Scalar datasets, @@ -131,7 +133,7 @@ def getShapeDims(shape): return dims -def jsonToArray(data_shape, data_dtype, data_json): +def jsonToArray(data_shape, data_dtype, data_json, broadcast=False): """ Return numpy array from the given json array. """ @@ -143,7 +145,7 @@ def fillVlenArray(rank, data, arr, index): arr[index] = data[i] index += 1 return index - + # need some special conversion for compound types -- # each element must be a tuple, but the JSON decoder # gives us a list instead. @@ -160,9 +162,7 @@ def fillVlenArray(rank, data, arr, index): converted_data = toTuple(np_shape_rank, data_json) data_json = converted_data else: - data_json = [ - data_json, - ] # listify + data_json = [data_json,] # listify if not (None in data_json): if isVlen(data_dtype): @@ -178,9 +178,17 @@ def fillVlenArray(rank, data, arr, index): # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this if arr.size != npoints: - msg = "Input data doesn't match selection number of elements" - msg += f" Expected {npoints}, but received: {arr.size}" - raise ValueError(msg) + if broadcast: + # try to broadcast to the target shape + # if it fails, a ValueError exception will be raised + arr_tgt = np.zeros(data_shape, dtype=data_dtype) + arr_tgt[...] = arr + # worked! use arr_tgt as arr + arr = arr_tgt + else: + msg = "Input data doesn't match selection number of elements" + msg += f" Expected {npoints}, but received: {arr.size}" + raise ValueError(msg) if arr.shape != data_shape: arr = arr.reshape(data_shape) # reshape to match selection else: @@ -368,10 +376,11 @@ def copyElement(e, dt, buffer, offset): return offset -def getElementCount(buffer, offset): +def getElementCount(buffer, offset=0): """ Get the count value from persisted vlen array """ + n = offset m = offset + 4 count_bytes = bytes(buffer[n:m]) @@ -425,7 +434,7 @@ def readElement(buffer, offset, arr, index, dt): offset = readElement(buffer, offset, e, i, dt) e.reshape(dt.shape) else: - count = getElementCount(buffer, offset) + count = getElementCount(buffer, offset=offset) offset += 4 n = offset m = offset + count @@ -472,17 +481,18 @@ def bytesToArray(data, dt, shape): """ Create numpy array based on byte representation """ - # print(f"bytesToArray({len(data)}, {dt}, {shape}") - nelements = getNumElements(shape) if not isVlen(dt): # regular numpy from string arr = np.frombuffer(data, dtype=dt) else: + nelements = getNumElements(shape) + arr = np.zeros((nelements,), dtype=dt) offset = 0 for index in range(nelements): offset = readElement(data, offset, arr, index, dt) - arr = arr.reshape(shape) + if shape is not None: + arr = arr.reshape(shape) # check that we can update the array if needed # Note: this seems to have been required starting with numpuy v 1.17 # Setting the flag directly is not recommended. diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index da8dbdff..79bfa5e7 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -843,7 +843,6 @@ def isExtensible(dims, maxdims): """ if maxdims is None or len(dims) == 0: return False - log.debug(f"isExtensible - dims: {dims} maxdims: {maxdims}") rank = len(dims) if len(maxdims) != rank: raise ValueError("rank of maxdims does not match dataset") diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 9588a66e..c27600b7 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3063,9 +3063,7 @@ def testIntelligentRangeGet(self): req = self.endpoint + "/datasets/" + dset_id + "/value" start = 1234567 stop = start + 10 - params = { - "select": f"[{start}:{stop}]" - } # read 10 element, starting at index 1234567 + params = {"select": f"[{start}:{stop}]"} # read 10 element, starting at index 1234567 params["nonstrict"] = 1 # enable SN to invoke lambda func # read the selection @@ -3078,6 +3076,7 @@ def testIntelligentRangeGet(self): # should get one element back self.assertEqual(len(value), 10) self.assertEqual(value, list(range(start, start + 10))) + def testLargeCreationProperties(self): # test Dataset with artifically large creation_properties data @@ -3141,6 +3140,83 @@ def testLargeCreationProperties(self): self.assertEqual(ret_values[i], 24) self.assertEqual(ret_values[i + 5], 42) + def testValueReinitialization(self): + # Test the dataset values get reset after a reduction and resize + + print("testValueReinitialization", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + + # get domain + req = f"{self.endpoint}/" + rsp = self.session.get(req, headers=headers) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + root_uuid = rspJson["root"] + + # create the dataset + req = f"{self.endpoint}/datasets" + payload = {"type": "H5T_STD_I32LE", "shape": 10, "maxdims": 10} + payload["creationProperties"] = {"fillValue": 42} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + rspJson = json.loads(rsp.text) + dset_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset_uuid)) + + # link new dataset as 'dset' + name = "dset" + req = f"{self.endpoint}/groups/{root_uuid}/links/{name}" + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write to the dset + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + data = list(range(10)) # write 0-9 + payload = {"value": data[0:10]} + params = {"select": "[0:10]"} + + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], data) + + # resize the dataset to 5 elements + req =f"{self.endpoint}/datasets/{dset_uuid}/shape" + payload = {"shape": 5} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # read back the remaining elements + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], data[:5]) + + # resize back to 10 + req =f"{self.endpoint}/datasets/{dset_uuid}/shape" + payload = {"shape": 10} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # read all 10 data values + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + ret_value = rspJson["value"] if __name__ == "__main__": # setup test files diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index da7b8eef..d11979df 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -205,9 +205,7 @@ def testJsonToArray(self): 4, ] data = [ - [ - 1, - ], + [1,], [1, 2], [1, 2, 3], [1, 2, 3, 4], @@ -292,6 +290,95 @@ def testJsonToArray(self): self.assertTrue(isinstance(e, tuple)) self.assertEqual(e, (id0, id1, id2)) + + def testJsonToArrayBroadcast(self): + dt = np.dtype("i4") + shape = [10,] + data = [42,] + out = jsonToArray(shape, dt, data, broadcast=True) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10,)) + for i in range(10): + self.assertEqual(out[i], 42) + + # compound type + dt = np.dtype([("a", "i4"), ("b", "S5")]) + shape = [10,] + data = [[6, "six"],] + out = jsonToArray(shape, dt, data, broadcast=True) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10,)) + for i in range(10): + e = out[i] + self.assertEqual(e[0], 6) + self.assertEqual(e[1], b'six') + + # VLEN str + dt = special_dtype(vlen=str) + data = ["hello",] + + shape = [10,] + + out = jsonToArray(shape, dt, data, broadcast=True) + self.assertTrue("vlen" in out.dtype.metadata) + self.assertEqual(out.dtype.metadata["vlen"], str) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(out.shape, (10,)) + for i in range(10): + e = out[i] + self.assertEqual(out[0], data[0]) + + # two dimensional target + dt = np.dtype("i4") + shape = [10,2] + data = [42,] + out = jsonToArray(shape, dt, data, broadcast=True) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10,2)) + for i in range(10): + for j in range(2): + self.assertEqual(out[i,j], 42) + + dt = np.dtype("i4") + shape = [10,2] + data = [69,96] + out = jsonToArray(shape, dt, data, broadcast=True) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10,2)) + for i in range(10): + self.assertEqual(out[i,0], 69) + self.assertEqual(out[i,1], 96) + + # three dimensional target + dt = np.dtype("i4") + shape = [10, 3, 2] + data = [[0,1],[2,3],[4,5]] + out = jsonToArray(shape, dt, data, broadcast=True) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10, 3, 2)) + for i in range(10): + for j in range(3): + for k in range(2): + self.assertEqual(out[i,j,k], j * 2 + k) + + + # verify ValueError returning if broadcast rules don't apply + dt = np.dtype("i4") + shape = [5,] + data = [1, 2] + + try: + jsonToArray(shape, dt, data, broadcast=True) + self.assertTrue(False) + except ValueError: + pass # expected + + def testToBytes(self): # Simple array dt = np.dtype(" Date: Mon, 16 Oct 2023 10:46:02 -0700 Subject: [PATCH 03/17] broadcast support --- hsds/chunk_sn.py | 133 ++++++++++++++++++-------------------- hsds/dset_sn.py | 9 +-- hsds/servicenode_lib.py | 2 +- hsds/util/arrayUtil.py | 6 +- tests/integ/value_test.py | 19 +++--- 5 files changed, 80 insertions(+), 89 deletions(-) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index f5aa03e9..e58baa17 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -422,6 +422,8 @@ async def PUT_Value(request): params = request.rel_url.query append_rows = None # this is a append update or not append_dim = 0 + num_elements = None + element_count = None if "append" in params and params["append"]: try: append_rows = int(params["append"]) @@ -450,6 +452,15 @@ async def PUT_Value(request): raise HTTPBadRequest(reason=msg) query = params["query"] + if "element_count" in params: + try: + element_count = int(params["element_count"]) + except ValueError: + msg = "invalid element_count" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"element_count param: {element_count}") + dset_id = request.match_info.get("id") if not dset_id: msg = "Missing dataset id" @@ -494,7 +505,7 @@ async def PUT_Value(request): log.warn(msg) raise HTTPBadRequest(reason=msg) log.info(f"append_rows: {append_rows}") - + if append_rows: for key in ("start", "stop", "step"): if key in body: @@ -510,7 +521,6 @@ async def PUT_Value(request): log.warn(msg) raise HTTPBadRequest(reason=msg) log.info(f"append_dim: {append_dim}") - # get state for dataset from DN. dset_json = await getObjectJson(app, dset_id, bucket=bucket, refresh=False) @@ -713,14 +723,6 @@ async def PUT_Value(request): log.warn(msg) raise # re-throw - """ - if len(binary_data) != request.content_length: - msg = f"Read {len(binary_data)} bytes, expecting: " - msg += f"{request.content_length}" - log.error(msg) - raise HTTPBadRequest(reason=msg) - """ - if append_rows: for i in range(rank): if i == append_dim: @@ -748,29 +750,44 @@ async def PUT_Value(request): np_shape = getSelectionShape(slices) else: # point update - np_shape = (num_points,) + np_shape = [num_points,] log.debug(f"selection shape: {np_shape}") - num_elements = getNumElements(np_shape) - log.debug(f"selection num elements: {num_elements}") - if num_elements <= 0: + if np.prod(np_shape) == 0: msg = "Selection is empty" log.warn(msg) raise HTTPBadRequest(reason=msg) + if element_count is not None: + # if this is set to something other than the number of + # elements in np_shape, should be a value that can + # be used for broadcasting + for n in range(rank): + msg = f"{element_count} vs np.prod({np_shape[:n+1]}): {np.prod(np_shape[:(n+1)])}" + log.debug(msg) + if element_count == np.prod(np_shape) // np.prod(np_shape[:(n + 1)]): + num_elements = element_count + log.debug(f"broadcast with: {element_count} elements is valid ") + break + if num_elements is None: + # this never got set, so element count must be invalid for this shape + msg = f"element_count {element_count} not compatible with selection shape: {np_shape}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + # set num_elements based on selection shape + num_elements = getNumElements(np_shape) + log.debug(f"selection num elements: {num_elements}") + arr = None # np array to hold request data if binary_data: if item_size == "H5T_VARIABLE": - # binary variable length data try: - arr = bytesToArray(binary_data, dset_dtype, np_shape) + arr = bytesToArray(binary_data, dset_dtype, [num_elements,]) except ValueError as ve: log.warn(f"bytesToArray value error: {ve}") raise HTTPBadRequest() - - num_req_elements = getNumElements(arr.shape) - log.debug(f"binary variable data element count: {num_req_elements}") else: # fixed item size if len(binary_data) % item_size != 0: @@ -778,68 +795,46 @@ async def PUT_Value(request): msg += f"but {len(binary_data)} bytes received" log.warn(msg) raise HTTPBadRequest(reason=msg) - + + if len(binary_data) // item_size != num_elements: + msg = f"expected {item_size * num_elements} bytes but got {len(binary_data)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # check against max request size if num_elements * item_size > max_request_size: msg = f"read {num_elements*item_size} bytes, greater than {max_request_size}" log.warn(msg) - num_req_elements = len(binary_data) // item_size - - # if the req item count is less than expected, - # check to see if it is a broadcast request - broadcast_shape = None - if num_req_elements != num_elements and not append_rows: - broadcast_shape = [1,] - for ndim in range(rank): - if num_req_elements == np.prod(broadcast_shape): - break - np_shape_extent = np_shape[rank - 1 - ndim] - if ndim == 0: - broadcast_shape = [np_shape_extent,] - else: - broadcast_shape = [np_shape_extent].extend(broadcast_shape) - log.debug(f"trying broadcast_shape: {broadcast_shape}") - if len(broadcast_shape) == rank: - msg = f"Unexpected request size: {len(binary_data)}, " - msg += f"for num_elements: {num_elements} with item_size: {item_size}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - # read bytes into a one-dimensional numpy array - if item_size != "H5T_VARIABLE": - """ - # binary variable length data - try: - arr = bytesToArray(binary_data, dset_dtype, (num_elements,)) - except ValueError as ve: - log.warn(f"Unable to parse variable length data: {ve}") - raise HTTPBadRequest() - """ arr = np.fromstring(binary_data, dtype=dset_dtype) - - if broadcast_shape: - log.info(f"broadcasting from {broadcast_shape} to {np_shape}") - arr = arr.reshape(broadcast_shape) - tmp_arr = np.zeros(np_shape, dtype=dset_dtype) - tmp_arr[...] = arr - arr = tmp_arr - else: - try: - arr = arr.reshape(np_shape) # conform to selection shape - except ValueError: - msg = "Bad Request: binary input data doesn't match selection" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + log.debug(f"read fixed type array: {arr}") + + if element_count is not None: + # broad cast data into numpy array + arr_tmp = np.zeros(np_shape, dtype=dset_dtype) + arr_tmp[...] = arr + arr = arr_tmp + try: + arr = arr.reshape(np_shape) # conform to selection shape + except ValueError: + msg = "Bad Request: binary input data doesn't match selection" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + msg = f"PUT value - numpy array shape: {arr.shape} dtype: {arr.dtype}" log.debug(msg) - + elif request_type == "json": # get array from json input try: msg = "input data doesn't match selection" # only enable broadcast if not appending - arr = jsonToArray(np_shape, dset_dtype, json_data, broadcast=(False if append_rows else True)) + if num_elements < np.prod(np_shape): + broadcast = True + else: + broadcast = False + log.debug(f"np_shape: {np_shape}, broadcast: {broadcast}") + arr = jsonToArray(np_shape, dset_dtype, json_data, broadcast=broadcast) except ValueError: log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -1108,7 +1103,7 @@ async def GET_Value(request): # get state for dataset from DN. # Note - refreshShape will do a refresh if the dataset is extensible # i.e. we need to make sure we have the correct shape dimensions - # + dset_json = await getObjectJson(app, dset_id, bucket=bucket, refresh=True) type_json = dset_json["type"] dset_dtype = createDataType(type_json) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 9e82a690..e09e5fba 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -621,7 +621,7 @@ async def PUT_DatasetShape(request): msg = "Extent of update shape request does not match dataset sahpe" log.warn(msg) raise HTTPBadRequest(reason=msg) - + shape_reduction = False for i in range(rank): if shape_update and shape_update[i] < dims[i]: @@ -634,12 +634,12 @@ async def PUT_DatasetShape(request): msg = "Extension dimension can not be extended past max extent" log.warn(msg) raise HTTPConflict() - + if extend_dim < 0 or extend_dim >= rank: msg = "Extension dimension must be less than rank and non-negative" log.warn(msg) raise HTTPBadRequest(reason=msg) - + if shape_reduction: log.info(f"Shape extent reduced for dataset (rank: {rank})") @@ -685,9 +685,6 @@ async def PUT_DatasetShape(request): log.warn("got 409 extending dataspace") raise - - - resp = await jsonResponse(request, json_resp, status=201) log.response(request, resp=resp) return resp diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 2f75284c..416f1fc5 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -211,7 +211,7 @@ async def getObjectJson( msg = f"Object: {obj_id} not found, req: {req}, params: {params}" log.warn(msg) raise HTTPNotFound() - + return obj_json diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index 0f6b7099..ff586fee 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -93,8 +93,6 @@ def getNumElements(dims): return num_elements - - def getShapeDims(shape): """ Get dims from a given shape json. Return [1,] for Scalar datasets, @@ -145,7 +143,7 @@ def fillVlenArray(rank, data, arr, index): arr[index] = data[i] index += 1 return index - + # need some special conversion for compound types -- # each element must be a tuple, but the JSON decoder # gives us a list instead. @@ -486,7 +484,7 @@ def bytesToArray(data, dt, shape): arr = np.frombuffer(data, dtype=dt) else: nelements = getNumElements(shape) - + arr = np.zeros((nelements,), dtype=dt) offset = 0 for index in range(nelements): diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index c27600b7..788d53f8 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3076,7 +3076,6 @@ def testIntelligentRangeGet(self): # should get one element back self.assertEqual(len(value), 10) self.assertEqual(value, list(range(start, start + 10))) - def testLargeCreationProperties(self): # test Dataset with artifically large creation_properties data @@ -3142,11 +3141,10 @@ def testLargeCreationProperties(self): def testValueReinitialization(self): # Test the dataset values get reset after a reduction and resize - + print("testValueReinitialization", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) - - + # get domain req = f"{self.endpoint}/" rsp = self.session.get(req, headers=headers) @@ -3187,14 +3185,14 @@ def testValueReinitialization(self): rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) self.assertEqual(rspJson["value"], data) - + # resize the dataset to 5 elements - req =f"{self.endpoint}/datasets/{dset_uuid}/shape" + req = f"{self.endpoint}/datasets/{dset_uuid}/shape" payload = {"shape": 5} rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) rspJson = json.loads(rsp.text) - + # read back the remaining elements req = f"{self.endpoint}/datasets/{dset_uuid}/value" rsp = self.session.get(req, headers=headers) @@ -3204,7 +3202,7 @@ def testValueReinitialization(self): self.assertEqual(rspJson["value"], data[:5]) # resize back to 10 - req =f"{self.endpoint}/datasets/{dset_uuid}/shape" + req = f"{self.endpoint}/datasets/{dset_uuid}/shape" payload = {"shape": 10} rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 201) @@ -3216,7 +3214,10 @@ def testValueReinitialization(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) - ret_value = rspJson["value"] + value = rspJson["value"] + print("value:", value) + # TBD: verify values are getting reinitialized + if __name__ == "__main__": # setup test files From 84c055f46fc1871826badf72f4e811fc37a44e81 Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 17 Oct 2023 15:43:32 -0700 Subject: [PATCH 04/17] fix for higher dimensional broadcast --- hsds/chunk_sn.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index e58baa17..69415bad 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -629,6 +629,7 @@ async def PUT_Value(request): binary_data = None points = None # used for point selection writes np_shape = [] # shape of incoming data + bc_shape = [] # shape of broadcast array (if element_count is set) slices = [] # selection area to write to if item_size == 'H5T_VARIABLE' or not use_http_streaming(request, rank): @@ -762,13 +763,18 @@ async def PUT_Value(request): # if this is set to something other than the number of # elements in np_shape, should be a value that can # be used for broadcasting - for n in range(rank): - msg = f"{element_count} vs np.prod({np_shape[:n+1]}): {np.prod(np_shape[:(n+1)])}" - log.debug(msg) - if element_count == np.prod(np_shape) // np.prod(np_shape[:(n + 1)]): - num_elements = element_count - log.debug(f"broadcast with: {element_count} elements is valid ") - break + if element_count == 1: + num_elements = 1 + bc_shape = [1,] + log.debug(f"broadcasting one element to {np_shape}") + else: + + for n in range(rank-1): + bc_shape = np_shape[rank - n - 1] + if element_count == np.prod(bc_shape): + num_elements = element_count + log.debug(f"broadcast with: {element_count} elements is valid with shape: {bc_shape} ") + break if num_elements is None: # this never got set, so element count must be invalid for this shape msg = f"element_count {element_count} not compatible with selection shape: {np_shape}" @@ -809,8 +815,9 @@ async def PUT_Value(request): arr = np.fromstring(binary_data, dtype=dset_dtype) log.debug(f"read fixed type array: {arr}") - if element_count is not None: - # broad cast data into numpy array + if bc_shape: + # broadcast received data into numpy array + arr = arr.reshape(bc_shape) arr_tmp = np.zeros(np_shape, dtype=dset_dtype) arr_tmp[...] = arr arr = arr_tmp From 557b7bcac6ca0f2cacb096f22313b04562f6a30a Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 17 Oct 2023 23:00:14 -0700 Subject: [PATCH 05/17] added broadcast test --- hsds/chunk_sn.py | 43 +-- tests/integ/broadcast_test.py | 518 ++++++++++++++++++++++++++++++++++ 2 files changed, 542 insertions(+), 19 deletions(-) create mode 100755 tests/integ/broadcast_test.py diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 69415bad..903f82fe 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -591,7 +591,6 @@ async def PUT_Value(request): method=request.method, ) - log.debug(f"arr shape: {arr_rsp.shape}") response_type = getAcceptType(request) if response_type == "binary": @@ -632,13 +631,11 @@ async def PUT_Value(request): bc_shape = [] # shape of broadcast array (if element_count is set) slices = [] # selection area to write to - if item_size == 'H5T_VARIABLE' or not use_http_streaming(request, rank): + if item_size == 'H5T_VARIABLE' or element_count or not use_http_streaming(request, rank): http_streaming = False else: http_streaming = True - http_streaming = False # test - # body could also contain a point selection specifier if body and "points" in body: if append_rows: @@ -766,14 +763,15 @@ async def PUT_Value(request): if element_count == 1: num_elements = 1 bc_shape = [1,] - log.debug(f"broadcasting one element to {np_shape}") + log.debug(f"broadcasting one element to shape: {np_shape}") else: - - for n in range(rank-1): - bc_shape = np_shape[rank - n - 1] + bc_shape = [] + for n in range(rank - 1): + bc_shape.insert(0, np_shape[rank - n - 1]) if element_count == np.prod(bc_shape): num_elements = element_count - log.debug(f"broadcast with: {element_count} elements is valid with shape: {bc_shape} ") + msg = f"broadcast with: {element_count} elements valid for shape: {bc_shape}" + log.debug(msg) break if num_elements is None: # this never got set, so element count must be invalid for this shape @@ -836,12 +834,22 @@ async def PUT_Value(request): try: msg = "input data doesn't match selection" # only enable broadcast if not appending - if num_elements < np.prod(np_shape): - broadcast = True + + if bc_shape: + arr = jsonToArray(bc_shape, dset_dtype, json_data) else: - broadcast = False - log.debug(f"np_shape: {np_shape}, broadcast: {broadcast}") - arr = jsonToArray(np_shape, dset_dtype, json_data, broadcast=broadcast) + arr = jsonToArray(np_shape, dset_dtype, json_data) + + log.debug(f"jsonToArray returned: {arr}") + if num_elements != np.prod(arr.shape): + msg = f"expected {num_elements} elements, but got {np.prod(arr.shape)}" + raise HTTPBadRequest(reason=msg) + + if bc_shape: + # broadcast to target + arr_tmp = np.zeros(np_shape, dtype=dset_dtype) + arr_tmp[...] = arr + arr = arr_tmp except ValueError: log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -953,8 +961,6 @@ async def PUT_Value(request): msg = f"bytesToArray value error for page: {page_number+1}: {ve}" log.warn(msg) raise HTTPBadRequest(reason=msg) - if len(select_shape) == 2: - log.debug(f"arr test value[0,0]: {arr[0,0]}") try: chunk_ids = getChunkIds(dset_id, page, layout) @@ -963,9 +969,8 @@ async def PUT_Value(request): raise HTTPInternalServerError() log.debug(f"chunk_ids: {chunk_ids}") if len(chunk_ids) > max_chunks: - log.warn( - f"got {len(chunk_ids)} for page: {page_number+1}. max_chunks: {max_chunks} " - ) + msg = f"got {len(chunk_ids)} for page: {page_number+1}. max_chunks: {max_chunks}" + log.warn(msg) crawler = ChunkCrawler( app, diff --git a/tests/integ/broadcast_test.py b/tests/integ/broadcast_test.py new file mode 100755 index 00000000..5d0187a5 --- /dev/null +++ b/tests/integ/broadcast_test.py @@ -0,0 +1,518 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import json +import helper + + +class BroadcastTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(BroadcastTest, self).__init__(*args, **kwargs) + self.base_domain = helper.getTestDomainName(self.__class__.__name__) + helper.setupDomain(self.base_domain) + self.endpoint = helper.getEndpoint() + + def setUp(self): + self.session = helper.getSession() + + def tearDown(self): + if self.session: + self.session.close() + + def getUUIDByPath(self, domain, h5path): + return helper.getUUIDByPath(domain, h5path, session=self.session) + + def getRootUUID(self, domain, username=None, password=None): + return helper.getRootUUID( + domain, username=username, password=password, session=self.session + ) + + def checkVerbose(self, dset_id, headers=None, expected=None): + # do a flush with rescan, then check the expected return values are correct + req = f"{self.endpoint}/" + params = {"flush": 1, "rescan": 1} + rsp = self.session.put(req, params=params, headers=headers) + # should get a NO_CONTENT code, + self.assertEqual(rsp.status_code, 204) + + # do a get and verify the additional keys are + req = f"{self.endpoint}/datasets/{dset_id}" + params = {"verbose": 1} + + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + + for k in expected: + self.assertTrue(k in rspJson) + self.assertEqual(rspJson[k], expected[k]) + + # main + + def testPut1DDataset(self): + # Test PUT value with broadcast for 1d dataset + print("testPut1DDataset", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + data = {"type": "H5T_STD_I32LE", "shape": 10} + + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset1d' + name = "dset1d" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write to the dset + req = self.endpoint + "/datasets/" + dset_id + "/value" + data = [42,] # broadcast to [42, ..., 42] + + payload = {"value": data} + params = {"element_count": 1} + + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], data * 10) + + def testPut1DDatasetBinary(self): + # Test PUT value with broadcast for 1d dataset using binary data + print("testPut1DDatasetBinary", self.base_domain) + NUM_ELEMENTS = 10 # 1000000 - this value is hitting nginx request size limit + + headers = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req["Content-Type"] = "application/octet-stream" + headers_bin_rsp = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_rsp["accept"] = "application/octet-stream" + + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + data = {"type": "H5T_STD_I32LE", "shape": NUM_ELEMENTS} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset1d' + name = "dset1d" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write 42 as four-byte little endian integer + # broadcast across the entire dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + data = bytearray(4) + data[0] = 0x2a + params = {"element_count": 1} + rsp = self.session.put(req, data=data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers_bin_rsp) + self.assertEqual(rsp.status_code, 200) + data = rsp.content + self.assertEqual(len(data), NUM_ELEMENTS * 4) + for i in range(NUM_ELEMENTS): + offset = i * 4 + self.assertEqual(data[offset + 0], 0x2a) + self.assertEqual(data[offset + 1], 0) + self.assertEqual(data[offset + 2], 0) + self.assertEqual(data[offset + 3], 0) + + # write a selection + params = {"select": "[4:6]"} # 4th and 5th elements + params["element_count"] = 1 # broadcast + data = bytearray(4) + data[0] = 0x40 # 64 + rsp = self.session.put(req, data=data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read a selection + params = {"select": "[0:6]"} # read first 6 elements + rsp = self.session.get(req, params=params, headers=headers_bin_rsp) + self.assertEqual(rsp.status_code, 200) + data = rsp.content + self.assertEqual(len(data), 24) + for i in range(6): + offset = i * 4 + if i >= 4: + # these were updated by the previous selection + self.assertEqual(data[offset + 0], 0x40) + else: + self.assertEqual(data[offset + 0], 0x2a) + self.assertEqual(data[offset + 1], 0) + self.assertEqual(data[offset + 2], 0) + self.assertEqual(data[offset + 3], 0) + + def testPut2DDataset(self): + """Test PUT value with broadcast for 2d dataset""" + print("testPut2DDataset", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + num_col = 5 + num_row = 4 + data = {"type": "H5T_STD_I32LE", "shape": [num_row, num_col]} + + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset2d' + name = "dset2d" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # broadcast one element to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + json_data = [42,] + payload = {"value": json_data} + params = {"element_count": 1} + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + for row in json_value: + for item in row: + self.assertEqual(item, 42) + + # broadcast row to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + json_data = [1, 2, 3, 4, 5] + payload = {"value": json_data} + params = {"element_count": 5} + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + for row in json_value: + self.assertEqual(row, [1, 2, 3, 4, 5]) + + def testPut2DDatasetBinary(self): + # Test PUT value with broadcast for a 2d dataset + print("testPut2DDatasetBinary", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req["Content-Type"] = "application/octet-stream" + headers_bin_rsp = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_rsp["accept"] = "application/octet-stream" + + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + num_col = 5 + num_row = 4 + data = {"type": "H5T_STD_I32LE", "shape": [num_row, num_col]} + + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset2d' + name = "dset2d" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # broadcast one value to entire datsaet + bin_data = bytearray(4) + bin_data[0] = 0x2a + req = self.endpoint + "/datasets/" + dset_id + "/value" + params = {"element_count": 1} + rsp = self.session.put(req, data=bin_data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read back the data as json + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_data = rspJson["value"] + for row in json_data: + self.assertEqual(row, [42, 42, 42, 42, 42]) + + # broadcast a row to the entire dataset + bin_data = bytearray(4 * 5) + for i in range(5): + bin_data[i * 4] = i + + params = {"element_count": 5} + rsp = self.session.put(req, data=bin_data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read back the data as json + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_data = rspJson["value"] + for row in json_data: + self.assertEqual(row, [0, 1, 2, 3, 4]) + + def testPut3DDataset(self): + """Test PUT value with broadcast for 3d dataset""" + print("testPut3DDataset", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + data = {"type": "H5T_STD_I32LE", "shape": [2, 3, 5]} + + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset3d' + name = "dset3d" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # broadcast one element to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + json_data = [42,] + payload = {"value": json_data} + params = {"element_count": 1} + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + for level in json_value: + for row in level: + self.assertEqual(row, [42, 42, 42, 42, 42]) + + # broadcast row to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + json_data = [1, 2, 3, 4, 5] + payload = {"value": json_data} + params = {"element_count": 5} + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + for level in json_value: + for row in level: + self.assertEqual(row, [1, 2, 3, 4, 5]) + + # broadcast level (3x5 block) to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + test_data = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] + payload = {"value": test_data} + params = {"element_count": 15} + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + # test data should be repeated twice + self.assertEqual(json_value[0], test_data) + self.assertEqual(json_value[1], test_data) + + def testPut3DDatasetBinary(self): + """Test PUT value with broadcast for 3d dataset""" + print("testPut3DDatasetBinary", self.base_domain) + + headers = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_req["Content-Type"] = "application/octet-stream" + headers_bin_rsp = helper.getRequestHeaders(domain=self.base_domain) + headers_bin_rsp["accept"] = "application/octet-stream" + + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + data = {"type": "H5T_STD_I32LE", "shape": [2, 3, 5]} + + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset3d' + name = "dset3d" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # broadcast one value to entire datsaet + bin_data = bytearray(4) + bin_data[0] = 0x2a + req = self.endpoint + "/datasets/" + dset_id + "/value" + params = {"element_count": 1} + rsp = self.session.put(req, data=bin_data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + for level in json_value: + for row in level: + self.assertEqual(row, [42, 42, 42, 42, 42]) + + # broadcast row to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + bin_data = bytearray(5 * 4) + for i in range(5): + bin_data[i * 4] = i + 1 + + params = {"element_count": 5} + rsp = self.session.put(req, data=bin_data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + for level in json_value: + for row in level: + self.assertEqual(row, [1, 2, 3, 4, 5]) + + # broadcast level (3x5 block) to the dataset + req = self.endpoint + "/datasets/" + dset_id + "/value" + bin_data = bytearray(5 * 3 * 4) + for i in range(5 * 3): + bin_data[i * 4] = i + 1 + params = {"element_count": 15} + rsp = self.session.put(req, data=bin_data, params=params, headers=headers_bin_req) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + json_value = rspJson["value"] + + for level in json_value: + expected = 1 + for row in level: + for item in row: + self.assertEqual(item, expected) + expected += 1 + + +if __name__ == "__main__": + # setup test files + unittest.main() From 7c9f7191edea8eec5672f90763fb192d45ba2281 Mon Sep 17 00:00:00 2001 From: jreadey Date: Wed, 18 Oct 2023 11:06:30 -0700 Subject: [PATCH 06/17] remove broadcast from arrayUtil.py --- hsds/util/arrayUtil.py | 16 ++----- tests/unit/array_util_test.py | 90 +---------------------------------- 2 files changed, 5 insertions(+), 101 deletions(-) diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index ff586fee..a0358fc7 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -131,7 +131,7 @@ def getShapeDims(shape): return dims -def jsonToArray(data_shape, data_dtype, data_json, broadcast=False): +def jsonToArray(data_shape, data_dtype, data_json): """ Return numpy array from the given json array. """ @@ -176,17 +176,9 @@ def fillVlenArray(rank, data, arr, index): # allow if the array is a scalar and the selection shape is one element, # numpy is ok with this if arr.size != npoints: - if broadcast: - # try to broadcast to the target shape - # if it fails, a ValueError exception will be raised - arr_tgt = np.zeros(data_shape, dtype=data_dtype) - arr_tgt[...] = arr - # worked! use arr_tgt as arr - arr = arr_tgt - else: - msg = "Input data doesn't match selection number of elements" - msg += f" Expected {npoints}, but received: {arr.size}" - raise ValueError(msg) + msg = "Input data doesn't match selection number of elements" + msg += f" Expected {npoints}, but received: {arr.size}" + raise ValueError(msg) if arr.shape != data_shape: arr = arr.reshape(data_shape) # reshape to match selection else: diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index d11979df..6df2e0aa 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -290,95 +290,7 @@ def testJsonToArray(self): self.assertTrue(isinstance(e, tuple)) self.assertEqual(e, (id0, id1, id2)) - - def testJsonToArrayBroadcast(self): - dt = np.dtype("i4") - shape = [10,] - data = [42,] - out = jsonToArray(shape, dt, data, broadcast=True) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (10,)) - for i in range(10): - self.assertEqual(out[i], 42) - - # compound type - dt = np.dtype([("a", "i4"), ("b", "S5")]) - shape = [10,] - data = [[6, "six"],] - out = jsonToArray(shape, dt, data, broadcast=True) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (10,)) - for i in range(10): - e = out[i] - self.assertEqual(e[0], 6) - self.assertEqual(e[1], b'six') - - # VLEN str - dt = special_dtype(vlen=str) - data = ["hello",] - - shape = [10,] - - out = jsonToArray(shape, dt, data, broadcast=True) - self.assertTrue("vlen" in out.dtype.metadata) - self.assertEqual(out.dtype.metadata["vlen"], str) - self.assertEqual(out.dtype.kind, "O") - self.assertEqual(out.shape, (10,)) - for i in range(10): - e = out[i] - self.assertEqual(out[0], data[0]) - - # two dimensional target - dt = np.dtype("i4") - shape = [10,2] - data = [42,] - out = jsonToArray(shape, dt, data, broadcast=True) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (10,2)) - for i in range(10): - for j in range(2): - self.assertEqual(out[i,j], 42) - - dt = np.dtype("i4") - shape = [10,2] - data = [69,96] - out = jsonToArray(shape, dt, data, broadcast=True) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (10,2)) - for i in range(10): - self.assertEqual(out[i,0], 69) - self.assertEqual(out[i,1], 96) - - # three dimensional target - dt = np.dtype("i4") - shape = [10, 3, 2] - data = [[0,1],[2,3],[4,5]] - out = jsonToArray(shape, dt, data, broadcast=True) - - self.assertTrue(isinstance(out, np.ndarray)) - self.assertEqual(out.shape, (10, 3, 2)) - for i in range(10): - for j in range(3): - for k in range(2): - self.assertEqual(out[i,j,k], j * 2 + k) - - - # verify ValueError returning if broadcast rules don't apply - dt = np.dtype("i4") - shape = [5,] - data = [1, 2] - - try: - jsonToArray(shape, dt, data, broadcast=True) - self.assertTrue(False) - except ValueError: - pass # expected - - + def testToBytes(self): # Simple array dt = np.dtype(" Date: Wed, 18 Oct 2023 11:20:06 -0700 Subject: [PATCH 07/17] fix flake8 error --- tests/unit/array_util_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index 6df2e0aa..dd10c1c0 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -290,7 +290,6 @@ def testJsonToArray(self): self.assertTrue(isinstance(e, tuple)) self.assertEqual(e, (id0, id1, id2)) - def testToBytes(self): # Simple array dt = np.dtype(" Date: Wed, 18 Oct 2023 13:40:18 -0700 Subject: [PATCH 08/17] broadcast PUT Value to DNs when data is one element --- hsds/chunk_crawl.py | 27 ++++++++++++++++--- hsds/chunk_dn.py | 43 +++++++++++++++++++++--------- hsds/chunk_sn.py | 49 +++++++++++++++++------------------ hsds/util/arrayUtil.py | 23 ++++++++++++++++ tests/unit/array_util_test.py | 23 +++++++++++++++- 5 files changed, 124 insertions(+), 41 deletions(-) diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index b7d2ce22..dade9fa5 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -70,20 +70,41 @@ async def write_chunk_hyperslab( log.error(f"No type found in dset_json: {dset_json}") raise HTTPInternalServerError() + params = {} layout = getChunkLayout(dset_json) chunk_sel = getChunkCoverage(chunk_id, slices, layout) log.debug(f"chunk_sel: {chunk_sel}") data_sel = getDataCoverage(chunk_id, slices, layout) log.debug(f"data_sel: {data_sel}") log.debug(f"arr.shape: {arr.shape}") - arr_chunk = arr[data_sel] + + # broadcast data if arr has one element and no stride is set + do_broadcast = True + if np.prod(arr.shape) != 1: + do_broadcast = False + else: + for s in slices: + if s.step is None: + continue + if s.step > 1: + do_broadcast = False + + if do_broadcast: + log.debug(f"broadcasting {arr}") + # just broadcast data value across selection + params["element_count"] = 1 + arr_chunk = arr + else: + arr_chunk = arr[data_sel] + req = getDataNodeUrl(app, chunk_id) req += "/chunks/" + chunk_id - log.debug(f"PUT chunk req: {req}") data = arrayToBytes(arr_chunk) + + log.debug(f"PUT chunk req: {req}, {len(data)} bytes") + # pass itemsize, type, dimensions, and selection as query params - params = {} select = getSliceQueryParam(chunk_sel) params["select"] = select if bucket: diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index 3fafd940..329f772a 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -20,7 +20,7 @@ from aiohttp.web import json_response, StreamResponse from .util.httpUtil import request_read, getContentType -from .util.arrayUtil import bytesToArray, arrayToBytes, getShapeDims +from .util.arrayUtil import bytesToArray, arrayToBytes, getShapeDims, getBroadcastShape from .util.idUtil import getS3Key, validateInPartition, isValidUuid from .util.storUtil import isStorObj, deleteStorObj from .util.hdf5dtype import createDataType @@ -48,6 +48,7 @@ async def PUT_Chunk(request): limit = 0 bucket = None input_arr = None + element_count = None if "query" in params: query = params["query"] @@ -77,6 +78,15 @@ async def PUT_Chunk(request): log.warn(msg) raise HTTPInternalServerError(reason=msg) + if "element_count" in params: + try: + element_count = int(params["element_count"]) + except ValueError: + msg = "invalid element_count" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"element_count param: {element_count}") + try: validateInPartition(app, chunk_id) except KeyError: @@ -130,9 +140,16 @@ async def PUT_Chunk(request): log.debug(f"PUT_Chunk slices: {selection}") mshape = getSelectionShape(selection) - num_elements = 1 - for extent in mshape: - num_elements *= extent + if element_count is not None: + bcshape = getBroadcastShape(mshape, element_count) + log.debug(f"ussing bcshape: {bcshape}") + else: + bcshape = None + + if bcshape: + num_elements = np.prod(bcshape) + else: + num_elements = np.prod(mshape) if getChunkInitializer(dset_json): chunk_init = True @@ -220,9 +237,6 @@ async def PUT_Chunk(request): return else: # regular chunk update - - broadcast = 0 # broadcast update - # check that the content_length is what we expect if itemsize != "H5T_VARIABLE": log.debug(f"expect content_length: {num_elements*itemsize}") @@ -235,10 +249,6 @@ async def PUT_Chunk(request): msg = f"Expected content_length of: {expected}, but got: {actual}" log.error(msg) raise HTTPBadRequest(reason=msg) - else: - broadcast = expected // actual - if broadcast != 1: - log.info(f"broadcast chunk write: {broadcast}") # create a numpy array for incoming data input_bytes = await request_read(request) @@ -249,7 +259,16 @@ async def PUT_Chunk(request): log.error(msg) raise HTTPInternalServerError() - input_arr = bytesToArray(input_bytes, dt, mshape) + input_arr = bytesToArray(input_bytes, dt, [num_elements, ]) + if bcshape: + input_arr = input_arr.reshape(bcshape) + log.debug(f"broadcasting {bcshape} to mshape {mshape}") + arr_tmp = np.zeros(mshape, dtype=dt) + arr_tmp[...] = input_arr + input_arr = arr_tmp + else: + input_arr = input_arr.reshape(mshape) + kwargs = {"chunk_arr": chunk_arr, "slices": selection, "data": input_arr} is_dirty = chunkWriteSelection(**kwargs) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 903f82fe..0c094314 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -39,7 +39,7 @@ from .util.chunkUtil import getQueryDtype, get_chunktable_dims from .util.arrayUtil import bytesArrayToList, jsonToArray, getShapeDims from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray -from .util.arrayUtil import squeezeArray, getNumpyValue +from .util.arrayUtil import squeezeArray, getNumpyValue, getBroadcastShape from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.boolparser import BooleanParser from .servicenode_lib import getObjectJson, validateAction @@ -760,24 +760,15 @@ async def PUT_Value(request): # if this is set to something other than the number of # elements in np_shape, should be a value that can # be used for broadcasting - if element_count == 1: - num_elements = 1 - bc_shape = [1,] - log.debug(f"broadcasting one element to shape: {np_shape}") - else: - bc_shape = [] - for n in range(rank - 1): - bc_shape.insert(0, np_shape[rank - n - 1]) - if element_count == np.prod(bc_shape): - num_elements = element_count - msg = f"broadcast with: {element_count} elements valid for shape: {bc_shape}" - log.debug(msg) - break - if num_elements is None: + bc_shape = getBroadcastShape(np_shape, element_count) + + if bc_shape is None: # this never got set, so element count must be invalid for this shape msg = f"element_count {element_count} not compatible with selection shape: {np_shape}" log.warn(msg) raise HTTPBadRequest(reason=msg) + # element_count will be what we expected to see + num_elements = element_count else: # set num_elements based on selection shape num_elements = getNumElements(np_shape) @@ -816,15 +807,23 @@ async def PUT_Value(request): if bc_shape: # broadcast received data into numpy array arr = arr.reshape(bc_shape) - arr_tmp = np.zeros(np_shape, dtype=dset_dtype) - arr_tmp[...] = arr - arr = arr_tmp - try: - arr = arr.reshape(np_shape) # conform to selection shape - except ValueError: - msg = "Bad Request: binary input data doesn't match selection" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + if element_count == 1: + log.debug("will send broadcast set to DN nodes") + else: + # need to instantiate the full np_shape since chunk boundries + # will effect how individual chunks get set + arr_tmp = np.zeros(np_shape, dtype=dset_dtype) + arr_tmp[...] = arr + arr = arr_tmp + + if element_count != 1: + try: + arr = arr.reshape(np_shape) # conform to selection shape + except ValueError: + msg = "Bad Request: binary input data doesn't match selection " + msg += f"reshaping {arr.shape} to {np_shape}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) msg = f"PUT value - numpy array shape: {arr.shape} dtype: {arr.dtype}" log.debug(msg) @@ -845,7 +844,7 @@ async def PUT_Value(request): msg = f"expected {num_elements} elements, but got {np.prod(arr.shape)}" raise HTTPBadRequest(reason=msg) - if bc_shape: + if bc_shape and element_count != 1: # broadcast to target arr_tmp = np.zeros(np_shape, dtype=dset_dtype) arr_tmp[...] = arr diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index a0358fc7..31ee3bf1 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -677,3 +677,26 @@ def ndarray_compare(arr1, arr2): else: # can just us np array_compare return np.array_equal(arr1, arr2) + + +def getBroadcastShape(mshape, element_count): + # if element_count is less than the number of elements + # defined by mshape, return a numpy compatible broadcast + # shape that contains element_count elements. + # If non exists return None + + if np.prod(mshape) == element_count: + return None + + if element_count == 1: + # this always works + return [1,] + + bcshape = [] + rank = len(mshape) + for n in range(rank - 1): + bcshape.insert(0, mshape[rank - n - 1]) + if element_count == np.prod(bcshape): + return bcshape # have a match + + return None # no broadcast found diff --git a/tests/unit/array_util_test.py b/tests/unit/array_util_test.py index dd10c1c0..c734e045 100644 --- a/tests/unit/array_util_test.py +++ b/tests/unit/array_util_test.py @@ -27,7 +27,8 @@ getByteArraySize, IndexIterator, ndarray_compare, - getNumpyValue + getNumpyValue, + getBroadcastShape ) from hsds.util.hdf5dtype import special_dtype from hsds.util.hdf5dtype import check_dtype @@ -795,6 +796,26 @@ def testJsonToArrayOnNoneArray(self): self.assertTrue(len(arr) == 0) self.assertTrue(arr.dtype == data_dtype) + def testGetBroadcastShape(self): + bcshape = getBroadcastShape([1, ], 1) + self.assertEqual(bcshape, None) + bcshape = getBroadcastShape([2, 3], 6) + self.assertEqual(bcshape, None) + bcshape = getBroadcastShape([2, 3], 5) + self.assertEqual(bcshape, None) + + bcshape = getBroadcastShape([4, 5], 1) + self.assertEqual(bcshape, [1, ]) + bcshape = getBroadcastShape([4, 5], 5) + self.assertEqual(bcshape, [5, ]) + + bcshape = getBroadcastShape([2, 3, 5], 1) + self.assertEqual(bcshape, [1, ]) + bcshape = getBroadcastShape([2, 3, 5], 5) + self.assertEqual(bcshape, [5, ]) + bcshape = getBroadcastShape([2, 3, 5], 15) + self.assertEqual(bcshape, [3, 5]) + if __name__ == "__main__": # setup test files From 241f4a32f88b5e5c3007f7743d361630811f1e0c Mon Sep 17 00:00:00 2001 From: jreadey Date: Wed, 18 Oct 2023 15:30:42 -0700 Subject: [PATCH 09/17] re-init values when dset shape is reduced then expanded --- hsds/chunk_sn.py | 5 +- hsds/dset_sn.py | 42 +++++++++++++++ hsds/util/chunkUtil.py | 11 ++++ tests/integ/value_test.py | 106 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 157 insertions(+), 7 deletions(-) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 0c094314..5aa4fd91 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -985,9 +985,8 @@ async def PUT_Value(request): crawler_status = crawler.get_status() if crawler_status not in (200, 201): - log.warn( - f"crawler failed for page: {page_number+1} with status: {crawler_status}" - ) + msg = f"crawler failed for page: {page_number+1} with status: {crawler_status}" + log.warn(msg) else: log.info("crawler write_chunk_hyperslab successful") diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index e09e5fba..94241541 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -15,6 +15,7 @@ # import math +import numpy as np from json import JSONDecodeError from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPConflict @@ -34,6 +35,7 @@ from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo +from .chunk_crawl import ChunkCrawler from . import config from . import hsds_logger as log @@ -644,6 +646,24 @@ async def PUT_DatasetShape(request): log.info(f"Shape extent reduced for dataset (rank: {rank})") # need to re-initialize any values that are now outside the shape + # first get the fill value + fill_value = None + type_json = dset_json["type"] + dt = createDataType(type_json) + + if "creationProperties" in dset_json: + fill_value = None + cprops = dset_json["creationProperties"] + if "fillValue" in cprops: + fill_value_prop = cprops["fillValue"] + encoding = cprops.get("fillValue_encoding") + fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) + if fill_value: + arr = np.empty((1,), dtype=dt, order="C") + arr[...] = fill_value + else: + arr = np.zeros([1,], dtype=dt, order="C") + layout = getChunkLayout(dset_json) log.debug(f"got layout: {layout}") for n in range(rank): @@ -663,6 +683,28 @@ async def PUT_DatasetShape(request): chunk_ids = getChunkIds(dset_id, slices, layout) log.debug(f"got chunkIds: {chunk_ids}") + chunk_ids.sort() + + crawler = ChunkCrawler( + app, + chunk_ids, + dset_json=dset_json, + bucket=bucket, + slices=slices, + arr=arr, + action="write_chunk_hyperslab", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"crawler failed for shape reinitialize with status: {crawler_status}" + log.warn(msg) + else: + msg = f"crawler success for reinitialization with slices: {slices}" + log.info(msg) + # send request onto DN req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id + "/shape" diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index 2031eaa0..87bdb40c 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -311,7 +311,10 @@ def getNumChunks(selection, layout): for i in range(len(selection)): s = selection[i] c = layout[i] # chunk size + if isinstance(s, slice): + if s.step is None: + s = slice(s.start, s.stop, 1) if s.step > 1: num_points = frac((s.stop - s.start), s.step) w = num_points * s.step - (s.step - 1) @@ -475,6 +478,8 @@ def getChunkIds(dset_id, selection, layout, dim=0, prefix=None, chunk_ids=None): s = selection[dim] c = layout[dim] # log.debug(f"getChunkIds - layout: {layout}") + if isinstance(s, slice) and s.step is None: + s = slice(s.start, s.stop, 1) if isinstance(s, slice) and s.step > c: # chunks may not be contiguous, skip along the selection and add @@ -570,6 +575,8 @@ def getChunkSelection(chunk_id, slices, layout): c = layout[dim] n = chunk_index[dim] * c if isinstance(s, slice): + if s.step is None: + s = slice(s.start, s.stop, 1) if s.start >= n + c: return None # null intersection if s.stop < n: @@ -653,6 +660,8 @@ def getDataCoverage(chunk_id, slices, layout): c = chunk_sel[dim] s = slices[dim] if isinstance(s, slice): + if s.step is None: + s = slice(s.start, s.stop, 1) if c.step != s.step: msg = "expecting step for chunk selection to be the " msg += "same as data selection" @@ -1163,6 +1172,8 @@ def chunkQuery( # adjust the index to correspond with the dataset s = slices[0] + if s.step is None: + s = slice(s.start, s.stop, 1) start = s.start + chunk_coord[0] if start > 0: # can just increment every value by same amount diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 788d53f8..18aac784 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3139,10 +3139,10 @@ def testLargeCreationProperties(self): self.assertEqual(ret_values[i], 24) self.assertEqual(ret_values[i + 5], 42) - def testValueReinitialization(self): + def testValueReinitialization1D(self): # Test the dataset values get reset after a reduction and resize - print("testValueReinitialization", self.base_domain) + print("testValueReinitialization1D", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) # get domain @@ -3215,8 +3215,106 @@ def testValueReinitialization(self): rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) value = rspJson["value"] - print("value:", value) - # TBD: verify values are getting reinitialized + self.assertEqual(value[0:5], data[0:5]) + self.assertEqual(value[5:10], [42,] * 5) + + def testValueReinitialization2D(self): + # Test the dataset values get reset after a reduction and resize + + print("testValueReinitialization1D", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get domain + req = f"{self.endpoint}/" + rsp = self.session.get(req, headers=headers) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + root_uuid = rspJson["root"] + + # create the dataset + req = f"{self.endpoint}/datasets" + payload = {"type": "H5T_STD_I32LE", "shape": [12, 15], "maxdims": [12, 15]} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + rspJson = json.loads(rsp.text) + dset_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset_uuid)) + + # link new dataset as 'dset' + name = "dset" + req = f"{self.endpoint}/groups/{root_uuid}/links/{name}" + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write to the dset + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + data = [] + for i in range(12): + row = [] + for j in range(15): + row.append(i * j) + data.append(row) + payload = {"value": data} + params = {"select": "[0:12, 0:15]"} + + rsp = self.session.put(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # read back the data + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + self.assertEqual(rspJson["value"], data) + + # resize the dataset to 10 x 10 array + req = f"{self.endpoint}/datasets/{dset_uuid}/shape" + payload = {"shape": [10, 10]} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # read back the remaining elements + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + value = rspJson["value"] + self.assertEqual(len(value), 10) + for i in range(10): + row = value[i] + self.assertEqual(len(row), 10) + for j in range(10): + self.assertEqual(row[j], i * j) + + # resize back to 12, 15 + req = f"{self.endpoint}/datasets/{dset_uuid}/shape" + payload = {"shape": [12, 15]} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # read all the data values + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + value = rspJson["value"] + + # check that the re-extended area is zero's + self.assertEqual(len(value), 12) + for i in range(12): + row = value[i] + self.assertEqual(len(row), 15) + for j in range(15): + if j < 10 and i < 10: + self.assertEqual(row[j], i * j) + else: + self.assertEqual(row[j], 0) if __name__ == "__main__": From 0ee395fbc5a66bcdf6b18b5019ac6bdd6079950c Mon Sep 17 00:00:00 2001 From: jreadey Date: Thu, 19 Oct 2023 15:10:54 -0700 Subject: [PATCH 10/17] test for fetching points after reshape --- tests/integ/pointsel_test.py | 80 ++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/integ/pointsel_test.py b/tests/integ/pointsel_test.py index 0fc5516b..be2335fc 100755 --- a/tests/integ/pointsel_test.py +++ b/tests/integ/pointsel_test.py @@ -1583,6 +1583,86 @@ def testSelect2DDataset(self): self.assertEqual(len(data), 3 * 4) self.assertEqual(data, b"\x1e\x00\x00\x00 \x00\x00\x00#\x00\x00\x00") + def testShapeUpdate(self): + + # Test selecting points after shape has been updated + print("testShapeUpdate", self.base_domain) + + points = [75,] + + headers = helper.getRequestHeaders(domain=self.base_domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create dataset + data = {"type": "H5T_STD_I32LE", "shape": (100,), "maxdims": (100,)} + + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + dset_id = rspJson["id"] + self.assertTrue(helper.validateId(dset_id)) + + # link new dataset as 'dset1d' + name = "dset" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + name + payload = {"id": dset_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # write to the dset + data = list(range(100)) + data.reverse() # 99, 98, ..., 0 + + payload = {"value": data} + req = self.endpoint + "/datasets/" + dset_id + "/value" + + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) + + body = {"points": points} + # read selected points + rsp = self.session.post(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + ret_value = rspJson["value"] + self.assertEqual(len(ret_value), len(points)) + expected_result = [24, ] + + self.assertEqual(ret_value, expected_result) + + # resize the dataset to the small shape + req = self.endpoint + "/datasets/" + dset_id + "/shape" + payload = {"shape": 50} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # should get a 400 now + req = self.endpoint + "/datasets/" + dset_id + "/value" + rsp = self.session.post(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 400) + + # resize back to large shape + payload = {"shape": 100} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # read point again + req = self.endpoint + "/datasets/" + dset_id + "/value" + rsp = self.session.post(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 200) + + if __name__ == "__main__": # setup test files From 51df3a2773fe163e66ec8a4218e5ecc0edfd69ec Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 20 Oct 2023 10:34:21 -0700 Subject: [PATCH 11/17] fix for pt selection fail after shape update - #276 --- hsds/chunk_sn.py | 70 ++++++++------------------ hsds/dset_sn.py | 88 ++++++++++++++++++++------------- hsds/servicenode_lib.py | 25 ++++++++++ tests/integ/dataset_test.py | 18 ++++++- tests/integ/pointsel_test.py | 87 +-------------------------------- tests/integ/value_test.py | 95 +++++++++++++++++++++++++++++++++++- 6 files changed, 211 insertions(+), 172 deletions(-) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 5aa4fd91..f24cbfbe 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -42,7 +42,7 @@ from .util.arrayUtil import squeezeArray, getNumpyValue, getBroadcastShape from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.boolparser import BooleanParser -from .servicenode_lib import getObjectJson, validateAction +from .servicenode_lib import getDsetJson, validateAction from .chunk_crawl import ChunkCrawler from . import config from . import hsds_logger as log @@ -72,7 +72,7 @@ def get_hrefs(request, dset_json): return hrefs -async def get_slices(app, select, dset_json, bucket=None): +def get_slices(app, select, dset_json): """Get desired slices from selection query param string or json value. If select is none or empty, slices for entire datashape will be returned. @@ -87,35 +87,14 @@ async def get_slices(app, select, dset_json, bucket=None): raise HTTPBadRequest(reason=msg) dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets - maxdims = getDsetMaxDims(dset_json) - - # refetch the dims if the dataset is extensible and request or hasn't - # provided an explicit region - if isExtensible(dims, maxdims) and (select is None or not select): - kwargs = {"bucket": bucket, "refresh": True} - dset_json = await getObjectJson(app, dset_id, **kwargs) - dims = getShapeDims(dset_json["shape"]) - slices = None # selection for read - if isExtensible and select: - try: - slices = getSelectionList(select, dims) - except ValueError: - # exception might be due to us having stale version of dims, - # so use refresh - kwargs = {"bucket": bucket, "refresh": True} - dset_json = await getObjectJson(app, dset_id, **kwargs) - dims = getShapeDims(dset_json["shape"]) - slices = None # retry below - - if slices is None: - try: - slices = getSelectionList(select, dims) - except ValueError: - msg = f"Invalid selection: {select} on dims: {dims} " - msg += f"for dataset: {dset_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + try: + slices = getSelectionList(select, dims) + except ValueError: + msg = f"Invalid selection: {select} on dims: {dims} " + msg += f"for dataset: {dset_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) return slices @@ -254,8 +233,7 @@ def getChunkItem(chunkid): raise HTTPInternalServerError() chunktable_id = layout["chunk_table"] # get state for dataset from DN. - kwargs = {"bucket": bucket, "refresh": False} - chunktable_json = await getObjectJson(app, chunktable_id, **kwargs) + chunktable_json = await getDsetJson(app, chunktable_id, bucket=bucket) # log.debug(f"chunktable_json: {chunktable_json}") chunktable_dims = getShapeDims(chunktable_json["shape"]) chunktable_layout = chunktable_json["layout"] @@ -523,7 +501,7 @@ async def PUT_Value(request): log.info(f"append_dim: {append_dim}") # get state for dataset from DN. - dset_json = await getObjectJson(app, dset_id, bucket=bucket, refresh=False) + dset_json = await getDsetJson(app, dset_id, bucket=bucket) layout = None datashape = dset_json["shape"] @@ -568,7 +546,7 @@ async def PUT_Value(request): raise HTTPBadRequest(reason=msg) select = params.get("select") - slices = await get_slices(app, select, dset_json, bucket=bucket) + slices = get_slices(app, select, dset_json) if "Limit" in params: try: limit = int(params["Limit"]) @@ -676,12 +654,6 @@ async def PUT_Value(request): log.warn("unable to append to dataspace") raise HTTPConflict() - # refetch the dims if the dataset is extensible - if isExtensible(dims, maxdims): - kwargs = {"bucket": bucket, "refresh": True} - dset_json = await getObjectJson(app, dset_id, **kwargs) - dims = getShapeDims(dset_json["shape"]) - if request_type == "json": if "value" in body: json_data = body["value"] @@ -737,10 +709,10 @@ async def PUT_Value(request): elif points is None: if body and "start" in body and "stop" in body: - slices = await get_slices(app, body, dset_json, bucket=bucket) + slices = get_slices(app, body, dset_json) else: select = params.get("select") - slices = await get_slices(app, select, dset_json, bucket=bucket) + slices = get_slices(app, select, dset_json) # The selection parameters will determine expected put value shape log.debug(f"PUT Value selection: {slices}") @@ -992,7 +964,7 @@ async def PUT_Value(request): else: # - # Do point PUT + # Do point post # log.debug(f"num_points: {num_points}") @@ -1111,10 +1083,10 @@ async def GET_Value(request): bucket = getBucketForDomain(domain) # get state for dataset from DN. - # Note - refreshShape will do a refresh if the dataset is extensible + # Note - this will do a refresh if the dataset is extensible # i.e. we need to make sure we have the correct shape dimensions - dset_json = await getObjectJson(app, dset_id, bucket=bucket, refresh=True) + dset_json = await getDsetJson(app, dset_id, bucket=bucket) type_json = dset_json["type"] dset_dtype = createDataType(type_json) @@ -1137,7 +1109,7 @@ async def GET_Value(request): select = params.get("select") if select: log.debug(f"select query param: {select}") - slices = await get_slices(app, select, dset_json, bucket=bucket) + slices = get_slices(app, select, dset_json) log.debug(f"GET Value selection: {slices}") limit = 0 @@ -1569,7 +1541,7 @@ async def getSelectionData( await getChunkLocations(app, dset_id, dset_json, chunkinfo, chunk_ids, bucket=bucket) if slices is None: - slices = await get_slices(app, None, dset_json, bucket=bucket) + slices = get_slices(app, None, dset_json) if points is None: # get chunk selections for hyperslab select @@ -1649,7 +1621,7 @@ async def POST_Value(request): raise HTTPBadRequest(reason=msg) # get state for dataset from DN. - dset_json = await getObjectJson(app, dset_id, bucket=bucket) + dset_json = await getDsetJson(app, dset_id, bucket=bucket) datashape = dset_json["shape"] if datashape["class"] == "H5S_NULL": @@ -1691,7 +1663,7 @@ async def POST_Value(request): elif "select" in body: select = body["select"] log.debug(f"select: {select}") - slices = await get_slices(app, select, dset_json, bucket=bucket) + slices = get_slices(app, select, dset_json) log.debug(f"got slices: {slices}") else: msg = "Expected points or select key in request body" diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 94241541..d4fb0a96 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -25,7 +25,7 @@ from .util.dsetUtil import getPreviewQuery, getFilterItem, getChunkLayout from .util.arrayUtil import getNumElements, getShapeDims, getNumpyValue from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk -from .util.chunkUtil import getContiguousLayout, getChunkIds +from .util.chunkUtil import getContiguousLayout, getChunkIds, getChunkSelection from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -33,7 +33,7 @@ from .util.storUtil import getFilters from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson from .util.hdf5dtype import getItemSize -from .servicenode_lib import getDomainJson, getObjectJson, getPathForObjectId +from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo from .chunk_crawl import ChunkCrawler from . import config @@ -189,9 +189,7 @@ async def validateChunkLayout(app, shape_json, item_size, layout, bucket=None): raise HTTPBadRequest(reason=msg) # verify the chunk table exists and is of reasonable shape try: - chunktable_json = await getObjectJson( - app, chunktable_id, bucket=bucket, refresh=False - ) + chunktable_json = await getDsetJson(app, chunktable_id, bucket=bucket) except HTTPNotFound: msg = f"chunk table id: {chunktable_id} not found" log.warn(msg) @@ -343,9 +341,8 @@ async def GET_Dataset(request): # get authoritative state for dataset from DN (even if it's # in the meta_cache). - dset_json = await getObjectJson( - app, dset_id, refresh=True, include_attrs=include_attrs, bucket=bucket - ) + kwargs = {"refresh": True, "include_attrs": include_attrs, "bucket": bucket} + dset_json = await getDsetJson(app, dset_id, **kwargs) # check that we have permissions to read the object await validateAction(app, domain, dset_id, username, "read") @@ -444,7 +441,7 @@ async def GET_DatasetType(request): # get authoritative state for group from DN (even if it's in # the meta_cache). - dset_json = await getObjectJson(app, dset_id, refresh=True, bucket=bucket) + dset_json = await getDsetJson(app, dset_id, refresh=True, bucket=bucket) await validateAction(app, domain, dset_id, username, "read") @@ -496,7 +493,7 @@ async def GET_DatasetShape(request): # get authoritative state for dataset from DN (even if it's in # the meta_cache). - dset_json = await getObjectJson(app, dset_id, refresh=True, bucket=bucket) + dset_json = await getDsetJson(app, dset_id, refresh=True, bucket=bucket) await validateAction(app, domain, dset_id, username, "read") @@ -601,9 +598,7 @@ async def PUT_DatasetShape(request): # verify the user has permission to update shape await validateAction(app, domain, dset_id, username, "update") - # get authoritative state for dataset from DN (even if it's in the - # meta_cache). - dset_json = await getObjectJson(app, dset_id, refresh=True, bucket=bucket) + dset_json = await getDsetJson(app, dset_id, bucket=bucket) shape_orig = dset_json["shape"] log.debug(f"shape_orig: {shape_orig}") @@ -666,44 +661,69 @@ async def PUT_DatasetShape(request): layout = getChunkLayout(dset_json) log.debug(f"got layout: {layout}") + delete_ids = set() # chunk ids that will need to be deleted for n in range(rank): if dims[n] <= shape_update[i]: log.debug(f"skip dimension {n}") continue log.debug(f"reinitialize for dimension: {n}") slices = [] + update_ids = set() # chunk ids that will need to be updated + for m in range(rank): if m == n: s = slice(shape_update[m], dims[m], 1) else: # just select the entire extent - s = slice(0, dims[m]) + s = slice(0, dims[m], 1) slices.append(s) log.debug(f"shape_reinitialize - got slices: {slices} for dimension: {n}") chunk_ids = getChunkIds(dset_id, slices, layout) log.debug(f"got chunkIds: {chunk_ids}") - chunk_ids.sort() - - crawler = ChunkCrawler( - app, - chunk_ids, - dset_json=dset_json, - bucket=bucket, - slices=slices, - arr=arr, - action="write_chunk_hyperslab", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - if crawler_status not in (200, 201): - msg = f"crawler failed for shape reinitialize with status: {crawler_status}" - log.warn(msg) + # separate ids into those that overlap the new shape + # vs. those that follow entirely outside the new shape. + # The former will need to be partiaally reset, the latter + # will need to be deleted + for chunk_id in chunk_ids: + if getChunkSelection(chunk_id, slices, layout) is None: + delete_ids.add(chunk_id) + else: + update_ids.add(chunk_id) + + if update_ids: + update_ids = list(update_ids) + update_ids.sort() + log.debug(f"these ids will need to be updated: {update_ids}") + + crawler = ChunkCrawler( + app, + update_ids, + dset_json=dset_json, + bucket=bucket, + slices=slices, + arr=arr, + action="write_chunk_hyperslab", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"crawler failed for shape reinitialize with status: {crawler_status}" + log.warn(msg) + else: + msg = f"crawler success for reinitialization with slices: {slices}" + log.info(msg) else: - msg = f"crawler success for reinitialization with slices: {slices}" - log.info(msg) + log.info(f"no chunks need updating for shape reduction over dim {m}") + + if delete_ids: + delete_ids = list(delete_ids) + delete_ids.sort() + log.debug(f"these ids will need to be deleted: {delete_ids}") + else: + log.info("no chunks need deletion for shape reduction") # send request onto DN req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id + "/shape" diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 416f1fc5..57711067 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -215,6 +215,31 @@ async def getObjectJson( return obj_json +async def getDsetJson(app, dset_id, + bucket=None, + refresh=False, + include_links=False, + include_attrs=False): + kwargs = {} + kwargs["bucket"] = bucket + kwargs["refresh"] = refresh + kwargs["include_links"] = include_links + kwargs["include_attrs"] = include_attrs + dset_json = await getObjectJson(app, dset_id, **kwargs) + if refresh: + # can just return the json + return dset_json + + # check to see if the dataspace is mutable + # if so, refresh if necessary + datashape = dset_json["shape"] + if "maxdims" in datashape: + log.debug("getDsetJson - refreshing json for mutable shape") + kwargs["refresh"] = True + dset_json = await getObjectJson(app, dset_id, **kwargs) + return dset_json + + async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, domain=None, follow_soft_links=False, follow_external_links=False): """Find the object at the provided h5path location. diff --git a/tests/integ/dataset_test.py b/tests/integ/dataset_test.py index ab23eb06..e7f5cf73 100755 --- a/tests/integ/dataset_test.py +++ b/tests/integ/dataset_test.py @@ -830,7 +830,23 @@ def testExtendDataset(self): self.assertTrue("root" in rspJson) root_uuid = rspJson["root"] - # create the dataset + # create non-extendable dataset + req = self.endpoint + "/datasets" + payload = {"type": "H5T_STD_I32LE", "shape": 10} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + rspJson = json.loads(rsp.text) + dset_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset_uuid)) + + # try extending it (should fail) + req = self.endpoint + "/datasets/" + dset_uuid + "/shape" + payload = {"extend": 5} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 400) + + # create extendable dataset req = self.endpoint + "/datasets" payload = {"type": "H5T_STD_I32LE", "shape": 10, "maxdims": 20} req = self.endpoint + "/datasets" diff --git a/tests/integ/pointsel_test.py b/tests/integ/pointsel_test.py index be2335fc..f14ad053 100755 --- a/tests/integ/pointsel_test.py +++ b/tests/integ/pointsel_test.py @@ -1569,12 +1569,7 @@ def testSelect2DDataset(self): rspJson = json.loads(rsp.text) self.assertTrue("hrefs" in rspJson) self.assertTrue("value" in rspJson) - self.assertEqual( - rspJson["value"], - [ - [30, 32, 35], - ], - ) + self.assertEqual(rspJson["value"], [[30, 32, 35],], ) # read a coordinate selection with binary response rsp = self.session.post(req, data=json.dumps(body), headers=headers_bin_rsp) @@ -1583,86 +1578,6 @@ def testSelect2DDataset(self): self.assertEqual(len(data), 3 * 4) self.assertEqual(data, b"\x1e\x00\x00\x00 \x00\x00\x00#\x00\x00\x00") - def testShapeUpdate(self): - - # Test selecting points after shape has been updated - print("testShapeUpdate", self.base_domain) - - points = [75,] - - headers = helper.getRequestHeaders(domain=self.base_domain) - req = self.endpoint + "/" - - # Get root uuid - rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - root_uuid = rspJson["root"] - helper.validateId(root_uuid) - - # create dataset - data = {"type": "H5T_STD_I32LE", "shape": (100,), "maxdims": (100,)} - - req = self.endpoint + "/datasets" - rsp = self.session.post(req, data=json.dumps(data), headers=headers) - self.assertEqual(rsp.status_code, 201) - rspJson = json.loads(rsp.text) - dset_id = rspJson["id"] - self.assertTrue(helper.validateId(dset_id)) - - # link new dataset as 'dset1d' - name = "dset" - req = self.endpoint + "/groups/" + root_uuid + "/links/" + name - payload = {"id": dset_id} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) - - # write to the dset - data = list(range(100)) - data.reverse() # 99, 98, ..., 0 - - payload = {"value": data} - req = self.endpoint + "/datasets/" + dset_id + "/value" - - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 200) - - body = {"points": points} - # read selected points - rsp = self.session.post(req, data=json.dumps(body), headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("value" in rspJson) - ret_value = rspJson["value"] - self.assertEqual(len(ret_value), len(points)) - expected_result = [24, ] - - self.assertEqual(ret_value, expected_result) - - # resize the dataset to the small shape - req = self.endpoint + "/datasets/" + dset_id + "/shape" - payload = {"shape": 50} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) - rspJson = json.loads(rsp.text) - - # should get a 400 now - req = self.endpoint + "/datasets/" + dset_id + "/value" - rsp = self.session.post(req, data=json.dumps(body), headers=headers) - self.assertEqual(rsp.status_code, 400) - - # resize back to large shape - payload = {"shape": 100} - rsp = self.session.put(req, data=json.dumps(payload), headers=headers) - self.assertEqual(rsp.status_code, 201) - rspJson = json.loads(rsp.text) - - # read point again - req = self.endpoint + "/datasets/" + dset_id + "/value" - rsp = self.session.post(req, data=json.dumps(body), headers=headers) - self.assertEqual(rsp.status_code, 200) - - if __name__ == "__main__": # setup test files diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index 18aac784..face21a7 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3218,10 +3218,10 @@ def testValueReinitialization1D(self): self.assertEqual(value[0:5], data[0:5]) self.assertEqual(value[5:10], [42,] * 5) - def testValueReinitialization2D(self): + def testShapeReinitialization2D(self): # Test the dataset values get reset after a reduction and resize - print("testValueReinitialization1D", self.base_domain) + print("testShapeReinitialization2D", self.base_domain) headers = helper.getRequestHeaders(domain=self.base_domain) # get domain @@ -3316,6 +3316,97 @@ def testValueReinitialization2D(self): else: self.assertEqual(row[j], 0) + def testShapeReinitialization3D(self): + # Test the dataset values get reset after a reduction and resize + + print("testPointReinitialization3D", self.base_domain) + headers = helper.getRequestHeaders(domain=self.base_domain) + + # get domain + req = f"{self.endpoint}/" + rsp = self.session.get(req, headers=headers) + rspJson = json.loads(rsp.text) + self.assertTrue("root" in rspJson) + root_uuid = rspJson["root"] + + # define two different shapes that we'll switch between + # min extent in each dimension is 20 for the point setup to work + large_shape = (110, 120, 130) + small_shape = (55, 60, 70) + + # setup some points on the diagonal + # space some points apart equally + delta = (large_shape[0] // 10, large_shape[1] // 10, large_shape[2] // 10) + offset = (5, 5, 5) + points = [] + for i in range(10): + if i == 0: + pt = offset + else: + last_pt = points[i - 1] + pt = (last_pt[0] + delta[0], last_pt[1] + delta[1], last_pt[2] + delta[2]) + for n in range(3): + if pt[n] >= large_shape[n]: + raise ValueError("pt outside extent") + points.append(pt) + + # create the dataset + req = f"{self.endpoint}/datasets" + payload = {"type": "H5T_STD_I32LE", "shape": large_shape, "maxdims": large_shape} + req = self.endpoint + "/datasets" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # create dataset + rspJson = json.loads(rsp.text) + dset_uuid = rspJson["id"] + self.assertTrue(helper.validateId(dset_uuid)) + + # link new dataset as 'dset' + name = "dset" + req = f"{self.endpoint}/groups/{root_uuid}/links/{name}" + payload = {"id": dset_uuid} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + + value = [1, ] * 10 # set value of each pt to one + + # write 1's to all the point locations + payload = {"points": points, "value": value} + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) + + # resize the dataset to the small shape + req = f"{self.endpoint}/datasets/{dset_uuid}/shape" + payload = {"shape": small_shape} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # resize back to large shape + req = f"{self.endpoint}/datasets/{dset_uuid}/shape" + payload = {"shape": large_shape} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + + # read all the data values + req = f"{self.endpoint}/datasets/{dset_uuid}/value" + body = {"points": points} + # read selected points + rsp = self.session.post(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("value" in rspJson) + ret_value = rspJson["value"] + + for i in range(10): + pt = points[i] + n = ret_value[i] + if pt[0] >= small_shape[0] and pt[1] >= small_shape[1] and pt[2] >= small_shape[2]: + self.assertEqual(n, 0) + else: + self.assertEqual(n, 1) + if __name__ == "__main__": # setup test files From 21f08c0f7118c7bb3f9e1535d8b6e0401c4d88ef Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 20 Oct 2023 18:15:45 -0700 Subject: [PATCH 12/17] delete chunk outside reduced shape region --- hsds/dset_sn.py | 38 +++++++++++++++++++------- hsds/servicenode_lib.py | 57 +++++++++++++++++++++++++++++++++++++-- tests/integ/value_test.py | 6 +++-- 3 files changed, 87 insertions(+), 14 deletions(-) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index d4fb0a96..338a4cbc 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -34,7 +34,7 @@ from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId -from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo +from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, removeChunks from .chunk_crawl import ChunkCrawler from . import config from . import hsds_logger as log @@ -524,6 +524,7 @@ async def PUT_DatasetShape(request): shape_update = None extend = 0 extend_dim = 0 + hrefs = [] # tBD - definae HATEOS refs to return dset_id = request.match_info.get("id") if not dset_id: @@ -556,13 +557,16 @@ async def PUT_DatasetShape(request): log.warn(msg) raise HTTPBadRequest(reason=msg) + if "shape" in data and "extend" in data: + msg = "PUT shape must have shape or extend key in body but not both" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "shape" in data: shape_update = data["shape"] if isinstance(shape_update, int): # convert to a list - shape_update = [ - shape_update, - ] + shape_update = [shape_update, ] log.debug(f"shape_update: {shape_update}") if "extend" in data: @@ -619,6 +623,23 @@ async def PUT_DatasetShape(request): log.warn(msg) raise HTTPBadRequest(reason=msg) + if extend_dim < 0 or extend_dim >= rank: + msg = "Extension dimension must be less than rank and non-negative" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if shape_update is None: + # construct a shape update using original dims and extend dim and value + shape_update = dims.copy() + shape_update[extend_dim] = extend + + if shape_update == dims: + log.info("shape update is same as current dims, no action needed") + json_resp = {"hrefs:", hrefs} + resp = await jsonResponse(request, json_resp, status=200) + log.response(request, resp=resp) + return resp + shape_reduction = False for i in range(rank): if shape_update and shape_update[i] < dims[i]: @@ -632,11 +653,6 @@ async def PUT_DatasetShape(request): log.warn(msg) raise HTTPConflict() - if extend_dim < 0 or extend_dim >= rank: - msg = "Extension dimension must be less than rank and non-negative" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - if shape_reduction: log.info(f"Shape extent reduced for dataset (rank: {rank})") @@ -718,17 +734,19 @@ async def PUT_DatasetShape(request): else: log.info(f"no chunks need updating for shape reduction over dim {m}") + log.debug("chunk reinitialization complete") if delete_ids: delete_ids = list(delete_ids) delete_ids.sort() log.debug(f"these ids will need to be deleted: {delete_ids}") + await removeChunks(app, delete_ids, bucket=bucket) else: log.info("no chunks need deletion for shape reduction") # send request onto DN req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id + "/shape" - json_resp = {"hrefs": []} + json_resp = {"hrefs": hrefs} params = {} if bucket: params["bucket"] = bucket diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 57711067..5c0047ab 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -13,9 +13,11 @@ # utility methods for service node handlers # +import asyncio + from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError -from aiohttp.client_exceptions import ClientOSError +from aiohttp.client_exceptions import ClientOSError, ClientError from .util.authUtil import getAclKeys from .util.idUtil import getDataNodeUrl, getCollectionForId, isSchema2Id @@ -23,7 +25,7 @@ from .util.linkUtil import h5Join from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck -from .util.httpUtil import http_get +from .util.httpUtil import http_get, http_delete from .util.domainUtil import getBucketForDomain, verifyRoot from . import hsds_logger as log @@ -485,3 +487,54 @@ async def getRootInfo(app, root_id, bucket=None): return None return info_json + + +async def removeChunks(app, chunk_ids, bucket=None): + """ Remove chunks with the given ids """ + + log.info(f"removeChunks, {len(chunk_ids)} chunks") + log.debug(f"removeChunks for: {chunk_ids}") + + dn_urls = app["dn_urls"] + if not dn_urls: + log.error("removeChunks request, but no dn_urls") + raise HTTPInternalServerError() + + log.debug(f"doFlush - dn_urls: {dn_urls}") + params = {} + if bucket: + params["bucket"] = bucket + failed_count = 0 + + try: + tasks = [] + for chunk_id in chunk_ids: + dn_url = getDataNodeUrl(app, chunk_id) + req = dn_url + "/chunks/" + chunk_id + task = asyncio.ensure_future(http_delete(app, req, params=params)) + tasks.append(task) + done, pending = await asyncio.wait(tasks) + if pending: + # should be empty since we didn't use return_when parameter + log.error("removeChunks - got pending tasks") + raise HTTPInternalServerError() + for task in done: + if task.exception(): + exception_type = type(task.exception()) + msg = f"removeChunks - task had exception: {exception_type}" + log.warn(msg) + failed_count += 1 + + except ClientError as ce: + msg = f"removeChunks - ClientError: {ce}" + log.error(msg) + raise HTTPInternalServerError() + except asyncio.CancelledError as cle: + log.error(f"removeChunks - CancelledError: {cle}") + raise HTTPInternalServerError() + + if failed_count: + msg = f"removeChunks, failed count: {failed_count}" + log.error(msg) + else: + log.info(f"removeChunks complete for {len(chunk_ids)} chunks - no errors") diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index face21a7..fd9e2e29 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3331,7 +3331,7 @@ def testShapeReinitialization3D(self): # define two different shapes that we'll switch between # min extent in each dimension is 20 for the point setup to work - large_shape = (110, 120, 130) + large_shape = (220, 120, 130) small_shape = (55, 60, 70) # setup some points on the diagonal @@ -3398,11 +3398,13 @@ def testShapeReinitialization3D(self): rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) ret_value = rspJson["value"] + print(ret_value) for i in range(10): pt = points[i] n = ret_value[i] - if pt[0] >= small_shape[0] and pt[1] >= small_shape[1] and pt[2] >= small_shape[2]: + print(f"{pt}: {n}") + if pt[0] >= small_shape[0] or pt[1] >= small_shape[1] or pt[2] >= small_shape[2]: self.assertEqual(n, 0) else: self.assertEqual(n, 1) From dc6571f8d0cf3d85b2bf7a32fa3bbe531ba83ed4 Mon Sep 17 00:00:00 2001 From: jreadey Date: Sat, 21 Oct 2023 13:54:10 -0700 Subject: [PATCH 13/17] added dset_lib.py --- hsds/chunk_crawl.py | 12 ++++-------- hsds/chunk_sn.py | 12 ++++-------- hsds/datanode_lib.py | 12 ++++-------- hsds/dset_lib.py | 41 +++++++++++++++++++++++++++++++++++++++ hsds/dset_sn.py | 19 ++---------------- tests/integ/value_test.py | 2 -- 6 files changed, 55 insertions(+), 43 deletions(-) create mode 100755 hsds/dset_lib.py diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index dade9fa5..8660ad56 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -31,8 +31,10 @@ from .util.dsetUtil import getSelectionShape, getChunkLayout from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getChunkIdForPartition, getQueryDtype -from .util.arrayUtil import jsonToArray, getShapeDims, getNumpyValue +from .util.arrayUtil import jsonToArray, getShapeDims from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray +from .dset_lib import getFillValue + from . import config from . import hsds_logger as log @@ -422,14 +424,8 @@ async def read_point_sel( np_arr_rsp = None dt = np_arr.dtype - fill_value = None # initialize to fill_value if specified - if "creationProperties" in dset_json: - cprops = dset_json["creationProperties"] - if "fillValue" in cprops: - fill_value_prop = cprops["fillValue"] - encoding = cprops.get("fillValue_encoding") - fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) + fill_value = getFillValue(dset_json) def defaultArray(): # no data, return zero array diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index f24cbfbe..b0bf8dff 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -39,10 +39,11 @@ from .util.chunkUtil import getQueryDtype, get_chunktable_dims from .util.arrayUtil import bytesArrayToList, jsonToArray, getShapeDims from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray -from .util.arrayUtil import squeezeArray, getNumpyValue, getBroadcastShape +from .util.arrayUtil import squeezeArray, getBroadcastShape from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.boolparser import BooleanParser from .servicenode_lib import getDsetJson, validateAction +from .dset_lib import getFillValue from .chunk_crawl import ChunkCrawler from . import config from . import hsds_logger as log @@ -1407,13 +1408,8 @@ async def doReadSelection( raise HTTPBadRequest(reason=msg) # initialize to fill_value if specified - fill_value = None - if "creationProperties" in dset_json: - cprops = dset_json["creationProperties"] - if "fillValue" in cprops: - fill_value_prop = cprops["fillValue"] - encoding = cprops.get("fillValue_encoding") - fill_value = getNumpyValue(fill_value_prop, dt=dset_dtype, encoding=encoding) + fill_value = getFillValue(dset_json) + if fill_value: arr = np.empty(np_shape, dtype=dset_dtype, order="C") arr[...] = fill_value diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index e87b063e..36d29ae3 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -32,12 +32,13 @@ from .util.dsetUtil import getChunkLayout, getFilterOps from .util.dsetUtil import getChunkInitializer, getSliceQueryParam from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex -from .util.arrayUtil import arrayToBytes, bytesToArray, getShapeDims, jsonToArray, getNumpyValue +from .util.arrayUtil import arrayToBytes, bytesToArray, getShapeDims, jsonToArray from .util.hdf5dtype import createDataType, getItemSize from .util.rangegetUtil import ChunkLocation, chunkMunge from . import config from . import hsds_logger as log +from .dset_lib import getFillValue # supported initializer commands INITIALIZER_CMDS = ["chunklocator", "arange"] @@ -1119,13 +1120,8 @@ async def get_chunk( if chunk_arr is None: # normal fill value based init or initializer failed - fill_value = None - if "creationProperties" in dset_json: - cprops = dset_json["creationProperties"] - if "fillValue" in cprops: - fill_value_prop = cprops["fillValue"] - encoding = cprops.get("fillValue_encoding") - fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) + fill_value = getFillValue(dset_json) + if fill_value: chunk_arr = np.empty(dims, dtype=dt, order="C") chunk_arr[...] = fill_value diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py new file mode 100755 index 00000000..d23be6c0 --- /dev/null +++ b/hsds/dset_lib.py @@ -0,0 +1,41 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import numpy as np + +from .util.hdf5dtype import createDataType +from .util.arrayUtil import getNumpyValue +from . import hsds_logger as log + + +def getFillValue(dset_json): + """ Return the fill value of the given dataset as a numpy array. + If no fill value is defined, return an zero array of given type """ + + fill_value = None + type_json = dset_json["type"] + dt = createDataType(type_json) + + if "creationProperties" in dset_json: + cprops = dset_json["creationProperties"] + if "fillValue" in cprops: + fill_value_prop = cprops["fillValue"] + log.debug(f"got fo;;+value_prop: {fill_value_prop}") + encoding = cprops.get("fillValue_encoding") + fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) + if fill_value: + arr = np.empty((1,), dtype=dt, order="C") + arr[...] = fill_value + else: + arr = np.zeros([1,], dtype=dt, order="C") + + return arr diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 338a4cbc..49bfd514 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -15,7 +15,6 @@ # import math -import numpy as np from json import JSONDecodeError from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPConflict @@ -35,6 +34,7 @@ from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, removeChunks +from .dset_lib import getFillValue from .chunk_crawl import ChunkCrawler from . import config from . import hsds_logger as log @@ -658,22 +658,7 @@ async def PUT_DatasetShape(request): # need to re-initialize any values that are now outside the shape # first get the fill value - fill_value = None - type_json = dset_json["type"] - dt = createDataType(type_json) - - if "creationProperties" in dset_json: - fill_value = None - cprops = dset_json["creationProperties"] - if "fillValue" in cprops: - fill_value_prop = cprops["fillValue"] - encoding = cprops.get("fillValue_encoding") - fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) - if fill_value: - arr = np.empty((1,), dtype=dt, order="C") - arr[...] = fill_value - else: - arr = np.zeros([1,], dtype=dt, order="C") + arr = getFillValue(dset_json) layout = getChunkLayout(dset_json) log.debug(f"got layout: {layout}") diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index fd9e2e29..b3b15d31 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3398,12 +3398,10 @@ def testShapeReinitialization3D(self): rspJson = json.loads(rsp.text) self.assertTrue("value" in rspJson) ret_value = rspJson["value"] - print(ret_value) for i in range(10): pt = points[i] n = ret_value[i] - print(f"{pt}: {n}") if pt[0] >= small_shape[0] or pt[1] >= small_shape[1] or pt[2] >= small_shape[2]: self.assertEqual(n, 0) else: From 8b2bdd1d5c3c935f599544402954ac99dcaaedd4 Mon Sep 17 00:00:00 2001 From: jreadey Date: Sat, 21 Oct 2023 14:58:17 -0700 Subject: [PATCH 14/17] refactored reduce shape to dset_lib --- hsds/chunk_crawl.py | 30 +++++++- hsds/dset_lib.py | 150 ++++++++++++++++++++++++++++++++++++++++ hsds/dset_sn.py | 89 +++--------------------- hsds/servicenode_lib.py | 60 +--------------- 4 files changed, 192 insertions(+), 137 deletions(-) diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 8660ad56..3e655fd2 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -31,9 +31,8 @@ from .util.dsetUtil import getSelectionShape, getChunkLayout from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getChunkIdForPartition, getQueryDtype -from .util.arrayUtil import jsonToArray, getShapeDims +from .util.arrayUtil import jsonToArray, getShapeDims, getNumpyValue from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray -from .dset_lib import getFillValue from . import config from . import hsds_logger as log @@ -45,6 +44,33 @@ ) +def getFillValue(dset_json): + """ Return the fill value of the given dataset as a numpy array. + If no fill value is defined, return an zero array of given type """ + + # NOTE - this is copy of the function in dset_lib, but needed to put + # here to avoid circular dependency + + fill_value = None + type_json = dset_json["type"] + dt = createDataType(type_json) + + if "creationProperties" in dset_json: + cprops = dset_json["creationProperties"] + if "fillValue" in cprops: + fill_value_prop = cprops["fillValue"] + log.debug(f"got fo;;+value_prop: {fill_value_prop}") + encoding = cprops.get("fillValue_encoding") + fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) + if fill_value: + arr = np.empty((1,), dtype=dt, order="C") + arr[...] = fill_value + else: + arr = np.zeros([1,], dtype=dt, order="C") + + return arr + + async def write_chunk_hyperslab( app, chunk_id, dset_json, slices, arr, bucket=None, client=None ): diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index d23be6c0..36a8e34f 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -10,11 +10,20 @@ # request a copy from help@hdfgroup.org. # ############################################################################## +import asyncio import numpy as np +from aiohttp.client_exceptions import ClientError + from .util.hdf5dtype import createDataType from .util.arrayUtil import getNumpyValue +from .util.dsetUtil import getChunkLayout +from .util.chunkUtil import getChunkIds, getChunkSelection +from .util.idUtil import getDataNodeUrl +from .util.httpUtil import http_delete + from . import hsds_logger as log +from .chunk_crawl import ChunkCrawler def getFillValue(dset_json): @@ -39,3 +48,144 @@ def getFillValue(dset_json): arr = np.zeros([1,], dtype=dt, order="C") return arr + + +async def removeChunks(app, chunk_ids, bucket=None): + """ Remove chunks with the given ids """ + + # this should only be called from a SN + + log.info(f"removeChunks, {len(chunk_ids)} chunks") + log.debug(f"removeChunks for: {chunk_ids}") + + dn_urls = app["dn_urls"] + if not dn_urls: + log.error("removeChunks request, but no dn_urls") + raise ValueError() + + log.debug(f"doFlush - dn_urls: {dn_urls}") + params = {} + if bucket: + params["bucket"] = bucket + failed_count = 0 + + try: + tasks = [] + for chunk_id in chunk_ids: + dn_url = getDataNodeUrl(app, chunk_id) + req = dn_url + "/chunks/" + chunk_id + task = asyncio.ensure_future(http_delete(app, req, params=params)) + tasks.append(task) + done, pending = await asyncio.wait(tasks) + if pending: + # should be empty since we didn't use return_when parameter + log.error("removeChunks - got pending tasks") + raise ValueError() + for task in done: + if task.exception(): + exception_type = type(task.exception()) + msg = f"removeChunks - task had exception: {exception_type}" + log.warn(msg) + failed_count += 1 + + except ClientError as ce: + msg = f"removeChunks - ClientError: {ce}" + log.error(msg) + raise ValueError() + except asyncio.CancelledError as cle: + log.error(f"removeChunks - CancelledError: {cle}") + raise ValueError() + + if failed_count: + msg = f"removeChunks, failed count: {failed_count}" + log.error(msg) + else: + log.info(f"removeChunks complete for {len(chunk_ids)} chunks - no errors") + + +async def reduceShape(app, dset_json, shape_update, bucket=None): + """ Given an existing dataset and a new shape, + Reinitialize and edge chunks and delete any chunks + that fall entirely out of the new shape region """ + + dset_id = dset_json["id"] + log.info(f"reduceShape for {dset_id} to {shape_update}") + + # get the current shape dims + shape_orig = dset_json["shape"] + if shape_orig["class"] != "H5S_SIMPLE": + raise ValueError("reduceShape can only be called on simple datasets") + dims = shape_orig["dims"] + rank = len(dims) + + # get the fill value + arr = getFillValue(dset_json) + + # and the chunk layout + layout = getChunkLayout(dset_json) + log.debug(f"got layout: {layout}") + delete_ids = set() # chunk ids that will need to be deleted + for n in range(rank): + if dims[n] <= shape_update[n]: + log.debug(f"skip dimension {n}") + continue + log.debug(f"reinitialize for dimension: {n}") + slices = [] + update_ids = set() # chunk ids that will need to be updated + + for m in range(rank): + if m == n: + s = slice(shape_update[m], dims[m], 1) + else: + # just select the entire extent + s = slice(0, dims[m], 1) + slices.append(s) + log.debug(f"shape_reinitialize - got slices: {slices} for dimension: {n}") + chunk_ids = getChunkIds(dset_id, slices, layout) + log.debug(f"got chunkIds: {chunk_ids}") + + # separate ids into those that overlap the new shape + # vs. those that follow entirely outside the new shape. + # The former will need to be partiaally reset, the latter + # will need to be deleted + for chunk_id in chunk_ids: + if getChunkSelection(chunk_id, slices, layout) is None: + delete_ids.add(chunk_id) + else: + update_ids.add(chunk_id) + + if update_ids: + update_ids = list(update_ids) + update_ids.sort() + log.debug(f"these ids will need to be updated: {update_ids}") + + crawler = ChunkCrawler( + app, + update_ids, + dset_json=dset_json, + bucket=bucket, + slices=slices, + arr=arr, + action="write_chunk_hyperslab", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + if crawler_status not in (200, 201): + msg = f"crawler failed for shape reinitialize with status: {crawler_status}" + log.warn(msg) + else: + msg = f"crawler success for reinitialization with slices: {slices}" + log.info(msg) + else: + log.info(f"no chunks need updating for shape reduction over dim {m}") + + log.debug("chunk reinitialization complete") + if delete_ids: + delete_ids = list(delete_ids) + delete_ids.sort() + log.debug(f"these ids will need to be deleted: {delete_ids}") + await removeChunks(app, delete_ids, bucket=bucket) + else: + log.info("no chunks need deletion for shape reduction") diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 49bfd514..e3f537d3 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -16,15 +16,16 @@ import math from json import JSONDecodeError -from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPConflict +from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound +from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError from .util.httpUtil import http_post, http_put, http_delete, getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId, isSchema2Id -from .util.dsetUtil import getPreviewQuery, getFilterItem, getChunkLayout +from .util.dsetUtil import getPreviewQuery, getFilterItem from .util.arrayUtil import getNumElements, getShapeDims, getNumpyValue from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk -from .util.chunkUtil import getContiguousLayout, getChunkIds, getChunkSelection +from .util.chunkUtil import getContiguousLayout from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain @@ -33,9 +34,8 @@ from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId -from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, removeChunks -from .dset_lib import getFillValue -from .chunk_crawl import ChunkCrawler +from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo +from .dset_lib import reduceShape from . import config from . import hsds_logger as log @@ -655,78 +655,11 @@ async def PUT_DatasetShape(request): if shape_reduction: log.info(f"Shape extent reduced for dataset (rank: {rank})") - - # need to re-initialize any values that are now outside the shape - # first get the fill value - arr = getFillValue(dset_json) - - layout = getChunkLayout(dset_json) - log.debug(f"got layout: {layout}") - delete_ids = set() # chunk ids that will need to be deleted - for n in range(rank): - if dims[n] <= shape_update[i]: - log.debug(f"skip dimension {n}") - continue - log.debug(f"reinitialize for dimension: {n}") - slices = [] - update_ids = set() # chunk ids that will need to be updated - - for m in range(rank): - if m == n: - s = slice(shape_update[m], dims[m], 1) - else: - # just select the entire extent - s = slice(0, dims[m], 1) - slices.append(s) - log.debug(f"shape_reinitialize - got slices: {slices} for dimension: {n}") - chunk_ids = getChunkIds(dset_id, slices, layout) - log.debug(f"got chunkIds: {chunk_ids}") - - # separate ids into those that overlap the new shape - # vs. those that follow entirely outside the new shape. - # The former will need to be partiaally reset, the latter - # will need to be deleted - for chunk_id in chunk_ids: - if getChunkSelection(chunk_id, slices, layout) is None: - delete_ids.add(chunk_id) - else: - update_ids.add(chunk_id) - - if update_ids: - update_ids = list(update_ids) - update_ids.sort() - log.debug(f"these ids will need to be updated: {update_ids}") - - crawler = ChunkCrawler( - app, - update_ids, - dset_json=dset_json, - bucket=bucket, - slices=slices, - arr=arr, - action="write_chunk_hyperslab", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - if crawler_status not in (200, 201): - msg = f"crawler failed for shape reinitialize with status: {crawler_status}" - log.warn(msg) - else: - msg = f"crawler success for reinitialization with slices: {slices}" - log.info(msg) - else: - log.info(f"no chunks need updating for shape reduction over dim {m}") - - log.debug("chunk reinitialization complete") - if delete_ids: - delete_ids = list(delete_ids) - delete_ids.sort() - log.debug(f"these ids will need to be deleted: {delete_ids}") - await removeChunks(app, delete_ids, bucket=bucket) - else: - log.info("no chunks need deletion for shape reduction") + try: + await reduceShape(app, dset_json, shape_update, bucket=bucket) + except ValueError as ve: + msg = f"reduceShape for {dset_id} to {shape_update} resulted in exception: {ve}" + raise HTTPInternalServerError() # send request onto DN req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id + "/shape" diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 5c0047ab..abb312ae 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -13,19 +13,16 @@ # utility methods for service node handlers # -import asyncio - from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError -from aiohttp.client_exceptions import ClientOSError, ClientError +from aiohttp.client_exceptions import ClientOSError from .util.authUtil import getAclKeys -from .util.idUtil import getDataNodeUrl, getCollectionForId, isSchema2Id -from .util.idUtil import getS3Key +from .util.idUtil import getDataNodeUrl, getCollectionForId, isSchema2Id, getS3Key from .util.linkUtil import h5Join from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck -from .util.httpUtil import http_get, http_delete +from .util.httpUtil import http_get from .util.domainUtil import getBucketForDomain, verifyRoot from . import hsds_logger as log @@ -487,54 +484,3 @@ async def getRootInfo(app, root_id, bucket=None): return None return info_json - - -async def removeChunks(app, chunk_ids, bucket=None): - """ Remove chunks with the given ids """ - - log.info(f"removeChunks, {len(chunk_ids)} chunks") - log.debug(f"removeChunks for: {chunk_ids}") - - dn_urls = app["dn_urls"] - if not dn_urls: - log.error("removeChunks request, but no dn_urls") - raise HTTPInternalServerError() - - log.debug(f"doFlush - dn_urls: {dn_urls}") - params = {} - if bucket: - params["bucket"] = bucket - failed_count = 0 - - try: - tasks = [] - for chunk_id in chunk_ids: - dn_url = getDataNodeUrl(app, chunk_id) - req = dn_url + "/chunks/" + chunk_id - task = asyncio.ensure_future(http_delete(app, req, params=params)) - tasks.append(task) - done, pending = await asyncio.wait(tasks) - if pending: - # should be empty since we didn't use return_when parameter - log.error("removeChunks - got pending tasks") - raise HTTPInternalServerError() - for task in done: - if task.exception(): - exception_type = type(task.exception()) - msg = f"removeChunks - task had exception: {exception_type}" - log.warn(msg) - failed_count += 1 - - except ClientError as ce: - msg = f"removeChunks - ClientError: {ce}" - log.error(msg) - raise HTTPInternalServerError() - except asyncio.CancelledError as cle: - log.error(f"removeChunks - CancelledError: {cle}") - raise HTTPInternalServerError() - - if failed_count: - msg = f"removeChunks, failed count: {failed_count}" - log.error(msg) - else: - log.info(f"removeChunks complete for {len(chunk_ids)} chunks - no errors") From 4dc6a06f8aea58460a898fe4d77eb567a1499608 Mon Sep 17 00:00:00 2001 From: jreadey Date: Mon, 23 Oct 2023 11:04:12 -0700 Subject: [PATCH 15/17] determine allocated chunks for shape reduction --- hsds/attr_sn.py | 4 +- hsds/chunk_crawl.py | 26 ++++--- hsds/ctype_sn.py | 5 +- hsds/domain_sn.py | 57 +------------- hsds/dset_lib.py | 153 +++++++++++++++++++++++++++++--------- hsds/dset_sn.py | 11 ++- hsds/group_sn.py | 5 +- hsds/servicenode_lib.py | 59 ++++++++++++++- hsds/util/chunkUtil.py | 6 +- tests/integ/value_test.py | 2 +- 10 files changed, 214 insertions(+), 114 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index 4b9db23a..d3dd648a 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -377,7 +377,7 @@ async def PUT_Attribute(request): msg = "Bad Request: input data doesn't match selection" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.info(f"Got: {arr.size} array elements") + log.debug(f"Got: {arr.size} array elements") else: value = None @@ -717,7 +717,7 @@ async def PUT_AttributeValue(request): msg = "Bad Request: input data doesn't match selection" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.info(f"Got: {arr.size} array elements") + log.debug(f"Got: {arr.size} array elements") # ready to add attribute now attr_json = {} diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 3e655fd2..02930993 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -16,6 +16,7 @@ import asyncio import time +import traceback import random from asyncio import CancelledError import numpy as np @@ -100,7 +101,11 @@ async def write_chunk_hyperslab( params = {} layout = getChunkLayout(dset_json) + log.debug(f"getChunkCoverage({chunk_id}, {slices}, {layout})") chunk_sel = getChunkCoverage(chunk_id, slices, layout) + if chunk_sel is None: + log.warn(f"getChunkCoverage returned None for: {chunk_id}, {slices}, {layout}") + return log.debug(f"chunk_sel: {chunk_sel}") data_sel = getDataCoverage(chunk_id, slices, layout) log.debug(f"data_sel: {data_sel}") @@ -857,26 +862,23 @@ async def do_work(self, chunk_id, client=None): ) except HTTPServiceUnavailable as sue: status_code = 503 - log.warn( - f"HTTPServiceUnavailable for {self._action}({chunk_id}): {sue}" - ) + msg = f"HTTPServiceUnavailable for {self._action}({chunk_id}): {sue}" + log.warn(msg) except Exception as e: status_code = 500 - log.error( - f"Unexpected exception {type(e)} for {self._action}({chunk_id}): {e} " - ) + msg = f"Unexpected exception {type(e)} for {self._action}({chunk_id}): {e} " + log.error(msg) + tb = traceback.format_exc() + print("traceback:", tb) retry += 1 if status_code == 200: break if retry == max_retries: - log.error( - f"ChunkCrawler action: {self._action} failed after: {retry} retries" - ) + msg = f"ChunkCrawler action: {self._action} failed after: {retry} retries" + log.error(msg) else: sleep_time = retry_exp * 2 ** retry + random.uniform(0, 0.1) - log.warn( - f"ChunkCrawler.doWork - retry: {retry}, sleeping for {sleep_time:.2f}" - ) + msg = f"ChunkCrawler.doWork - retry: {retry}, sleeping for {sleep_time:.2f}" await asyncio.sleep(sleep_time) # save status_code diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 01be4bac..f3d0236e 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -72,7 +72,10 @@ async def GET_Datatype(request): msg = "h5paths must be absolute" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.info(f"GET_Datatype, h5path: {h5path}") + msg = f"GET_Datatype, h5path: {h5path}" + if group_id: + msg += f" group_id: {group_id}" + log.info(msg) username, pswd = getUserPasswordFromRequest(request) if username is None and app["allow_noauth"]: diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index e3724cad..8ca13f72 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -13,7 +13,6 @@ # service node of hsds cluster # -from asyncio import CancelledError import asyncio import json import os.path as op @@ -23,7 +22,6 @@ from aiohttp.web_exceptions import HTTPGone, HTTPInternalServerError from aiohttp.web_exceptions import HTTPConflict, HTTPServiceUnavailable from aiohttp import ClientResponseError -from aiohttp.client_exceptions import ClientError from aiohttp.web import json_response from requests.sessions import merge_setting @@ -41,7 +39,7 @@ from .util.boolparser import BooleanParser from .util.globparser import globmatch from .servicenode_lib import getDomainJson, getObjectJson, getObjectIdByPath -from .servicenode_lib import getRootInfo, checkBucketAccess +from .servicenode_lib import getRootInfo, checkBucketAccess, doFlush from .basenode import getVersion from . import hsds_logger as log from . import config @@ -893,59 +891,6 @@ async def GET_Domain(request): return resp -async def doFlush(app, root_id, bucket=None): - """return wnen all DN nodes have wrote any pending changes to S3""" - log.info(f"doFlush {root_id}") - params = {"flush": 1} - if bucket: - params["bucket"] = bucket - dn_urls = app["dn_urls"] - dn_ids = [] - log.debug(f"doFlush - dn_urls: {dn_urls}") - failed_count = 0 - - try: - tasks = [] - for dn_url in dn_urls: - req = dn_url + "/groups/" + root_id - task = asyncio.ensure_future(http_put(app, req, params=params)) - tasks.append(task) - done, pending = await asyncio.wait(tasks) - if pending: - # should be empty since we didn't use return_when parameter - log.error("doFlush - got pending tasks") - raise HTTPInternalServerError() - for task in done: - if task.exception(): - exception_type = type(task.exception()) - msg = f"doFlush - task had exception: {exception_type}" - log.warn(msg) - failed_count += 1 - else: - json_rsp = task.result() - log.debug(f"PUT /groups rsp: {json_rsp}") - if json_rsp and "id" in json_rsp: - dn_ids.append(json_rsp["id"]) - else: - log.error("expected dn_id in flush response from DN") - except ClientError as ce: - msg = f"doFlush - ClientError for http_put('/groups/{root_id}'): {ce}" - log.error(msg) - raise HTTPInternalServerError() - except CancelledError as cle: - log.error(f"doFlush - CancelledError '/groups/{root_id}'): {cle}") - raise HTTPInternalServerError() - msg = f"doFlush for {root_id} complete, failed: {failed_count} " - msg += f"out of {len(dn_urls)}" - log.info(msg) - if failed_count > 0: - log.error(f"doFlush fail count: {failed_count} returning 500") - raise HTTPInternalServerError() - else: - log.info("doFlush no fails, returning dn ids") - return dn_ids - - async def getScanTime(app, root_id, bucket=None): """ Return timestamp for the last scan of the given root id """ root_scan = 0 diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 36a8e34f..7666a94a 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -18,11 +18,13 @@ from .util.hdf5dtype import createDataType from .util.arrayUtil import getNumpyValue from .util.dsetUtil import getChunkLayout -from .util.chunkUtil import getChunkIds, getChunkSelection -from .util.idUtil import getDataNodeUrl +from .util.chunkUtil import getChunkCoordinate +from .util.idUtil import getDataNodeUrl, isSchema2Id, getS3Key, getObjId +from .util.storUtil import getStorKeys from .util.httpUtil import http_delete from . import hsds_logger as log +from . import config from .chunk_crawl import ChunkCrawler @@ -103,6 +105,58 @@ async def removeChunks(app, chunk_ids, bucket=None): log.info(f"removeChunks complete for {len(chunk_ids)} chunks - no errors") +async def getAllocatedChunkIds(app, dset_id, bucket=None): + """ Return the set of allocated chunk ids for the give dataset. + If slices is given, just return chunks that interesect with the slice region """ + + log.info(f"getAllocatedChunkIds for {dset_id}") + + if not isSchema2Id(dset_id): + msg = f"no tabulation for schema v1 id: {dset_id} returning " + msg += "null results" + log.warn(msg) + return {} + + if not bucket: + bucket = config.get("bucket_name") + if not bucket: + raise ValueError(f"no bucket defined for getAllocatedChunkIds for {dset_id}") + + root_key = getS3Key(dset_id) + log.debug(f"got root_key: {root_key}") + + if not root_key.endswith("/.dataset.json"): + raise ValueError("unexpected root key") + + root_prefix = root_key[: -(len(".dataset.json"))] + + log.debug(f"scanRoot - using prefix: {root_prefix}") + + kwargs = { + "prefix": root_prefix, + "include_stats": False, + "bucket": bucket, + } + s3keys = await getStorKeys(app, **kwargs) + + # getStoreKeys will pick up the dataset.json as well, + # so go through and discard + chunk_ids = [] + for s3key in s3keys: + if s3key.endswith("json"): + # ignore metadata items + continue + try: + chunk_id = getObjId(s3key) + except ValueError: + log.warn(f"ignoring s3key: {s3key}") + continue + chunk_ids.append(chunk_id) + + log.debug(f"getAllocattedChunkIds - got {len(chunk_ids)} ids") + return chunk_ids + + async def reduceShape(app, dset_json, shape_update, bucket=None): """ Given an existing dataset and a new shape, Reinitialize and edge chunks and delete any chunks @@ -122,42 +176,68 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): arr = getFillValue(dset_json) # and the chunk layout - layout = getChunkLayout(dset_json) + layout = tuple(getChunkLayout(dset_json)) log.debug(f"got layout: {layout}") - delete_ids = set() # chunk ids that will need to be deleted - for n in range(rank): - if dims[n] <= shape_update[n]: - log.debug(f"skip dimension {n}") + + # get all chunk ids for chunks that have been allocated + chunk_ids = await getAllocatedChunkIds(app, dset_id, bucket=bucket) + chunk_ids.sort() + + log.debug(f"got chunkIds: {chunk_ids}") + + # separate ids into three groups: + # A: those are entirely inside the new shape region - no action needed + # B: those that overlap the new shape - will need the edge portion reinitialized + # C: those that are entirely outside the new shape - will need to be deleted + + delete_ids = [] # chunk ids for chunk that that will need to be deleted + update_ids = [] # chunk ids for chunks that will need to be reinitialized + + for chunk_id in chunk_ids: + log.debug(f"chunk_id: {chunk_id}") + chunk_coord = getChunkCoordinate(chunk_id, layout) + log.debug(f"chunk_coord: {chunk_coord}") + skip = True + for i in range(rank): + if chunk_coord[i] + layout[i] > shape_update[i]: + skip = False + break + if skip: + log.debug(f"chunk_id {chunk_id} no action needed") continue - log.debug(f"reinitialize for dimension: {n}") - slices = [] - update_ids = set() # chunk ids that will need to be updated - for m in range(rank): - if m == n: - s = slice(shape_update[m], dims[m], 1) - else: - # just select the entire extent - s = slice(0, dims[m], 1) - slices.append(s) - log.debug(f"shape_reinitialize - got slices: {slices} for dimension: {n}") - chunk_ids = getChunkIds(dset_id, slices, layout) - log.debug(f"got chunkIds: {chunk_ids}") - - # separate ids into those that overlap the new shape - # vs. those that follow entirely outside the new shape. - # The former will need to be partiaally reset, the latter - # will need to be deleted - for chunk_id in chunk_ids: - if getChunkSelection(chunk_id, slices, layout) is None: - delete_ids.add(chunk_id) - else: - update_ids.add(chunk_id) + reinit = False + for n in range(rank): + if chunk_coord[n] < shape_update[n]: + reinit = True + break + if reinit: + log.debug("chunk reinit") + update_ids.append(chunk_id) + else: + log.debug("chunk delete") + delete_ids.append(chunk_id) - if update_ids: - update_ids = list(update_ids) - update_ids.sort() - log.debug(f"these ids will need to be updated: {update_ids}") + msg = f"reduceShape - from {len(chunk_ids)} chunks, {len(update_ids)} will need to be " + msg += f"updated and {len(delete_ids)} will need to deleted" + log.info(msg) + + if update_ids: + log.debug(f"these ids will need to be updated: {update_ids}") + + # For multidimensional datasets, may need multiple hyperslab writes + # go through each dimension and calculate region to update + + for n in range(rank): + slices = [] + + for m in range(rank): + if m == n: + s = slice(shape_update[m], dims[m], 1) + else: + # just select the entire extent + s = slice(0, dims[m], 1) + slices.append(s) crawler = ChunkCrawler( app, @@ -178,10 +258,11 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): else: msg = f"crawler success for reinitialization with slices: {slices}" log.info(msg) - else: - log.info(f"no chunks need updating for shape reduction over dim {m}") + else: + log.info("no chunks need updating for shape reduction") log.debug("chunk reinitialization complete") + if delete_ids: delete_ids = list(delete_ids) delete_ids.sort() diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index e3f537d3..00bfcd7f 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -34,7 +34,7 @@ from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId -from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo +from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, doFlush from .dset_lib import reduceShape from . import config from . import hsds_logger as log @@ -306,7 +306,10 @@ async def GET_Dataset(request): msg = "h5paths must be absolute" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.info(f"GET_Dataset, h5path: {h5path}") + msg = f"GET_Dataset, h5path: {h5path}" + if group_id: + msg += f" group_id: {group_id}" + log.info(msg) username, pswd = getUserPasswordFromRequest(request) if username is None and app["allow_noauth"]: @@ -655,10 +658,14 @@ async def PUT_DatasetShape(request): if shape_reduction: log.info(f"Shape extent reduced for dataset (rank: {rank})") + root_id = dset_json["root"] + # need to do a flush to know which chunks to update or delete + await doFlush(app, root_id, bucket=bucket) try: await reduceShape(app, dset_json, shape_update, bucket=bucket) except ValueError as ve: msg = f"reduceShape for {dset_id} to {shape_update} resulted in exception: {ve}" + log.error(msg) raise HTTPInternalServerError() # send request onto DN diff --git a/hsds/group_sn.py b/hsds/group_sn.py index dcdf04d9..98d58ed1 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -62,7 +62,10 @@ async def GET_Group(request): msg = "h5paths must be absolute if no parent id is provided" log.warn(msg) raise HTTPBadRequest(reason=msg) - log.info(f"GET_Group, h5path: {h5path}") + msg = f"GET_Group, h5path: {h5path}" + if group_id: + msg += f" group_id: {group_id}" + log.info(msg) if "include_links" in params and params["include_links"]: include_links = True if "include_attrs" in params and params["include_attrs"]: diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index abb312ae..2a7b45c1 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -13,16 +13,18 @@ # utility methods for service node handlers # +import asyncio + from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden from aiohttp.web_exceptions import HTTPNotFound, HTTPInternalServerError -from aiohttp.client_exceptions import ClientOSError +from aiohttp.client_exceptions import ClientOSError, ClientError from .util.authUtil import getAclKeys from .util.idUtil import getDataNodeUrl, getCollectionForId, isSchema2Id, getS3Key from .util.linkUtil import h5Join from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck -from .util.httpUtil import http_get +from .util.httpUtil import http_get, http_put from .util.domainUtil import getBucketForDomain, verifyRoot from . import hsds_logger as log @@ -484,3 +486,56 @@ async def getRootInfo(app, root_id, bucket=None): return None return info_json + + +async def doFlush(app, root_id, bucket=None): + """return wnen all DN nodes have wrote any pending changes to S3""" + log.info(f"doFlush {root_id}") + params = {"flush": 1} + if bucket: + params["bucket"] = bucket + dn_urls = app["dn_urls"] + dn_ids = [] + log.debug(f"doFlush - dn_urls: {dn_urls}") + failed_count = 0 + + try: + tasks = [] + for dn_url in dn_urls: + req = dn_url + "/groups/" + root_id + task = asyncio.ensure_future(http_put(app, req, params=params)) + tasks.append(task) + done, pending = await asyncio.wait(tasks) + if pending: + # should be empty since we didn't use return_when parameter + log.error("doFlush - got pending tasks") + raise HTTPInternalServerError() + for task in done: + if task.exception(): + exception_type = type(task.exception()) + msg = f"doFlush - task had exception: {exception_type}" + log.warn(msg) + failed_count += 1 + else: + json_rsp = task.result() + log.debug(f"PUT /groups rsp: {json_rsp}") + if json_rsp and "id" in json_rsp: + dn_ids.append(json_rsp["id"]) + else: + log.error("expected dn_id in flush response from DN") + except ClientError as ce: + msg = f"doFlush - ClientError for http_put('/groups/{root_id}'): {ce}" + log.error(msg) + raise HTTPInternalServerError() + except asyncio.CancelledError as cle: + log.error(f"doFlush - CancelledError '/groups/{root_id}'): {cle}") + raise HTTPInternalServerError() + msg = f"doFlush for {root_id} complete, failed: {failed_count} " + msg += f"out of {len(dn_urls)}" + log.info(msg) + if failed_count > 0: + log.error(f"doFlush fail count: {failed_count} returning 500") + raise HTTPInternalServerError() + else: + log.info("doFlush no fails, returning dn ids") + return dn_ids diff --git a/hsds/util/chunkUtil.py b/hsds/util/chunkUtil.py index 87bdb40c..88059a57 100644 --- a/hsds/util/chunkUtil.py +++ b/hsds/util/chunkUtil.py @@ -557,7 +557,6 @@ def getChunkCoordinate(chunk_id, layout): coord = getChunkIndex(chunk_id) for i in range(len(layout)): coord[i] *= layout[i] - return coord @@ -611,7 +610,12 @@ def getChunkCoverage(chunk_id, slices, layout): """ chunk_index = getChunkIndex(chunk_id) chunk_sel = getChunkSelection(chunk_id, slices, layout) + if not chunk_sel: + log.warn(f"slices: {slices} does intersect chunk: {chunk_id}") + return None rank = len(layout) + if len(slices) != rank: + raise ValueError(f"invalid slices value for dataset of rank: {rank}") sel = [] for dim in range(rank): s = chunk_sel[dim] diff --git a/tests/integ/value_test.py b/tests/integ/value_test.py index b3b15d31..70f62efe 100755 --- a/tests/integ/value_test.py +++ b/tests/integ/value_test.py @@ -3331,7 +3331,7 @@ def testShapeReinitialization3D(self): # define two different shapes that we'll switch between # min extent in each dimension is 20 for the point setup to work - large_shape = (220, 120, 130) + large_shape = (2200, 120, 130) small_shape = (55, 60, 70) # setup some points on the diagonal From 681c75e19a4e42098feefcc5311d96ba362fa15b Mon Sep 17 00:00:00 2001 From: jreadey Date: Thu, 26 Oct 2023 08:39:23 -0700 Subject: [PATCH 16/17] updates for review comments --- hsds/async_lib.py | 4 +- hsds/attr_sn.py | 3 +- hsds/chunk_crawl.py | 20 +- hsds/chunk_dn.py | 4 +- hsds/chunk_sn.py | 568 ++------------------------------ hsds/datanode_lib.py | 4 +- hsds/dset_lib.py | 588 ++++++++++++++++++++++++++++++++-- hsds/dset_sn.py | 4 +- hsds/util/arrayUtil.py | 38 --- hsds/util/dsetUtil.py | 66 ++++ tests/integ/broadcast_test.py | 3 +- 11 files changed, 673 insertions(+), 629 deletions(-) diff --git a/hsds/async_lib.py b/hsds/async_lib.py index 9ebfa099..2308cedb 100755 --- a/hsds/async_lib.py +++ b/hsds/async_lib.py @@ -20,9 +20,9 @@ from .util.idUtil import getObjId, isValidChunkId, getCollectionForId from .util.chunkUtil import getDatasetId, getNumChunks, ChunkIterator from .util.hdf5dtype import getItemSize, createDataType -from .util.arrayUtil import getShapeDims, getNumElements, bytesToArray +from .util.arrayUtil import getNumElements, bytesToArray from .util.dsetUtil import getHyperslabSelection, getFilterOps, getChunkDims -from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout +from .util.dsetUtil import getDatasetLayoutClass, getDatasetLayout, getShapeDims from .util.storUtil import getStorKeys, putStorJSONObj, getStorJSONObj from .util.storUtil import deleteStorObj, getStorBytes, isStorObj diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index d3dd648a..da78f4cb 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -27,8 +27,9 @@ from .util.attrUtil import validateAttributeName, getRequestCollectionName from .util.hdf5dtype import validateTypeItem, getBaseTypeJson from .util.hdf5dtype import createDataType, getItemSize -from .util.arrayUtil import jsonToArray, getShapeDims, getNumElements +from .util.arrayUtil import jsonToArray, getNumElements from .util.arrayUtil import bytesArrayToList +from .util.dsetUtil import getShapeDims from .servicenode_lib import getDomainJson, getObjectJson, validateAction from . import hsds_logger as log from . import config diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index 02930993..eccbc4e2 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -28,11 +28,11 @@ from .util.httpUtil import isUnixDomainUrl from .util.idUtil import getDataNodeUrl, getNodeCount from .util.hdf5dtype import createDataType -from .util.dsetUtil import getSliceQueryParam +from .util.dsetUtil import getSliceQueryParam, getShapeDims from .util.dsetUtil import getSelectionShape, getChunkLayout from .util.chunkUtil import getChunkCoverage, getDataCoverage from .util.chunkUtil import getChunkIdForPartition, getQueryDtype -from .util.arrayUtil import jsonToArray, getShapeDims, getNumpyValue +from .util.arrayUtil import jsonToArray, getNumpyValue from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray from . import config @@ -50,7 +50,7 @@ def getFillValue(dset_json): If no fill value is defined, return an zero array of given type """ # NOTE - this is copy of the function in dset_lib, but needed to put - # here to avoid circular dependency + # here to avoid a circular dependency fill_value = None type_json = dset_json["type"] @@ -60,7 +60,7 @@ def getFillValue(dset_json): cprops = dset_json["creationProperties"] if "fillValue" in cprops: fill_value_prop = cprops["fillValue"] - log.debug(f"got fo;;+value_prop: {fill_value_prop}") + log.debug(f"got fill_value_prop: {fill_value_prop}") encoding = cprops.get("fillValue_encoding") fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) if fill_value: @@ -179,18 +179,6 @@ async def read_chunk_hyperslab( return msg = f"read_chunk_hyperslab, chunk_id: {chunk_id}," - """ - msg += " slices: [" - for s in slices: - if isinstance(s, slice): - msg += f"{s}," - else: - if len(s) > 5: - # avoid large output lines - msg += f"[{s[0]}, {s[1]}, ..., {s[-2]}, {s[-1]}]," - else: - msg += f"{s}," - """ msg += f" bucket: {bucket}" if query is not None: msg += f" query: {query} limit: {limit}" diff --git a/hsds/chunk_dn.py b/hsds/chunk_dn.py index 329f772a..4f3da7f7 100644 --- a/hsds/chunk_dn.py +++ b/hsds/chunk_dn.py @@ -20,11 +20,11 @@ from aiohttp.web import json_response, StreamResponse from .util.httpUtil import request_read, getContentType -from .util.arrayUtil import bytesToArray, arrayToBytes, getShapeDims, getBroadcastShape +from .util.arrayUtil import bytesToArray, arrayToBytes, getBroadcastShape from .util.idUtil import getS3Key, validateInPartition, isValidUuid from .util.storUtil import isStorObj, deleteStorObj from .util.hdf5dtype import createDataType -from .util.dsetUtil import getSelectionList, getChunkLayout +from .util.dsetUtil import getSelectionList, getChunkLayout, getShapeDims from .util.dsetUtil import getSelectionShape, getChunkInitializer from .util.chunkUtil import getChunkIndex, getDatasetId, chunkQuery from .util.chunkUtil import chunkWriteSelection, chunkReadSelection diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index b0bf8dff..3a6eb4dd 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -30,29 +30,21 @@ from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain from .util.hdf5dtype import getItemSize, createDataType -from .util.dsetUtil import getSelectionList, isNullSpace, getDatasetLayout, getDatasetLayoutClass +from .util.dsetUtil import isNullSpace, get_slices, getShapeDims from .util.dsetUtil import isExtensible, getSelectionPagination from .util.dsetUtil import getSelectionShape, getDsetMaxDims, getChunkLayout from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId -from .util.chunkUtil import getChunkIndex, getChunkSuffix -from .util.chunkUtil import getChunkCoverage, getDataCoverage -from .util.chunkUtil import getQueryDtype, get_chunktable_dims -from .util.arrayUtil import bytesArrayToList, jsonToArray, getShapeDims +from .util.arrayUtil import bytesArrayToList, jsonToArray from .util.arrayUtil import getNumElements, arrayToBytes, bytesToArray from .util.arrayUtil import squeezeArray, getBroadcastShape from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.boolparser import BooleanParser from .servicenode_lib import getDsetJson, validateAction -from .dset_lib import getFillValue +from .dset_lib import getSelectionData from .chunk_crawl import ChunkCrawler from . import config from . import hsds_logger as log -CHUNK_REF_LAYOUTS = ( - "H5D_CONTIGUOUS_REF", - "H5D_CHUNKED_REF", - "H5D_CHUNKED_REF_INDIRECT", -) VARIABLE_AVG_ITEM_SIZE = 512 # guess at avg variable type length @@ -73,32 +65,6 @@ def get_hrefs(request, dset_json): return hrefs -def get_slices(app, select, dset_json): - """Get desired slices from selection query param string or json value. - If select is none or empty, slices for entire datashape will be - returned. - Refretch dims if the dataset is extensible - """ - - dset_id = dset_json["id"] - datashape = dset_json["shape"] - if datashape["class"] == "H5S_NULL": - msg = "Null space datasets can not be used as target for GET value" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets - - try: - slices = getSelectionList(select, dims) - except ValueError: - msg = f"Invalid selection: {select} on dims: {dims} " - msg += f"for dataset: {dset_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - return slices - - def use_http_streaming(request, rank): """ return boolean indicating whether http streaming should be used """ if rank == 0: @@ -110,283 +76,6 @@ def use_http_streaming(request, rank): return True -async def getChunkLocations(app, dset_id, dset_json, chunkinfo_map, chunk_ids, bucket=None): - """ - Get info for chunk locations (for reference layouts) - """ - layout_class = getDatasetLayoutClass(dset_json) - - if layout_class not in CHUNK_REF_LAYOUTS: - msg = f"skip getChunkLocations for layout class: {layout_class}" - log.debug(msg) - return - - chunk_dims = None - if "layout" in dset_json: - dset_layout = dset_json["layout"] - log.debug(f"dset_json layout: {dset_layout}") - if "dims" in dset_layout: - chunk_dims = dset_layout["dims"] - if chunk_dims is None: - msg = "no chunk dimensions set in dataset layout" - log.error(msg) - raise HTTPInternalServerError() - - datashape = dset_json["shape"] - datatype = dset_json["type"] - if isNullSpace(dset_json): - log.error("H5S_NULL shape class used with reference chunk layout") - raise HTTPInternalServerError() - dims = getShapeDims(datashape) - rank = len(dims) - # chunk_ids = list(chunkinfo_map.keys()) - # chunk_ids.sort() - num_chunks = len(chunk_ids) - msg = f"getChunkLocations for dset: {dset_id} bucket: {bucket} " - msg += f"rank: {rank} num chunk_ids: {num_chunks}" - log.info(msg) - log.debug(f"getChunkLocations layout: {layout_class}") - - def getChunkItem(chunkid): - if chunk_id in chunkinfo_map: - chunk_item = chunkinfo_map[chunk_id] - else: - chunk_item = {} - chunkinfo_map[chunk_id] = chunk_item - return chunk_item - - if layout_class == "H5D_CONTIGUOUS_REF": - layout = getDatasetLayout(dset_json) - log.debug(f"cpl layout: {layout}") - s3path = layout["file_uri"] - s3size = layout["size"] - if s3size == 0: - msg = "getChunkLocations - H5D_CONTIGUOUS_REF layout size 0, " - msg += "no allocation" - log.info(msg) - return - item_size = getItemSize(datatype) - chunk_size = item_size - for dim in chunk_dims: - chunk_size *= dim - log.debug(f"using chunk_size: {chunk_size} for H5D_CONTIGUOUS_REF") - - for chunk_id in chunk_ids: - log.debug(f"getChunkLocations - getting data for chunk: {chunk_id}") - chunk_item = getChunkItem(chunk_id) - chunk_index = getChunkIndex(chunk_id) - if len(chunk_index) != rank: - log.error("Unexpected chunk_index") - raise HTTPInternalServerError() - extent = item_size - if "offset" not in layout: - msg = "getChunkLocations - expected to find offset in chunk " - msg += "layout for H5D_CONTIGUOUS_REF" - log.error(msg) - continue - s3offset = layout["offset"] - if not isinstance(s3offset, int): - msg = "getChunkLocations - expected offset to be an int but " - msg += f"got: {s3offset}" - log.error(msg) - continue - log.debug(f"getChunkLocations s3offset: {s3offset}") - for i in range(rank): - dim = rank - i - 1 - index = chunk_index[dim] - s3offset += index * chunk_dims[dim] * extent - extent *= dims[dim] - msg = f"setting chunk_info_map to s3offset: {s3offset} " - msg == f"s3size: {s3size} for chunk_id: {chunk_id}" - log.debug(msg) - if s3offset > layout["offset"] + layout["size"]: - msg = f"range get of s3offset: {s3offset} s3size: {s3size} " - msg += "extends beyond end of contiguous dataset for " - msg += f"chunk_id: {chunk_id}" - log.warn(msg) - chunk_item["s3path"] = s3path - chunk_item["s3offset"] = s3offset - chunk_item["s3size"] = chunk_size - elif layout_class == "H5D_CHUNKED_REF": - layout = getDatasetLayout(dset_json) - log.debug(f"cpl layout: {layout}") - s3path = layout["file_uri"] - chunks = layout["chunks"] - - for chunk_id in chunk_ids: - chunk_item = getChunkItem(chunk_id) - s3offset = 0 - s3size = 0 - chunk_key = getChunkSuffix(chunk_id) - if chunk_key in chunks: - item = chunks[chunk_key] - s3offset = item[0] - s3size = item[1] - chunk_item["s3path"] = s3path - chunk_item["s3offset"] = s3offset - chunk_item["s3size"] = s3size - - elif layout_class == "H5D_CHUNKED_REF_INDIRECT": - layout = getDatasetLayout(dset_json) - log.debug(f"cpl layout: {layout}") - if "chunk_table" not in layout: - log.error("Expected to find chunk_table in dataset layout") - raise HTTPInternalServerError() - chunktable_id = layout["chunk_table"] - # get state for dataset from DN. - chunktable_json = await getDsetJson(app, chunktable_id, bucket=bucket) - # log.debug(f"chunktable_json: {chunktable_json}") - chunktable_dims = getShapeDims(chunktable_json["shape"]) - chunktable_layout = chunktable_json["layout"] - if chunktable_layout.get("class") == "H5D_CHUNKED_REF_INDIRECT": - # We don't support recursive chunked_ref_indirect classes - msg = "chunktable layout: H5D_CHUNKED_REF_INDIRECT is invalid" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - if len(chunktable_dims) != rank: - msg = "Rank of chunktable should be same as the dataset" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - - # convert the list of chunk_ids into a set of points to query in - # the chunk table - log.debug(f"datashape: {dims}") - log.debug(f"chunk_dims: {chunk_dims}") - log.debug(f"chunktable_dims: {chunktable_dims}") - default_chunktable_dims = get_chunktable_dims(dims, chunk_dims) - log.debug(f"default_chunktable_dims: {default_chunktable_dims}") - table_factors = [] - if "hyper_dims" in layout: - hyper_dims = layout["hyper_dims"] - else: - # assume 1 to 1 matching - hyper_dims = chunk_dims - ref_num_chunks = num_chunks - for dim in range(rank): - if chunk_dims[dim] % hyper_dims[dim] != 0: - msg = f"expected hyper_dims [{hyper_dims[dim]}] to be a factor" - msg += f" of {chunk_dims[dim]}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - factor = chunk_dims[dim] // hyper_dims[dim] - table_factors.append(factor) - ref_num_chunks *= factor - log.debug(f"table_factors: {table_factors}") - log.debug(f"ref_num_chunks: {ref_num_chunks}") - log.debug(f"hyper_dims: {hyper_dims}") - - if rank == 1: - arr_points = np.zeros((ref_num_chunks,), dtype=np.dtype("u8")) - table_factor = table_factors[0] - for i in range(num_chunks): - chunk_id = chunk_ids[i] - log.debug(f"chunk_id: {chunk_id}") - chunk_index = getChunkIndex(chunk_id) - chunk_index = chunk_index[0] - log.debug(f"chunk_index: {chunk_index}") - for j in range(table_factor): - index = chunk_index * table_factor + j - arr_index = i * table_factor + j - arr_points[arr_index] = index - else: - if ref_num_chunks != num_chunks: - msg = "hyperchunks not supported for multidimensional datasets" - log.warn(msg) - raise HTTPBadRequest(msg=msg) - arr_points = np.zeros((num_chunks, rank), dtype=np.dtype("u8")) - for i in range(num_chunks): - chunk_id = chunk_ids[i] - log.debug(f"chunk_id for chunktable: {chunk_id}") - indx = getChunkIndex(chunk_id) - log.debug(f"get chunk indx: {indx}") - arr_points[i] = indx - - msg = f"got chunktable points: {arr_points}, calling getSelectionData" - log.debug(msg) - # this call won't lead to a circular loop of calls since we've checked - # that the chunktable layout is not H5D_CHUNKED_REF_INDIRECT - kwargs = {"points": arr_points, "bucket": bucket} - point_data = await getSelectionData(app, chunktable_id, chunktable_json, **kwargs) - - log.debug(f"got chunktable data: {point_data}") - if "file_uri" in layout: - s3_layout_path = layout["file_uri"] - log.debug(f"got s3_layout_path: {s3_layout_path}") - else: - s3_layout_path = None - - for i in range(num_chunks): - chunk_id = chunk_ids[i] - chunk_item = getChunkItem(chunk_id) - item = point_data[i] - if s3_layout_path is None: - if len(item) < 3: - msg = "expected chunk table to have three fields" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - e = item[2] - if e: - s3path = e.decode("utf-8") - log.debug(f"got s3path: {s3path}") - else: - s3path = s3_layout_path - chunk_item["s3path"] = s3path - - if ref_num_chunks == num_chunks: - item = point_data[i] - s3offset = int(item[0]) - s3size = int(item[1]) - chunk_item["s3offset"] = s3offset - chunk_item["s3size"] = s3size - else: - factor = ref_num_chunks // num_chunks - s3offsets = [] - s3sizes = [] - for j in range(factor): - item = point_data[i * factor + j] - s3offset = int(item[0]) - s3offsets.append(s3offset) - s3size = int(item[1]) - s3sizes.append(s3size) - chunk_item["s3offset"] = s3offsets - chunk_item["s3size"] = s3sizes - chunk_item["hyper_dims"] = hyper_dims - - else: - log.error(f"Unexpected chunk layout: {layout['class']}") - raise HTTPInternalServerError() - - log.debug(f"returning chunkinfo_map: {chunkinfo_map}") - return chunkinfo_map - - -def get_chunk_selections(chunk_map, chunk_ids, slices, dset_json): - """Update chunk_map with chunk and data selections for the - given set of slices - """ - log.debug(f"get_chunk_selections - chunk_ids: {chunk_ids}") - if not slices: - log.debug("no slices set, returning") - return # nothing to do - log.debug(f"slices: {slices}") - layout = getChunkLayout(dset_json) - for chunk_id in chunk_ids: - if chunk_id in chunk_map: - item = chunk_map[chunk_id] - else: - item = {} - chunk_map[chunk_id] = item - - chunk_sel = getChunkCoverage(chunk_id, slices, layout) - log.debug( - f"get_chunk_selections - chunk_id: {chunk_id}, chunk_sel: {chunk_sel}" - ) - item["chunk_sel"] = chunk_sel - data_sel = getDataCoverage(chunk_id, slices, layout) - log.debug(f"get_chunk_selections - data_sel: {data_sel}") - item["data_sel"] = data_sel - async def PUT_Value(request): """ @@ -547,7 +236,12 @@ async def PUT_Value(request): raise HTTPBadRequest(reason=msg) select = params.get("select") - slices = get_slices(app, select, dset_json) + try: + slices = get_slices(select, dset_json) + except ValueError as ve: + log.warn(f"Invalid selection: {ve}") + raise HTTPBadRequest(reason="Invalid selection") + if "Limit" in params: try: limit = int(params["Limit"]) @@ -709,11 +403,15 @@ async def PUT_Value(request): np_shape = tuple(np_shape) elif points is None: - if body and "start" in body and "stop" in body: - slices = get_slices(app, body, dset_json) - else: - select = params.get("select") - slices = get_slices(app, select, dset_json) + try: + if body and "start" in body and "stop" in body: + slices = get_slices(body, dset_json) + else: + select = params.get("select") + slices = get_slices(select, dset_json) + except ValueError as ve: + log.warn(f"Invalid Selection: {ve}") + raise HTTPBadRequest(reason="Invalid Selection") # The selection parameters will determine expected put value shape log.debug(f"PUT Value selection: {slices}") @@ -1110,7 +808,12 @@ async def GET_Value(request): select = params.get("select") if select: log.debug(f"select query param: {select}") - slices = get_slices(app, select, dset_json) + try: + slices = get_slices(select, dset_json) + except ValueError as ve: + log.warn(f"Invalid selection: {ve}") + raise HTTPBadRequest(reason="Invalid selection") + log.debug(f"GET Value selection: {slices}") limit = 0 @@ -1347,223 +1050,6 @@ async def GET_Value(request): return resp -async def doReadSelection( - app, - chunk_ids, - dset_json, - slices=None, - points=None, - query=None, - query_update=None, - chunk_map=None, - bucket=None, - limit=0, -): - """read selection utility function""" - log.info(f"doReadSelection - number of chunk_ids: {len(chunk_ids)}") - log.debug(f"doReadSelection - chunk_ids: {chunk_ids}") - - type_json = dset_json["type"] - item_size = getItemSize(type_json) - log.debug(f"item size: {item_size}") - dset_dtype = createDataType(type_json) # np datatype - if query is None: - query_dtype = None - else: - log.debug(f"query: {query} limit: {limit}") - query_dtype = getQueryDtype(dset_dtype) - - # create array to hold response data - arr = None - - if points is not None: - # point selection - np_shape = [ - len(points), - ] - elif query is not None: - # return shape will be determined by number of matches - np_shape = None - elif slices is not None: - log.debug(f"get np_shape for slices: {slices}") - np_shape = getSelectionShape(slices) - else: - log.error("doReadSelection - expected points or slices to be set") - raise HTTPInternalServerError() - log.debug(f"selection shape: {np_shape}") - - if np_shape is not None: - # check that the array size is reasonable - request_size = math.prod(np_shape) - if item_size == "H5T_VARIABLE": - request_size *= 512 # random guess of avg item_size - else: - request_size *= item_size - log.debug(f"request_size: {request_size}") - max_request_size = int(config.get("max_request_size")) - if request_size >= max_request_size: - msg = f"Attempting to fetch {request_size} bytes (greater than " - msg += f"{max_request_size} limit" - log.error(msg) - raise HTTPBadRequest(reason=msg) - - # initialize to fill_value if specified - fill_value = getFillValue(dset_json) - - if fill_value: - arr = np.empty(np_shape, dtype=dset_dtype, order="C") - arr[...] = fill_value - else: - arr = np.zeros(np_shape, dtype=dset_dtype, order="C") - - crawler = ChunkCrawler( - app, - chunk_ids, - dset_json=dset_json, - chunk_map=chunk_map, - bucket=bucket, - slices=slices, - query=query, - query_update=query_update, - limit=limit, - arr=arr, - action="read_chunk_hyperslab", - ) - await crawler.crawl() - - crawler_status = crawler.get_status() - - log.info(f"doReadSelection complete - status: {crawler_status}") - if crawler_status == 400: - log.info(f"doReadSelection raising BadRequest error: {crawler_status}") - raise HTTPBadRequest() - if crawler_status not in (200, 201): - log.info( - f"doReadSelection raising HTTPInternalServerError for status: {crawler_status}" - ) - raise HTTPInternalServerError() - - if query is not None: - # combine chunk responses and return - if limit > 0 and crawler._hits > limit: - nrows = limit - else: - nrows = crawler._hits - arr = np.empty((nrows,), dtype=query_dtype) - start = 0 - for chunkid in chunk_ids: - if chunkid not in chunk_map: - continue - chunk_item = chunk_map[chunkid] - if "query_rsp" not in chunk_item: - continue - query_rsp = chunk_item["query_rsp"] - if len(query_rsp) == 0: - continue - stop = start + len(query_rsp) - if stop > nrows: - rsp_stop = len(query_rsp) - (stop - nrows) - arr[start:] = query_rsp[0:rsp_stop] - else: - arr[start:stop] = query_rsp[:] - start = stop - if start >= nrows: - log.debug(f"got {nrows} rows for query, quitting") - break - return arr - - -async def getSelectionData( - app, - dset_id, - dset_json, - slices=None, - points=None, - query=None, - query_update=None, - bucket=None, - limit=0, - method="GET", -): - """Read selected slices and return numpy array""" - log.debug("getSelectionData") - if slices is None and points is None: - log.error("getSelectionData - expected either slices or points to be set") - raise HTTPInternalServerError() - - layout = getChunkLayout(dset_json) - - chunkinfo = {} - - if slices is not None: - num_chunks = getNumChunks(slices, layout) - log.debug(f"num_chunks: {num_chunks}") - - max_chunks = int(config.get("max_chunks_per_request", default=1000)) - if num_chunks > max_chunks: - msg = f"num_chunks over {max_chunks} limit, but will attempt to fetch with crawler" - log.warn(msg) - - chunk_ids = getChunkIds(dset_id, slices, layout) - else: - # points - already checked it is not None - num_points = len(points) - chunk_ids = [] - for pt_indx in range(num_points): - point = points[pt_indx] - chunk_id = getChunkId(dset_id, point, layout) - if chunk_id in chunkinfo: - chunk_entry = chunkinfo[chunk_id] - else: - chunk_entry = {} - chunkinfo[chunk_id] = chunk_entry - chunk_ids.append(chunk_id) - if "points" in chunk_entry: - point_list = chunk_entry["points"] - else: - point_list = [] - chunk_entry["points"] = point_list - if "indices" in chunk_entry: - point_index = chunk_entry["indices"] - else: - point_index = [] - chunk_entry["indices"] = point_index - - point_list.append(point) - point_index.append(pt_indx) - - # Get information about where chunks are located - # Will be None except for H5D_CHUNKED_REF_INDIRECT type - await getChunkLocations(app, dset_id, dset_json, chunkinfo, chunk_ids, bucket=bucket) - - if slices is None: - slices = get_slices(app, None, dset_json) - - if points is None: - # get chunk selections for hyperslab select - get_chunk_selections(chunkinfo, chunk_ids, slices, dset_json) - - log.debug(f"chunkinfo_map: {chunkinfo}") - - if method == "OPTIONS": - # skip doing any big data load for options request - return None - - arr = await doReadSelection( - app, - chunk_ids, - dset_json, - slices=slices, - points=points, - query=query, - query_update=query_update, - limit=limit, - chunk_map=chunkinfo, - bucket=bucket, - ) - - return arr - async def POST_Value(request): """ @@ -1659,7 +1145,11 @@ async def POST_Value(request): elif "select" in body: select = body["select"] log.debug(f"select: {select}") - slices = get_slices(app, select, dset_json) + try: + slices = get_slices(select, dset_json) + except ValueError as ve: + log.warn(f"Invalid selection: {ve}") + raise HTTPBadRequest(reason="Invalid selection") log.debug(f"got slices: {slices}") else: msg = "Expected points or select key in request body" diff --git a/hsds/datanode_lib.py b/hsds/datanode_lib.py index 36d29ae3..d365ce2a 100644 --- a/hsds/datanode_lib.py +++ b/hsds/datanode_lib.py @@ -29,10 +29,10 @@ from .util.domainUtil import isValidDomain, getBucketForDomain from .util.attrUtil import getRequestCollectionName from .util.httpUtil import http_post -from .util.dsetUtil import getChunkLayout, getFilterOps +from .util.dsetUtil import getChunkLayout, getFilterOps, getShapeDims from .util.dsetUtil import getChunkInitializer, getSliceQueryParam from .util.chunkUtil import getDatasetId, getChunkSelection, getChunkIndex -from .util.arrayUtil import arrayToBytes, bytesToArray, getShapeDims, jsonToArray +from .util.arrayUtil import arrayToBytes, bytesToArray, jsonToArray from .util.hdf5dtype import createDataType, getItemSize from .util.rangegetUtil import ChunkLocation, chunkMunge diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 7666a94a..36e03989 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -11,23 +11,36 @@ ############################################################################## import asyncio +import math import numpy as np from aiohttp.client_exceptions import ClientError - -from .util.hdf5dtype import createDataType +from aiohttp.web_exceptions import HTTPBadRequest, HTTPInternalServerError +from .util.hdf5dtype import createDataType, getItemSize from .util.arrayUtil import getNumpyValue -from .util.dsetUtil import getChunkLayout -from .util.chunkUtil import getChunkCoordinate +from .util.dsetUtil import isNullSpace, getDatasetLayout, getDatasetLayoutClass +from .util.dsetUtil import getChunkLayout, getSelectionShape, getShapeDims, get_slices +from .util.chunkUtil import getChunkCoordinate, getChunkIndex, getChunkSuffix +from .util.chunkUtil import getNumChunks, getChunkIds, getChunkId +from .util.chunkUtil import getChunkCoverage, getDataCoverage +from .util.chunkUtil import getQueryDtype, get_chunktable_dims + from .util.idUtil import getDataNodeUrl, isSchema2Id, getS3Key, getObjId from .util.storUtil import getStorKeys from .util.httpUtil import http_delete -from . import hsds_logger as log -from . import config +from .servicenode_lib import getDsetJson from .chunk_crawl import ChunkCrawler +from . import config +from . import hsds_logger as log +CHUNK_REF_LAYOUTS = ( + "H5D_CONTIGUOUS_REF", + "H5D_CHUNKED_REF", + "H5D_CHUNKED_REF_INDIRECT", +) + def getFillValue(dset_json): """ Return the fill value of the given dataset as a numpy array. If no fill value is defined, return an zero array of given type """ @@ -40,7 +53,7 @@ def getFillValue(dset_json): cprops = dset_json["creationProperties"] if "fillValue" in cprops: fill_value_prop = cprops["fillValue"] - log.debug(f"got fo;;+value_prop: {fill_value_prop}") + log.debug(f"got fill_value_prop: {fill_value_prop}") encoding = cprops.get("fillValue_encoding") fill_value = getNumpyValue(fill_value_prop, dt=dt, encoding=encoding) if fill_value: @@ -52,6 +65,530 @@ def getFillValue(dset_json): return arr +async def getChunkLocations(app, dset_id, dset_json, chunkinfo_map, chunk_ids, bucket=None): + """ + Get info for chunk locations (for reference layouts) + """ + layout_class = getDatasetLayoutClass(dset_json) + + if layout_class not in CHUNK_REF_LAYOUTS: + msg = f"skip getChunkLocations for layout class: {layout_class}" + log.debug(msg) + return + + chunk_dims = None + if "layout" in dset_json: + dset_layout = dset_json["layout"] + log.debug(f"dset_json layout: {dset_layout}") + if "dims" in dset_layout: + chunk_dims = dset_layout["dims"] + if chunk_dims is None: + msg = "no chunk dimensions set in dataset layout" + log.error(msg) + raise HTTPInternalServerError() + + datashape = dset_json["shape"] + datatype = dset_json["type"] + if isNullSpace(dset_json): + log.error("H5S_NULL shape class used with reference chunk layout") + raise HTTPInternalServerError() + dims = getShapeDims(datashape) + rank = len(dims) + # chunk_ids = list(chunkinfo_map.keys()) + # chunk_ids.sort() + num_chunks = len(chunk_ids) + msg = f"getChunkLocations for dset: {dset_id} bucket: {bucket} " + msg += f"rank: {rank} num chunk_ids: {num_chunks}" + log.info(msg) + log.debug(f"getChunkLocations layout: {layout_class}") + + def getChunkItem(chunkid): + if chunk_id in chunkinfo_map: + chunk_item = chunkinfo_map[chunk_id] + else: + chunk_item = {} + chunkinfo_map[chunk_id] = chunk_item + return chunk_item + + if layout_class == "H5D_CONTIGUOUS_REF": + layout = getDatasetLayout(dset_json) + log.debug(f"cpl layout: {layout}") + s3path = layout["file_uri"] + s3size = layout["size"] + if s3size == 0: + msg = "getChunkLocations - H5D_CONTIGUOUS_REF layout size 0, " + msg += "no allocation" + log.info(msg) + return + item_size = getItemSize(datatype) + chunk_size = item_size + for dim in chunk_dims: + chunk_size *= dim + log.debug(f"using chunk_size: {chunk_size} for H5D_CONTIGUOUS_REF") + + for chunk_id in chunk_ids: + log.debug(f"getChunkLocations - getting data for chunk: {chunk_id}") + chunk_item = getChunkItem(chunk_id) + chunk_index = getChunkIndex(chunk_id) + if len(chunk_index) != rank: + log.error("Unexpected chunk_index") + raise HTTPInternalServerError() + extent = item_size + if "offset" not in layout: + msg = "getChunkLocations - expected to find offset in chunk " + msg += "layout for H5D_CONTIGUOUS_REF" + log.error(msg) + continue + s3offset = layout["offset"] + if not isinstance(s3offset, int): + msg = "getChunkLocations - expected offset to be an int but " + msg += f"got: {s3offset}" + log.error(msg) + continue + log.debug(f"getChunkLocations s3offset: {s3offset}") + for i in range(rank): + dim = rank - i - 1 + index = chunk_index[dim] + s3offset += index * chunk_dims[dim] * extent + extent *= dims[dim] + msg = f"setting chunk_info_map to s3offset: {s3offset} " + msg == f"s3size: {s3size} for chunk_id: {chunk_id}" + log.debug(msg) + if s3offset > layout["offset"] + layout["size"]: + msg = f"range get of s3offset: {s3offset} s3size: {s3size} " + msg += "extends beyond end of contiguous dataset for " + msg += f"chunk_id: {chunk_id}" + log.warn(msg) + chunk_item["s3path"] = s3path + chunk_item["s3offset"] = s3offset + chunk_item["s3size"] = chunk_size + elif layout_class == "H5D_CHUNKED_REF": + layout = getDatasetLayout(dset_json) + log.debug(f"cpl layout: {layout}") + s3path = layout["file_uri"] + chunks = layout["chunks"] + + for chunk_id in chunk_ids: + chunk_item = getChunkItem(chunk_id) + s3offset = 0 + s3size = 0 + chunk_key = getChunkSuffix(chunk_id) + if chunk_key in chunks: + item = chunks[chunk_key] + s3offset = item[0] + s3size = item[1] + chunk_item["s3path"] = s3path + chunk_item["s3offset"] = s3offset + chunk_item["s3size"] = s3size + + elif layout_class == "H5D_CHUNKED_REF_INDIRECT": + layout = getDatasetLayout(dset_json) + log.debug(f"cpl layout: {layout}") + if "chunk_table" not in layout: + log.error("Expected to find chunk_table in dataset layout") + raise HTTPInternalServerError() + chunktable_id = layout["chunk_table"] + # get state for dataset from DN. + chunktable_json = await getDsetJson(app, chunktable_id, bucket=bucket) + # log.debug(f"chunktable_json: {chunktable_json}") + chunktable_dims = getShapeDims(chunktable_json["shape"]) + chunktable_layout = chunktable_json["layout"] + if chunktable_layout.get("class") == "H5D_CHUNKED_REF_INDIRECT": + # We don't support recursive chunked_ref_indirect classes + msg = "chunktable layout: H5D_CHUNKED_REF_INDIRECT is invalid" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if len(chunktable_dims) != rank: + msg = "Rank of chunktable should be same as the dataset" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # convert the list of chunk_ids into a set of points to query in + # the chunk table + log.debug(f"datashape: {dims}") + log.debug(f"chunk_dims: {chunk_dims}") + log.debug(f"chunktable_dims: {chunktable_dims}") + default_chunktable_dims = get_chunktable_dims(dims, chunk_dims) + log.debug(f"default_chunktable_dims: {default_chunktable_dims}") + table_factors = [] + if "hyper_dims" in layout: + hyper_dims = layout["hyper_dims"] + else: + # assume 1 to 1 matching + hyper_dims = chunk_dims + ref_num_chunks = num_chunks + for dim in range(rank): + if chunk_dims[dim] % hyper_dims[dim] != 0: + msg = f"expected hyper_dims [{hyper_dims[dim]}] to be a factor" + msg += f" of {chunk_dims[dim]}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + factor = chunk_dims[dim] // hyper_dims[dim] + table_factors.append(factor) + ref_num_chunks *= factor + log.debug(f"table_factors: {table_factors}") + log.debug(f"ref_num_chunks: {ref_num_chunks}") + log.debug(f"hyper_dims: {hyper_dims}") + + if rank == 1: + arr_points = np.zeros((ref_num_chunks,), dtype=np.dtype("u8")) + table_factor = table_factors[0] + for i in range(num_chunks): + chunk_id = chunk_ids[i] + log.debug(f"chunk_id: {chunk_id}") + chunk_index = getChunkIndex(chunk_id) + chunk_index = chunk_index[0] + log.debug(f"chunk_index: {chunk_index}") + for j in range(table_factor): + index = chunk_index * table_factor + j + arr_index = i * table_factor + j + arr_points[arr_index] = index + else: + if ref_num_chunks != num_chunks: + msg = "hyperchunks not supported for multidimensional datasets" + log.warn(msg) + raise HTTPBadRequest(msg=msg) + arr_points = np.zeros((num_chunks, rank), dtype=np.dtype("u8")) + for i in range(num_chunks): + chunk_id = chunk_ids[i] + log.debug(f"chunk_id for chunktable: {chunk_id}") + indx = getChunkIndex(chunk_id) + log.debug(f"get chunk indx: {indx}") + arr_points[i] = indx + + msg = f"got chunktable points: {arr_points}, calling getSelectionData" + log.debug(msg) + # this call won't lead to a circular loop of calls since we've checked + # that the chunktable layout is not H5D_CHUNKED_REF_INDIRECT + kwargs = {"points": arr_points, "bucket": bucket} + point_data = await getSelectionData(app, chunktable_id, chunktable_json, **kwargs) + + log.debug(f"got chunktable data: {point_data}") + if "file_uri" in layout: + s3_layout_path = layout["file_uri"] + log.debug(f"got s3_layout_path: {s3_layout_path}") + else: + s3_layout_path = None + + for i in range(num_chunks): + chunk_id = chunk_ids[i] + chunk_item = getChunkItem(chunk_id) + item = point_data[i] + if s3_layout_path is None: + if len(item) < 3: + msg = "expected chunk table to have three fields" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + e = item[2] + if e: + s3path = e.decode("utf-8") + log.debug(f"got s3path: {s3path}") + else: + s3path = s3_layout_path + chunk_item["s3path"] = s3path + + if ref_num_chunks == num_chunks: + item = point_data[i] + s3offset = int(item[0]) + s3size = int(item[1]) + chunk_item["s3offset"] = s3offset + chunk_item["s3size"] = s3size + else: + factor = ref_num_chunks // num_chunks + s3offsets = [] + s3sizes = [] + for j in range(factor): + item = point_data[i * factor + j] + s3offset = int(item[0]) + s3offsets.append(s3offset) + s3size = int(item[1]) + s3sizes.append(s3size) + chunk_item["s3offset"] = s3offsets + chunk_item["s3size"] = s3sizes + chunk_item["hyper_dims"] = hyper_dims + + else: + log.error(f"Unexpected chunk layout: {layout['class']}") + raise HTTPInternalServerError() + + log.debug(f"returning chunkinfo_map: {chunkinfo_map}") + return chunkinfo_map + + + +def get_chunkmap_selections(chunk_map, chunk_ids, slices, dset_json): + """Update chunk_map with chunk and data selections for the + given set of slices + """ + log.debug(f"get_chunkmap_selections - chunk_ids: {chunk_ids}") + if not slices: + log.debug("no slices set, returning") + return # nothing to do + log.debug(f"slices: {slices}") + layout = getChunkLayout(dset_json) + for chunk_id in chunk_ids: + if chunk_id in chunk_map: + item = chunk_map[chunk_id] + else: + item = {} + chunk_map[chunk_id] = item + + chunk_sel = getChunkCoverage(chunk_id, slices, layout) + log.debug( + f"get_chunk_selections - chunk_id: {chunk_id}, chunk_sel: {chunk_sel}" + ) + item["chunk_sel"] = chunk_sel + data_sel = getDataCoverage(chunk_id, slices, layout) + log.debug(f"get_chunk_selections - data_sel: {data_sel}") + item["data_sel"] = data_sel + + +def get_chunk_selections(chunk_map, chunk_ids, slices, dset_json): + """Update chunk_map with chunk and data selections for the + given set of slices + """ + log.debug(f"get_chunk_selections - chunk_ids: {chunk_ids}") + if not slices: + log.debug("no slices set, returning") + return # nothing to do + log.debug(f"slices: {slices}") + layout = getChunkLayout(dset_json) + for chunk_id in chunk_ids: + if chunk_id in chunk_map: + item = chunk_map[chunk_id] + else: + item = {} + chunk_map[chunk_id] = item + + chunk_sel = getChunkCoverage(chunk_id, slices, layout) + log.debug( + f"get_chunk_selections - chunk_id: {chunk_id}, chunk_sel: {chunk_sel}" + ) + item["chunk_sel"] = chunk_sel + data_sel = getDataCoverage(chunk_id, slices, layout) + log.debug(f"get_chunk_selections - data_sel: {data_sel}") + item["data_sel"] = data_sel + +async def getSelectionData( + app, + dset_id, + dset_json, + slices=None, + points=None, + query=None, + query_update=None, + bucket=None, + limit=0, + method="GET", +): + """Read selected slices and return numpy array""" + log.debug("getSelectionData") + if slices is None and points is None: + log.error("getSelectionData - expected either slices or points to be set") + raise HTTPInternalServerError() + + layout = getChunkLayout(dset_json) + + chunkinfo = {} + + if slices is not None: + num_chunks = getNumChunks(slices, layout) + log.debug(f"num_chunks: {num_chunks}") + + max_chunks = int(config.get("max_chunks_per_request", default=1000)) + if num_chunks > max_chunks: + msg = f"num_chunks over {max_chunks} limit, but will attempt to fetch with crawler" + log.warn(msg) + + chunk_ids = getChunkIds(dset_id, slices, layout) + else: + # points - already checked it is not None + num_points = len(points) + chunk_ids = [] + for pt_indx in range(num_points): + point = points[pt_indx] + chunk_id = getChunkId(dset_id, point, layout) + if chunk_id in chunkinfo: + chunk_entry = chunkinfo[chunk_id] + else: + chunk_entry = {} + chunkinfo[chunk_id] = chunk_entry + chunk_ids.append(chunk_id) + if "points" in chunk_entry: + point_list = chunk_entry["points"] + else: + point_list = [] + chunk_entry["points"] = point_list + if "indices" in chunk_entry: + point_index = chunk_entry["indices"] + else: + point_index = [] + chunk_entry["indices"] = point_index + + point_list.append(point) + point_index.append(pt_indx) + + # Get information about where chunks are located + # Will be None except for H5D_CHUNKED_REF_INDIRECT type + await getChunkLocations(app, dset_id, dset_json, chunkinfo, chunk_ids, bucket=bucket) + + if slices is None: + slices = get_slices(None, dset_json) + + if points is None: + # get chunk selections for hyperslab select + get_chunk_selections(chunkinfo, chunk_ids, slices, dset_json) + + log.debug(f"chunkinfo_map: {chunkinfo}") + + if method == "OPTIONS": + # skip doing any big data load for options request + return None + + arr = await doReadSelection( + app, + chunk_ids, + dset_json, + slices=slices, + points=points, + query=query, + query_update=query_update, + limit=limit, + chunk_map=chunkinfo, + bucket=bucket, + ) + + return arr + + +async def doReadSelection( + app, + chunk_ids, + dset_json, + slices=None, + points=None, + query=None, + query_update=None, + chunk_map=None, + bucket=None, + limit=0, +): + """read selection utility function""" + log.info(f"doReadSelection - number of chunk_ids: {len(chunk_ids)}") + log.debug(f"doReadSelection - chunk_ids: {chunk_ids}") + + type_json = dset_json["type"] + item_size = getItemSize(type_json) + log.debug(f"item size: {item_size}") + dset_dtype = createDataType(type_json) # np datatype + if query is None: + query_dtype = None + else: + log.debug(f"query: {query} limit: {limit}") + query_dtype = getQueryDtype(dset_dtype) + + # create array to hold response data + arr = None + + if points is not None: + # point selection + np_shape = [ + len(points), + ] + elif query is not None: + # return shape will be determined by number of matches + np_shape = None + elif slices is not None: + log.debug(f"get np_shape for slices: {slices}") + np_shape = getSelectionShape(slices) + else: + log.error("doReadSelection - expected points or slices to be set") + raise HTTPInternalServerError() + log.debug(f"selection shape: {np_shape}") + + if np_shape is not None: + # check that the array size is reasonable + request_size = math.prod(np_shape) + if item_size == "H5T_VARIABLE": + request_size *= 512 # random guess of avg item_size + else: + request_size *= item_size + log.debug(f"request_size: {request_size}") + max_request_size = int(config.get("max_request_size")) + if request_size >= max_request_size: + msg = f"Attempting to fetch {request_size} bytes (greater than " + msg += f"{max_request_size} limit" + log.error(msg) + raise HTTPBadRequest(reason=msg) + + # initialize to fill_value if specified + fill_value = getFillValue(dset_json) + + if fill_value: + arr = np.empty(np_shape, dtype=dset_dtype, order="C") + arr[...] = fill_value + else: + arr = np.zeros(np_shape, dtype=dset_dtype, order="C") + + crawler = ChunkCrawler( + app, + chunk_ids, + dset_json=dset_json, + chunk_map=chunk_map, + bucket=bucket, + slices=slices, + query=query, + query_update=query_update, + limit=limit, + arr=arr, + action="read_chunk_hyperslab", + ) + await crawler.crawl() + + crawler_status = crawler.get_status() + + log.info(f"doReadSelection complete - status: {crawler_status}") + if crawler_status == 400: + log.info(f"doReadSelection raising BadRequest error: {crawler_status}") + raise HTTPBadRequest() + if crawler_status not in (200, 201): + log.info( + f"doReadSelection raising HTTPInternalServerError for status: {crawler_status}" + ) + raise HTTPInternalServerError() + + if query is not None: + # combine chunk responses and return + if limit > 0 and crawler._hits > limit: + nrows = limit + else: + nrows = crawler._hits + arr = np.empty((nrows,), dtype=query_dtype) + start = 0 + for chunkid in chunk_ids: + if chunkid not in chunk_map: + continue + chunk_item = chunk_map[chunkid] + if "query_rsp" not in chunk_item: + continue + query_rsp = chunk_item["query_rsp"] + if len(query_rsp) == 0: + continue + stop = start + len(query_rsp) + if stop > nrows: + rsp_stop = len(query_rsp) - (stop - nrows) + arr[start:] = query_rsp[0:rsp_stop] + else: + arr[start:stop] = query_rsp[:] + start = stop + if start >= nrows: + log.debug(f"got {nrows} rows for query, quitting") + break + return arr + + + async def removeChunks(app, chunk_ids, bucket=None): """ Remove chunks with the given ids """ @@ -65,7 +602,6 @@ async def removeChunks(app, chunk_ids, bucket=None): log.error("removeChunks request, but no dn_urls") raise ValueError() - log.debug(f"doFlush - dn_urls: {dn_urls}") params = {} if bucket: params["bucket"] = bucket @@ -73,6 +609,8 @@ async def removeChunks(app, chunk_ids, bucket=None): try: tasks = [] + # TBD - this may be problematic if the number of chunks to + # be deleted is very large - may need to implement some sort of crawler for chunk_id in chunk_ids: dn_url = getDataNodeUrl(app, chunk_id) req = dn_url + "/chunks/" + chunk_id @@ -159,7 +697,7 @@ async def getAllocatedChunkIds(app, dset_id, bucket=None): async def reduceShape(app, dset_json, shape_update, bucket=None): """ Given an existing dataset and a new shape, - Reinitialize and edge chunks and delete any chunks + Reinitialize any edge chunks and delete any chunks that fall entirely out of the new shape region """ dset_id = dset_json["id"] @@ -197,25 +735,17 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): log.debug(f"chunk_id: {chunk_id}") chunk_coord = getChunkCoordinate(chunk_id, layout) log.debug(f"chunk_coord: {chunk_coord}") - skip = True - for i in range(rank): - if chunk_coord[i] + layout[i] > shape_update[i]: - skip = False - break - if skip: + + + if np.all(np.add(chunk_coord, layout) <= shape_update): log.debug(f"chunk_id {chunk_id} no action needed") continue - - reinit = False - for n in range(rank): - if chunk_coord[n] < shape_update[n]: - reinit = True - break - if reinit: - log.debug("chunk reinit") + + if np.any(chunk_coord < shape_update): + log.debug(f"{chunk_id} reinit") update_ids.append(chunk_id) else: - log.debug("chunk delete") + log.debug(f"{chunk_id} delete") delete_ids.append(chunk_id) msg = f"reduceShape - from {len(chunk_ids)} chunks, {len(update_ids)} will need to be " @@ -230,15 +760,23 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): for n in range(rank): slices = [] - + update_element_count = 1 for m in range(rank): if m == n: s = slice(shape_update[m], dims[m], 1) + update_element_count *= dims[m] - shape_update[m] else: # just select the entire extent s = slice(0, dims[m], 1) + update_element_count *= dims[m] slices.append(s) + if update_element_count == 0: + log.debug(f"empty hyperslab update for dim {n}") + continue + + log.debug(f"update {update_element_count} elements for dim {n}") + crawler = ChunkCrawler( app, update_ids, diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 00bfcd7f..26919178 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -22,8 +22,8 @@ from .util.httpUtil import http_post, http_put, http_delete, getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId, isSchema2Id -from .util.dsetUtil import getPreviewQuery, getFilterItem -from .util.arrayUtil import getNumElements, getShapeDims, getNumpyValue +from .util.dsetUtil import getPreviewQuery, getFilterItem, getShapeDims +from .util.arrayUtil import getNumElements, getNumpyValue from .util.chunkUtil import getChunkSize, guessChunk, expandChunk, shrinkChunk from .util.chunkUtil import getContiguousLayout from .util.authUtil import getUserPasswordFromRequest, aclCheck diff --git a/hsds/util/arrayUtil.py b/hsds/util/arrayUtil.py index 31ee3bf1..5bd7e0ab 100644 --- a/hsds/util/arrayUtil.py +++ b/hsds/util/arrayUtil.py @@ -93,44 +93,6 @@ def getNumElements(dims): return num_elements -def getShapeDims(shape): - """ - Get dims from a given shape json. Return [1,] for Scalar datasets, - None for null dataspaces - """ - dims = None - if isinstance(shape, int): - dims = [ - shape, - ] - elif isinstance(shape, list) or isinstance(shape, tuple): - dims = shape # can use as is - elif isinstance(shape, str): - # only valid string value is H5S_NULL - if shape != "H5S_NULL": - raise ValueError("Invalid value for shape") - dims = None - elif isinstance(shape, dict): - if "class" not in shape: - raise ValueError("'class' key not found in shape") - if shape["class"] == "H5S_NULL": - dims = None - elif shape["class"] == "H5S_SCALAR": - dims = [ - 1, - ] - elif shape["class"] == "H5S_SIMPLE": - if "dims" not in shape: - raise ValueError("'dims' key expected for shape") - dims = shape["dims"] - else: - raise ValueError("Unknown shape class: {}".format(shape["class"])) - else: - raise ValueError("Unexpected shape class: {}".format(type(shape))) - - return dims - - def jsonToArray(data_shape, data_dtype, data_json): """ Return numpy array from the given json array. diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 79bfa5e7..3e8cd475 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -309,6 +309,45 @@ def getSelectionShape(selection): return shape +def getShapeDims(shape): + """ + Get dims from a given shape json. Return [1,] for Scalar datasets, + None for null dataspaces + """ + dims = None + if isinstance(shape, int): + dims = [ + shape, + ] + elif isinstance(shape, list) or isinstance(shape, tuple): + dims = shape # can use as is + elif isinstance(shape, str): + # only valid string value is H5S_NULL + if shape != "H5S_NULL": + raise ValueError("Invalid value for shape") + dims = None + elif isinstance(shape, dict): + if "class" not in shape: + raise ValueError("'class' key not found in shape") + if shape["class"] == "H5S_NULL": + dims = None + elif shape["class"] == "H5S_SCALAR": + dims = [ + 1, + ] + elif shape["class"] == "H5S_SIMPLE": + if "dims" not in shape: + raise ValueError("'dims' key expected for shape") + dims = shape["dims"] + else: + raise ValueError("Unknown shape class: {}".format(shape["class"])) + else: + raise ValueError("Unexpected shape class: {}".format(type(shape))) + + return dims + + + def getQueryParameter(request, query_name, body=None, default=None): """ Herlper function, get query parameter value from request. @@ -560,6 +599,33 @@ def getSelectionList(select, dims): return tuple(select_list) +def get_slices(select, dset_json): + """Get desired slices from selection query param string or json value. + If select is none or empty, slices for entire datashape will be + returned. + Refretch dims if the dataset is extensible + """ + + dset_id = dset_json["id"] + datashape = dset_json["shape"] + if datashape["class"] == "H5S_NULL": + msg = "Null space datasets can not be used as target for GET value" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + dims = getShapeDims(datashape) # throws 400 for HS_NULL dsets + + try: + slices = getSelectionList(select, dims) + except ValueError: + msg = f"Invalid selection: {select} on dims: {dims} " + msg += f"for dataset: {dset_id}" + log.warn(msg) + raise + return slices + + + def getSelectionPagination(select, dims, itemsize, max_request_size): """ Paginate a select tupe into multiple selects where each diff --git a/tests/integ/broadcast_test.py b/tests/integ/broadcast_test.py index 5d0187a5..1bf3c2ee 100755 --- a/tests/integ/broadcast_test.py +++ b/tests/integ/broadcast_test.py @@ -109,8 +109,7 @@ def testPut1DDataset(self): def testPut1DDatasetBinary(self): # Test PUT value with broadcast for 1d dataset using binary data print("testPut1DDatasetBinary", self.base_domain) - NUM_ELEMENTS = 10 # 1000000 - this value is hitting nginx request size limit - + NUM_ELEMENTS = 10 headers = helper.getRequestHeaders(domain=self.base_domain) headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) headers_bin_req["Content-Type"] = "application/octet-stream" From 8eec4398de225ccc4803b0996772a2b366764203 Mon Sep 17 00:00:00 2001 From: jreadey Date: Thu, 26 Oct 2023 08:47:47 -0700 Subject: [PATCH 17/17] flake8 updates --- hsds/chunk_sn.py | 8 +++----- hsds/dset_lib.py | 9 ++++----- hsds/util/dsetUtil.py | 2 -- tests/integ/broadcast_test.py | 2 +- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/hsds/chunk_sn.py b/hsds/chunk_sn.py index 3a6eb4dd..650cc0fa 100755 --- a/hsds/chunk_sn.py +++ b/hsds/chunk_sn.py @@ -76,7 +76,6 @@ def use_http_streaming(request, rank): return True - async def PUT_Value(request): """ Handler for PUT //value request @@ -241,7 +240,7 @@ async def PUT_Value(request): except ValueError as ve: log.warn(f"Invalid selection: {ve}") raise HTTPBadRequest(reason="Invalid selection") - + if "Limit" in params: try: limit = int(params["Limit"]) @@ -663,7 +662,7 @@ async def PUT_Value(request): else: # - # Do point post + # Do point put # log.debug(f"num_points: {num_points}") @@ -813,7 +812,7 @@ async def GET_Value(request): except ValueError as ve: log.warn(f"Invalid selection: {ve}") raise HTTPBadRequest(reason="Invalid selection") - + log.debug(f"GET Value selection: {slices}") limit = 0 @@ -1050,7 +1049,6 @@ async def GET_Value(request): return resp - async def POST_Value(request): """ Handler for POST //value request - point selection or hyperslab read diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 36e03989..3e2fc56e 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -41,6 +41,7 @@ "H5D_CHUNKED_REF_INDIRECT", ) + def getFillValue(dset_json): """ Return the fill value of the given dataset as a numpy array. If no fill value is defined, return an zero array of given type """ @@ -316,7 +317,6 @@ def getChunkItem(chunkid): return chunkinfo_map - def get_chunkmap_selections(chunk_map, chunk_ids, slices, dset_json): """Update chunk_map with chunk and data selections for the given set of slices @@ -370,6 +370,7 @@ def get_chunk_selections(chunk_map, chunk_ids, slices, dset_json): log.debug(f"get_chunk_selections - data_sel: {data_sel}") item["data_sel"] = data_sel + async def getSelectionData( app, dset_id, @@ -588,7 +589,6 @@ async def doReadSelection( return arr - async def removeChunks(app, chunk_ids, bucket=None): """ Remove chunks with the given ids """ @@ -735,12 +735,11 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): log.debug(f"chunk_id: {chunk_id}") chunk_coord = getChunkCoordinate(chunk_id, layout) log.debug(f"chunk_coord: {chunk_coord}") - - + if np.all(np.add(chunk_coord, layout) <= shape_update): log.debug(f"chunk_id {chunk_id} no action needed") continue - + if np.any(chunk_coord < shape_update): log.debug(f"{chunk_id} reinit") update_ids.append(chunk_id) diff --git a/hsds/util/dsetUtil.py b/hsds/util/dsetUtil.py index 3e8cd475..5d9a2479 100644 --- a/hsds/util/dsetUtil.py +++ b/hsds/util/dsetUtil.py @@ -347,7 +347,6 @@ def getShapeDims(shape): return dims - def getQueryParameter(request, query_name, body=None, default=None): """ Herlper function, get query parameter value from request. @@ -625,7 +624,6 @@ def get_slices(select, dset_json): return slices - def getSelectionPagination(select, dims, itemsize, max_request_size): """ Paginate a select tupe into multiple selects where each diff --git a/tests/integ/broadcast_test.py b/tests/integ/broadcast_test.py index 1bf3c2ee..f480e637 100755 --- a/tests/integ/broadcast_test.py +++ b/tests/integ/broadcast_test.py @@ -109,7 +109,7 @@ def testPut1DDataset(self): def testPut1DDatasetBinary(self): # Test PUT value with broadcast for 1d dataset using binary data print("testPut1DDatasetBinary", self.base_domain) - NUM_ELEMENTS = 10 + NUM_ELEMENTS = 10 headers = helper.getRequestHeaders(domain=self.base_domain) headers_bin_req = helper.getRequestHeaders(domain=self.base_domain) headers_bin_req["Content-Type"] = "application/octet-stream"