From 634e6b1f1a9480885093d5d937d42674627f739c Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 5 Jan 2024 05:12:23 +0000 Subject: [PATCH 01/18] refactor link dn --- hsds/ctype_sn.py | 17 +--- hsds/datanode.py | 8 +- hsds/dset_sn.py | 15 +-- hsds/group_sn.py | 19 +--- hsds/link_dn.py | 200 +++++++++++++++++++++++++++------------ hsds/link_sn.py | 105 ++++---------------- hsds/servicenode_lib.py | 123 ++++++++++++++++++++++-- tests/integ/link_test.py | 2 +- 8 files changed, 293 insertions(+), 196 deletions(-) diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index f3d0236e..3beae207 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -16,7 +16,7 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPGone from json import JSONDecodeError -from .util.httpUtil import http_post, http_put, http_delete, getHref, respJsonAssemble +from .util.httpUtil import http_post, http_delete, getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId from .util.authUtil import getUserPasswordFromRequest, aclCheck @@ -25,7 +25,7 @@ from .util.domainUtil import getBucketForDomain, verifyRoot from .util.hdf5dtype import validateTypeItem, getBaseTypeJson from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getObjectIdByPath, getPathForObjectId +from .servicenode_lib import getObjectIdByPath, getPathForObjectId, putHardLink from . import hsds_logger as log @@ -223,22 +223,13 @@ async def POST_Datatype(request): ctype_json = {"id": ctype_id, "root": root_id, "type": datatype} log.debug(f"create named type, body: {ctype_json}") req = getDataNodeUrl(app, ctype_id) + "/datatypes" - params = {} - if bucket: - params["bucket"] = bucket + params = {"bucket": bucket} type_json = await http_post(app, req, data=ctype_json, params=params) # create link if requested if link_id and link_title: - link_json = {} - link_json["id"] = ctype_id - link_json["class"] = "H5L_TYPE_HARD" - link_req = getDataNodeUrl(app, link_id) - link_req += "/groups/" + link_id + "/links/" + link_title - log.debug("PUT link - : " + link_req) - put_rsp = await http_put(app, link_req, data=link_json, params=params) - log.debug(f"PUT Link resp: {put_rsp}") + await putHardLink(app, link_id, link_title, tgt_id=ctype_id, bucket=bucket) # datatype creation successful resp = await jsonResponse(request, type_json, status=201) diff --git a/hsds/datanode.py b/hsds/datanode.py index 1efd9063..7e8c9a9d 100644 --- a/hsds/datanode.py +++ b/hsds/datanode.py @@ -30,7 +30,7 @@ from .domain_dn import GET_Domain, PUT_Domain, DELETE_Domain, PUT_ACL from .group_dn import GET_Group, POST_Group, DELETE_Group, PUT_Group from .group_dn import POST_Root -from .link_dn import GET_Links, GET_Link, PUT_Link, DELETE_Link +from .link_dn import GET_Links, POST_Links, PUT_Links, DELETE_Links from .attr_dn import GET_Attributes, POST_Attributes from .attr_dn import PUT_Attributes, DELETE_Attributes from .ctype_dn import GET_Datatype, POST_Datatype, DELETE_Datatype @@ -59,9 +59,9 @@ async def init(): app.router.add_route("PUT", "/groups/{id}", PUT_Group) app.router.add_route("POST", "/groups", POST_Group) app.router.add_route("GET", "/groups/{id}/links", GET_Links) - app.router.add_route("GET", "/groups/{id}/links/{title}", GET_Link) - app.router.add_route("DELETE", "/groups/{id}/links/{title}", DELETE_Link) - app.router.add_route("PUT", "/groups/{id}/links/{title}", PUT_Link) + app.router.add_route("POST", "/groups/{id}/links", POST_Links) + app.router.add_route("DELETE", "/groups/{id}/links", DELETE_Links) + app.router.add_route("PUT", "/groups/{id}/links", PUT_Links) app.router.add_route("GET", "/groups/{id}/attributes", GET_Attributes) app.router.add_route("POST", "/groups/{id}/attributes", POST_Attributes) app.router.add_route("DELETE", "/groups/{id}/attributes", DELETE_Attributes) diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 7df44873..c6a5ae6d 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -34,7 +34,7 @@ from .util.hdf5dtype import validateTypeItem, createDataType, getBaseTypeJson from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId -from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, doFlush +from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, doFlush, putHardLink from .dset_lib import reduceShape from . import config from . import hsds_logger as log @@ -1170,22 +1170,13 @@ async def POST_Dataset(request): log.debug(f"create dataset: {dataset_json}") req = getDataNodeUrl(app, dset_id) + "/datasets" - params = {} - if bucket: - params["bucket"] = bucket + params = {"bucket": bucket} post_json = await http_post(app, req, data=dataset_json, params=params) # create link if requested if link_id and link_title: - link_json = {} - link_json["id"] = dset_id - link_json["class"] = "H5L_TYPE_HARD" - link_req = getDataNodeUrl(app, link_id) - link_req += "/groups/" + link_id + "/links/" + link_title - log.info("PUT link - : " + link_req) - put_rsp = await http_put(app, link_req, data=link_json, params=params) - log.debug(f"PUT Link resp: {put_rsp}") + await putHardLink(app, link_id, link_title, tgt_id=dset_id, bucket=bucket) # dataset creation successful resp = await jsonResponse(request, post_json, status=201) diff --git a/hsds/group_sn.py b/hsds/group_sn.py index 98d58ed1..d09baacc 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -16,7 +16,7 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from json import JSONDecodeError -from .util.httpUtil import http_post, http_put, http_delete, getHref +from .util.httpUtil import http_post, http_delete, getHref from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId from .util.authUtil import getUserPasswordFromRequest, aclCheck @@ -24,7 +24,7 @@ from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, getPathForDomain, verifyRoot from .servicenode_lib import getDomainJson, getObjectJson, validateAction -from .servicenode_lib import getObjectIdByPath, getPathForObjectId +from .servicenode_lib import getObjectIdByPath, getPathForObjectId, putHardLink from . import hsds_logger as log @@ -223,23 +223,14 @@ async def POST_Group(request): group_json["creationProperties"] = creation_props log.debug(f"create group, body: {group_json}") req = getDataNodeUrl(app, group_id) + "/groups" - params = {} - if bucket: - params["bucket"] = bucket + params = {"bucket": bucket} group_json = await http_post(app, req, data=group_json, params=params) # create link if requested if link_id and link_title: - link_json = {} - link_json["id"] = group_id - link_json["class"] = "H5L_TYPE_HARD" - link_req = getDataNodeUrl(app, link_id) - link_req += "/groups/" + link_id + "/links/" + link_title - log.debug("PUT link - : " + link_req) - kwargs = {"data": link_json, "params": params} - put_json_rsp = await http_put(app, link_req, **kwargs) - log.debug(f"PUT Link resp: {put_json_rsp}") + await putHardLink(app, link_id, link_title, tgt_id=group_id, bucket=bucket) + log.debug("returning resp") # group creation successful resp = await jsonResponse(request, group_json, status=201) diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 27b55050..f6bf7661 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -21,6 +21,7 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response +from .util.arrayUtil import decodeData from .util.idUtil import isValidUuid from .util.linkUtil import validateLinkName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj @@ -82,7 +83,7 @@ async def GET_Links(request): group_json = await get_metadata_obj(app, group_id, bucket=bucket) - log.info(f"for id: {group_id} got group json: {group_json}") + log.debug(f"for id: {group_id} got group json: {group_json}") if "links" not in group_json: msg.error(f"unexpected group data for id: {group_id}") raise HTTPInternalServerError() @@ -136,25 +137,33 @@ async def GET_Links(request): return resp -async def GET_Link(request): - """HTTP GET method to return JSON for a link""" +async def POST_Links(request): + """HTTP POST method to return JSON for a link a given set of links """ log.request(request) app = request.app params = request.rel_url.query group_id = get_obj_id(request) - log.info(f"GET link: {group_id}") + log.info(f"POST_Links: {group_id}") if not isValidUuid(group_id, obj_class="group"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() - link_title = request.match_info.get("title") + body = await request.json() + if "titles" not in body: + msg = f"POST_Links expected titles in body but got: {body.keys()}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) - validateLinkName(link_title) + titles = body["titles"] # list of link names to fetch + + for title in titles: + validateLinkName(title) bucket = None if "bucket" in params: bucket = params["bucket"] + if not bucket: msg = "GET_Links - no bucket param" log.warn(msg) @@ -162,60 +171,75 @@ async def GET_Link(request): group_json = await get_metadata_obj(app, group_id, bucket=bucket) log.info(f"for id: {group_id} got group json: {group_json}") + if "links" not in group_json: log.error(f"unexpected group data for id: {group_id}") raise HTTPInternalServerError() links = group_json["links"] - if link_title not in links: - log.info(f"Link name {link_title} not found in group: {group_id}") + + link_list = [] # links to be returned + + for title in titles: + if title not in links: + log.info(f"Link name {title} not found in group: {group_id}") + raise HTTPNotFound() + link_json = links[title] + link_list.append(link_json) + + if not link_list: + msg = f"POST link - requested {len(titles)} but none were found" + log.warn(msg) raise HTTPNotFound() - link_json = links[link_title] + if len(link_list) != len(titles): + msg = f"POST_links - requested {len(titles)} links but only " + msg += f"{len(link_list)} were found" + log.warn(msg) + raise HTTPNotFound() - resp = json_response(link_json) + rspJson = {"links": link_list} + resp = json_response(rspJson) log.response(request, resp=resp) return resp -async def PUT_Link(request): - """Handler creating a new link""" +async def PUT_Links(request): + """Handler creating new links """ log.request(request) app = request.app params = request.rel_url.query group_id = get_obj_id(request) - log.info(f"PUT link: {group_id}") + log.info(f"PUT links: {group_id}") + if not isValidUuid(group_id, obj_class="group"): log.error(f"Unexpected group_id: {group_id}") raise HTTPInternalServerError() - link_title = request.match_info.get("title") - validateLinkName(link_title) - - log.info(f"link_title: {link_title}") - if not request.has_body: - msg = "PUT Link with no body" + msg = "PUT_Links with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) body = await request.json() - if "class" not in body: - msg = "PUT Link with no class key body" + if "links" not in body: + msg = "PUT_Links with no links key in body" log.warn(msg) raise HTTPBadRequest(reason=msg) - link_class = body["class"] - link_json = {} - link_json["class"] = link_class + items = body["links"] + + # validate input + for title in items: + validateLinkName(title) + item = items[title] - if "id" in body: - link_json["id"] = body["id"] - if "h5path" in body: - link_json["h5path"] = body["h5path"] - if "h5domain" in body: - link_json["h5domain"] = body["h5domain"] + if "id" in item: + if not isValidUuid(item["id"]): + msg = f"invalid uuid for {title}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) if "bucket" in params: bucket = params["bucket"] @@ -225,7 +249,7 @@ async def PUT_Link(request): bucket = None if not bucket: - msg = "GET_Links - no bucket param" + msg = "PUT_Links - no bucket provided" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -235,68 +259,129 @@ async def PUT_Link(request): raise HTTPInternalServerError() links = group_json["links"] - if link_title in links: - msg = f"Link name {link_title} already found in group: {group_id}" - log.warn(msg) - raise HTTPConflict() + dup_titles = [] + for title in items: + if title in links: + link_json = items[title] + existing_link = links[title] + for prop in ("class", "id", "h5path", "h5domain"): + if prop in link_json: + if prop not in existing_link: + msg = f"PUT Link - prop {prop} not found in existing " + msg += "link, returning 409" + log.warn(msg) + raise HTTPConflict() + + if link_json[prop] != existing_link[prop]: + msg = f"PUT Links - prop {prop} value is different, old: " + msg += f"{existing_link[prop]}, new: {link_json[prop]}, " + msg += "returning 409" + log.warn(msg) + raise HTTPConflict() + msg = f"Link name {title} already found in group: {group_id}" + log.warn(msg) + dup_titles.append(title) - now = time.time() - link_json["created"] = now + for title in dup_titles: + del items[title] - # add the link - links[link_title] = link_json + if items: - # update the group lastModified - group_json["lastModified"] = now + now = time.time() - # write back to S3, save to metadata cache - await save_metadata_obj(app, group_id, group_json, bucket=bucket) + # add the links + for title in items: + item = items[title] + item["created"] = now + links[title] = item - resp_json = {} + # update the group lastModified + group_json["lastModified"] = now + + # write back to S3, save to metadata cache + await save_metadata_obj(app, group_id, group_json, bucket=bucket) + + status = 201 + else: + # nothing to update + status = 200 - resp = json_response(resp_json, status=201) + # put the status in the JSON response since the http_put function + # used the the SN won't return it + resp_json = {"status": status} + + resp = json_response(resp_json, status=status) log.response(request, resp=resp) return resp -async def DELETE_Link(request): +async def DELETE_Links(request): """HTTP DELETE method for group links""" log.request(request) app = request.app params = request.rel_url.query group_id = get_obj_id(request) - log.info(f"DELETE link: {group_id}") + log.info(f"DELETE links: {group_id}") if not isValidUuid(group_id, obj_class="group"): msg = f"Unexpected group_id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) - link_title = request.match_info.get("title") - validateLinkName(link_title) + if "encoding" in params: + encoding = params["encoding"] + if encoding != "base64": + msg = "only base64 encoding is supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + encoding = None + + if "separator" in params: + separator = params["separator"] + else: + separator = "/" if "bucket" in params: bucket = params["bucket"] else: bucket = None + if not bucket: - msg = "GET_Links - no bucket param" + msg = "DELETE_Links - no bucket param" log.warn(msg) raise HTTPBadRequest(reason=msg) + if "titles" not in params: + msg = "expected titles for DELETE links" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + titles_param = params["titles"] + if encoding: + titles_param = decodeData(titles_param).decode("utf-8") + + titles = titles_param.split(separator) + + log.info(f"DELETE links {titles} in {group_id} bucket: {bucket}") + group_json = await get_metadata_obj(app, group_id, bucket=bucket) - # TBD: Possible race condition + if "links" not in group_json: log.error(f"unexpected group data for id: {group_id}") raise HTTPInternalServerError() links = group_json["links"] - if link_title not in links: - msg = f"Link name {link_title} not found in group: {group_id}" - log.warn(msg) - raise HTTPNotFound() - del links[link_title] # remove the link from dictionary + for title in titles: + if title not in links: + msg = f"Link name {title} not found in group: {group_id}" + log.warn(msg) + raise HTTPNotFound() + + # now delete the links + for title in titles: + del links[title] # remove the link from dictionary # update the group lastModified now = time.time() @@ -305,8 +390,7 @@ async def DELETE_Link(request): # write back to S3 await save_metadata_obj(app, group_id, group_json, bucket=bucket) - hrefs = [] # TBD - resp_json = {"href": hrefs} + resp_json = {} resp = json_response(resp_json) log.response(request, resp=resp) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index cb75d9a2..3bad1afc 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -13,17 +13,17 @@ # service node of hsds cluster # -from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict +from aiohttp.web_exceptions import HTTPBadRequest from json import JSONDecodeError -from .util.httpUtil import http_get, http_put, http_delete, getHref +from .util.httpUtil import http_get, http_delete, getHref from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, getCollectionForId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain from .util.linkUtil import validateLinkName -from .servicenode_lib import validateAction, getObjectJson +from .servicenode_lib import validateAction, getLink, putLink from . import config from . import hsds_logger as log @@ -155,13 +155,14 @@ async def GET_Link(request): await validateAction(app, domain, group_id, username, "read") req = getDataNodeUrl(app, group_id) - req += "/groups/" + group_id + "/links/" + link_title + req += "/groups/" + group_id + "/links" log.debug("get LINK: " + req) params = {} if bucket: params["bucket"] = bucket - link_json = await http_get(app, req, params=params) - log.debug("got link_json: " + str(link_json)) + + link_json = await getLink(app, group_id, link_title, bucket=bucket) + resp_link = {} resp_link["title"] = link_title link_class = link_json["class"] @@ -212,13 +213,9 @@ async def PUT_Link(request): msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) - if not isValidUuid(group_id, obj_class="Group"): - msg = f"Invalid group id: {group_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + link_title = request.match_info.get("title") log.info(f"PUT Link_title: [{link_title}]") - validateLinkName(link_title) username, pswd = getUserPasswordFromRequest(request) # write actions need auth @@ -236,29 +233,6 @@ async def PUT_Link(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - link_json = {} - if "id" in body: - if not isValidUuid(body["id"]): - msg = "PUT Link with invalid id in body" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - link_json["id"] = body["id"] - link_json["class"] = "H5L_TYPE_HARD" - - elif "h5path" in body: - link_json["h5path"] = body["h5path"] - # could be hard or soft link - if "h5domain" in body: - link_json["h5domain"] = body["h5domain"] - link_json["class"] = "H5L_TYPE_EXTERNAL" - else: - # soft link - link_json["class"] = "H5L_TYPE_SOFT" - else: - msg = "PUT Link with no id or h5path keys" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - domain = getDomainFromRequest(request) if not isValidDomain(domain): msg = f"Invalid domain: {domain}" @@ -270,59 +244,20 @@ async def PUT_Link(request): await validateAction(app, domain, group_id, username, "create") - # for hard links, verify that the referenced id exists and is in - # this domain - if "id" in body: - ref_id = body["id"] - ref_json = await getObjectJson(app, ref_id, bucket=bucket) - group_json = await getObjectJson(app, group_id, bucket=bucket) - if ref_json["root"] != group_json["root"]: - msg = "Hard link must reference an object in the same domain" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + # putLink will validate these arguments + kwargs = {"bucket": bucket} + kwargs["tgt_id"] = body.get("id") + kwargs["h5path"] = body.get("h5path") + kwargs["h5domain"] = body.get("h5domain") + + status = await putLink(app, group_id, link_title, **kwargs) - # ready to add link now - req = getDataNodeUrl(app, group_id) - req += "/groups/" + group_id + "/links/" + link_title - log.debug("PUT link - getting group: " + req) - params = {} - if bucket: - params["bucket"] = bucket - try: - put_rsp = await http_put(app, req, data=link_json, params=params) - log.debug("PUT Link resp: " + str(put_rsp)) - dn_status = 201 - except HTTPConflict: - # check to see if this is just a duplicate put of an existing link - dn_status = 409 - log.warn(f"PUT Link: got conflict error for link_json: {link_json}") - existing_link = await http_get(app, req, params=params) - log.warn(f"PUT Link: fetched existing link: {existing_link}") - for prop in ("class", "id", "h5path", "h5domain"): - if prop in link_json: - if prop not in existing_link: - msg = f"PUT Link - prop {prop} not found in existing " - msg += "link, returning 409" - log.warn(msg) - break - if link_json[prop] != existing_link[prop]: - msg = f"PUT Link - prop {prop} value is different, old: " - msg += f"{existing_link[prop]}, new: {link_json[prop]}, " - msg += "returning 409" - log.warn(msg) - break - else: - log.info("PUT link is identical to existing value returning OK") - # return 200 since we didn't actually create a resource - dn_status = 200 - if dn_status == 409: - raise # return 409 to client hrefs = [] # TBD req_rsp = {"hrefs": hrefs} # link creation successful # returns 201 if new link was created, 200 if this is a duplicate # of an existing link - resp = await jsonResponse(request, req_rsp, status=dn_status) + resp = await jsonResponse(request, req_rsp, status=status) log.response(request, resp=resp) return resp @@ -360,10 +295,10 @@ async def DELETE_Link(request): await validateAction(app, domain, group_id, username, "delete") req = getDataNodeUrl(app, group_id) - req += "/groups/" + group_id + "/links/" + link_title - params = {} - if bucket: - params["bucket"] = bucket + req += "/groups/" + group_id + "/links" + + params = {"bucket": bucket, "titles": link_title} + rsp_json = await http_delete(app, req, params=params) resp = await jsonResponse(request, rsp_json) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index c5f0c561..2c75f5a9 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -22,8 +22,9 @@ from .util.authUtil import getAclKeys from .util.arrayUtil import encodeData -from .util.idUtil import getDataNodeUrl, getCollectionForId, isSchema2Id, getS3Key -from .util.linkUtil import h5Join +from .util.idUtil import getDataNodeUrl, getCollectionForId +from .util.idUtil import isSchema2Id, getS3Key, isValidUuid +from .util.linkUtil import h5Join, validateLinkName from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete @@ -341,6 +342,116 @@ async def getDsetJson(app, dset_id, return dset_json +async def getLink(app, group_id, title, bucket=None): + """ Get the link json for the given title """ + + req = getDataNodeUrl(app, group_id) + req += "/groups/" + group_id + "/links" + log.debug(f"getLink for {group_id} - title: {title}") + params = {"bucket": bucket} + + data = {"titles": [title, ]} + post_rsp = await http_post(app, req, data=data, params=params) + log.debug(f"got link_json: {post_rsp}") + if "links" not in post_rsp: + log.error("unexpected response from post links") + raise HTTPInternalServerError() + links = post_rsp["links"] + if len(links) != 1: + log.error(f"expected 1 link but got: {len(links)}") + raise HTTPInternalServerError() + link_json = links[0] + return link_json + + +async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, bucket=None): + """ create a new link. Return 201 if this is a new link, + or 200 if it's a duplicate of an existing link. """ + + validateLinkName(title) + + if h5path and tgt_id: + msg = "putLink - provide tgt_id or h5path, but not both" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + link_json = {} + + if tgt_id: + if not isValidUuid(tgt_id): + msg = f"putLink with invalid id: {tgt_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + link_json["id"] = tgt_id + link_class = "H5L_TYPE_HARD" + elif h5path: + link_json["h5path"] = h5path + # could be hard or soft link + if h5domain: + link_json["h5domain"] = h5domain + link_class = "H5L_TYPE_EXTERNAL" + else: + # soft link + link_class = "H5L_TYPE_SOFT" + else: + msg = "PUT Link with no id or h5path keys" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + link_json["class"] = link_class + + # for hard links, verify that the referenced id exists and is in + # this domain + if link_class == "H5L_TYPE_HARD": + tgt_id = link_json["id"] + ref_json = await getObjectJson(app, tgt_id, bucket=bucket) + group_json = await getObjectJson(app, group_id, bucket=bucket) + if ref_json["root"] != group_json["root"]: + msg = "Hard link must reference an object in the same domain" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # ready to add link now + req = getDataNodeUrl(app, group_id) + req += "/groups/" + group_id + "/links" + log.debug(f"PUT links - PUT request: {req}") + params = {"bucket": bucket} + + data = {"links": {title: link_json}} + + put_rsp = await http_put(app, req, data=data, params=params) + log.debug(f"PUT Link resp: {put_rsp}") + if "status" in put_rsp: + status = put_rsp["status"] + else: + status = 201 + return status + + +async def putHardLink(app, group_id, title, tgt_id=None, bucket=None): + """ create a new hard link. Return 201 if this is a new link, + or 201 if it's a duplicate of an existing link """ + + status = await putLink(app, group_id, title, tgt_id=tgt_id, bucket=bucket) + return status + + +async def putSoftLink(app, group_id, title, h5path=None, bucket=None): + """ create a new hard link. Return 201 if this is a new link, + or 201 if it's a duplicate of an existing link """ + + status = await putLink(app, group_id, title, h5path=h5path, bucket=bucket) + return status + + +async def putExternalLink(app, group_id, title, h5path=None, h5domain=None, bucket=None): + """ create a new hard link. Return 201 if this is a new link, + or 201 if it's a duplicate of an existing link """ + + status = await putLink(app, group_id, title, h5path=h5path, h5domain=h5domain, bucket=bucket) + return status + + async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, domain=None, follow_soft_links=False, follow_external_links=False): """Find the object at the provided h5path location. @@ -379,13 +490,7 @@ async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, dom if not link: continue # skip empty link - req = getDataNodeUrl(app, obj_id) - req += "/groups/" + obj_id + "/links/" + link - log.debug("get LINK: " + req) - params = {} - if bucket: - params["bucket"] = bucket - link_json = await http_get(app, req, params=params) + link_json = await getLink(app, obj_id, link, bucket=bucket) if link_json["class"] == "H5L_TYPE_EXTERNAL": if not follow_external_links: diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 02392cdb..3728561a 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -96,7 +96,7 @@ def testHardLink(self): self.assertEqual(rspLink["id"], grp1_id) self.assertEqual(rspLink["collection"], "groups") - # try creating the link again (be ok = PUT is idempotent) + # try creating the link again (should be ok - PUT is idempotent) rsp = self.session.put(req, data=json.dumps(payload), headers=headers) self.assertEqual(rsp.status_code, 200) # OK From acec4bee2336731760b8a48e35951f6c223617e4 Mon Sep 17 00:00:00 2001 From: jreadey Date: Mon, 8 Jan 2024 15:06:58 +0000 Subject: [PATCH 02/18] post method for links --- hsds/attr_sn.py | 28 +---- hsds/chunk_crawl.py | 11 -- hsds/domain_crawl.py | 112 ++++++++++++++++++- hsds/domain_sn.py | 14 +-- hsds/link_dn.py | 51 ++++++--- hsds/link_sn.py | 226 +++++++++++++++++++++++++++++++++++---- hsds/servicenode.py | 4 +- hsds/servicenode_lib.py | 62 +++++++++-- hsds/util/domainUtil.py | 32 +++--- tests/integ/attr_test.py | 19 +--- tests/integ/helper.py | 19 ++++ tests/integ/link_test.py | 201 ++++++++++++++++++++++++++++++++-- 12 files changed, 643 insertions(+), 136 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index 579ae7ff..af42b337 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -20,7 +20,7 @@ from .util.httpUtil import getHref from .util.httpUtil import getAcceptType, jsonResponse -from .util.idUtil import isValidUuid, getCollectionForId, getRootObjId +from .util.idUtil import isValidUuid, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot @@ -1106,7 +1106,7 @@ async def PUT_AttributeValue(request): async def POST_Attributes(request): - """HTTP method to get multiple attribute values""" + """HTTP method to get multiple attributes """ log.request(request) app = request.app log.info("POST_Attributes") @@ -1247,7 +1247,6 @@ async def POST_Attributes(request): elif len(items) == 1: # just make a request the datanode obj_id = list(items.keys())[0] - collection = getCollectionForId(obj_id) attr_names = items[obj_id] kwargs = {"attr_names": attr_names, "bucket": bucket} if not include_data: @@ -1259,12 +1258,6 @@ async def POST_Attributes(request): attributes = await getAttributes(app, obj_id, **kwargs) - # mixin hrefs - for attribute in attributes: - attr_name = attribute["name"] - attr_href = f"/{collection}/{obj_id}/attributes/{attr_name}" - attribute["href"] = getHref(request, attr_href) - resp_json["attributes"] = attributes else: # get multi obj @@ -1288,31 +1281,16 @@ async def POST_Attributes(request): msg = f"DomainCrawler returned: {len(crawler._obj_dict)} objects" log.info(msg) attributes = crawler._obj_dict - # mixin hrefs + # log attributes returned for each obj_id for obj_id in attributes: obj_attributes = attributes[obj_id] msg = f"POST_Attributes, obj_id {obj_id} " msg += f"returned {len(obj_attributes)}" log.debug(msg) - collection = getCollectionForId(obj_id) - for attribute in obj_attributes: - log.debug(f"attribute: {attribute}") - attr_name = attribute["name"] - attr_href = f"/{collection}/{obj_id}/attributes/{attr_name}" - attribute["href"] = getHref(request, attr_href) log.debug(f"got {len(attributes)} attributes") resp_json["attributes"] = attributes - hrefs = [] - collection = getCollectionForId(req_id) - obj_uri = "/" + collection + "/" + req_id - href = getHref(request, obj_uri + "/attributes") - hrefs.append({"rel": "self", "href": href}) - hrefs.append({"rel": "home", "href": getHref(request, "/")}) - hrefs.append({"rel": "owner", "href": getHref(request, obj_uri)}) - resp_json["hrefs"] = hrefs - resp = await jsonResponse(request, resp_json, ignore_nan=ignore_nan) log.response(request, resp=resp) return resp diff --git a/hsds/chunk_crawl.py b/hsds/chunk_crawl.py index b131ca9b..426cb169 100755 --- a/hsds/chunk_crawl.py +++ b/hsds/chunk_crawl.py @@ -81,9 +81,6 @@ async def write_chunk_hyperslab( np_arr: numpy array of data to be written """ - if not bucket: - bucket = config.get("bucket_name") - msg = f"write_chunk_hyperslab, chunk_id: {chunk_id}, slices: {slices}, " msg += f"bucket: {bucket}" log.info(msg) @@ -181,8 +178,6 @@ async def read_chunk_hyperslab( entire object) bucket: s3 bucket to read from """ - if not bucket: - bucket = config.get("bucket_name") if chunk_map is None: log.error("expected chunk_map to be set") @@ -444,9 +439,6 @@ async def read_point_sel( arr: numpy array to store read bytes """ - if not bucket: - bucket = config.get("bucket_name") - msg = f"read_point_sel, chunk_id: {chunk_id}, bucket: {bucket}" log.info(msg) @@ -549,9 +541,6 @@ async def write_point_sel( point_data: index of arr element to update for a given point """ - if not bucket: - bucket = config.get("bucket_name") - msg = f"write_point_sel, chunk_id: {chunk_id}, points: {point_list}, " msg += f"data: {point_data}" log.info(msg) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 84ee7a87..8697e267 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -18,10 +18,8 @@ from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone - from .util.idUtil import getCollectionForId, getDataNodeUrl - -from .servicenode_lib import getObjectJson, getAttributes, putAttributes +from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks from . import hsds_logger as log @@ -210,6 +208,93 @@ async def get_obj_json(self, obj_id): self._obj_dict[link_id] = {} # placeholder for obj id self._q.put_nowait(link_id) + async def get_links(self, grp_id, titles=None): + """ if titles is set, get all the links in grp_id that + have a title in the list. Otherwise, return all links for the object. """ + log.debug(f"get_links: {grp_id}, titles; {titles}") + collection = getCollectionForId(grp_id) + if collection != "groups": + log.warn(f"get_links, expected groups id but got: {grp_id}") + return + kwargs = {} + if titles: + kwargs["titles"] = titles + if self._params.get("bucket"): + kwargs["bucket"] = self._params["bucket"] + + if self._params.get("follow_links"): + follow_links = True + else: + follow_links = False + + log.debug(f"follow_links: {follow_links}") + log.debug(f"getLinks kwargs: {kwargs}") + + links = None + status = 200 + try: + links = await getLinks(self._app, grp_id, **kwargs) + except HTTPNotFound: + status = 404 + except HTTPServiceUnavailable: + status = 503 + except HTTPInternalServerError: + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + status = 500 + log.debug(f"getObjectJson status: {status}") + + if links is None: + msg = f"DomainCrawler - get_links for {grp_id} " + if status >= 500: + msg += f"failed, status: {status}" + log.error(msg) + else: + msg += f"returned status: {status}" + log.warn(msg) + return + + log.debug(f"DomainCrawler - got links for {grp_id}") + log.debug(f"save to obj_dict: {links}") + + self._obj_dict[grp_id] = links # store the links + + # if follow_links, add any group links to the lookup ids set + if follow_links: + log.debug(f"follow links for {grp_id}") + for link_obj in links: + log.debug(f"follow links for: {link_obj}") + if 'title' not in link_obj: + log.warn(f"expected to find title in link_json: {link_obj}") + continue + title = link_obj["title"] + log.debug(f"DomainCrawler - got link: {title}") + num_objects = len(self._obj_dict) + if self._params.get("max_objects_limit") is not None: + max_objects_limit = self._params["max_objects_limit"] + if num_objects >= max_objects_limit: + msg = "DomainCrawler reached limit of " + msg += f"{max_objects_limit}" + log.info(msg) + break + if link_obj["class"] != "H5L_TYPE_HARD": + # just follow hardlinks + log.debug("not hard link,continue") + continue + link_id = link_obj["id"] + if getCollectionForId(link_id) != "groups": + # only groups can have links + log.debug(f"link id: {link_id} is not for a group, continue") + continue + if link_id not in self._obj_dict: + # haven't seen this object yet, get obj json + log.debug(f"DomainCrawler - adding link_id: {link_id} to queue") + self._obj_dict[link_id] = {} # placeholder for obj id + self._q.put_nowait(link_id) + else: + log.debug(f"link: {link_id} already in object dict") + def get_status(self): """ return the highest status of any of the returned objects """ status = None @@ -304,7 +389,7 @@ async def fetch(self, obj_id): log.warn("expected at least one name in attr_names list") return - log.debug(f"DomainCrawler - got attribute names: {attr_names}") + log.debug(f"DomainCrawler - get attribute names: {attr_names}") await self.get_attributes(obj_id, attr_names) elif self._action == "put_attr": log.debug("DomainCrawler - put attributes") @@ -316,6 +401,25 @@ async def fetch(self, obj_id): log.debug(f"got {len(attr_items)} attr_items") await self.put_attributes(obj_id, attr_items) + elif self._action == "get_link": + log.debug("DomainCrawlwer - get links") + if obj_id not in self._objs: + link_titles = None # fetch all links for this object + else: + link_titles = self._objs[obj_id] + if link_titles is None: + log.debug(f"fetch all links for {obj_id}") + else: + if not isinstance(link_titles, list): + log.error("expected list for link titles") + return + if len(link_titles) == 0: + log.warn("expected at least one name in link titles list") + return + + log.debug(f"DomainCrawler - get link titles: {link_titles}") + await self.get_links(obj_id, link_titles) + else: msg = f"DomainCrawler: unexpected action: {self._action}" log.error(msg) diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index ad6baf70..d7778ecd 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -453,7 +453,7 @@ async def GET_Domain(request): bucket = getBucketForDomain(domain) log.debug(f"GET_Domain domain: {domain} bucket: {bucket}") - if not bucket and not config.get("bucket_name"): + if not bucket: # no bucket defined, raise 400 msg = "Bucket not provided" log.warn(msg) @@ -1354,10 +1354,6 @@ async def GET_Datasets(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") - else: - checkBucketAccess(app, bucket) # verify the domain try: @@ -1448,10 +1444,6 @@ async def GET_Groups(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") - else: - checkBucketAccess(app, bucket) # use reload to get authoritative domain json try: @@ -1537,10 +1529,6 @@ async def GET_Datatypes(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") - else: - checkBucketAccess(app, bucket) # use reload to get authoritative domain json try: diff --git a/hsds/link_dn.py b/hsds/link_dn.py index f6bf7661..3e2dfbbb 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -21,7 +21,6 @@ from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response -from .util.arrayUtil import decodeData from .util.idUtil import isValidUuid from .util.linkUtil import validateLinkName from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj @@ -185,7 +184,44 @@ async def POST_Links(request): log.info(f"Link name {title} not found in group: {group_id}") raise HTTPNotFound() link_json = links[title] - link_list.append(link_json) + item = {} + if "class" not in link_json: + log.warn(f"expected to find class key for link: {title}") + continue + link_class = link_json["class"] + item["class"] = link_class + if "created" not in link_json: + log.warn(f"expected to find created time for link: {title}") + link_created = 0 + else: + link_created = link_json["created"] + item["created"] = link_created + if link_class == "H5L_TYPE_HARD": + if "id" not in link_json: + log.warn(f"expected to id for hard linK: {title}") + continue + item["id"] = link_json["id"] + elif link_class == "H5L_TYPE_SOFT": + if "h5path" not in link_json: + log.warn(f"expected to find h5path for soft link: {title}") + continue + item["h5path"] = link_json["h5path"] + elif link_class == "H5L_TYPE_EXTERNAL": + if "h5path" not in link_json: + log.warn(f"expected to find h5path for external link: {title}") + continue + item["h5path"] = link_json["h5path"] + if "h5domain" not in link_json: + log.warn(f"expted to find h5domain for external link: {title}") + continue + item["h5domain"] = link_json["h5domain"] + else: + log.warn(f"unexpected to link class {link_class} for link: {title}") + continue + + item["title"] = title + + link_list.append(item) if not link_list: msg = f"POST link - requested {len(titles)} but none were found" @@ -328,15 +364,6 @@ async def DELETE_Links(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - if "encoding" in params: - encoding = params["encoding"] - if encoding != "base64": - msg = "only base64 encoding is supported" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - else: - encoding = None - if "separator" in params: separator = params["separator"] else: @@ -358,8 +385,6 @@ async def DELETE_Links(request): raise HTTPBadRequest(reason=msg) titles_param = params["titles"] - if encoding: - titles_param = decodeData(titles_param).decode("utf-8") titles = titles_param.split(separator) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 3bad1afc..bb2284d9 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -16,15 +16,16 @@ from aiohttp.web_exceptions import HTTPBadRequest from json import JSONDecodeError -from .util.httpUtil import http_get, http_delete, getHref +from .util.httpUtil import http_get, getHref from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, getCollectionForId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword -from .util.domainUtil import getDomainFromRequest, isValidDomain +from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot from .util.domainUtil import getBucketForDomain from .util.linkUtil import validateLinkName -from .servicenode_lib import validateAction, getLink, putLink -from . import config +from .servicenode_lib import getDomainJson, validateAction +from .servicenode_lib import getLink, putLink, getLinks, deleteLinks +from .domain_crawl import DomainCrawler from . import hsds_logger as log @@ -72,8 +73,6 @@ async def GET_Links(request): log.warn(msg) raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") await validateAction(app, domain, group_id, username, "read") @@ -149,8 +148,6 @@ async def GET_Link(request): log.warn(msg) raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") await validateAction(app, domain, group_id, username, "read") @@ -239,11 +236,8 @@ async def PUT_Link(request): log.warn(msg) raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") await validateAction(app, domain, group_id, username, "create") - # putLink will validate these arguments kwargs = {"bucket": bucket} kwargs["tgt_id"] = body.get("id") @@ -262,8 +256,206 @@ async def PUT_Link(request): return resp +async def DELETE_Links(request): + """HTTP method to delete multiple link""" + log.request(request) + app = request.app + params = request.rel_url.query + group_id = request.match_info.get("id") + if not group_id: + msg = "Missing group id" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if not isValidUuid(group_id, obj_class="Group"): + msg = f"Invalid group id: {group_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "titles" not in params: + msg = "expected titles params" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + titles_param = params["titles"] + if "separator" in params: + separator = params["separator"] + else: + separator = "/" + titles = titles_param.split(separator) + + for title in titles: + validateLinkName(title) + + username, pswd = getUserPasswordFromRequest(request) + await validateUserPassword(app, username, pswd) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"domain: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + bucket = getBucketForDomain(domain) + + await validateAction(app, domain, group_id, username, "delete") + + await deleteLinks(app, group_id, titles=titles, bucket=bucket) + + rsp_json = {} + resp = await jsonResponse(request, rsp_json) + log.response(request, resp=resp) + return resp + + +async def POST_Links(request): + """HTTP method to get multiple links """ + log.request(request) + app = request.app + params = request.rel_url.query + log.info("POST_Links") + req_id = request.match_info.get("id") + + if params.get("follow_links"): + follow_links = True + else: + follow_links = False + + if not request.has_body: + msg = "POST Links with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + body = await request.json() + except JSONDecodeError: + msg = "Unable to load JSON body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "titles" in body: + titles = body["titles"] + if not isinstance(titles, list): + msg = f"expected list for titles but got: {type(titles)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + titles = None + + if "group_ids" in body: + group_ids = body["group_ids"] + else: + group_ids = None + + if titles is None and group_ids is None: + msg = "expected body to contain one of titles, group_ids keys" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # construct an item list from titles and group_ids + items = {} + if group_ids is None: + if not req_id: + msg = "no object id in request" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + items[req_id] = titles + elif isinstance(group_ids, list): + if titles is None: + msg = "no titles - will return all links for each object" + log.debug(msg) + for group_id in group_ids: + items[group_id] = None + elif isinstance(group_ids, dict): + if titles is not None: + msg = "titles must not be provided if obj_ids is a dict" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + for group_id in group_ids: + names_for_id = group_ids[group_id] + if not isinstance(names_for_id, list): + msg = "expected list of titles" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + items[group_id] = names_for_id + + log.debug(f"POST Links items: {items}") + + # do a check that everything is as it should with the item list + for group_id in items: + if not isValidUuid(group_id, obj_class="Group"): + msg = f"Invalid group id: {group_id}" + log.warn(msg) + + titles = items[group_id] + + if titles is None: + log.debug(f"getting all links for {group_id}") + elif isinstance(titles, list): + for title in titles: + validateLinkName(title) # raises HTTPBadRequest if invalid + else: + msg = f"expected list for titles but got: {type(titles)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + username, pswd = getUserPasswordFromRequest(request) + if username is None and app["allow_noauth"]: + username = "default" + else: + await validateUserPassword(app, username, pswd) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain value: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + bucket = getBucketForDomain(domain) + + # get domain JSON + domain_json = await getDomainJson(app, domain) + verifyRoot(domain_json) + + # TBD - verify that the obj_id belongs to the given domain + await validateAction(app, domain, req_id, username, "read") + + resp_json = {} + + if len(items) == 0: + msg = "no group ids specified for POST Links" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + elif len(items) == 1 and not follow_links: + # just make a request to the datanode + group_id = list(items.keys())[0] + titles = items[group_id] + links = await getLinks(app, group_id, titles=titles, bucket=bucket) + + resp_json["links"] = links + else: + # get multi obj + # don't follow links for the groups we visit! + crawler_params = {"follow_links": follow_links, "bucket": bucket} + # mixin params + + kwargs = {"action": "get_link", "raise_error": True, "params": crawler_params} + crawler = DomainCrawler(app, items, **kwargs) + # will raise exception on NotFound, etc. + await crawler.crawl() + + msg = f"DomainCrawler returned: {len(crawler._obj_dict)} objects" + log.info(msg) + links = crawler._obj_dict + + log.debug(f"got {len(links)} links") + resp_json["links"] = links + + resp = await jsonResponse(request, resp_json) + log.response(request, resp=resp) + return resp + + async def DELETE_Link(request): - """HTTP method to delete a link""" + """HTTP method to delete one or more links """ log.request(request) app = request.app @@ -289,18 +481,12 @@ async def DELETE_Link(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - if not bucket: - bucket = config.get("bucket_name") await validateAction(app, domain, group_id, username, "delete") - req = getDataNodeUrl(app, group_id) - req += "/groups/" + group_id + "/links" - - params = {"bucket": bucket, "titles": link_title} - - rsp_json = await http_delete(app, req, params=params) + await deleteLinks(app, group_id, titles=[link_title, ], bucket=bucket) + rsp_json = {} resp = await jsonResponse(request, rsp_json) log.response(request, resp=resp) return resp diff --git a/hsds/servicenode.py b/hsds/servicenode.py index 22b9822a..0f99f1b0 100755 --- a/hsds/servicenode.py +++ b/hsds/servicenode.py @@ -29,7 +29,7 @@ from .domain_sn import GET_Datasets, GET_Groups, GET_Datatypes from .domain_sn import GET_ACL, GET_ACLs, PUT_ACL from .group_sn import GET_Group, POST_Group, DELETE_Group -from .link_sn import GET_Links, GET_Link, PUT_Link, DELETE_Link +from .link_sn import GET_Links, POST_Links, GET_Link, PUT_Link, DELETE_Link, DELETE_Links from .attr_sn import GET_Attributes, GET_Attribute, PUT_Attribute, PUT_Attributes, DELETE_Attribute from .attr_sn import DELETE_Attributes, GET_AttributeValue, PUT_AttributeValue, POST_Attributes from .ctype_sn import GET_Datatype, POST_Datatype, DELETE_Datatype @@ -82,6 +82,8 @@ async def init(): path = "/groups/{id}/links" app.router.add_route("GET", path, GET_Links) + app.router.add_route("POST", path, POST_Links) + app.router.add_route("DELETE", path, DELETE_Links) path = "/groups/{id}/links/{title}" app.router.add_route("GET", path, GET_Link) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 2c75f5a9..875a7e33 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -342,25 +342,47 @@ async def getDsetJson(app, dset_id, return dset_json -async def getLink(app, group_id, title, bucket=None): - """ Get the link json for the given title """ +async def getLinks(app, group_id, titles=None, bucket=None): + """ Get the link jsons for the given titles """ req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" - log.debug(f"getLink for {group_id} - title: {title}") params = {"bucket": bucket} + log.debug(f"getLinks {group_id}") + if titles: + # do a post request with the given title list + log.debug(f"getLinks for {group_id} - {len(titles)} titles") + data = {"titles": titles} + post_rsp = await http_post(app, req, data=data, params=params) + log.debug(f"got link_json: {post_rsp}") + if "links" not in post_rsp: + log.error("unexpected response from post links") + raise HTTPInternalServerError() + links = post_rsp["links"] + else: + # do a get for all links + log.debug(f"getLinks, all links for {group_id}") + get_rsp = await http_get(app, req, params=params) + log.debug(f"got link_json: {get_rsp}") + if "links" not in get_rsp: + log.error("unexpected response from get links") + raise HTTPInternalServerError() + links = get_rsp["links"] + + return links + + +async def getLink(app, group_id, title, bucket=None): + """ Get the link json for the given title """ + + titles = [title, ] + links = await getLinks(app, group_id, titles=titles, bucket=bucket) - data = {"titles": [title, ]} - post_rsp = await http_post(app, req, data=data, params=params) - log.debug(f"got link_json: {post_rsp}") - if "links" not in post_rsp: - log.error("unexpected response from post links") - raise HTTPInternalServerError() - links = post_rsp["links"] if len(links) != 1: log.error(f"expected 1 link but got: {len(links)}") raise HTTPInternalServerError() link_json = links[0] + return link_json @@ -452,6 +474,26 @@ async def putExternalLink(app, group_id, title, h5path=None, h5domain=None, buck return status +async def deleteLinks(app, group_id, titles=None, separator="/", bucket=None): + """ delete the requested set of attributes from the given object """ + + if titles is None or len(titles) == 0: + msg = "provide a list of link names for deletion" + log.debug(msg) + raise HTTPBadRequest(reason=msg) + + node_url = getDataNodeUrl(app, group_id) + req = f"{node_url}/groups/{group_id}/links" + log.debug(f"deleteLinks: {req}") + params = {"separator": separator, "bucket": bucket} + + # stringify the list of link_names + titles_param = separator.join(titles) + params["titles"] = titles_param + log.debug(f"using params: {params}") + await http_delete(app, req, params=params) + + async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False, domain=None, follow_soft_links=False, follow_external_links=False): """Find the object at the provided h5path location. diff --git a/hsds/util/domainUtil.py b/hsds/util/domainUtil.py index 3fa2e7f7..140ec01a 100644 --- a/hsds/util/domainUtil.py +++ b/hsds/util/domainUtil.py @@ -50,6 +50,22 @@ def isIPAddress(s): return True +def getBucketForDomain(domain): + """get the bucket for the domain or None + if no bucket is given + """ + if not domain: + return None + if domain[0] == "/": + # no bucket specified + return None + index = domain.find("/") + if index < 0: + # invalid domain? + return None + return domain[:index] + + def getParentDomain(domain): """Get parent domain of given domain. E.g. getParentDomain("www.hdfgroup.org") returns "hdfgroup.org" @@ -263,22 +279,6 @@ def getPathForDomain(domain): return domain[(index):] -def getBucketForDomain(domain): - """get the bucket for the domain or None - if no bucket is given - """ - if not domain: - return None - if domain[0] == "/": - # no bucket specified - return None - index = domain.find("/") - if index < 0: - # invalid domain? - return None - return domain[:index] - - def verifyRoot(domain_json): """Throw bad request if we are expecting a domain, but got a folder instead diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 6e203981..78c7a099 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -1671,7 +1671,6 @@ def testNonURLEncodableAttributeName(self): self.assertTrue("name" in rsp_attr) self.assertEqual(rsp_attr["name"], attr_name) - self.assertTrue("href" in rsp_attr) self.assertTrue("created" in rsp_attr) self.assertTrue("type" in rsp_attr) self.assertEqual(rsp_attr["type"], expected_type) @@ -1684,10 +1683,8 @@ def testNonURLEncodableAttributeName(self): rsp = self.session.delete(bad_req, headers=headers) self.assertEqual(rsp.status_code, 404) # not found - # send attribute name as an encoded query param - attr_names_param = base64.b64encode(attr_name.encode("utf8")).decode("ascii") # specify a separator since our attribute name has the default slash - params = {"attr_names": attr_names_param, "encoding": "base64", "separator": "!"} + params = {"attr_names": attr_names, "separator": "!"} rsp = self.session.delete(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) @@ -1702,7 +1699,7 @@ def testNonURLEncodableAttributeName(self): def testPostAttributeSingle(self): domain = helper.getTestDomain("tall.h5") - print("testGetDomain", domain) + print("testPostAttributeSingle", domain) headers = helper.getRequestHeaders(domain=domain) headers["Origin"] = "https://www.hdfgroup.org" # test CORS headers_bin_rsp = helper.getRequestHeaders(domain=domain) @@ -1731,7 +1728,6 @@ def testPostAttributeSingle(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) self.assertTrue("attributes" in rspJson) attributes = rspJson["attributes"] self.assertTrue(isinstance(attributes, list)) @@ -1749,7 +1745,6 @@ def testPostAttributeSingle(self): shapeJson = attrJson["shape"] self.assertEqual(shapeJson["class"], "H5S_SIMPLE") self.assertTrue("created" in attrJson) - self.assertTrue("href" in attrJson) self.assertTrue("value" not in attrJson) # test with returning all attribute values @@ -1758,7 +1753,6 @@ def testPostAttributeSingle(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) self.assertTrue("attributes" in rspJson) attributes = rspJson["attributes"] self.assertTrue(isinstance(attributes, list)) @@ -1776,14 +1770,13 @@ def testPostAttributeSingle(self): shapeJson = attrJson["shape"] self.assertEqual(shapeJson["class"], "H5S_SIMPLE") self.assertTrue("created" in attrJson) - self.assertTrue("href" in attrJson) self.assertTrue("value" in attrJson) self.assertEqual(attrJson["value"], expected_values[i]) def testPostAttributeMultiple(self): """ Get attributes for multiple objs """ domain = helper.getTestDomain("tall.h5") - print("testGetDomain", domain) + print("testPostAttributeMultiple", domain) headers = helper.getRequestHeaders(domain=domain) headers["Origin"] = "https://www.hdfgroup.org" # test CORS headers_bin_rsp = helper.getRequestHeaders(domain=domain) @@ -1827,7 +1820,6 @@ def testPostAttributeMultiple(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) self.assertTrue("attributes" in rspJson) attributes = rspJson["attributes"] self.assertTrue(isinstance(attributes, dict)) @@ -1853,7 +1845,6 @@ def testPostAttributeMultiple(self): shapeJson = attrJson["shape"] self.assertEqual(shapeJson["class"], "H5S_SIMPLE") self.assertTrue("created" in attrJson) - self.assertTrue("href" in attrJson) self.assertTrue("value" not in attrJson) # test with returning attribute values @@ -1862,7 +1853,6 @@ def testPostAttributeMultiple(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) self.assertTrue("attributes" in rspJson) attributes = rspJson["attributes"] self.assertTrue(isinstance(attributes, dict)) @@ -1889,7 +1879,6 @@ def testPostAttributeMultiple(self): shapeJson = attrJson["shape"] self.assertEqual(shapeJson["class"], "H5S_SIMPLE") self.assertTrue("created" in attrJson) - self.assertTrue("href" in attrJson) self.assertTrue("value" in attrJson) self.assertEqual(attrJson["value"], expected_values[i]) @@ -1903,7 +1892,6 @@ def testPostAttributeMultiple(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) self.assertTrue("attributes" in rspJson) attributes = rspJson["attributes"] self.assertTrue(isinstance(attributes, dict)) @@ -1934,7 +1922,6 @@ def testPostAttributeMultiple(self): rsp = self.session.post(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) self.assertTrue("attributes" in rspJson) attributes = rspJson["attributes"] self.assertEqual(len(attributes), 2) diff --git a/tests/integ/helper.py b/tests/integ/helper.py index d5d5412d..dd5d125e 100644 --- a/tests/integ/helper.py +++ b/tests/integ/helper.py @@ -253,3 +253,22 @@ def getHDF5JSON(filename): with open(filename) as f: hdf5_json = json.load(f) return hdf5_json + + +def getLink(domain, grp_id, title): + headers = getRequestHeaders(domain=domain) + session = getSession() + req = getEndpoint() + "/groups/" + grp_id + "/links/" + title + rsp = session.get(req, headers=headers) + if rsp.status_code in (404, 410): + # not found or deleted + return None + elif rsp.status_code != 200: + raise ValueError(f"getLink exception: {rsp.status_code}") + + rspJson = json.loads(rsp.text) + if "link" not in rspJson: + raise KeyError(f"expected link key in {rspJson}") + link_json = rspJson["link"] + + return link_json diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 3728561a..9df9a319 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -448,16 +448,14 @@ def testGet(self): req = helper.getEndpoint() + "/" rsp = self.session.get(req, headers=headers) if rsp.status_code != 200: - print( - "WARNING: Failed to get domain: {}. Is test data setup?".format(domain) - ) + print(f"WARNING: Failed to get domain: {domain}. Is test data setup?") return # abort rest of test rspJson = json.loads(rsp.text) root_uuid = rspJson["root"] self.assertTrue(root_uuid.startswith("g-")) - # get the "/g1" group + # get the "/g1/g1.2" group g1_2_uuid = helper.getUUIDByPath(domain, "/g1/g1.2", session=self.session) now = time.time() @@ -505,9 +503,7 @@ def testGet(self): self.assertTrue(g1_2_1_uuid is not None) self.assertTrue(extlink_file is not None) - expected_uuid = helper.getUUIDByPath( - domain, "/g1/g1.2/g1.2.1", session=self.session - ) + expected_uuid = helper.getUUIDByPath(domain, "/g1/g1.2/g1.2.1", session=self.session) self.assertEqual(expected_uuid, g1_2_1_uuid) # get link by title @@ -865,6 +861,197 @@ def testRootH5Path(self): self.assertTrue(k in cprops) self.assertEqual(cprops[k], creation_props[k]) + def testPostLinkSingle(self): + domain = helper.getTestDomain("tall.h5") + print("testPostLinkSingle", domain) + headers = helper.getRequestHeaders(domain=domain) + headers["Origin"] = "https://www.hdfgroup.org" # test CORS + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + + domainJson = json.loads(rsp.text) + root_id = domainJson["root"] + helper.validateId(root_id) + + # get the "/g1/g1.2" group + g1_2_uuid = helper.getUUIDByPath(domain, "/g1/g1.2", session=self.session) + + now = time.time() + + # get link "extlink" and "g1.2.1" for /g1/g1.2: + titles = ["extlink", "g1.2.1"] + payload = {"titles": titles} + req = helper.getEndpoint() + "/groups/" + g1_2_uuid + "/links" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), 2) + g1_2_1_uuid = None + extlink_file = None + for link in links: + self.assertTrue("class" in link) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + for name in ( + "created", + "class", + "id", + "title", + ): + self.assertTrue(name in link) + g1_2_1_uuid = link["id"] + self.assertTrue(g1_2_1_uuid.startswith("g-")) + self.assertEqual(link["title"], "g1.2.1") + self.assertTrue(link["created"] < now - 10) + else: + self.assertEqual(link_class, "H5L_TYPE_EXTERNAL") + for name in ("created", "class", "h5domain", "h5path", "title"): + self.assertTrue(name in link) + self.assertEqual(link["title"], "extlink") + extlink_file = link["h5domain"] + self.assertEqual(extlink_file, "somefile") + self.assertEqual(link["h5path"], "somepath") + self.assertTrue(link["created"] < now - 10) + + self.assertTrue(g1_2_1_uuid is not None) + self.assertTrue(extlink_file is not None) + expected_uuid = helper.getUUIDByPath(domain, "/g1/g1.2/g1.2.1", session=self.session) + self.assertEqual(expected_uuid, g1_2_1_uuid) + + def testPostLinkMultiple(self): + domain = helper.getTestDomain("tall.h5") + print("testPostLinkSingle", domain) + headers = helper.getRequestHeaders(domain=domain) + headers["Origin"] = "https://www.hdfgroup.org" # test CORS + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + + domainJson = json.loads(rsp.text) + root_id = domainJson["root"] + helper.validateId(root_id) + + # get the "/g1/g1.2" group + h5paths = ["/g1", "/g2", "/g1/g1.1", "/g1/g1.2", "/g2", "/g1/g1.2/g1.2.1"] + grp_map = {} + g1_id = None + g2_id = None + for h5path in h5paths: + grp_id = helper.getUUIDByPath(domain, h5path, session=self.session) + grp_map[grp_id] = h5path + if h5path == "/g1": + g1_id = grp_id # save + elif h5path == "/g2": + g2_id = grp_id + + # get all links for the given set of group ids + grp_ids = list(grp_map.keys()) + payload = {"group_ids": grp_ids} + req = helper.getEndpoint() + "/groups/" + root_id + "/links" + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + obj_links = rspJson["links"] + self.assertTrue(len(obj_links), len(grp_ids)) + for grp_id in obj_links: + self.assertTrue(grp_id in grp_map) + h5path = grp_map[grp_id] + if h5path == "/g1/g1.2/g1.2.1": + expected_count = 1 + else: + expected_count = 2 # all the rest have two links + links = obj_links[grp_id] + self.assertEqual(len(links), expected_count) + for link in links: + title = link["title"] + expected = helper.getLink(domain, grp_id, title) + self.assertEqual(link["class"], expected["class"]) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertEqual(link["id"], expected["id"]) + else: + # soft or external link + self.assertEqual(link["h5path"], expected["h5path"]) + if link_class == "H5L_TYPE_EXTERNAL": + self.assertEqual(link["h5domain"], expected["h5domain"]) + + # get just the request links for each group + link_map = {g1_id: ["g1.1", "g1.2"], g2_id: ["dset2.2", ]} + payload = {"group_ids": link_map} + rsp = self.session.post(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + obj_links = rspJson["links"] + self.assertEqual(len(obj_links), 2) + self.assertTrue(g1_id in obj_links) + g1_links = obj_links[g1_id] + self.assertEqual(len(g1_links), 2) + for link in g1_links: + self.assertTrue("class" in link) + self.assertEqual(link["class"], "H5L_TYPE_HARD") + self.assertTrue("title" in link) + self.assertTrue(link["title"] in ("g1.1", "g1.2")) + self.assertTrue("id" in link) + g2_links = obj_links[g2_id] + self.assertEqual(len(g2_links), 1) # two links in this group but just asked for dset2.2 + link = g2_links[0] + self.assertEqual(link["class"], "H5L_TYPE_HARD") + + # get all links for the domain by providing the root_id with the follow_links param + params = {"follow_links": 1} + grp_ids = [root_id, ] + payload = {"group_ids": grp_ids} + rsp = self.session.post(req, data=json.dumps(payload), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + obj_links = rspJson["links"] + self.assertEqual(len(obj_links), 6) + expected_group_links = ("g1", "g2", "g1.1", "g1.2", "g1.2.1", ) + expected_dset_links = ("dset1.2", "dset2.2", "dset1.1.1", "dset1.1.2", "dset2.1", ) + expected_soft_links = ("slink",) + expected_external_links = ("extlink", ) + + # listify the returned links + links = [] + for obj_id in obj_links: + links.extend(obj_links[obj_id]) + self.assertEqual(len(links), 11) + for link in links: + self.assertTrue("title" in link) + title = link["title"] + self.assertTrue("class" in link) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + link_id = link["id"] + if link_id.startswith("g-"): + self.assertTrue(title in expected_group_links) + elif link_id.startswith("d-"): + self.assertTrue(title in expected_dset_links) + else: + self.assertTrue(False) # unexpected + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue(title in expected_soft_links) + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue(title in expected_external_links) + else: + self.assertTrue(False) # unexpected + if __name__ == "__main__": # setup test files From db689888cbea072d8a416482f20430cee236eb3d Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 9 Jan 2024 02:25:25 +0000 Subject: [PATCH 03/18] updates based on PR feedback --- hsds/link_dn.py | 6 +++--- hsds/link_sn.py | 7 +++---- hsds/servicenode_lib.py | 12 ++++++------ tests/integ/link_test.py | 2 +- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 3e2dfbbb..5a8651c8 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -137,7 +137,7 @@ async def GET_Links(request): async def POST_Links(request): - """HTTP POST method to return JSON for a link a given set of links """ + """HTTP POST method to return JSON for a link or a given set of links """ log.request(request) app = request.app params = request.rel_url.query @@ -164,7 +164,7 @@ async def POST_Links(request): bucket = params["bucket"] if not bucket: - msg = "GET_Links - no bucket param" + msg = "POST_Links - no bucket param" log.warn(msg) raise HTTPBadRequest(reason=msg) @@ -224,7 +224,7 @@ async def POST_Links(request): link_list.append(item) if not link_list: - msg = f"POST link - requested {len(titles)} but none were found" + msg = f"POST_links - requested {len(titles)} but none were found" log.warn(msg) raise HTTPNotFound() diff --git a/hsds/link_sn.py b/hsds/link_sn.py index bb2284d9..3ccad7fa 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -257,7 +257,7 @@ async def PUT_Link(request): async def DELETE_Links(request): - """HTTP method to delete multiple link""" + """HTTP method to delete multiple links """ log.request(request) app = request.app params = request.rel_url.query @@ -432,10 +432,9 @@ async def POST_Links(request): resp_json["links"] = links else: - # get multi obj - # don't follow links for the groups we visit! + # Use DomainCrawler to fetch links from multiple object. + # set the follow_links and bucket params crawler_params = {"follow_links": follow_links, "bucket": bucket} - # mixin params kwargs = {"action": "get_link", "raise_error": True, "params": crawler_params} crawler = DomainCrawler(app, items, **kwargs) diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 875a7e33..be009a90 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -452,30 +452,30 @@ async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, async def putHardLink(app, group_id, title, tgt_id=None, bucket=None): """ create a new hard link. Return 201 if this is a new link, - or 201 if it's a duplicate of an existing link """ + or 200 if it's a duplicate of an existing link """ status = await putLink(app, group_id, title, tgt_id=tgt_id, bucket=bucket) return status async def putSoftLink(app, group_id, title, h5path=None, bucket=None): - """ create a new hard link. Return 201 if this is a new link, - or 201 if it's a duplicate of an existing link """ + """ create a new soft link. Return 201 if this is a new link, + or 200 if it's a duplicate of an existing link """ status = await putLink(app, group_id, title, h5path=h5path, bucket=bucket) return status async def putExternalLink(app, group_id, title, h5path=None, h5domain=None, bucket=None): - """ create a new hard link. Return 201 if this is a new link, - or 201 if it's a duplicate of an existing link """ + """ create a new external link. Return 201 if this is a new link, + or 200 if it's a duplicate of an existing link """ status = await putLink(app, group_id, title, h5path=h5path, h5domain=h5domain, bucket=bucket) return status async def deleteLinks(app, group_id, titles=None, separator="/", bucket=None): - """ delete the requested set of attributes from the given object """ + """ delete the requested set of links from the given object """ if titles is None or len(titles) == 0: msg = "provide a list of link names for deletion" diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 9df9a319..1633b95c 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -1024,7 +1024,7 @@ def testPostLinkMultiple(self): self.assertEqual(len(obj_links), 6) expected_group_links = ("g1", "g2", "g1.1", "g1.2", "g1.2.1", ) expected_dset_links = ("dset1.2", "dset2.2", "dset1.1.1", "dset1.1.2", "dset2.1", ) - expected_soft_links = ("slink",) + expected_soft_links = ("slink", ) expected_external_links = ("extlink", ) # listify the returned links From f9df229cfe6dc658361c5e6bd8c90369401ea7d8 Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 12 Jan 2024 15:29:14 +0000 Subject: [PATCH 04/18] multiop put for links --- hsds/attr_dn.py | 94 +++++++--- hsds/datanode.py | 2 + hsds/domain_crawl.py | 35 +++- hsds/domain_sn.py | 125 +++++++++++++ hsds/link_dn.py | 131 ++++++++------ hsds/link_sn.py | 204 ++++++++++++++++++++- hsds/servicenode.py | 11 +- hsds/servicenode_lib.py | 81 ++++++--- hsds/util/attrUtil.py | 29 +++ hsds/util/linkUtil.py | 102 ++++++++++- tests/integ/attr_test.py | 37 +++- tests/integ/domain_test.py | 29 +++ tests/integ/link_test.py | 359 +++++++++++++++++++++++++++++++++++++ 13 files changed, 1122 insertions(+), 117 deletions(-) diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index d80ca322..8dd44da3 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -15,11 +15,11 @@ import time from bisect import bisect_left -from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPNotFound +from aiohttp.web_exceptions import HTTPBadRequest, HTTPConflict, HTTPNotFound, HTTPGone from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response -from .util.attrUtil import validateAttributeName +from .util.attrUtil import validateAttributeName, isEqualAttr from .util.hdf5dtype import getItemSize, createDataType from .util.dsetUtil import getShapeDims from .util.arrayUtil import arrayToBytes, jsonToArray, decodeData @@ -270,21 +270,31 @@ async def POST_Attributes(request): if encoding: kwargs["encoding"] = encoding + missing_names = set() + for attr_name in titles: if attr_name not in attr_dict: + missing_names.add(attr_name) continue des_attr = _getAttribute(attr_name, obj_json, **kwargs) attr_list.append(des_attr) resp_json = {"attributes": attr_list} - if not attr_list: - msg = f"POST attributes - requested {len(titles)} but none were found" - log.warn(msg) - raise HTTPNotFound() - if len(attr_list) != len(titles): + + if missing_names: msg = f"POST attributes - requested {len(titles)} attributes but only " msg += f"{len(attr_list)} were found" log.warn(msg) + # one or more attributes not found, check to see if any + # had been previously deleted + deleted_attrs = app["deleted_attrs"] + if obj_id in deleted_attrs: + attr_delete_set = deleted_attrs[obj_id] + for attr_name in missing_names: + if attr_name in attr_delete_set: + log.info(f"attribute: {attr_name} was previously deleted, returning 410") + raise HTTPGone() + log.info("one or mores attributes not found, returning 404") raise HTTPNotFound() log.debug(f"POST attributes returning: {resp_json}") resp = json_response(resp_json) @@ -392,18 +402,28 @@ async def PUT_Attributes(request): attributes = obj_json["attributes"] - # check for conflicts, also set timestamp create_time = time.time() - new_attribute = False # set this if we have any new attributes + # check for conflicts + new_attributes = set() # attribute names that are new or replacements for attr_name in items: attribute = items[attr_name] if attr_name in attributes: log.debug(f"attribute {attr_name} exists") - if replace: + old_item = attributes[attr_name] + try: + is_dup = isEqualAttr(attribute, old_item) + except TypeError: + log.error(f"isEqualAttr TypeError - new: {attribute} old: {old_item}") + raise HTTPInternalServerError() + if is_dup: + log.debug(f"duplicate attribute: {attr_name}") + continue + elif replace: # don't change the create timestamp log.debug(f"attribute {attr_name} exists, but will be updated") old_item = attributes[attr_name] attribute["created"] = old_item["created"] + new_attributes.add(attr_name) else: # Attribute already exists, return a 409 msg = f"Attempt to overwrite attribute: {attr_name} " @@ -414,18 +434,30 @@ async def PUT_Attributes(request): # set the timestamp log.debug(f"new attribute {attr_name}") attribute["created"] = create_time - new_attribute = True + new_attributes.add(attr_name) - # ok - all set, create the attributes - for attr_name in items: + # if any of the attribute names was previously deleted, + # remove from the deleted set + deleted_attrs = app["deleted_attrs"] + if obj_id in deleted_attrs: + attr_delete_set = deleted_attrs[obj_id] + else: + attr_delete_set = set() + + # ok - all set, add the attributes + for attr_name in new_attributes: log.debug(f"adding attribute {attr_name}") attr_json = items[attr_name] attributes[attr_name] = attr_json - - # write back to S3, save to metadata cache - await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) - - if new_attribute: + if attr_name in attr_delete_set: + attr_delete_set.remove(attr_name) + + if new_attributes: + # update the obj lastModified + now = time.time() + obj_json["lastModified"] = now + # write back to S3, save to metadata cache + await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) status = 201 else: status = 200 @@ -490,15 +522,35 @@ async def DELETE_Attributes(request): # return a list of attributes based on sorted dictionary keys attributes = obj_json["attributes"] + # add attribute names to deleted set, so we can return a 410 if they + # are requested in the future + deleted_attrs = app["deleted_attrs"] + if obj_id in deleted_attrs: + attr_delete_set = deleted_attrs[obj_id] + else: + attr_delete_set = set() + deleted_attrs[obj_id] = attr_delete_set + + save_obj = False # set to True if anything is actually modified for attr_name in attr_names: + if attr_name in attr_delete_set: + log.warn(f"attribute {attr_name} already deleted") + continue + if attr_name not in attributes: - msg = f"Attribute {attr_name} not found in objid: {obj_id}" + msg = f"Attribute {attr_name} not found in obj id: {obj_id}" log.warn(msg) raise HTTPNotFound() del attributes[attr_name] - - await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) + attr_delete_set.add(attr_name) + save_obj = True + + if save_obj: + # update the object lastModified + now = time.time() + obj_json["lastModified"] = now + await save_metadata_obj(app, obj_id, obj_json, bucket=bucket) resp_json = {} resp = json_response(resp_json) diff --git a/hsds/datanode.py b/hsds/datanode.py index 7e8c9a9d..50bb0307 100644 --- a/hsds/datanode.py +++ b/hsds/datanode.py @@ -299,6 +299,8 @@ def create_app(): } app["chunk_cache"] = LruCache(**kwargs) app["deleted_ids"] = set() + app["deleted_attrs"] = {} # map of objectid to set of deleted attribute names + app["deleted_links"] = {} # map of objecctid to set of deleted link names # map of objids to timestamp and bucket of which they were last updated app["dirty_ids"] = {} # map of dataset ids to deflate levels (if compressed) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 8697e267..f2d672b5 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -19,7 +19,7 @@ from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone from .util.idUtil import getCollectionForId, getDataNodeUrl -from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks +from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks from . import hsds_logger as log @@ -295,6 +295,30 @@ async def get_links(self, grp_id, titles=None): else: log.debug(f"link: {link_id} already in object dict") + async def put_links(self, grp_id, link_items): + # write the given links for the obj_id + log.debug(f"put_links for {grp_id}, {len(link_items)} links") + req = getDataNodeUrl(self._app, grp_id) + req += f"/groups/{grp_id}/links" + kwargs = {} + if "bucket" in self._params: + kwargs["bucket"] = self._params["bucket"] + status = None + try: + status = await putLinks(self._app, grp_id, link_items, **kwargs) + except HTTPConflict: + log.warn("DomainCrawler - got HTTPConflict from http_put") + status = 409 + except HTTPServiceUnavailable: + status = 503 + except HTTPInternalServerError: + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + + log.debug(f"DomainCrawler fetch for {grp_id} - returning status: {status}") + self._obj_dict[grp_id] = {"status": status} + def get_status(self): """ return the highest status of any of the returned objects """ status = None @@ -419,7 +443,16 @@ async def fetch(self, obj_id): log.debug(f"DomainCrawler - get link titles: {link_titles}") await self.get_links(obj_id, link_titles) + elif self._action == "put_link": + log.debug("DomainCrawlwer - put links") + # write links + if self._objs and obj_id not in self._objs: + log.error(f"couldn't find {obj_id} in self._objs") + return + link_items = self._objs[obj_id] + log.debug(f"got {len(link_items)} link items for {obj_id}") + await self.put_links(obj_id, link_items) else: msg = f"DomainCrawler: unexpected action: {self._action}" log.error(msg) diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index d7778ecd..785a9528 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -624,6 +624,131 @@ async def getScanTime(app, root_id, bucket=None): return root_scan +async def POST_Domain(request): + """ return object defined by h5path list """ + + log.request(request) + app = request.app + params = request.rel_url.query + log.debug(f"POST_Domain query params: {params}") + + include_links = False + include_attrs = False + follow_soft_links = False + follow_external_links = False + + if "include_links" in params and params["include_links"]: + include_links = True + if "include_attrs" in params and params["include_attrs"]: + include_attrs = True + if "follow_soft_links" in params and params["follow_soft_links"]: + follow_soft_links = True + if "follow_external_links" in params and params["follow_external_links"]: + follow_external_links = True + + if not request.has_body: + msg = "POST Domain with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + body = await request.json() + except json.JSONDecodeError: + msg = "Unable to load JSON body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "h5paths" in body: + h5paths = body["h5paths"] + if not isinstance(h5paths, list): + msg = f"expected list for h5paths but got: {type(h5paths)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + msg = "expected h5paths key in body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + (username, pswd) = getUserPasswordFromRequest(request) + if username is None and app["allow_noauth"]: + username = "default" + else: + await validateUserPassword(app, username, pswd) + + domain = None + try: + domain = getDomainFromRequest(request) + except ValueError: + log.warn(f"Invalid domain: {domain}") + raise HTTPBadRequest(reason="Invalid domain name") + + bucket = getBucketForDomain(domain) + log.debug(f"GET_Domain domain: {domain} bucket: {bucket}") + + if not bucket: + # no bucket defined, raise 400 + msg = "Bucket not provided" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if bucket: + checkBucketAccess(app, bucket) + + if not domain: + msg = "no domain given" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + log.info(f"got domain: {domain}") + + domain_json = await getDomainJson(app, domain, reload=True) + + if domain_json is None: + log.warn(f"domain: {domain} not found") + raise HTTPNotFound() + + if "acls" not in domain_json: + log.error("No acls key found in domain") + raise HTTPInternalServerError() + + log.debug(f"got domain_json: {domain_json}") + # validate that the requesting user has permission to read this domain + # aclCheck throws exception if not authorized + aclCheck(app, domain_json, "read", username) + + json_objs = {} + + for h5path in h5paths: + root_id = domain_json["root"] + + # getObjectIdByPath throws 404 if not found + obj_id, domain, _ = await getObjectIdByPath( + app, root_id, h5path, bucket=bucket, domain=domain, + follow_soft_links=follow_soft_links, + follow_external_links=follow_external_links) + log.info(f"get obj_id: {obj_id} from h5path: {h5path}") + # get authoritative state for object from DN (even if + # it's in the meta_cache). + kwargs = {"refresh": True, "bucket": bucket, + "include_attrs": include_attrs, "include_links": include_links} + log.debug(f"kwargs for getObjectJson: {kwargs}") + + obj_json = await getObjectJson(app, obj_id, **kwargs) + + obj_json = respJsonAssemble(obj_json, params, obj_id) + + obj_json["domain"] = getPathForDomain(domain) + + # client may not know class of object retrieved via path + obj_json["class"] = getObjectClass(obj_id) + + json_objs[h5path] = obj_json + + jsonRsp = {"h5paths": json_objs} + resp = await jsonResponse(request, jsonRsp) + log.response(request, resp=resp) + return resp + + async def PUT_Domain(request): """HTTP method to create a new domain""" log.request(request) diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 5a8651c8..d61d33d0 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -17,12 +17,12 @@ from copy import copy from bisect import bisect_left -from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPConflict +from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound, HTTPGone, HTTPConflict from aiohttp.web_exceptions import HTTPInternalServerError from aiohttp.web import json_response from .util.idUtil import isValidUuid -from .util.linkUtil import validateLinkName +from .util.linkUtil import validateLinkName, getLinkClass, isEqualLink from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import hsds_logger as log @@ -179,10 +179,12 @@ async def POST_Links(request): link_list = [] # links to be returned + missing_names = set() for title in titles: if title not in links: + missing_names.add(title) log.info(f"Link name {title} not found in group: {group_id}") - raise HTTPNotFound() + continue link_json = links[title] item = {} if "class" not in link_json: @@ -223,15 +225,20 @@ async def POST_Links(request): link_list.append(item) - if not link_list: - msg = f"POST_links - requested {len(titles)} but none were found" - log.warn(msg) - raise HTTPNotFound() - - if len(link_list) != len(titles): + if missing_names: msg = f"POST_links - requested {len(titles)} links but only " msg += f"{len(link_list)} were found" log.warn(msg) + # one or more links not found, check to see if any + # had been previously deleted + deleted_links = app["deleted_links"] + if group_id in deleted_links: + link_delete_set = deleted_links[group_id] + for link_name in missing_names: + if link_name in link_delete_set: + log.info(f"link: {link_name} was previously deleted, returning 410") + raise HTTPGone() + log.info("one or more links not found, returning 404") raise HTTPNotFound() rspJson = {"links": link_list} @@ -270,12 +277,12 @@ async def PUT_Links(request): for title in items: validateLinkName(title) item = items[title] - - if "id" in item: - if not isValidUuid(item["id"]): - msg = f"invalid uuid for {title}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + try: + link_class = getLinkClass(item) + except ValueError: + raise HTTPBadRequest(reason="invalid link") + if "class" not in item: + item["class"] = link_class if "bucket" in params: bucket = params["bucket"] @@ -295,44 +302,48 @@ async def PUT_Links(request): raise HTTPInternalServerError() links = group_json["links"] - dup_titles = [] + new_links = set() for title in items: if title in links: link_json = items[title] existing_link = links[title] - for prop in ("class", "id", "h5path", "h5domain"): - if prop in link_json: - if prop not in existing_link: - msg = f"PUT Link - prop {prop} not found in existing " - msg += "link, returning 409" - log.warn(msg) - raise HTTPConflict() - - if link_json[prop] != existing_link[prop]: - msg = f"PUT Links - prop {prop} value is different, old: " - msg += f"{existing_link[prop]}, new: {link_json[prop]}, " - msg += "returning 409" - log.warn(msg) - raise HTTPConflict() - msg = f"Link name {title} already found in group: {group_id}" - log.warn(msg) - dup_titles.append(title) - - for title in dup_titles: - del items[title] - - if items: + try: + is_dup = isEqualLink(link_json, existing_link) + except TypeError: + log.error(f"isEqualLink TypeError - new: {link_json}, old: {existing_link}") + raise HTTPInternalServerError() + + if is_dup: + # TBD: replace param for links? + continue # dup + else: + msg = f"link {title} already exists, returning 409" + log.warn(msg) + raise HTTPConflict() + else: + new_links.add(title) - now = time.time() + # if any of the attribute names was previously deleted, + # remove from the deleted set + deleted_links = app["deleted_links"] + if group_id in deleted_links: + link_delete_set = deleted_links[group_id] + else: + link_delete_set = set() - # add the links - for title in items: - item = items[title] - item["created"] = now - links[title] = item + create_time = time.time() + for title in new_links: + item = items[title] + item["created"] = create_time + links[title] = item + log.debug(f"added link {title}: {item}") + if title in link_delete_set: + link_delete_set.remove(title) + if new_links: # update the group lastModified - group_json["lastModified"] = now + group_json["lastModified"] = create_time + log.debug(f"tbd: group_json: {group_json}") # write back to S3, save to metadata cache await save_metadata_obj(app, group_id, group_json, bucket=bucket) @@ -343,7 +354,7 @@ async def PUT_Links(request): status = 200 # put the status in the JSON response since the http_put function - # used the the SN won't return it + # used by the the SN won't return it resp_json = {"status": status} resp = json_response(resp_json, status=status) @@ -398,22 +409,36 @@ async def DELETE_Links(request): links = group_json["links"] + # add link titles to deleted set, so we can return a 410 if they + # are requested in the future + deleted_links = app["deleted_links"] + if group_id in deleted_links: + link_delete_set = deleted_links[group_id] + else: + link_delete_set = set() + deleted_links[group_id] = link_delete_set + + save_obj = False # set to True if anything actually updated for title in titles: if title not in links: + if title in link_delete_set: + log.warn(f"Link name {title} has already been deleted") + continue msg = f"Link name {title} not found in group: {group_id}" log.warn(msg) raise HTTPNotFound() - # now delete the links - for title in titles: del links[title] # remove the link from dictionary + link_delete_set.add(title) + save_obj = True - # update the group lastModified - now = time.time() - group_json["lastModified"] = now + if save_obj: + # update the group lastModified + now = time.time() + group_json["lastModified"] = now - # write back to S3 - await save_metadata_obj(app, group_id, group_json, bucket=bucket) + # write back to S3 + await save_metadata_obj(app, group_id, group_json, bucket=bucket) resp_json = {} diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 3ccad7fa..7d29acb0 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -22,9 +22,9 @@ from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot from .util.domainUtil import getBucketForDomain -from .util.linkUtil import validateLinkName +from .util.linkUtil import validateLinkName, getLinkClass from .servicenode_lib import getDomainJson, validateAction -from .servicenode_lib import getLink, putLink, getLinks, deleteLinks +from .servicenode_lib import getLink, putLink, putLinks, getLinks, deleteLinks from .domain_crawl import DomainCrawler from . import hsds_logger as log @@ -134,7 +134,10 @@ async def GET_Link(request): log.warn(msg) raise HTTPBadRequest(reason=msg) link_title = request.match_info.get("title") - validateLinkName(link_title) + try: + validateLinkName(link_title) + except ValueError: + raise HTTPBadRequest(reason="invalid link name") username, pswd = getUserPasswordFromRequest(request) if username is None and app["allow_noauth"]: @@ -256,6 +259,191 @@ async def PUT_Link(request): return resp +async def PUT_Links(request): + """HTTP method to create a new links """ + log.request(request) + params = request.rel_url.query + app = request.app + status = None + + log.debug("PUT_Links") + + username, pswd = getUserPasswordFromRequest(request) + # write actions need auth + await validateUserPassword(app, username, pswd) + + if not request.has_body: + msg = "PUT_Links with no body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + try: + body = await request.json() + except JSONDecodeError: + msg = "Unable to load JSON body" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + bucket = getBucketForDomain(domain) + log.debug(f"got bucket: {bucket}") + + # get domain JSON + domain_json = await getDomainJson(app, domain) + verifyRoot(domain_json) + + req_grp_id = request.match_info.get("id") + if not req_grp_id: + req_grp_id = domain_json["root"] + + if "links" in body: + link_items = body["links"] + if not isinstance(link_items, dict): + msg = f"PUT_Links expected dict for for links body, but got: {type(link_items)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # validate the links + for title in link_items: + try: + validateLinkName(title) + link_item = link_items[title] + getLinkClass(link_item) + except ValueError: + raise HTTPBadRequest(reason="invalid link item") + else: + link_items = None + + if link_items: + log.debug(f"PUT Links {len(link_items)} links to add") + else: + log.debug("no links defined yet") + + # next, sort out where these attributes are going to + + grp_ids = {} + if "grp_ids" in body: + body_ids = body["grp_ids"] + if isinstance(body_ids, list): + # multi cast the links - each link in link_items + # will be written to each of the objects identified by obj_id + if not link_items: + msg = "no links provided" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + for grp_id in body_ids: + if not isValidUuid(grp_id): + msg = f"Invalid object id: {grp_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + grp_ids[grp_id] = link_items + + msg = f"{len(link_items)} links will be multicast to " + msg += f"{len(grp_ids)} objects" + log.info(msg) + elif isinstance(body_ids, dict): + # each value is body_ids is a set of links to write to the object + # unlike the above case, different attributes can be written to + # different objects + if link_items: + msg = "links defined outside the obj_ids dict" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + for grp_id in body_ids: + if not isValidUuid(grp_id): + msg = f"Invalid object id: {grp_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + id_json = body_ids[grp_id] + + if "links" not in id_json: + msg = f"PUT_links with no links for grp_id: {grp_id}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + link_items = id_json["links"] + if not isinstance(link_items, dict): + msg = f"PUT_Links expected dict for grp_id {grp_id}, " + msg += f"but got: {type(link_items)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + # validate link items + for title in link_items: + try: + validateLinkName(title) + link_item = link_items[title] + getLinkClass(link_item) + except ValueError: + raise HTTPBadRequest(reason="invalid link item") + grp_ids[grp_id] = link_items + + # write different attributes to different objects + msg = f"PUT_Links over {len(grp_ids)} objects" + else: + msg = f"unexpected type for grp_ids: {type(grp_ids)}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + else: + # use the object id from the request + grp_id = request.match_info.get("id") + if not grp_id: + msg = "Missing object id" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + grp_ids[grp_id] = link_items # make it look like a list for consistency + + log.debug(f"got {len(grp_ids)} grp_ids") + + # TBD - verify that the grp_id belongs to the given domain + await validateAction(app, domain, req_grp_id, username, "create") + + kwargs = {"bucket": bucket} + if params.get("replace"): + kwargs["replace"] = True + + count = len(grp_ids) + if count == 0: + msg = "no grp_ids defined" + log.warn(f"PUT_Attributes: {msg}") + raise HTTPBadRequest(reason=msg) + elif count == 1: + # just send one PUT Attributes request to the dn + grp_id = list(grp_ids.keys())[0] + link_json = grp_ids[grp_id] + log.debug(f"got link_json: {link_json}") + + status = await putLinks(app, grp_id, link_json, **kwargs) + + else: + # put multi obj + + # mixin some additonal kwargs + crawler_params = {"follow_links": False} + if bucket: + crawler_params["bucket"] = bucket + + kwargs = {"action": "put_link", "raise_error": True, "params": crawler_params} + crawler = DomainCrawler(app, grp_ids, **kwargs) + + # will raise exception on not found, server busy, etc. + await crawler.crawl() + + status = crawler.get_status() + + log.info("DomainCrawler done for put_links action") + + # link creation successful + log.debug(f"PUT_Links returning status: {status}") + req_rsp = {} + resp = await jsonResponse(request, req_rsp, status=status) + log.response(request, resp=resp) + return resp + + async def DELETE_Links(request): """HTTP method to delete multiple links """ log.request(request) @@ -284,7 +472,10 @@ async def DELETE_Links(request): titles = titles_param.split(separator) for title in titles: - validateLinkName(title) + try: + validateLinkName(title) + except ValueError: + raise HTTPBadRequest(reason="invalid link name") username, pswd = getUserPasswordFromRequest(request) await validateUserPassword(app, username, pswd) @@ -392,7 +583,10 @@ async def POST_Links(request): log.debug(f"getting all links for {group_id}") elif isinstance(titles, list): for title in titles: - validateLinkName(title) # raises HTTPBadRequest if invalid + try: + validateLinkName(title) + except ValueError: + raise HTTPBadRequest(reason="invalid link name") else: msg = f"expected list for titles but got: {type(titles)}" log.warn(msg) diff --git a/hsds/servicenode.py b/hsds/servicenode.py index 0f99f1b0..2ea85319 100755 --- a/hsds/servicenode.py +++ b/hsds/servicenode.py @@ -25,13 +25,14 @@ from .basenode import healthCheck, baseInit from . import hsds_logger as log from .util.authUtil import initUserDB, initGroupDB, setPassword -from .domain_sn import GET_Domain, PUT_Domain, DELETE_Domain, GET_Domains +from .domain_sn import GET_Domain, PUT_Domain, DELETE_Domain, GET_Domains, POST_Domain from .domain_sn import GET_Datasets, GET_Groups, GET_Datatypes from .domain_sn import GET_ACL, GET_ACLs, PUT_ACL from .group_sn import GET_Group, POST_Group, DELETE_Group -from .link_sn import GET_Links, POST_Links, GET_Link, PUT_Link, DELETE_Link, DELETE_Links -from .attr_sn import GET_Attributes, GET_Attribute, PUT_Attribute, PUT_Attributes, DELETE_Attribute -from .attr_sn import DELETE_Attributes, GET_AttributeValue, PUT_AttributeValue, POST_Attributes +from .link_sn import GET_Links, POST_Links, GET_Link, PUT_Link, PUT_Links +from .link_sn import DELETE_Link, DELETE_Links +from .attr_sn import GET_Attributes, GET_Attribute, PUT_Attribute, PUT_Attributes, POST_Attributes +from .attr_sn import DELETE_Attributes, DELETE_Attribute, GET_AttributeValue, PUT_AttributeValue from .ctype_sn import GET_Datatype, POST_Datatype, DELETE_Datatype from .dset_sn import GET_Dataset, POST_Dataset, DELETE_Dataset from .dset_sn import GET_DatasetShape, PUT_DatasetShape, GET_DatasetType @@ -52,6 +53,7 @@ async def init(): app.router.add_route("GET", path, GET_Domain) app.router.add_route("DELETE", path, DELETE_Domain) app.router.add_route("PUT", path, PUT_Domain) + app.router.add_route("POST", path, POST_Domain) path = "/domains" app.router.add_route("GET", path, GET_Domains) @@ -83,6 +85,7 @@ async def init(): path = "/groups/{id}/links" app.router.add_route("GET", path, GET_Links) app.router.add_route("POST", path, POST_Links) + app.router.add_route("PUT", path, PUT_Links) app.router.add_route("DELETE", path, DELETE_Links) path = "/groups/{id}/links/{title}" diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index be009a90..fad9d1ea 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -24,7 +24,7 @@ from .util.arrayUtil import encodeData from .util.idUtil import getDataNodeUrl, getCollectionForId from .util.idUtil import isSchema2Id, getS3Key, isValidUuid -from .util.linkUtil import h5Join, validateLinkName +from .util.linkUtil import h5Join, validateLinkName, getLinkClass from .util.storUtil import getStorJSONObj, isStorObj from .util.authUtil import aclCheck from .util.httpUtil import http_get, http_put, http_post, http_delete @@ -390,7 +390,10 @@ async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, """ create a new link. Return 201 if this is a new link, or 200 if it's a duplicate of an existing link. """ - validateLinkName(title) + try: + validateLinkName(title) + except ValueError: + raise HTTPBadRequest(reason="invalid link name") if h5path and tgt_id: msg = "putLink - provide tgt_id or h5path, but not both" @@ -398,27 +401,17 @@ async def putLink(app, group_id, title, tgt_id=None, h5path=None, h5domain=None, raise HTTPBadRequest(reason=msg) link_json = {} - if tgt_id: - if not isValidUuid(tgt_id): - msg = f"putLink with invalid id: {tgt_id}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) link_json["id"] = tgt_id - link_class = "H5L_TYPE_HARD" - elif h5path: + if h5path: link_json["h5path"] = h5path - # could be hard or soft link - if h5domain: - link_json["h5domain"] = h5domain - link_class = "H5L_TYPE_EXTERNAL" - else: - # soft link - link_class = "H5L_TYPE_SOFT" - else: - msg = "PUT Link with no id or h5path keys" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + if h5domain: + link_json["h5domain"] = h5domain + + try: + link_class = getLinkClass(link_json) + except ValueError: + raise HTTPBadRequest(reason="invalid link") link_json["class"] = link_class @@ -474,6 +467,54 @@ async def putExternalLink(app, group_id, title, h5path=None, h5domain=None, buck return status +async def putLinks(app, group_id, items, bucket=None): + """ create a new links. Return 201 if any item is a new link, + or 200 if it's a duplicate of an existing link. """ + + isValidUuid(group_id, obj_class="group") + group_json = None + + # validate input + for title in items: + try: + validateLinkName(title) + item = items[title] + link_class = getLinkClass(item) + except ValueError: + # invalid link + raise HTTPBadRequest(reason="invalid link") + + if link_class == "H5L_TYPE_HARD": + tgt_id = item["id"] + isValidUuid(tgt_id) + # for hard links, verify that the referenced id exists and is in + # this domain + ref_json = await getObjectJson(app, tgt_id, bucket=bucket) + if not group_json: + # just need to fetch this once + group_json = await getObjectJson(app, group_id, bucket=bucket) + if ref_json["root"] != group_json["root"]: + msg = "Hard link must reference an object in the same domain" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + # ready to add links now + req = getDataNodeUrl(app, group_id) + req += "/groups/" + group_id + "/links" + log.debug(f"PUT links - PUT request: {req}") + params = {"bucket": bucket} + + data = {"links": items} + + put_rsp = await http_put(app, req, data=data, params=params) + log.debug(f"PUT Link resp: {put_rsp}") + if "status" in put_rsp: + status = put_rsp["status"] + else: + status = 201 + return status + + async def deleteLinks(app, group_id, titles=None, separator="/", bucket=None): """ delete the requested set of links from the given object """ diff --git a/hsds/util/attrUtil.py b/hsds/util/attrUtil.py index 68ef2cdc..e6afd26e 100755 --- a/hsds/util/attrUtil.py +++ b/hsds/util/attrUtil.py @@ -51,3 +51,32 @@ def validateAttributeName(name): msg = f"attribute name must be a string, but got: {type(name)}" log.warn(msg) raise HTTPBadRequest(reason=msg) + + +def isEqualAttr(attr1, attr2): + """ compare to attributes, return True if the same, False if differnt """ + for obj in (attr1, attr2): + if not isinstance(obj, dict): + raise TypeError(f"unexpected type: {type(obj)}") + if "type" not in obj: + raise TypeError("expected type key for attribute") + if "shape" not in obj: + raise TypeError("expected shape key for attribute") + # value is optional (not set for null space attributes) + if attr1["type"] != attr2["type"]: + return False + if attr1["shape"] != attr2["shape"]: + return False + shape_class = attr1["shape"].get("class") + if shape_class == "H5S_NULL": + return True # nothing else to compare + for obj in (attr1, attr2): + if "value" not in obj: + raise TypeError("expected value key for attribute") + return attr1["value"] == attr2["value"] + + if not isinstance(attr1, dict): + raise TypeError(f"unexpected type: {type(attr1)}") + return True + if not attr1 and not attr2: + return True diff --git a/hsds/util/linkUtil.py b/hsds/util/linkUtil.py index b16133d1..3469a8a1 100644 --- a/hsds/util/linkUtil.py +++ b/hsds/util/linkUtil.py @@ -13,20 +13,112 @@ # linkdUtil: # link related functions # -from aiohttp.web_exceptions import HTTPBadRequest from .. import hsds_logger as log def validateLinkName(name): + """ verify the link name is valid """ if not isinstance(name, str): msg = "Unexpected type for link name" - log.error(msg) - raise HTTPBadRequest(reason=msg) + log.warn(msg) + raise ValueError(msg) if name.find("/") >= 0: msg = "link name contains slash" - log.error(msg) - raise HTTPBadRequest(reason=msg) + log.warn(msg) + raise ValueError(msg) + + +def getLinkClass(link_json): + """ verify this is a valid link + returns the link class """ + if "class" in link_json: + link_class = link_json["class"] + else: + link_class = None + if "h5path" in link_json and "id" in link_json: + msg = "link tgt_id and h5path both set" + log.warn(msg) + raise ValueError(msg) + if "id" in link_json: + tgt_id = link_json["id"] + if not isinstance(tgt_id, str) or len(tgt_id) < 38: + msg = f"link with invalid id: {tgt_id}" + log.warn(msg) + raise ValueError(msg) + if tgt_id[:2] not in ("g-", "t-", "d-"): + msg = "link tgt must be group, datatype or dataset uuid" + log.warn(msg) + raise ValueError(msg) + if link_class: + if link_class != "H5L_TYPE_HARD": + msg = f"expected link class to be H5L_TYPE_HARD but got: {link_class}" + log.warn(msg) + raise ValueError(msg) + else: + link_class = "H5L_TYPE_HARD" + elif "h5path" in link_json: + h5path = link_json["h5path"] + log.debug(f"link path: {h5path}") + if "h5domain" in link_json: + if link_class: + if link_class != "H5L_TYPE_EXTERNAL": + msg = f"expected link class to be H5L_TYPE_EXTERNAL but got: {link_class}" + log.warn(msg) + raise ValueError(msg) + else: + link_class = "H5L_TYPE_EXTERNAL" + else: + if link_class: + if link_class != "H5L_TYPE_SOFT": + msg = f"expected link class to be H5L_TYPE_SOFT but got: {link_class}" + log.warn(msg) + raise ValueError(msg) + else: + link_class = "H5L_TYPE_SOFT" + else: + msg = "link with no id or h5path" + log.warn(msg) + raise ValueError(msg) + + return link_class + + +def isEqualLink(link1, link2): + """ Return True if the two links are the same """ + + for obj in (link1, link2): + if not isinstance(obj, dict): + raise TypeError(f"unexpected type: {type(obj)}") + if "class" not in obj: + raise TypeError("expected class key for link") + if link1["class"] != link2["class"]: + return False # different link types + link_class = link1["class"] + if link_class == "H5L_TYPE_HARD": + for obj in (link1, link2): + if "id" not in obj: + raise TypeError(f"expected id key for link: {obj}") + if link1["id"] != link2["id"]: + return False + elif link_class == "H5L_TYPE_SOFT": + for obj in (link1, link2): + if "h5path" not in obj: + raise TypeError(f"expected h5path key for link: {obj}") + if link1["h5path"] != link2["h5path"]: + return False + elif link_class == "H5L_TYPE_EXTERNAL": + for obj in (link1, link2): + for k in ("h5path", "h5domain"): + if k not in obj: + raise TypeError(f"expected {k} key for link: {obj}") + if link1["h5path"] != link2["h5path"]: + return False + if link1["h5domain"] != link2["h5domain"]: + return False + else: + raise TypeError(f"unexpected link class: {link_class}") + return True def h5Join(path, paths): diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 78c7a099..7376f9a8 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -245,6 +245,7 @@ def testObjAttr(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 404) # not found attr_payload = {"type": "H5T_STD_I32LE", "value": 42} + attr_payload2 = {"type": "H5T_STD_I32LE", "value": 84} # try adding the attribute as a different user user2_name = config.get("user2_name") @@ -266,6 +267,14 @@ def testObjAttr(self): rsp = self.session.put(req, data=json.dumps(attr_payload), headers=headers) self.assertEqual(rsp.status_code, 201) # created + # try resending + rsp = self.session.put(req, data=json.dumps(attr_payload), headers=headers) + self.assertEqual(rsp.status_code, 200) # ok + + # try with a different value + rsp = self.session.put(req, data=json.dumps(attr_payload2), headers=headers) + self.assertEqual(rsp.status_code, 409) # conflict + # read the attribute we just created rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) # create attribute @@ -287,11 +296,11 @@ def testObjAttr(self): rspJson = json.loads(rsp.text) self.assertEqual(rspJson["attributeCount"], 1) # one attribute - # try creating the attribute again - should return 409 + # try creating the attribute again - should return 200 req = f"{self.endpoint}/{col_name}/{obj1_id}/attributes/{attr_name}" rsp = self.session.put(req, data=json.dumps(attr_payload), headers=headers) - self.assertEqual(rsp.status_code, 409) # conflict + self.assertEqual(rsp.status_code, 200) # OK # set the replace param and we should get a 200 params = {"replace": 1} @@ -327,6 +336,10 @@ def testEmptyShapeAttr(self): rsp = self.session.put(req, headers=headers, data=json.dumps(attr_payload)) self.assertEqual(rsp.status_code, 201) # created + # retry + rsp = self.session.put(req, headers=headers, data=json.dumps(attr_payload)) + self.assertEqual(rsp.status_code, 200) # OK + # read back the attribute rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) # OK @@ -411,6 +424,10 @@ def testNoShapeAttr(self): rsp = self.session.put(req, headers=headers, data=json.dumps(attr_payload)) self.assertEqual(rsp.status_code, 201) # created + # try re-sending the put. Should return 200 + rsp = self.session.put(req, headers=headers, data=json.dumps(attr_payload)) + self.assertEqual(rsp.status_code, 200) + # read back the attribute rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) # OK @@ -465,6 +482,10 @@ def testPutFixedString(self): rsp = self.session.put(req, data=json.dumps(data), headers=headers) self.assertEqual(rsp.status_code, 201) + # try re-sending the put. Should return 200 + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + # read attr rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 200) @@ -2027,9 +2048,9 @@ def testPutAttributeMultiple(self): self.assertEqual(len(attr_value), extent) self.assertEqual(attr_value, [i * 10 + j for j in range(extent)]) - # try writing again, should get 409 + # try writing again, should get 200 rsp = self.session.put(req, data=json.dumps(data), headers=headers) - self.assertEqual(rsp.status_code, 409) + self.assertEqual(rsp.status_code, 200) # write attributes to the three group objects data = {"obj_ids": grp_ids, "attributes": attributes} @@ -2090,10 +2111,10 @@ def testPutAttributeMultiple(self): self.assertEqual(len(attr_value), extent) self.assertEqual(attr_value, expected_value) - # try writing again, should get 409 + # try writing again, should get 200 req = self.endpoint + "/groups/" + root_id + "/attributes" rsp = self.session.put(req, data=json.dumps(data), headers=headers) - self.assertEqual(rsp.status_code, 409) + self.assertEqual(rsp.status_code, 200) def testDeleteAttributesMultiple(self): print("testDeleteAttributesMultiple", self.base_domain) @@ -2146,7 +2167,7 @@ def testDeleteAttributesMultiple(self): for i in range(attr_count): req = self.endpoint + "/groups/" + grp_id + "/attributes/" + attr_names[i] rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 404) + self.assertEqual(rsp.status_code, 410) # Create another batch of attributes for i in range(attr_count): @@ -2168,7 +2189,7 @@ def testDeleteAttributesMultiple(self): for i in range(attr_count): req = self.endpoint + "/groups/" + grp_id + "/attributes/" + attr_names[i] rsp = self.session.get(req, headers=headers) - self.assertEqual(rsp.status_code, 404) + self.assertEqual(rsp.status_code, 410) if __name__ == "__main__": diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index e339aa6e..1f69749a 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -189,6 +189,35 @@ def testGetDomain(self): rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 400) + def testPostDomain(self): + domain = helper.getTestDomain("tall.h5") + print("testPostDomain", domain) + headers = helper.getRequestHeaders(domain=domain) + + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + domainJson = json.loads(rsp.text) + self.assertTrue("root" in domainJson) + root_id = domainJson["root"] + + # Get group at /g1/g1.1 by using h5path + data = {"h5paths": ["/g1/g1.1", ]} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("h5paths" in rspJson) + rsp_paths = rspJson["h5paths"] + self.assertTrue("/g1/g1.1" in rsp_paths) + obj_json = rsp_paths["/g1/g1.1"] + g11id = helper.getUUIDByPath(domain, "/g1/g1.1", session=self.session) + self.assertEqual(g11id, obj_json["id"]) + self.assertTrue("root" in obj_json) + self.assertEqual(root_id, obj_json["root"]) + def testGetByPath(self): domain = helper.getTestDomain("tall.h5") print("testGetByPath", domain) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 1633b95c..faad65a7 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -23,6 +23,7 @@ def __init__(self, *args, **kwargs): super(LinkTest, self).__init__(*args, **kwargs) self.base_domain = helper.getTestDomainName(self.__class__.__name__) helper.setupDomain(self.base_domain, folder=True) + self.endpoint = helper.getEndpoint() def setUp(self): self.session = helper.getSession() @@ -861,6 +862,97 @@ def testRootH5Path(self): self.assertTrue(k in cprops) self.assertEqual(cprops[k], creation_props[k]) + def testNonURLEncodableLinkName(self): + domain = self.base_domain + "/testNonURLEncodableLinkName.h5" + print("testNonURLEncodableLinkName", domain) + helper.setupDomain(domain) + + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a subgroup + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + + # link as "grp1" + grp_name = "grp1" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + grp_name + payload = {"id": grp_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + link_name = "#link1#" + data = {"h5path": "somewhere"} + req = self.endpoint + "/groups/" + grp_id + "/links" # request without name + bad_req = f"{req}/{link_name}" # this request will fail because of the hash char + + # create link + rsp = self.session.put(bad_req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 404) # regular put doesn't work + + links = {link_name: data} + body = {"links": links} + + rsp = self.session.put(req, data=json.dumps(body), headers=headers) + self.assertEqual(rsp.status_code, 201) # this is ok + + # get all links and verify the one we created is there + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + rsp_links = rspJson["links"] + self.assertEqual(len(rsp_links), 1) + rsp_link = rsp_links[0] + self.assertTrue("title" in rsp_link) + self.assertEqual(rsp_link["title"], link_name) + + # try doing a get on this specific link + rsp = self.session.get(bad_req, headers=headers) + self.assertEqual(rsp.status_code, 404) # can't do a get with the link name + + # do a post request with the link name + link_names = [link_name, ] + data = {"titles": link_names} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + rsp_links = rspJson["links"] + self.assertEqual(len(rsp_links), 1) + rsp_links = rsp_links[0] + + self.assertTrue("title" in rsp_link) + self.assertEqual(rsp_link["title"], link_name) + + # try deleting the link by name + rsp = self.session.delete(bad_req, headers=headers) + self.assertEqual(rsp.status_code, 404) # not found + + # send link name as a query param + params = {"titles": link_names} + rsp = self.session.delete(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # verify the link is gone + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + rsp_links = rspJson["links"] + self.assertEqual(len(rsp_links), 0) + def testPostLinkSingle(self): domain = helper.getTestDomain("tall.h5") print("testPostLinkSingle", domain) @@ -1052,6 +1144,273 @@ def testPostLinkMultiple(self): else: self.assertTrue(False) # unexpected + def testPutLinkMultiple(self): + domain = self.base_domain + "/testPutLinkMultiple.h5" + helper.setupDomain(domain) + print("testPutLinkMultiple", domain) + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_id = rspJson["root"] + + # create a group + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grpA_id = rspJson["id"] + self.assertTrue(helper.validateId(grpA_id)) + + # link new obj as '/grpA' + req = self.endpoint + "/groups/" + root_id + "/links/grpA" + payload = {"id": grpA_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + # create some groups under grp1 + grp_count = 3 + + grp_names = [f"grp{(i+1):04d}" for i in range(grp_count)] + grp_ids = [] + + for grp_name in grp_names: + # create sub_groups + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + grp_ids.append(grp_id) + + # create some links + links = {} + for i in range(grp_count): + title = grp_names[i] + links[title] = {"id": grp_ids[i]} + + # add a soft and external link as well + links["softlink"] = {"h5path": "a_path"} + links["extlink"] = {"h5path": "another_path", "h5domain": "/a_domain"} + link_count = len(links) + + # write links to the grpA + data = {"links": links} + req = self.endpoint + "/groups/" + grpA_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # do a get on the links + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), link_count) + for link in ret_links: + self.assertTrue("title" in link) + title = link["title"] + self.assertTrue("class" in link) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertTrue("id" in link) + self.assertTrue(link["id"] in grp_ids) + self.assertTrue(title in grp_names) + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue("h5path" in link) + h5path = link["h5path"] + self.assertEqual(h5path, "a_path") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue("h5path" in link) + h5path = link["h5path"] + self.assertEqual(h5path, "another_path") + self.assertTrue("h5domain" in link) + h5domain = link["h5domain"] + self.assertEqual(h5domain, "/a_domain") + else: + self.assertTrue(False) # unexpected + + # try writing again, should get 200 (no new links) + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + + # write some links to three group objects + links = {} + links["hardlink_multicast"] = {"id": root_id} + links["softlink_multicast"] = {"h5path": "multi_path"} + links["extlink_multicast"] = {"h5path": "multi_path", "h5domain": "/another_domain"} + link_count = len(links) + data = {"links": links, "grp_ids": grp_ids} + req = self.endpoint + "/groups/" + root_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # check that the links got created + for grp_id in grp_ids: + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), 3) + for ret_link in ret_links: + self.assertTrue("class" in ret_link) + link_class = ret_link["class"] + if link_class == "H5L_TYPE_HARD": + self.assertTrue("id" in ret_link) + self.assertEqual(ret_link["id"], root_id) + elif link_class == "H5L_TYPE_SOFT": + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], "multi_path") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], "multi_path") + self.assertTrue("h5domain" in ret_link) + self.assertEqual(ret_link["h5domain"], "/another_domain") + else: + self.assertTrue(False) # unexpected + + # write different links to three group objects + link_data = {} + for i in range(grp_count): + grp_id = grp_ids[i] + links = {} + links[f"hardlink_{i}"] = {"id": root_id} + links[f"softlink_{i}"] = {"h5path": f"multi_path_{i}"} + ext_link = {"h5path": f"multi_path_{i}", "h5domain": f"/another_domain/{i}"} + links[f"extlink_{i}"] = ext_link + link_data[grp_id] = {"links": links} + + data = {"grp_ids": link_data} + req = self.endpoint + "/groups/" + root_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # check that the new links got created + for i in range(grp_count): + grp_id = grp_ids[i] + titles = [f"hardlink_{i}", f"softlink_{i}", f"extlink_{i}", ] + data = {"titles": titles} + # do a post to just return the links we are interested in + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + ret_links = rspJson["links"] + self.assertEqual(len(ret_links), len(titles)) + for j in range(len(titles)): + ret_link = ret_links[j] + self.assertTrue("class" in ret_link) + link_class = ret_link["class"] + self.assertTrue("title" in ret_link) + link_title = ret_link["title"] + if link_class == "H5L_TYPE_HARD": + self.assertEqual(link_title, f"hardlink_{i}") + self.assertTrue("id" in ret_link) + self.assertEqual(ret_link["id"], root_id) + elif link_class == "H5L_TYPE_SOFT": + self.assertEqual(link_title, f"softlink_{i}") + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], f"multi_path_{i}") + elif link_class == "H5L_TYPE_EXTERNAL": + self.assertEqual(link_title, f"extlink_{i}") + self.assertTrue("h5path" in ret_link) + self.assertEqual(ret_link["h5path"], f"multi_path_{i}") + self.assertTrue("h5domain" in ret_link) + self.assertEqual(ret_link["h5domain"], f"/another_domain/{i}") + else: + self.assertTrue(False) # unexpected + + def testDeleteLinkMultiple(self): + domain = self.base_domain + "/testDeleteLinkMultiple.h5" + helper.setupDomain(domain) + + print("testDeleteLinkMultiple", self.base_domain) + + headers = helper.getRequestHeaders(domain=domain) + req = self.endpoint + "/" + + # Get root uuid + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + helper.validateId(root_uuid) + + # create a subgroup + req = self.endpoint + "/groups" + rsp = self.session.post(req, headers=headers) + self.assertEqual(rsp.status_code, 201) + rspJson = json.loads(rsp.text) + grp_id = rspJson["id"] + self.assertTrue(helper.validateId(grp_id)) + + # link as "grp1" + grp_name = "grp1" + req = self.endpoint + "/groups/" + root_uuid + "/links/" + grp_name + payload = {"id": grp_id} + rsp = self.session.put(req, data=json.dumps(payload), headers=headers) + self.assertEqual(rsp.status_code, 201) # created + + # create some links + titles = [] + links = {} + title = "root" + links[title] = {"id": root_uuid} + titles.append(title) + + # add a soft and external link as well + title = "softlink" + links[title] = {"h5path": "a_path"} + titles.append(title) + title = "extlink" + links[title] = {"h5path": "another_path", "h5domain": "/a_domain"} + titles.append(title) + link_count = len(links) + + # write links to the grp1 + data = {"links": links} + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # Delete all by parameter + separator = '/' + params = {"titles": separator.join(titles)} + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.delete(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # Attempt to read deleted links + for i in range(link_count): + req = self.endpoint + "/groups/" + grp_id + "/links/" + titles[i] + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 410) + + # re-create links + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.put(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 201) + + # Delete with custom separator + separator = ':' + params = {"titles": separator.join(titles)} + params["separator"] = ":" + req = self.endpoint + "/groups/" + grp_id + "/links" + rsp = self.session.delete(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + # Attempt to read + for i in range(link_count): + req = self.endpoint + "/groups/" + grp_id + "/links/" + titles[i] + rsp = self.session.get(req, headers=headers) + self.assertEqual(rsp.status_code, 410) + if __name__ == "__main__": # setup test files From 6314f27347c0917904cbafc6fdaade75c93c939c Mon Sep 17 00:00:00 2001 From: jreadey Date: Sat, 13 Jan 2024 08:51:05 +0000 Subject: [PATCH 05/18] remove unreachable code --- hsds/util/attrUtil.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hsds/util/attrUtil.py b/hsds/util/attrUtil.py index e6afd26e..06468f7e 100755 --- a/hsds/util/attrUtil.py +++ b/hsds/util/attrUtil.py @@ -74,9 +74,3 @@ def isEqualAttr(attr1, attr2): if "value" not in obj: raise TypeError("expected value key for attribute") return attr1["value"] == attr2["value"] - - if not isinstance(attr1, dict): - raise TypeError(f"unexpected type: {type(attr1)}") - return True - if not attr1 and not attr2: - return True From 27ea0073cc2c41d5175c68cb2049fbff85c28a03 Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 16 Jan 2024 02:18:14 +0000 Subject: [PATCH 06/18] update version string --- hsds/basenode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hsds/basenode.py b/hsds/basenode.py index 4ae3dfed..c318a675 100644 --- a/hsds/basenode.py +++ b/hsds/basenode.py @@ -33,7 +33,7 @@ from .util.k8sClient import getDnLabelSelector, getPodIps from . import hsds_logger as log -HSDS_VERSION = "0.8.5" +HSDS_VERSION = "0.9.0.alpha0" def getVersion(): From f2365ced8ae4a2645cdb6fe7f67567fdf22d7272 Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 16 Jan 2024 02:19:10 +0000 Subject: [PATCH 07/18] added multi get obj test --- tests/integ/domain_test.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 1f69749a..6d4dbe7b 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -189,9 +189,9 @@ def testGetDomain(self): rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 400) - def testPostDomain(self): + def testPostDomainSingle(self): domain = helper.getTestDomain("tall.h5") - print("testPostDomain", domain) + print("testPostDomainSingle", domain) headers = helper.getRequestHeaders(domain=domain) req = helper.getEndpoint() + "/" @@ -218,6 +218,39 @@ def testPostDomain(self): self.assertTrue("root" in obj_json) self.assertEqual(root_id, obj_json["root"]) + def testPostDomainMultiple(self): + domain = helper.getTestDomain("tall.h5") + print("testPostDomainMultiple", domain) + headers = helper.getRequestHeaders(domain=domain) + + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + domainJson = json.loads(rsp.text) + self.assertTrue("root" in domainJson) + root_id = domainJson["root"] + + # h5paths to fetch + h5paths = ["/g1/g1.1", "/g1/g1.2", "/g2/dset2.2"] + data = {"h5paths": h5paths} + rsp = self.session.post(req, data=json.dumps(data), headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("h5paths" in rspJson) + rsp_paths = rspJson["h5paths"] + self.assertEqual(len(h5paths), len(rsp_paths)) + for h5path in h5paths: + self.assertTrue(h5path in rsp_paths) + obj_json = rsp_paths[h5path] + obj_id = helper.getUUIDByPath(domain, h5path, session=self.session) + self.assertEqual(obj_id, obj_json["id"]) + self.assertTrue("root" in obj_json) + self.assertEqual(root_id, obj_json["root"]) + + def testGetByPath(self): domain = helper.getTestDomain("tall.h5") print("testGetByPath", domain) From 7968ea74d25d9497e286e7cdbe827c7c6baf4a8a Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 16 Jan 2024 02:32:44 +0000 Subject: [PATCH 08/18] cleanup TBD comments --- hsds/attr_sn.py | 21 ++++----------------- hsds/ctype_sn.py | 1 - hsds/dset_sn.py | 1 - hsds/group_sn.py | 1 - hsds/link_sn.py | 2 -- 5 files changed, 4 insertions(+), 22 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index af42b337..c581a1ff 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -104,7 +104,6 @@ async def GET_Attributes(request): log.debug(f"bucket: {bucket}") kwargs["bucket"] = bucket - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "read") attributes = await getAttributes(app, obj_id, **kwargs) @@ -166,7 +165,6 @@ async def GET_Attribute(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "read") if "ignore_nan" in params and params["ignore_nan"]: @@ -518,7 +516,6 @@ async def PUT_Attribute(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, req_obj_id, username, "create") # get attribute from request body @@ -539,8 +536,7 @@ async def PUT_Attribute(request): status = await putAttributes(app, req_obj_id, attr_json, **kwargs) log.info(f"PUT Attributes status: {status}") - hrefs = [] # TBD - req_rsp = {"hrefs": hrefs} + req_rsp = {} # attribute creation successful resp = await jsonResponse(request, req_rsp, status=status) log.response(request, resp=resp) @@ -656,7 +652,6 @@ async def PUT_Attributes(request): log.debug(f"got {len(obj_ids)} obj_ids") - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, req_obj_id, username, "create") kwargs = {"bucket": bucket} @@ -694,8 +689,7 @@ async def PUT_Attributes(request): log.info("DomainCrawler done for put_attrs action") - hrefs = [] # TBD - req_rsp = {"hrefs": hrefs} + req_rsp = {} # attribute creation successful log.debug(f"PUT_Attributes returning status: {status}") resp = await jsonResponse(request, req_rsp, status=status) @@ -737,7 +731,6 @@ async def DELETE_Attribute(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "delete") attr_names = [attr_name, ] @@ -745,8 +738,7 @@ async def DELETE_Attribute(request): await deleteAttributes(app, obj_id, **kwargs) - hrefs = [] # TBD - req_rsp = {"hrefs": hrefs} + req_rsp = {} resp = await jsonResponse(request, req_rsp) log.response(request, resp=resp) return resp @@ -789,7 +781,6 @@ async def GET_AttributeValue(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "read") params = request.rel_url.query @@ -967,7 +958,6 @@ async def PUT_AttributeValue(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "update") attr_names = [attr_name, ] @@ -1097,8 +1087,7 @@ async def PUT_AttributeValue(request): else: log.info("PUT AttributesValue status: 200") - hrefs = [] # TBD - req_rsp = {"hrefs": hrefs} + req_rsp = {} # attribute creation successful resp = await jsonResponse(request, req_rsp) log.response(request, resp=resp) @@ -1213,7 +1202,6 @@ async def POST_Attributes(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "read") params = request.rel_url.query @@ -1361,7 +1349,6 @@ async def DELETE_Attributes(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "delete") kwargs = {"attr_names": attr_names, "bucket": bucket, "separator": separator} diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 3beae207..677f67ce 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -271,7 +271,6 @@ async def DELETE_Datatype(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, ctype_id, username, "delete") req = getDataNodeUrl(app, ctype_id) + "/datatypes/" + ctype_id diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index c6a5ae6d..92d09662 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -1215,7 +1215,6 @@ async def DELETE_Dataset(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, dset_id, username, "delete") req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id diff --git a/hsds/group_sn.py b/hsds/group_sn.py index d09baacc..fe98db98 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -267,7 +267,6 @@ async def DELETE_Group(request): # get domain JSON domain_json = await getDomainJson(app, domain) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, group_id, username, "delete") verifyRoot(domain_json) diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 7d29acb0..a3cc48b7 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -398,7 +398,6 @@ async def PUT_Links(request): log.debug(f"got {len(grp_ids)} grp_ids") - # TBD - verify that the grp_id belongs to the given domain await validateAction(app, domain, req_grp_id, username, "create") kwargs = {"bucket": bucket} @@ -609,7 +608,6 @@ async def POST_Links(request): domain_json = await getDomainJson(app, domain) verifyRoot(domain_json) - # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, req_id, username, "read") resp_json = {} From bf998ef291f8a97ff961e9dc58bd34fd86fe09a8 Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 16 Jan 2024 04:58:54 +0000 Subject: [PATCH 09/18] fix flake error --- tests/integ/domain_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 6d4dbe7b..1e03b3df 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -250,7 +250,6 @@ def testPostDomainMultiple(self): self.assertTrue("root" in obj_json) self.assertEqual(root_id, obj_json["root"]) - def testGetByPath(self): domain = helper.getTestDomain("tall.h5") print("testGetByPath", domain) From eab4acc34091163daf845660e0344fe02ab78250 Mon Sep 17 00:00:00 2001 From: jreadey Date: Thu, 18 Jan 2024 06:38:53 +0000 Subject: [PATCH 10/18] fix parent_id for POST domain, add pattern for links --- admin/config/config.yml | 2 +- hsds/attr_sn.py | 3 +- hsds/ctype_sn.py | 13 +--- hsds/domain_crawl.py | 17 +++-- hsds/domain_sn.py | 135 +++++++++++++++++---------------- hsds/dset_lib.py | 19 +++++ hsds/dset_sn.py | 24 +++--- hsds/group_sn.py | 17 +---- hsds/link_dn.py | 15 ++++ hsds/link_sn.py | 114 ++++++++++++++++++---------- hsds/servicenode_lib.py | 37 +++++++++- tests/integ/domain_test.py | 45 +++++++++-- tests/integ/link_test.py | 148 ++++++++++++++++++++++++++++++++++++- 13 files changed, 426 insertions(+), 163 deletions(-) diff --git a/admin/config/config.yml b/admin/config/config.yml index d690ba7e..92718250 100755 --- a/admin/config/config.yml +++ b/admin/config/config.yml @@ -76,7 +76,6 @@ http_streaming: true # enable HTTP streaming k8s_dn_label_selector: app=hsds # Selector for getting data node pods from a k8s deployment (https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors) k8s_namespace: null # Specifies if a the client should be limited to a specific namespace. Useful for some RBAC configurations. restart_policy: on-failure # Docker restart policy -domain_req_max_objects_limit: 500 # maximum number of objects to return in GET domain request with use_cache # the following two values with give backoff times of approx: 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8 dn_max_retries: 7 # number of time to retry DN requests dn_retry_backoff_exp: 0.1 # backoff factor for retries @@ -99,3 +98,4 @@ data_cache_max_req_size: 128k # max size for rangeget fetches data_cache_expire_time: 3600 # expire cache items after one hour data_cache_page_size: 4m # page size for range get cache, set to zero to disable proxy data_cache_max_concurrent_read: 16 # maximum number of inflight storage read requests +domain_req_max_objects_limit: 500 # maximum number of objects to return in GET domain request with use_cache diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index c581a1ff..b2b4ec9f 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -18,8 +18,7 @@ from aiohttp.web import StreamResponse from json import JSONDecodeError -from .util.httpUtil import getHref -from .util.httpUtil import getAcceptType, jsonResponse +from .util.httpUtil import getAcceptType, jsonResponse, getHref from .util.idUtil import isValidUuid, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain diff --git a/hsds/ctype_sn.py b/hsds/ctype_sn.py index 677f67ce..ad1941e7 100755 --- a/hsds/ctype_sn.py +++ b/hsds/ctype_sn.py @@ -16,7 +16,7 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPGone from json import JSONDecodeError -from .util.httpUtil import http_post, http_delete, getHref, respJsonAssemble +from .util.httpUtil import http_post, getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId from .util.authUtil import getUserPasswordFromRequest, aclCheck @@ -24,7 +24,7 @@ from .util.domainUtil import getDomainFromRequest, getPathForDomain, isValidDomain from .util.domainUtil import getBucketForDomain, verifyRoot from .util.hdf5dtype import validateTypeItem, getBaseTypeJson -from .servicenode_lib import getDomainJson, getObjectJson, validateAction +from .servicenode_lib import getDomainJson, getObjectJson, validateAction, deleteObj from .servicenode_lib import getObjectIdByPath, getPathForObjectId, putHardLink from . import hsds_logger as log @@ -242,8 +242,6 @@ async def DELETE_Datatype(request): """HTTP method to delete a committed type resource""" log.request(request) app = request.app - meta_cache = app["meta_cache"] - ctype_id = request.match_info.get("id") if not ctype_id: msg = "Missing committed type id" @@ -273,12 +271,7 @@ async def DELETE_Datatype(request): await validateAction(app, domain, ctype_id, username, "delete") - req = getDataNodeUrl(app, ctype_id) + "/datatypes/" + ctype_id - - await http_delete(app, req, params=params) - - if ctype_id in meta_cache: - del meta_cache[ctype_id] # remove from cache + await deleteObj(app, ctype_id, bucket=bucket) resp = await jsonResponse(request, {}) log.response(request, resp=resp) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index f2d672b5..b23ea265 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -34,7 +34,7 @@ def __init__( max_objects_limit=0, raise_error=False ): - log.info(f"DomainCrawler.__init__ root_id: {len(objs)} objs") + log.info(f"DomainCrawler.__init__ action: {action} root_id: {len(objs)} objs") log.debug(f"params: {params}") self._app = app self._action = action @@ -127,7 +127,7 @@ async def put_attributes(self, obj_id, attr_items): async def get_obj_json(self, obj_id): """ get the given obj_json for the obj_id. - for each group found, search the links if include_links is set """ + for each group found, search the links if follow_links is set """ log.debug(f"get_obj_json: {obj_id}") collection = getCollectionForId(obj_id) kwargs = {} @@ -207,11 +207,16 @@ async def get_obj_json(self, obj_id): log.debug(f"DomainCrawler - adding link_id: {link_id}") self._obj_dict[link_id] = {} # placeholder for obj id self._q.put_nowait(link_id) + if not self._params.get("include_links"): + # don't keep the links + del obj_json["links"] async def get_links(self, grp_id, titles=None): """ if titles is set, get all the links in grp_id that have a title in the list. Otherwise, return all links for the object. """ - log.debug(f"get_links: {grp_id}, titles; {titles}") + log.debug(f"get_links: {grp_id}") + if titles: + log.debug(f"titles; {titles}") collection = getCollectionForId(grp_id) if collection != "groups": log.warn(f"get_links, expected groups id but got: {grp_id}") @@ -221,7 +226,6 @@ async def get_links(self, grp_id, titles=None): kwargs["titles"] = titles if self._params.get("bucket"): kwargs["bucket"] = self._params["bucket"] - if self._params.get("follow_links"): follow_links = True else: @@ -388,7 +392,6 @@ async def work(self): async def fetch(self, obj_id): log.debug(f"DomainCrawler fetch for id: {obj_id}") - log.debug(f"action: {self._action}") if self._action == "get_obj": log.debug("DomainCrawler - get obj") # just get the obj json @@ -427,7 +430,9 @@ async def fetch(self, obj_id): await self.put_attributes(obj_id, attr_items) elif self._action == "get_link": log.debug("DomainCrawlwer - get links") - if obj_id not in self._objs: + log.debug(f"self._objs: {self._objs}, type: {type(self._objs)}") + + if self._objs is None or obj_id not in self._objs: link_titles = None # fetch all links for this object else: link_titles = self._objs[obj_id] diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index 785a9528..bd1c33aa 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -23,9 +23,8 @@ from aiohttp.web_exceptions import HTTPConflict, HTTPServiceUnavailable from aiohttp import ClientResponseError from aiohttp.web import json_response -from requests.sessions import merge_setting -from .util.httpUtil import getObjectClass, http_post, http_put, http_get, http_delete +from .util.httpUtil import getObjectClass, http_post, http_put, http_delete from .util.httpUtil import getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import getDataNodeUrl, createObjId, getCollectionForId @@ -47,60 +46,54 @@ from . import config -async def get_collections(app, root_id, bucket=None): +async def get_collections(app, root_id, bucket=None, max_objects_limit=None): """Return the object ids for given root.""" log.info(f"get_collections for {root_id}") - groups = {} - datasets = {} - datatypes = {} - lookup_ids = set() - lookup_ids.add(root_id) - params = {"bucket": bucket} - - while lookup_ids: - grp_id = lookup_ids.pop() - req = getDataNodeUrl(app, grp_id) - req += "/groups/" + grp_id + "/links" - log.debug("collection get LINKS: " + req) - try: - # throws 404 if doesn't exist - links_json = await http_get(app, req, params=params) - except HTTPNotFound: - log.warn(f"get_collection, group {grp_id} not found") - continue - log.debug(f"got links json from dn for group_id: {grp_id}") - links = links_json["links"] - log.debug(f"get_collection: got links: {links}") - for link in links: - if link["class"] != "H5L_TYPE_HARD": - continue - link_id = link["id"] - obj_type = getCollectionForId(link_id) - if obj_type == "groups": - if link_id in groups: - continue # been here before - groups[link_id] = {} - lookup_ids.add(link_id) - elif obj_type == "datasets": - if link_id in datasets: - continue - datasets[link_id] = {} - elif obj_type == "datatypes": - if link_id in datatypes: - continue - datatypes[link_id] = {} - else: - msg = "get_collection: unexpected link object type: " - msg += f"{obj_type}" - log.error(merge_setting) - HTTPInternalServerError() + crawler_params = { + "include_attrs": False, + "include_links": False, + "bucket": bucket, + "follow_links": True, + } + + if max_objects_limit: + crawler_params["max_objects_limit"] = max_objects_limit + + crawler = DomainCrawler(app, [root_id, ], action="get_obj", params=crawler_params) + await crawler.crawl() + if max_objects_limit and len(crawler._obj_dict) >= max_objects_limit: + msg = "get_collections - too many objects: " + msg += f"{len(crawler._obj_dict)}, returning None" + log.info(msg) + return None + else: + msg = f"DomainCrawler returned: {len(crawler._obj_dict)} object ids" + log.info(msg) + + group_ids = set() + dataset_ids = set() + datatype_ids = set() + + for obj_id in crawler._obj_dict: + obj_type = getCollectionForId(obj_id) + if obj_type == "groups": + group_ids.add(obj_id) + elif obj_type == "datasets": + dataset_ids.add(obj_id) + elif obj_type == "datatypes": + datatype_ids.add(obj_id) + else: + log.warn(f"get_collections - unexpected id type: {obj_id}") + if root_id in group_ids: + group_ids.remove(root_id) # don't include the root id + print(f"get_collections - group_ids: {group_ids}") result = {} - result["groups"] = groups - result["datasets"] = datasets - result["datatypes"] = datatypes + result["groups"] = group_ids + result["datasets"] = dataset_ids + result["datatypes"] = datatype_ids return result @@ -114,9 +107,10 @@ async def getDomainObjects(app, root_id, include_attrs=False, bucket=None): crawler_params = { "include_attrs": include_attrs, - "bucket": bucket, + "include_links": True, "follow_links": True, "max_objects_limit": max_objects_limit, + "bucket": bucket, } crawler = DomainCrawler(app, [root_id, ], action="get_obj", params=crawler_params) @@ -263,15 +257,13 @@ async def get_domains(request): if pattern: # do a pattern match on the basename basename = op.basename(domain) - log.debug( - f"get_domains: checking {basename} against pattern: {pattern}" - ) + msg = f"get_domains: checking {basename} against pattern: {pattern}" + log.debug(msg) try: got_match = globmatch(basename, pattern) except ValueError as ve: - log.warn( - f"get_domains, invalid query pattern {pattern}, ValueError: {ve}" - ) + msg = f"get_domains, invalid query pattern {pattern}, ValueError: {ve}" + log.warn(msg) raise HTTPBadRequest(reason="invalid query pattern") if got_match: log.debug("get_domains - got_match") @@ -502,14 +494,14 @@ async def GET_Domain(request): h5path = params["h5path"] # select which object to perform path search under - root_id = parent_id if parent_id else domain_json["root"] + base_id = parent_id if parent_id else domain_json["root"] # getObjectIdByPath throws 404 if not found obj_id, domain, _ = await getObjectIdByPath( - app, root_id, h5path, bucket=bucket, domain=domain, + app, base_id, h5path, bucket=bucket, domain=domain, follow_soft_links=follow_soft_links, follow_external_links=follow_external_links) - log.info(f"get obj_id: {obj_id} from h5path: {h5path}") + log.info(f"got obj_id: {obj_id} from h5path: {h5path}") # get authoritative state for object from DN (even if # it's in the meta_cache). kwargs = {"refresh": True, "bucket": bucket, @@ -632,11 +624,14 @@ async def POST_Domain(request): params = request.rel_url.query log.debug(f"POST_Domain query params: {params}") + parent_id = None include_links = False include_attrs = False follow_soft_links = False follow_external_links = False + if "parent_id" in params and params["parent_id"]: + parent_id = params["parent_id"] if "include_links" in params and params["include_links"]: include_links = True if "include_attrs" in params and params["include_attrs"]: @@ -710,22 +705,34 @@ async def POST_Domain(request): log.error("No acls key found in domain") raise HTTPInternalServerError() - log.debug(f"got domain_json: {domain_json}") + if "root" not in domain_json: + msg = f"{domain} is a folder, not a domain" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + root_id = domain_json["root"] + + # select which object to perform path search under + base_id = parent_id if parent_id else root_id + + log.debug(f"POST_Domain with h5paths: {h5paths} from: {base_id}") # validate that the requesting user has permission to read this domain # aclCheck throws exception if not authorized aclCheck(app, domain_json, "read", username) json_objs = {} + # TBD: the following could be made more efficient for + # cases where a large number of h5paths are given... for h5path in h5paths: - root_id = domain_json["root"] # getObjectIdByPath throws 404 if not found obj_id, domain, _ = await getObjectIdByPath( - app, root_id, h5path, bucket=bucket, domain=domain, + app, base_id, h5path, bucket=bucket, domain=domain, follow_soft_links=follow_soft_links, follow_external_links=follow_external_links) - log.info(f"get obj_id: {obj_id} from h5path: {h5path}") + + log.info(f"got obj_id: {obj_id} from h5path: {h5path}") # get authoritative state for object from DN (even if # it's in the meta_cache). kwargs = {"refresh": True, "bucket": bucket, diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 5534d2b6..16e7c8a5 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -904,3 +904,22 @@ async def reduceShape(app, dset_json, shape_update, bucket=None): await removeChunks(app, delete_ids, bucket=bucket) else: log.info("no chunks need deletion for shape reduction") + + +async def deleteAllChunks(app, dset_id, bucket=None): + """ Delete any allocated chunks for the given dataset """ + + log.info(f"deleteAllChunks for {dset_id}") + + # get all chunk ids for chunks that have been allocated + chunk_ids = await getAllocatedChunkIds(app, dset_id, bucket=bucket) + chunk_ids.sort() + + if chunk_ids: + chunk_ids = list(chunk_ids) + chunk_ids.sort() + msg = f"deleteAllChunks for {dset_id} - these chunks will need to be deleted: {chunk_ids}" + log.debug(msg) + await removeChunks(app, chunk_ids, bucket=bucket) + else: + log.info(f"deleteAllChunks for {dset_id} - no chunks need deletion") diff --git a/hsds/dset_sn.py b/hsds/dset_sn.py index 92d09662..51a2c7ba 100755 --- a/hsds/dset_sn.py +++ b/hsds/dset_sn.py @@ -19,7 +19,7 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPNotFound from aiohttp.web_exceptions import HTTPConflict, HTTPInternalServerError -from .util.httpUtil import http_post, http_put, http_delete, getHref, respJsonAssemble +from .util.httpUtil import http_post, http_put, getHref, respJsonAssemble from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId, isSchema2Id from .util.dsetUtil import getPreviewQuery, getFilterItem, getShapeDims @@ -35,7 +35,8 @@ from .util.hdf5dtype import getItemSize from .servicenode_lib import getDomainJson, getObjectJson, getDsetJson, getPathForObjectId from .servicenode_lib import getObjectIdByPath, validateAction, getRootInfo, doFlush, putHardLink -from .dset_lib import reduceShape +from .servicenode_lib import deleteObj +from .dset_lib import reduceShape, deleteAllChunks from . import config from . import hsds_logger as log @@ -107,7 +108,7 @@ async def validateChunkLayout(app, shape_json, item_size, layout, bucket=None): # reference to a dataset in a traditional HDF5 files with # contigious storage if item_size == "H5T_VARIABLE": - # can't be used with variable types.. + # can't be used with variable types... msg = "Datsets with variable types cannot be used with " msg += "reference layouts" log.warn(msg) @@ -527,7 +528,6 @@ async def PUT_DatasetShape(request): shape_update = None extend = 0 extend_dim = 0 - hrefs = [] # tBD - definae HATEOS refs to return dset_id = request.match_info.get("id") if not dset_id: @@ -638,7 +638,7 @@ async def PUT_DatasetShape(request): if shape_update == dims: log.info("shape update is same as current dims, no action needed") - json_resp = {"hrefs:", hrefs} + json_resp = {} resp = await jsonResponse(request, json_resp, status=200) log.response(request, resp=resp) return resp @@ -671,7 +671,7 @@ async def PUT_DatasetShape(request): # send request onto DN req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id + "/shape" - json_resp = {"hrefs": hrefs} + json_resp = {} params = {} if bucket: params["bucket"] = bucket @@ -1189,7 +1189,6 @@ async def DELETE_Dataset(request): """HTTP method to delete a dataset resource""" log.request(request) app = request.app - meta_cache = app["meta_cache"] dset_id = request.match_info.get("id") if not dset_id: @@ -1217,15 +1216,10 @@ async def DELETE_Dataset(request): await validateAction(app, domain, dset_id, username, "delete") - req = getDataNodeUrl(app, dset_id) + "/datasets/" + dset_id - - params = {} - if bucket: - params["bucket"] = bucket - await http_delete(app, req, params=params) + # free any allocated chunks + await deleteAllChunks(app, dset_id, bucket=bucket) - if dset_id in meta_cache: - del meta_cache[dset_id] # remove from cache + await deleteObj(app, dset_id, bucket=bucket) resp = await jsonResponse(request, {}) log.response(request, resp=resp) diff --git a/hsds/group_sn.py b/hsds/group_sn.py index fe98db98..8a8d54a4 100755 --- a/hsds/group_sn.py +++ b/hsds/group_sn.py @@ -16,14 +16,14 @@ from aiohttp.web_exceptions import HTTPBadRequest, HTTPForbidden, HTTPNotFound from json import JSONDecodeError -from .util.httpUtil import http_post, http_delete, getHref +from .util.httpUtil import http_post, getHref from .util.httpUtil import jsonResponse from .util.idUtil import isValidUuid, getDataNodeUrl, createObjId from .util.authUtil import getUserPasswordFromRequest, aclCheck from .util.authUtil import validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain from .util.domainUtil import getBucketForDomain, getPathForDomain, verifyRoot -from .servicenode_lib import getDomainJson, getObjectJson, validateAction +from .servicenode_lib import getDomainJson, getObjectJson, validateAction, deleteObj from .servicenode_lib import getObjectIdByPath, getPathForObjectId, putHardLink from . import hsds_logger as log @@ -242,7 +242,6 @@ async def DELETE_Group(request): """HTTP method to delete a group resource""" log.request(request) app = request.app - meta_cache = app["meta_cache"] group_id = request.match_info.get("id") if not group_id: @@ -277,17 +276,7 @@ async def DELETE_Group(request): log.warn(msg) raise HTTPForbidden() - req = getDataNodeUrl(app, group_id) - req += "/groups/" + group_id - params = {} - if bucket: - params["bucket"] = bucket - log.debug(f"http_delete req: {req} params: {params}") - - await http_delete(app, req, params=params) - - if group_id in meta_cache: - del meta_cache[group_id] # remove from cache + await deleteObj(app, group_id, bucket=bucket) resp = await jsonResponse(request, {}) log.response(request, resp=resp) diff --git a/hsds/link_dn.py b/hsds/link_dn.py index d61d33d0..974a4115 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -22,6 +22,7 @@ from aiohttp.web import json_response from .util.idUtil import isValidUuid +from .util.globparser import globmatch from .util.linkUtil import validateLinkName, getLinkClass, isEqualLink from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import hsds_logger as log @@ -80,6 +81,10 @@ async def GET_Links(request): log.warn(msg) raise HTTPBadRequest(reason=msg) + pattern = None + if "pattern" in params: + pattern = params["pattern"] + group_json = await get_metadata_obj(app, group_id, bucket=bucket) log.debug(f"for id: {group_id} got group json: {group_json}") @@ -109,6 +114,16 @@ async def GET_Links(request): titles.sort() # sort by key log.debug(f"links by lexographic order: {titles}") + if pattern: + try: + titles = [x for x in titles if globmatch(x, pattern)] + except ValueError: + log.error(f"exception getting links using pattern: {pattern}") + raise HTTPBadRequest(reason=msg) + msg = f"getLinks with pattern: {pattern} returning {len(titles)} " + msg += f"links from {len(link_dict)}" + log.debug(msg) + start_index = 0 if marker is not None: start_index = _index(titles, marker, create_order=create_order) + 1 diff --git a/hsds/link_sn.py b/hsds/link_sn.py index a3cc48b7..9be0016b 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -16,8 +16,9 @@ from aiohttp.web_exceptions import HTTPBadRequest from json import JSONDecodeError -from .util.httpUtil import http_get, getHref +from .util.httpUtil import getHref from .util.httpUtil import jsonResponse +from .util.globparser import globmatch from .util.idUtil import isValidUuid, getDataNodeUrl, getCollectionForId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain, verifyRoot @@ -45,21 +46,6 @@ async def GET_Links(request): msg = f"Invalid group id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) - limit = None - create_order = False - if "CreateOrder" in params and params["CreateOrder"]: - create_order = True - - if "Limit" in params: - try: - limit = int(params["Limit"]) - except ValueError: - msg = "Bad Request: Expected int type for limit" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - marker = None - if "Marker" in params: - marker = params["Marker"] username, pswd = getUserPasswordFromRequest(request) if username is None and app["allow_noauth"]: @@ -76,31 +62,79 @@ async def GET_Links(request): await validateAction(app, domain, group_id, username, "read") - req = getDataNodeUrl(app, group_id) - req += "/groups/" + group_id + "/links" + kwargs = {"bucket": bucket} - params = {} - if create_order: - params["CreateOrder"] = 1 - if limit is not None: - params["Limit"] = str(limit) - if marker is not None: - params["Marker"] = marker - if bucket: - params["bucket"] = bucket - links_json = await http_get(app, req, params=params) - log.debug(f"got links json from dn for group_id: {group_id}") - links = links_json["links"] - - # mix in collection key, target and hrefs - for link in links: - if link["class"] == "H5L_TYPE_HARD": - collection_name = getCollectionForId(link["id"]) - link["collection"] = collection_name - target_uri = "/" + collection_name + "/" + link["id"] - link["target"] = getHref(request, target_uri) - link_uri = "/groups/" + group_id + "/links/" + link["title"] - link["href"] = getHref(request, link_uri) + if "follow_links" in params and params["follow_links"]: + follow_links = True + else: + follow_links = False + + if "pattern" in params and params["pattern"]: + pattern = params["pattern"] + try: + globmatch("abc", pattern) + except ValueError: + msg = f"invlaid pattern: {pattern} for link matching" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"using pattern: {pattern} for GET_Links") + kwargs["pattern"] = pattern + else: + pattern = None + + if follow_links: + # Use DomainCrawler to fetch links from multiple objects. + # set the follow_links and bucket params + kwargs["follow_links"] = True + crawler_kwargs = {"action": "get_link", "raise_error": True, "params": kwargs} + items = [group_id, ] + crawler = DomainCrawler(app, items, **crawler_kwargs) + + # will raise exception on NotFound, etc. + await crawler.crawl() + + msg = f"DomainCrawler returned: {len(crawler._obj_dict)} objects" + log.info(msg) + links = crawler._obj_dict + if pattern: + for grp_id in links.keys(): + grp_links = links[grp_id] + ret_links = [] + for link in grp_links: + title = link["title"] + if globmatch(title, pattern): + ret_links.append(link) + links[grp_id] = ret_links + msg = f"getLinks for {grp_id}, matched {len((ret_links))} links " + msg += f"from {len(grp_links)} links with pattern {pattern}" + log.debug(msg) + else: + if "CreateOrder" in params and params["CreateOrder"]: + kwargs["create_order"] = True + if "Limit" in params: + try: + limit = int(params["Limit"]) + except ValueError: + msg = "Bad Request: Expected int type for limit" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs["limit"] = limit + if "Marker" in params: + kwargs["marker"] = params["Marker"] + + links = await getLinks(app, group_id, **kwargs) + + log.debug(f"got {len(links)} links json from dn for group_id: {group_id}") + + # mix in collection key, target and hrefs + for link in links: + if link["class"] == "H5L_TYPE_HARD": + collection_name = getCollectionForId(link["id"]) + link["collection"] = collection_name + target_uri = "/" + collection_name + "/" + link["id"] + link["target"] = getHref(request, target_uri) + link_uri = "/groups/" + group_id + "/links/" + link["title"] + link["href"] = getHref(request, link_uri) resp_json = {} resp_json["links"] = links diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index fad9d1ea..2e94fe40 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -342,13 +342,21 @@ async def getDsetJson(app, dset_id, return dset_json -async def getLinks(app, group_id, titles=None, bucket=None): +async def getLinks(app, group_id, + titles=None, + create_order=False, + limit=None, + marker=None, + pattern=None, + bucket=None): + """ Get the link jsons for the given titles """ req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" params = {"bucket": bucket} log.debug(f"getLinks {group_id}") + if titles: # do a post request with the given title list log.debug(f"getLinks for {group_id} - {len(titles)} titles") @@ -362,6 +370,15 @@ async def getLinks(app, group_id, titles=None, bucket=None): else: # do a get for all links log.debug(f"getLinks, all links for {group_id}") + if create_order: + params["CreateOrder"] = 1 + if limit is not None: + params["Limit"] = str(limit) + if marker is not None: + params["Marker"] = marker + if pattern is not None: + params["pattern"] = pattern + get_rsp = await http_get(app, req, params=params) log.debug(f"got link_json: {get_rsp}") if "links" not in get_rsp: @@ -963,3 +980,21 @@ async def deleteAttributes(app, obj_id, attr_names=None, separator="/", bucket=N params["attr_names"] = attr_name_param log.debug(f"using params: {params}") await http_delete(app, req, params=params) + + +async def deleteObj(app, obj_id, bucket=None): + """ send delete request for group, datatype, or dataset obj """ + log.debug(f"deleteObj {obj_id}") + req = getDataNodeUrl(app, obj_id) + collection = getCollectionForId(obj_id) + req += f"/{collection}/{obj_id}" + params = {} + if bucket: + params["bucket"] = bucket + log.debug(f"http_delete req: {req} params: {params}") + + await http_delete(app, req, params=params) + + meta_cache = app["meta_cache"] + if obj_id in meta_cache: + del meta_cache[obj_id] # remove from cache diff --git a/tests/integ/domain_test.py b/tests/integ/domain_test.py index 1e03b3df..dbe21b50 100755 --- a/tests/integ/domain_test.py +++ b/tests/integ/domain_test.py @@ -203,6 +203,9 @@ def testPostDomainSingle(self): domainJson = json.loads(rsp.text) self.assertTrue("root" in domainJson) root_id = domainJson["root"] + g1_id = helper.getUUIDByPath(domain, "/g1", session=self.session) + g11_id = helper.getUUIDByPath(domain, "/g1/g1.1", session=self.session) + d111_id = helper.getUUIDByPath(domain, "/g1/g1.1/dset1.1.1", session=self.session) # Get group at /g1/g1.1 by using h5path data = {"h5paths": ["/g1/g1.1", ]} @@ -213,8 +216,21 @@ def testPostDomainSingle(self): rsp_paths = rspJson["h5paths"] self.assertTrue("/g1/g1.1" in rsp_paths) obj_json = rsp_paths["/g1/g1.1"] - g11id = helper.getUUIDByPath(domain, "/g1/g1.1", session=self.session) - self.assertEqual(g11id, obj_json["id"]) + self.assertEqual(g11_id, obj_json["id"]) + self.assertTrue("root" in obj_json) + self.assertEqual(root_id, obj_json["root"]) + + # Get dataset /g1/g1.1/dset1.1.1 with a relative path and parent_id g1 + params = {"parent_id": g1_id} + data = {"h5paths": ["g1.1/dset1.1.1", ]} + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("h5paths" in rspJson) + rsp_paths = rspJson["h5paths"] + self.assertTrue("g1.1/dset1.1.1" in rsp_paths) + obj_json = rsp_paths["g1.1/dset1.1.1"] + self.assertEqual(d111_id, obj_json["id"]) self.assertTrue("root" in obj_json) self.assertEqual(root_id, obj_json["root"]) @@ -265,14 +281,19 @@ def testGetByPath(self): self.assertTrue("root" in domainJson) root_id = domainJson["root"] + # get ids that we'll need later + g1_id = helper.getUUIDByPath(domain, "/g1", session=self.session) + g11_id = helper.getUUIDByPath(domain, "/g1/g1.1", session=self.session) + d111_id = helper.getUUIDByPath(domain, "/g1/g1.1/dset1.1.1", session=self.session) + # Get group at /g1/g1.1 by using h5path params = {"h5path": "/g1/g1.1"} rsp = self.session.get(req, headers=headers, params=params) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("id" in rspJson) - g11id = helper.getUUIDByPath(domain, "/g1/g1.1", session=self.session) - self.assertEqual(g11id, rspJson["id"]) + + self.assertEqual(g11_id, rspJson["id"]) self.assertTrue("root" in rspJson) self.assertEqual(root_id, rspJson["root"]) @@ -282,10 +303,18 @@ def testGetByPath(self): self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) self.assertTrue("id" in rspJson) - d111id = helper.getUUIDByPath( - domain, "/g1/g1.1/dset1.1.1", session=self.session - ) - self.assertEqual(d111id, rspJson["id"]) + + self.assertEqual(d111_id, rspJson["id"]) + self.assertTrue("root" in rspJson) + self.assertEqual(root_id, rspJson["root"]) + + # get /g1/g1.1/dset1.1.1 using a relative path with parent id g1 + params = {"h5path": "g1.1/dset1.1.1", "parent_id": g1_id} + rsp = self.session.get(req, headers=headers, params=params) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("id" in rspJson) + self.assertEqual(d111_id, rspJson["id"]) self.assertTrue("root" in rspJson) self.assertEqual(root_id, rspJson["root"]) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index faad65a7..045f54c1 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -442,7 +442,7 @@ def testGetLinks(self): def testGet(self): # test getting links from an existing domain domain = helper.getTestDomain("tall.h5") - print("testGetDomain", domain) + print("testGet", domain) headers = helper.getRequestHeaders(domain=domain) # verify domain exists @@ -507,6 +507,23 @@ def testGet(self): expected_uuid = helper.getUUIDByPath(domain, "/g1/g1.2/g1.2.1", session=self.session) self.assertEqual(expected_uuid, g1_2_1_uuid) + # do get with a regex pattern + params = {"pattern": "ext*"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), 1) # only extlink should be returned + link = links[0] + for name in ("created", "class", "h5domain", "h5path", "title", "href"): + self.assertTrue(name in link) + self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(link["title"], "extlink") + self.assertEqual(link["h5domain"], "somefile") + self.assertEqual(link["h5path"], "somepath") + self.assertTrue(link["created"] < now - 10) + # get link by title req = helper.getEndpoint() + "/groups/" + g1_2_1_uuid + "/links/slink" rsp = self.session.get(req, headers=headers) @@ -529,6 +546,133 @@ def testGet(self): self.assertEqual(link["title"], "slink") self.assertEqual(link["h5path"], "somevalue") + def testGetRecursive(self): + # test getting links from an existing domain, following links + domain = helper.getTestDomain("tall.h5") + print("testGetRecursive", domain) + headers = helper.getRequestHeaders(domain=domain) + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + print(f"WARNING: Failed to get domain: {domain}. Is test data setup?") + return # abort rest of test + + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + self.assertTrue(root_uuid.startswith("g-")) + + # get links for root group and other groups recursively + req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" + params = {"follow_links": 1} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + hrefs = rspJson["hrefs"] + self.assertEqual(len(hrefs), 3) + self.assertTrue("links" in rspJson) + grp_links = rspJson["links"] + hardlink_count = 0 + softlink_count = 0 + extlink_count = 0 + expected_group_links = ("g1", "g2", "g1.1", "g1.2", "g1.2.1", ) + expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") + expected_soft_links = ("slink", ) + expected_external_links = ("extlink", ) + self.assertEqual(len(grp_links), 6) + for grp_id in grp_links: + helper.validateId(grp_id) + links = grp_links[grp_id] + for link in links: + self.assertTrue("title" in link) + link_title = link["title"] + self.assertTrue("class" in link) + link_class = link["class"] + if link_class == "H5L_TYPE_HARD": + hardlink_count += 1 + self.assertTrue("id" in link) + link_id = link["id"] + helper.validateId(link_id) + if link_id.startswith("g-"): + self.assertTrue(link_title in expected_group_links) + elif link_id.startswith("d-"): + self.assertTrue(link_title in expected_dset_links) + else: + self.assertTrue(False) # unexpected + elif link_class == "H5L_TYPE_SOFT": + softlink_count += 1 + self.assertTrue("h5path" in link) + self.assertFalse("h5domain" in link) + self.assertFalse("id" in link) + self.assertTrue(link_title in expected_soft_links) + elif link_class == "H5L_TYPE_EXTERNAL": + extlink_count += 1 + self.assertTrue("h5path" in link) + self.assertTrue("h5domain" in link) + self.assertFalse("id" in link) + self.assertTrue(link_title in expected_external_links) + else: + self.assertTrue(False) # unexpected + + self.assertEqual(hardlink_count, len(expected_dset_links) + len(expected_group_links)) + self.assertEqual(softlink_count, len(expected_soft_links)) + self.assertEqual(extlink_count, len(expected_external_links)) + + def testGetPattern(self): + # test getting links from an existing domain, with a regex filter + domain = helper.getTestDomain("tall.h5") + print("testGetPattern", domain) + headers = helper.getRequestHeaders(domain=domain) + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + print(f"WARNING: Failed to get domain: {domain}. Is test data setup?") + return # abort rest of test + + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + self.assertTrue(root_uuid.startswith("g-")) + + # get links for root group and other groups recursively + req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" + params = {"follow_links": 1, "pattern": "dset*"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("hrefs" in rspJson) + hrefs = rspJson["hrefs"] + self.assertEqual(len(hrefs), 3) + self.assertTrue("links" in rspJson) + grp_links = rspJson["links"] + + expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") + + self.assertEqual(len(grp_links), 6) + link_count = 0 + + for grp_id in grp_links: + helper.validateId(grp_id) + links = grp_links[grp_id] + for link in links: + self.assertTrue("title" in link) + link_title = link["title"] + self.assertTrue(link_title in expected_dset_links) + self.assertTrue("class" in link) + link_class = link["class"] + # only hardlinks will be a match with this pattern + self.assertEqual(link_class, "H5L_TYPE_HARD") + link_count += 1 + self.assertTrue("id" in link) + link_id = link["id"] + helper.validateId(link_id) + self.assertTrue(link_id.startswith("d-")) # link to a dataset + + self.assertEqual(link_count, len(expected_dset_links)) + def testSoftLinkTraversal(self): # test that an object can be found via path with an external link # relative and absolute path @@ -1115,7 +1259,7 @@ def testPostLinkMultiple(self): obj_links = rspJson["links"] self.assertEqual(len(obj_links), 6) expected_group_links = ("g1", "g2", "g1.1", "g1.2", "g1.2.1", ) - expected_dset_links = ("dset1.2", "dset2.2", "dset1.1.1", "dset1.1.2", "dset2.1", ) + expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") expected_soft_links = ("slink", ) expected_external_links = ("extlink", ) From b1e3cf12d87b0939b674ae8203cda1a5af2f267e Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 19 Jan 2024 03:56:51 +0000 Subject: [PATCH 11/18] update per PR comments --- hsds/dset_lib.py | 1 - tests/integ/link_test.py | 55 ++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/hsds/dset_lib.py b/hsds/dset_lib.py index 16e7c8a5..ca7ec957 100755 --- a/hsds/dset_lib.py +++ b/hsds/dset_lib.py @@ -913,7 +913,6 @@ async def deleteAllChunks(app, dset_id, bucket=None): # get all chunk ids for chunks that have been allocated chunk_ids = await getAllocatedChunkIds(app, dset_id, bucket=bucket) - chunk_ids.sort() if chunk_ids: chunk_ids = list(chunk_ids) diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 045f54c1..74179005 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -507,23 +507,6 @@ def testGet(self): expected_uuid = helper.getUUIDByPath(domain, "/g1/g1.2/g1.2.1", session=self.session) self.assertEqual(expected_uuid, g1_2_1_uuid) - # do get with a regex pattern - params = {"pattern": "ext*"} - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), 1) # only extlink should be returned - link = links[0] - for name in ("created", "class", "h5domain", "h5path", "title", "href"): - self.assertTrue(name in link) - self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") - self.assertEqual(link["title"], "extlink") - self.assertEqual(link["h5domain"], "somefile") - self.assertEqual(link["h5path"], "somepath") - self.assertTrue(link["created"] < now - 10) - # get link by title req = helper.getEndpoint() + "/groups/" + g1_2_1_uuid + "/links/slink" rsp = self.session.get(req, headers=headers) @@ -573,7 +556,7 @@ def testGetRecursive(self): hrefs = rspJson["hrefs"] self.assertEqual(len(hrefs), 3) self.assertTrue("links" in rspJson) - grp_links = rspJson["links"] + obj_map = rspJson["links"] # map of obj_ids to links hardlink_count = 0 softlink_count = 0 extlink_count = 0 @@ -581,10 +564,10 @@ def testGetRecursive(self): expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") expected_soft_links = ("slink", ) expected_external_links = ("extlink", ) - self.assertEqual(len(grp_links), 6) - for grp_id in grp_links: + self.assertEqual(len(obj_map), 6) + for grp_id in obj_map: helper.validateId(grp_id) - links = grp_links[grp_id] + links = obj_map[grp_id] for link in links: self.assertTrue("title" in link) link_title = link["title"] @@ -636,6 +619,28 @@ def testGetPattern(self): rspJson = json.loads(rsp.text) root_uuid = rspJson["root"] self.assertTrue(root_uuid.startswith("g-")) + # get the "/g1/g1.2" group id + g1_2_uuid = helper.getUUIDByPath(domain, "/g1/g1.2", session=self.session) + now = time.time() + + # do get with a regex pattern + # get links for /g1/g1.2: + req = helper.getEndpoint() + "/groups/" + g1_2_uuid + "/links" + params = {"pattern": "ext*"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), 1) # only extlink should be returned + link = links[0] + for name in ("created", "class", "h5domain", "h5path", "title", "href"): + self.assertTrue(name in link) + self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(link["title"], "extlink") + self.assertEqual(link["h5domain"], "somefile") + self.assertEqual(link["h5path"], "somepath") + self.assertTrue(link["created"] < now - 10) # get links for root group and other groups recursively req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" @@ -647,16 +652,16 @@ def testGetPattern(self): hrefs = rspJson["hrefs"] self.assertEqual(len(hrefs), 3) self.assertTrue("links" in rspJson) - grp_links = rspJson["links"] + obj_map = rspJson["links"] # map of grp ids to links expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") - self.assertEqual(len(grp_links), 6) + self.assertEqual(len(obj_map), 6) # 6 groups should be returned link_count = 0 - for grp_id in grp_links: + for grp_id in obj_map: helper.validateId(grp_id) - links = grp_links[grp_id] + links = obj_map[grp_id] for link in links: self.assertTrue("title" in link) link_title = link["title"] From df81b422b6f36c42c4d3a37756a5f3d8b9df9253 Mon Sep 17 00:00:00 2001 From: jreadey Date: Fri, 19 Jan 2024 13:42:56 +0000 Subject: [PATCH 12/18] added max_data_size for attr, pattern for link --- hsds/attr_dn.py | 46 ++++++++++++++-- hsds/attr_sn.py | 36 +++++++++++++ hsds/servicenode_lib.py | 7 +++ hsds/util/storUtil.py | 4 ++ tests/integ/attr_test.py | 110 +++++++++++++++++++++++++++++++++++++++ tests/integ/link_test.py | 4 +- 6 files changed, 202 insertions(+), 5 deletions(-) diff --git a/hsds/attr_dn.py b/hsds/attr_dn.py index 8dd44da3..1b5693c8 100755 --- a/hsds/attr_dn.py +++ b/hsds/attr_dn.py @@ -21,9 +21,10 @@ from .util.attrUtil import validateAttributeName, isEqualAttr from .util.hdf5dtype import getItemSize, createDataType +from .util.globparser import globmatch from .util.dsetUtil import getShapeDims from .util.arrayUtil import arrayToBytes, jsonToArray, decodeData -from .util.arrayUtil import bytesToArray, bytesArrayToList +from .util.arrayUtil import bytesToArray, bytesArrayToList, getNumElements from .datanode_lib import get_obj_id, get_metadata_obj, save_metadata_obj from . import hsds_logger as log @@ -43,7 +44,7 @@ def _index(items, marker, create_order=False): return -1 -def _getAttribute(attr_name, obj_json, include_data=True, encoding=None): +def _getAttribute(attr_name, obj_json, include_data=True, max_data_size=0, encoding=None): """ copy relevant fields from src to target """ if not isinstance(obj_json, dict): @@ -89,6 +90,26 @@ def _getAttribute(attr_name, obj_json, include_data=True, encoding=None): encoding = None log.debug("base64 encoding requested") + if include_data and max_data_size > 0: + # check that the size of the data is not greater than the limit + item_size = getItemSize(type_json) + if item_size == "H5T_VARIABLE": + # could be anything, just guess as 512 bytes per element + # TBD: determine exact size + item_size = 512 + dims = getShapeDims(shape_json) + num_elements = getNumElements(dims) + attr_size = item_size * num_elements + if attr_size > max_data_size: + msg = f"{attr_name} size of {attr_size} is " + msg += "larger than max_data_size, excluding data" + log.debug(msg) + include_data = False + else: + msg = f"{attr_name} size of {attr_size} is " + msg += "not larger than max_data_size, including data" + log.debug(msg) + if include_data: value_json = src_attr["value"] if "encoding" in src_attr: @@ -143,11 +164,18 @@ async def GET_Attributes(request): if params.get("IncludeData"): include_data = True + max_data_size = 0 + if params.get("max_data_size"): + max_data_size = int(params["max_data_size"]) + pattern = None + if params.get("pattern"): + pattern = params["pattern"] + limit = None if "Limit" in params: try: limit = int(params["Limit"]) - log.info("GET_Links - using Limit: {}".format(limit)) + log.info(f"GET_Attributes - using Limit: {limit}") except ValueError: msg = "Bad Request: Expected int type for limit" log.error(msg) # should be validated by SN @@ -204,7 +232,14 @@ async def GET_Attributes(request): attr_list = [] for i in range(start_index, end_index): attr_name = titles[i] + if pattern: + if not globmatch(attr_name, pattern): + log.debug(f"attr_name: {attr_name} did not match pattern: {pattern}") + continue + kwargs = {"include_data": include_data, "encoding": encoding} + if include_data: + kwargs["max_data_size"] = max_data_size log.debug(f"_getAttribute kwargs: {kwargs}") des_attr = _getAttribute(attr_name, obj_json, **kwargs) attr_list.append(des_attr) @@ -249,6 +284,9 @@ async def POST_Attributes(request): if "IncludeData" in params and params["IncludeData"]: include_data = True log.debug("include attr data") + max_data_size = 0 + if params.get("max_data_size"): + max_data_size = int(params["max_data_size"]) if params.get("encoding"): encoding = params["encoding"] log.debug("POST_Attributes requested base64 encoding") @@ -269,6 +307,8 @@ async def POST_Attributes(request): kwargs = {"include_data": include_data} if encoding: kwargs["encoding"] = encoding + if max_data_size > 0: + kwargs["max_data_size"] = max_data_size missing_names = set() diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index b2b4ec9f..0f21243c 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -19,6 +19,7 @@ from json import JSONDecodeError from .util.httpUtil import getAcceptType, jsonResponse, getHref +from .util.globparser import globmatch from .util.idUtil import isValidUuid, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword from .util.domainUtil import getDomainFromRequest, isValidDomain @@ -60,6 +61,7 @@ async def GET_Attributes(request): ignore_nan = False include_data = True + max_data_size = 0 if "IncludeData" in params: IncludeData = params["IncludeData"] if not IncludeData or IncludeData == "0": @@ -67,6 +69,15 @@ async def GET_Attributes(request): kwargs["include_data"] = False log.debug(f"include_data: {include_data}") + if "max_data_size" in params: + try: + max_data_size = int(params["max_data_size"]) + except ValueError: + msg = "expected int for max_data_size" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs["max_data_size"] = max_data_size + if "ignore_nan" in params and params["ignore_nan"]: ignore_nan = True kwargs["ignore_nan"] = True @@ -88,6 +99,19 @@ async def GET_Attributes(request): marker = params["Marker"] kwargs["marker"] = marker + if "pattern" in params and params["pattern"]: + pattern = params["pattern"] + try: + globmatch("abc", pattern) + except ValueError: + msg = f"invlaid pattern: {pattern} for attribute matching" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"using pattern: {pattern} for GET_Attributes") + kwargs["pattern"] = pattern + else: + pattern = None + username, pswd = getUserPasswordFromRequest(request) if username is None and app["allow_noauth"]: username = "default" @@ -1206,10 +1230,18 @@ async def POST_Attributes(request): params = request.rel_url.query log.debug(f"got params: {params}") include_data = True + max_data_size = 0 if "IncludeData" in params: IncludeData = params["IncludeData"] if not IncludeData or IncludeData == "0": include_data = False + if "max_data_size" in params: + try: + max_data_size = int(params["max_data_size"]) + except ValueError: + msg = "expected int for max_data_size" + log.warn(msg) + raise HTTPBadRequest(reason=msg) if params.get("ignore_nan"): ignore_nan = True @@ -1238,6 +1270,8 @@ async def POST_Attributes(request): kwargs = {"attr_names": attr_names, "bucket": bucket} if not include_data: kwargs["include_data"] = False + if max_data_size > 0: + kwargs["max_data_size"] = max_data_size if ignore_nan: kwargs["ignore_nan"] = True if encoding: @@ -1253,6 +1287,8 @@ async def POST_Attributes(request): # mixin params if not include_data: crawler_params["include_data"] = False + if max_data_size > 0: + crawler_params["max_data_size"] = max_data_size if ignore_nan: crawler_params["ignore_nan"] = True diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index 2e94fe40..bdb56c63 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -852,8 +852,10 @@ async def doFlush(app, root_id, bucket=None): async def getAttributes(app, obj_id, attr_names=None, include_data=True, + max_data_size=0, ignore_nan=False, create_order=False, + pattern=None, encoding=None, limit=0, marker=None, @@ -879,6 +881,8 @@ async def getAttributes(app, obj_id, params["CreateOrder"] = 1 if encoding: params["encoding"] = encoding + if max_data_size > 0: + params["max_data_size"] = max_data_size if attr_names: # send names via a POST request @@ -892,6 +896,9 @@ async def getAttributes(app, obj_id, params["Limit"] = limit if marker: params["Marker"] = marker + if pattern: + params["pattern"] = pattern + log.debug(f"using params: {params}") # do a get to fetch all the attributes dn_json = await http_get(app, req, params=params) diff --git a/hsds/util/storUtil.py b/hsds/util/storUtil.py index b80b7b3a..00fc6a9b 100644 --- a/hsds/util/storUtil.py +++ b/hsds/util/storUtil.py @@ -20,6 +20,7 @@ import numpy as np import numcodecs as codecs import bitshuffle +from json import JSONDecodeError from aiohttp.web_exceptions import HTTPInternalServerError from .. import hsds_logger as log @@ -389,6 +390,9 @@ async def getStorJSONObj(app, key, bucket=None): except UnicodeDecodeError: log.error(f"Error loading JSON at key: {key}") raise HTTPInternalServerError() + except JSONDecodeError: + log.error(f"unable to load json: {data}") + raise HTTPInternalServerError() msg = f"storage key {key} returned json object " msg += f"with {len(json_dict)} keys" diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 7376f9a8..1f036720 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -2191,6 +2191,116 @@ def testDeleteAttributesMultiple(self): rsp = self.session.get(req, headers=headers) self.assertEqual(rsp.status_code, 410) + def testMaxDataSize(self): + domain = helper.getTestDomain("tall.h5") + print("testMaxDataSize", domain) + headers = helper.getRequestHeaders(domain=domain) + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + msg = f"WARNING: Failed to get domain: {domain}. Is test data setup?" + print(msg) + return # abort rest of test + domainJson = json.loads(rsp.text) + root_id = domainJson["root"] + helper.validateId(root_id) + + attr_names = ["attr1", "attr2"] + + req = helper.getEndpoint() + "/groups/" + root_id + "/attributes" + params = {"IncludeData": 1} + + for max_data_size in (0, 10): + params["max_data_size"] = max_data_size + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, list)) + + self.assertEqual(len(attributes), len(attr_names)) + + for i in range(len(attr_names)): + attrJson = attributes[i] + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertEqual(attr_name, attr_names[i]) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + if max_data_size == 0 or attr_name == "attr1": + self.assertTrue("value" in attrJson) + else: + self.assertFalse("value" in attrJson) + + # do the same thing with a post request + data = {"attr_names": ["attr1", "attr2", ]} + for max_data_size in (0, 10): + params["max_data_size"] = max_data_size + rsp = self.session.post(req, data=json.dumps(data), params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertTrue(isinstance(attributes, list)) + + self.assertEqual(len(attributes), len(attr_names)) + + for i in range(len(attr_names)): + attrJson = attributes[i] + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertEqual(attr_name, attr_names[i]) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + if max_data_size == 0 or attr_name == "attr1": + self.assertTrue("value" in attrJson) + else: + self.assertFalse("value" in attrJson) + + def testGetPattern(self): + # test getting attributes from an existing domain, with a glob filter + domain = helper.getTestDomain("tall.h5") + print("testGetPattern", domain) + headers = helper.getRequestHeaders(domain=domain) + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + print(f"WARNING: Failed to get domain: {domain}. Is test data setup?") + return # abort rest of test + + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + self.assertTrue(root_uuid.startswith("g-")) + # get the "/g1/g1.1/dset1.1.1" dset id + d111_uuid = helper.getUUIDByPath(domain, "/g1/g1.1/dset1.1.1", session=self.session) + + # do get with a glob pattern + req = helper.getEndpoint() + "/datasets/" + d111_uuid + "/attributes" + params = {"pattern": "*1"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + attributes = rspJson["attributes"] + self.assertEqual(len(attributes), 1) # only attr1 should be returned + attr = attributes[0] + for name in ("created", "type", "shape", "value", "name", "href"): + self.assertTrue(name in attr) + self.assertEqual(attr["name"], "attr1") + if __name__ == "__main__": # setup test files diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index 74179005..aa23e481 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -564,7 +564,7 @@ def testGetRecursive(self): expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") expected_soft_links = ("slink", ) expected_external_links = ("extlink", ) - self.assertEqual(len(obj_map), 6) + self.assertEqual(len(obj_map), 6) for grp_id in obj_map: helper.validateId(grp_id) links = obj_map[grp_id] @@ -604,7 +604,7 @@ def testGetRecursive(self): self.assertEqual(extlink_count, len(expected_external_links)) def testGetPattern(self): - # test getting links from an existing domain, with a regex filter + # test getting links from an existing domain, with a glob filter domain = helper.getTestDomain("tall.h5") print("testGetPattern", domain) headers = helper.getRequestHeaders(domain=domain) From 90b9ec4e6547098eefa83801b39acfe5faa77208 Mon Sep 17 00:00:00 2001 From: jreadey Date: Mon, 22 Jan 2024 03:06:27 +0000 Subject: [PATCH 13/18] added follow_links for get attrs --- hsds/attr_sn.py | 71 ++++++++++++++----- hsds/domain_crawl.py | 146 +++++++++++++++++++++++---------------- tests/integ/attr_test.py | 48 +++++++++++++ 3 files changed, 188 insertions(+), 77 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index 0f21243c..c1f7164c 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -57,11 +57,29 @@ async def GET_Attributes(request): log.warn(msg) raise HTTPBadRequest(reason=msg) + domain = getDomainFromRequest(request) + if not isValidDomain(domain): + msg = f"Invalid domain: {domain}" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + kwargs = {} + bucket = getBucketForDomain(domain) + log.debug(f"bucket: {bucket}") + kwargs["bucket"] = bucket ignore_nan = False include_data = True max_data_size = 0 + if "follow_links" in params and params["follow_links"]: + if collection != "groups": + msg = "follow_links can only be used with group ids" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + follow_links = True + else: + follow_links = False + log.debug(f"getAttributes follow_links: {follow_links}") if "IncludeData" in params: IncludeData = params["IncludeData"] if not IncludeData or IncludeData == "0": @@ -98,6 +116,15 @@ async def GET_Attributes(request): if "Marker" in params: marker = params["Marker"] kwargs["marker"] = marker + encoding = None + if "encoding" in params: + encoding = params["encoding"] + if params["encoding"] != "base64": + msg = "only base64 encoding is supported" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + encoding = "base64" + kwargs["encoding"] = encoding if "pattern" in params and params["pattern"]: pattern = params["pattern"] @@ -118,26 +145,38 @@ async def GET_Attributes(request): else: await validateUserPassword(app, username, pswd) - domain = getDomainFromRequest(request) - if not isValidDomain(domain): - msg = f"Invalid domain: {domain}" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - bucket = getBucketForDomain(domain) - log.debug(f"bucket: {bucket}") - kwargs["bucket"] = bucket - await validateAction(app, domain, obj_id, username, "read") - attributes = await getAttributes(app, obj_id, **kwargs) + if follow_links: + crawler_params = {"follow_links": True, "bucket": bucket} + # mixin params + if not include_data: + crawler_params["include_data"] = False + if max_data_size > 0: + crawler_params["max_data_size"] = max_data_size + if ignore_nan: + crawler_params["ignore_nan"] = True + if encoding: + crawler_params["encoding"] = encoding - log.debug(f"got attributes json from dn for obj_id: {obj_id}") + kwargs = {"action": "get_attr", "raise_error": True, "params": crawler_params} + items = [obj_id, ] + crawler = DomainCrawler(app, items, **kwargs) + # will raise exception on NotFound, etc. + await crawler.crawl() + attributes = crawler._obj_dict + msg = f"DomainCrawler returned: {len(attributes)} objects" + log.info(msg) + else: + # just get attributes for this objects + attributes = await getAttributes(app, obj_id, **kwargs) + log.debug(f"got attributes json from dn for obj_id: {obj_id}") - # mixin hrefs - for attribute in attributes: - attr_name = attribute["name"] - attr_href = f"/{collection}/{obj_id}/attributes/{attr_name}" - attribute["href"] = getHref(request, attr_href) + # mixin hrefs + for attribute in attributes: + attr_name = attribute["name"] + attr_href = f"/{collection}/{obj_id}/attributes/{attr_name}" + attribute["href"] = getHref(request, attr_href) resp_json = {} resp_json["attributes"] = attributes diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index b23ea265..1f37b151 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -34,7 +34,7 @@ def __init__( max_objects_limit=0, raise_error=False ): - log.info(f"DomainCrawler.__init__ action: {action} root_id: {len(objs)} objs") + log.info(f"DomainCrawler.__init__ action: {action} - {len(objs)} objs") log.debug(f"params: {params}") self._app = app self._action = action @@ -57,6 +57,55 @@ def __init__( else: self._objs = None + def follow_links(self, grp_id, links): + # add any linked obj ids to the lookup ids set + log.debug(f"follow links for {grp_id}") + if getCollectionForId(grp_id) != "groups": + log.warn(f"expected group id but got: {grp_id}") + return + link_count = 0 + for link in links: + log.debug(f"DomainCrawler - follow links for: {link}") + if isinstance(link, str): + # we were passed a dict of link titles to link_jsons + title = link + link_obj = links[title] + else: + # were passed a list of link jsons + if "title" not in link: + log.warn(f"expected to find title key in link: {link}") + continue + title = link["title"] + link_obj = link + log.debug(f"link {title}: {link_obj}") + if link_obj["class"] != "H5L_TYPE_HARD": + # just follow hardlinks + log.debug("not hard link, continue") + continue + link_id = link_obj["id"] + link_collection = getCollectionForId(link_id) + if self._action in ("get_link", "put_link") and link_collection != "groups": + # only groups can have links + log.debug(f"link id: {link_id} is not for a group, continue") + continue + num_objects = len(self._obj_dict) + if self._params.get("max_objects_limit") is not None: + max_objects_limit = self._params["max_objects_limit"] + if num_objects >= max_objects_limit: + msg = "DomainCrawler reached limit of " + msg += f"{max_objects_limit}" + log.info(msg) + break + if link_id not in self._obj_dict: + # haven't seen this object yet, get obj json + log.debug(f"DomainCrawler - adding link_id: {link_id} to queue") + self._obj_dict[link_id] = {} # placeholder for obj id + self._q.put_nowait(link_id) + link_count += 1 + else: + log.debug(f"link: {link_id} already in object dict") + log.debug(f"follow links done, added {link_count} ids to queue") + async def get_attributes(self, obj_id, attr_names): # get the given attributes for the obj_id msg = f"get_attributes for {obj_id}" @@ -98,6 +147,32 @@ async def get_attributes(self, obj_id, attr_names): log.warn(f"Domain crawler - got {status} status for obj_id {obj_id}") self._obj_dict[obj_id] = {"status": status} + collection = getCollectionForId(obj_id) + follow_links = self._params.get("follow_links") + bucket = self._params.get("bucket") + if collection == "groups" and follow_links: + links = None + status = 200 + try: + links = await getLinks(self._app, obj_id, bucket=bucket) + except HTTPNotFound: + status = 404 + except HTTPServiceUnavailable: + status = 503 + except HTTPInternalServerError: + status = 500 + except Exception as e: + log.error(f"unexpected exception {e}") + status = 500 + + if status >= 500: + log.warn(f"getLinks for {obj_id} returned: {status}") + elif links: + log.debug(f"follow_links for: {links}") + self.follow_links(obj_id, links) + else: + log.debug(f"no links for {obj_id}") + async def put_attributes(self, obj_id, attr_items): # write the given attributes for the obj_id log.debug(f"put_attributes for {obj_id}, {len(attr_items)} attributes") @@ -179,34 +254,16 @@ async def get_obj_json(self, obj_id): # for groups iterate through all the hard links and # add to the lookup ids set - log.debug(f"gotCollection: {collection}") + log.debug(f"gotCollection: {collection}, follow_links: {follow_links}") if collection == "groups" and follow_links: if "links" not in obj_json: log.error("expected links key in obj_json") return links = obj_json["links"] - log.debug(f"DomainCrawler links: {links}") - for title in links: - log.debug(f"DomainCrawler - got link: {title}") - link_obj = links[title] - num_objects = len(self._obj_dict) - if self._params.get("max_objects_limit") is not None: - max_objects_limit = self._params["max_objects_limit"] - if num_objects >= max_objects_limit: - msg = "DomainCrawler reached limit of " - msg += f"{max_objects_limit}" - log.info(msg) - break - if link_obj["class"] != "H5L_TYPE_HARD": - # just follow hardlinks - continue - link_id = link_obj["id"] - if link_id not in self._obj_dict: - # haven't seen this object yet, get obj json - log.debug(f"DomainCrawler - adding link_id: {link_id}") - self._obj_dict[link_id] = {} # placeholder for obj id - self._q.put_nowait(link_id) + log.debug(f"follow_links for: {links}") + self.follow_links(obj_id, links) + if not self._params.get("include_links"): # don't keep the links del obj_json["links"] @@ -267,37 +324,7 @@ async def get_links(self, grp_id, titles=None): # if follow_links, add any group links to the lookup ids set if follow_links: log.debug(f"follow links for {grp_id}") - for link_obj in links: - log.debug(f"follow links for: {link_obj}") - if 'title' not in link_obj: - log.warn(f"expected to find title in link_json: {link_obj}") - continue - title = link_obj["title"] - log.debug(f"DomainCrawler - got link: {title}") - num_objects = len(self._obj_dict) - if self._params.get("max_objects_limit") is not None: - max_objects_limit = self._params["max_objects_limit"] - if num_objects >= max_objects_limit: - msg = "DomainCrawler reached limit of " - msg += f"{max_objects_limit}" - log.info(msg) - break - if link_obj["class"] != "H5L_TYPE_HARD": - # just follow hardlinks - log.debug("not hard link,continue") - continue - link_id = link_obj["id"] - if getCollectionForId(link_id) != "groups": - # only groups can have links - log.debug(f"link id: {link_id} is not for a group, continue") - continue - if link_id not in self._obj_dict: - # haven't seen this object yet, get obj json - log.debug(f"DomainCrawler - adding link_id: {link_id} to queue") - self._obj_dict[link_id] = {} # placeholder for obj id - self._q.put_nowait(link_id) - else: - log.debug(f"link: {link_id} already in object dict") + self.follow_links(grp_id, links) async def put_links(self, grp_id, link_items): # write the given links for the obj_id @@ -399,13 +426,10 @@ async def fetch(self, obj_id): elif self._action == "get_attr": log.debug("DomainCrawler - get attributes") # fetch the given attributes - if self._objs is None: - log.error("DomainCrawler - self._objs not set") - return - if obj_id not in self._objs: - log.error(f"couldn't find {obj_id} in self._objs") - return - attr_names = self._objs[obj_id] + if self._objs is None or obj_id not in self._objs: + attr_names = None # fetch all attributes for obj_id + else: + attr_names = self._objs[obj_id] if attr_names is None: log.debug(f"fetch all attributes for {obj_id}") else: diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 1f036720..1066c52a 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -2301,6 +2301,54 @@ def testGetPattern(self): self.assertTrue(name in attr) self.assertEqual(attr["name"], "attr1") + def testGetRecursive(self): + # test getting all attributes from an existing domain + domain = helper.getTestDomain("tall.h5") + print("testGetRecursive", domain) + headers = helper.getRequestHeaders(domain=domain) + + # verify domain exists + req = helper.getEndpoint() + "/" + rsp = self.session.get(req, headers=headers) + if rsp.status_code != 200: + print(f"WARNING: Failed to get domain: {domain}. Is test data setup?") + return # abort rest of test + + rspJson = json.loads(rsp.text) + root_uuid = rspJson["root"] + self.assertTrue(root_uuid.startswith("g-")) + # get the "/g1/g1.1/dset1.1.1" dset id + d111_uuid = helper.getUUIDByPath(domain, "/g1/g1.1/dset1.1.1", session=self.session) + + # do get with follow_links + req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" + params = {"follow_links": "1"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + obj_map = rspJson["attributes"] + self.assertEqual(len(obj_map), 10) + attr_count = 0 + for obj_id in obj_map: + attr_count += len(obj_map[obj_id]) + self.assertEqual(attr_count, 4) + for obj_id in (root_uuid, d111_uuid): + # these are the only two objects with attributes + self.assertTrue(obj_id in obj_map) + obj_attrs = obj_map[obj_id] + self.assertEqual(len(obj_attrs), 2) + for attrJson in obj_attrs: + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertTrue(attr_name in ("attr1", "attr2")) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("value" in attrJson) + if __name__ == "__main__": # setup test files From f07794d20e112e577ced151913227fde86b05b16 Mon Sep 17 00:00:00 2001 From: jreadey Date: Mon, 22 Jan 2024 08:05:08 +0000 Subject: [PATCH 14/18] DomainCrawler refactor --- hsds/attr_sn.py | 124 +++++++++++++++++++++------------------ hsds/domain_crawl.py | 100 +++++++++++++++---------------- hsds/domain_sn.py | 15 ++--- hsds/link_sn.py | 73 ++++++++++++++--------- hsds/servicenode_lib.py | 9 +-- tests/integ/attr_test.py | 1 + 6 files changed, 173 insertions(+), 149 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index c1f7164c..c8259f2a 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -63,14 +63,9 @@ async def GET_Attributes(request): log.warn(msg) raise HTTPBadRequest(reason=msg) - kwargs = {} bucket = getBucketForDomain(domain) log.debug(f"bucket: {bucket}") - kwargs["bucket"] = bucket - ignore_nan = False - include_data = True - max_data_size = 0 if "follow_links" in params and params["follow_links"]: if collection != "groups": msg = "follow_links can only be used with group ids" @@ -80,11 +75,11 @@ async def GET_Attributes(request): else: follow_links = False log.debug(f"getAttributes follow_links: {follow_links}") + include_data = True if "IncludeData" in params: IncludeData = params["IncludeData"] if not IncludeData or IncludeData == "0": include_data = False - kwargs["include_data"] = False log.debug(f"include_data: {include_data}") if "max_data_size" in params: @@ -94,16 +89,19 @@ async def GET_Attributes(request): msg = "expected int for max_data_size" log.warn(msg) raise HTTPBadRequest(reason=msg) - kwargs["max_data_size"] = max_data_size + else: + max_data_size = 0 if "ignore_nan" in params and params["ignore_nan"]: ignore_nan = True - kwargs["ignore_nan"] = True + else: + ignore_nan = False if "CreateOrder" in params and params["CreateOrder"]: - kwargs["create_order"] = True + create_order = True + else: + create_order = False - limit = None if "Limit" in params: try: limit = int(params["Limit"]) @@ -111,12 +109,12 @@ async def GET_Attributes(request): msg = "Bad Request: Expected int type for limit" log.warn(msg) raise HTTPBadRequest(reason=msg) - kwargs["limit"] = limit - marker = None + else: + limit = None if "Marker" in params: marker = params["Marker"] - kwargs["marker"] = marker - encoding = None + else: + marker = None if "encoding" in params: encoding = params["encoding"] if params["encoding"] != "base64": @@ -124,7 +122,8 @@ async def GET_Attributes(request): log.warn(msg) raise HTTPBadRequest(reason=msg) encoding = "base64" - kwargs["encoding"] = encoding + else: + encoding = None if "pattern" in params and params["pattern"]: pattern = params["pattern"] @@ -135,7 +134,6 @@ async def GET_Attributes(request): log.warn(msg) raise HTTPBadRequest(reason=msg) log.debug(f"using pattern: {pattern} for GET_Attributes") - kwargs["pattern"] = pattern else: pattern = None @@ -148,18 +146,15 @@ async def GET_Attributes(request): await validateAction(app, domain, obj_id, username, "read") if follow_links: - crawler_params = {"follow_links": True, "bucket": bucket} + # setup kwargs for DomainCrawler + kwargs = {"action": "get_attr", "follow_links": True, "bucket": bucket} # mixin params - if not include_data: - crawler_params["include_data"] = False + if include_data: + kwargs["include_data"] = True if max_data_size > 0: - crawler_params["max_data_size"] = max_data_size + kwargs["max_data_size"] = max_data_size if ignore_nan: - crawler_params["ignore_nan"] = True - if encoding: - crawler_params["encoding"] = encoding - - kwargs = {"action": "get_attr", "raise_error": True, "params": crawler_params} + kwargs["ignore_nan"] = True items = [obj_id, ] crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. @@ -169,6 +164,23 @@ async def GET_Attributes(request): log.info(msg) else: # just get attributes for this objects + kwargs = {"bucket": bucket} + if include_data: + kwargs["include_data"] = True + if max_data_size > 0: + kwargs["max_data_size"] = max_data_size + if ignore_nan: + kwargs["ignore_nan"] = True + if limit: + kwargs["limit"] = limit + if marker: + kwargs["marker"] = marker + if encoding: + kwargs["encoding"] = encoding + if pattern: + kwargs["pattern"] = pattern + if create_order: + kwargs["create_order"] = True attributes = await getAttributes(app, obj_id, **kwargs) log.debug(f"got attributes json from dn for obj_id: {obj_id}") @@ -622,7 +634,6 @@ async def PUT_Attributes(request): msg = "PUT Attribute with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) - try: body = await request.json() except JSONDecodeError: @@ -637,6 +648,10 @@ async def PUT_Attributes(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) log.debug(f"got bucket: {bucket}") + if "replace" in params and params["replace"]: + replace = True + else: + replace = False # get domain JSON domain_json = await getDomainJson(app, domain) @@ -716,10 +731,6 @@ async def PUT_Attributes(request): await validateAction(app, domain, req_obj_id, username, "create") - kwargs = {"bucket": bucket} - if params.get("replace"): - kwargs["replace"] = True - count = len(obj_ids) if count == 0: msg = "no obj_ids defined" @@ -730,18 +741,17 @@ async def PUT_Attributes(request): obj_id = list(obj_ids.keys())[0] attr_json = obj_ids[obj_id] log.debug(f"got attr_json: {attr_json}") + kwargs = {"bucket": bucket, "attr_json": attr_json} + if replace: + kwargs["replace"] = True - status = await putAttributes(app, obj_id, attr_json, **kwargs) + status = await putAttributes(app, obj_id, **kwargs) else: # put multi obj - - # mixin some additonal kwargs - crawler_params = {"follow_links": False} - if bucket: - crawler_params["bucket"] = bucket - - kwargs = {"action": "put_attr", "raise_error": True, "params": crawler_params} + kwargs = {"action": "put_attr", "bucket": bucket} + if replace: + kwargs["replace"] = True crawler = DomainCrawler(app, obj_ids, **kwargs) # will raise exception on not found, server busy, etc. @@ -860,7 +870,7 @@ async def GET_AttributeValue(request): encoding = None attr_names = [attr_name, ] - kwargs = {"attr_names": attr_names, "bucket": bucket} + kwargs = {"attr_names": attr_names, "bucket": bucket, "include_data": True} if ignore_nan: kwargs["ignore_nan"] = True @@ -1268,12 +1278,14 @@ async def POST_Attributes(request): params = request.rel_url.query log.debug(f"got params: {params}") - include_data = True + include_data = False max_data_size = 0 if "IncludeData" in params: IncludeData = params["IncludeData"] - if not IncludeData or IncludeData == "0": - include_data = False + log.debug(f"got IncludeData: [{IncludeData}], type: {type(IncludeData)}") + if IncludeData and IncludeData != "0": + include_data = True + log.debug(f"include_data: {include_data}") if "max_data_size" in params: try: max_data_size = int(params["max_data_size"]) @@ -1307,35 +1319,35 @@ async def POST_Attributes(request): obj_id = list(items.keys())[0] attr_names = items[obj_id] kwargs = {"attr_names": attr_names, "bucket": bucket} - if not include_data: - kwargs["include_data"] = False + if include_data: + log.debug("setting include_data to True") + kwargs["include_data"] = True if max_data_size > 0: kwargs["max_data_size"] = max_data_size if ignore_nan: kwargs["ignore_nan"] = True if encoding: kwargs["encoding"] = encoding - + log.debug(f"getAttributes kwargs: {kwargs}") attributes = await getAttributes(app, obj_id, **kwargs) resp_json["attributes"] = attributes else: # get multi obj # don't follow links! - crawler_params = {"follow_links": False, "bucket": bucket} - # mixin params - if not include_data: - crawler_params["include_data"] = False + kwargs = {"action": "get_attr", "bucket": bucket, "follow_links": False} + kwargs["include_attrs"] = True + if include_data: + log.debug("setting include_data to True") + kwargs["include_data"] = True if max_data_size > 0: - crawler_params["max_data_size"] = max_data_size - + kwargs["max_data_size"] = max_data_size if ignore_nan: - crawler_params["ignore_nan"] = True - + kwargs["ignore_nan"] = True if encoding: - crawler_params["encoding"] = encoding - - kwargs = {"action": "get_attr", "raise_error": True, "params": crawler_params} + pass + # TBD: crawler_params["encoding"] = encoding + log.debug(f"DomainCrawler kwargs: {kwargs}") crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. await crawler.crawl() diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 1f37b151..05f59b31 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -29,26 +29,39 @@ def __init__( app, objs, action="get_obj", - params=None, + bucket=None, + follow_links=False, + include_links=False, + include_attrs=False, + include_data=False, + ignore_nan=False, + replace=False, + ignore_error=False, max_tasks=40, - max_objects_limit=0, - raise_error=False + max_objects_limit=0 ): log.info(f"DomainCrawler.__init__ action: {action} - {len(objs)} objs") - log.debug(f"params: {params}") self._app = app self._action = action self._max_objects_limit = max_objects_limit - self._params = params + self._follow_links = follow_links + self._include_links = include_links + self._include_attrs = include_attrs + self._include_data = include_data + self._ignore_nan = ignore_nan + self._replace = replace self._max_tasks = max_tasks self._q = asyncio.Queue() self._obj_dict = {} self.seen_ids = set() - self._raise_error = raise_error + self._ignore_error = ignore_error if not objs: log.error("no objs for crawler to crawl!") raise ValueError() - + if not bucket: + log.error("bucket not set for DomainCrawler") + raise ValueError() + self._bucket = bucket for obj_id in objs: log.debug(f"adding {obj_id} to the queue") self._q.put_nowait(obj_id) @@ -59,7 +72,7 @@ def __init__( def follow_links(self, grp_id, links): # add any linked obj ids to the lookup ids set - log.debug(f"follow links for {grp_id}") + log.debug(f"follow links for {grp_id}, links: {links}") if getCollectionForId(grp_id) != "groups": log.warn(f"expected group id but got: {grp_id}") return @@ -89,13 +102,10 @@ def follow_links(self, grp_id, links): log.debug(f"link id: {link_id} is not for a group, continue") continue num_objects = len(self._obj_dict) - if self._params.get("max_objects_limit") is not None: - max_objects_limit = self._params["max_objects_limit"] - if num_objects >= max_objects_limit: - msg = "DomainCrawler reached limit of " - msg += f"{max_objects_limit}" - log.info(msg) - break + if self._max_objects_limit and num_objects >= self._max_objects_limit: + msg = f"DomainCrawler reached limit of {self._max_objects_limit}" + log.info(msg) + break if link_id not in self._obj_dict: # haven't seen this object yet, get obj json log.debug(f"DomainCrawler - adding link_id: {link_id} to queue") @@ -113,10 +123,11 @@ async def get_attributes(self, obj_id, attr_names): msg += f", {len(attr_names)} attributes" log.debug(msg) - kwargs = {} - for key in ("include_data", "ignore_nan", "bucket"): - if key in self._params: - kwargs[key] = self._params[key] + kwargs = {"bucket": self._bucket} + if self._include_data: + kwargs["include_data"] = True + if self._ignore_nan: + kwargs["ignore_nan"] = True if attr_names: kwargs["attr_names"] = attr_names log.debug(f"using kwargs: {kwargs}") @@ -148,13 +159,12 @@ async def get_attributes(self, obj_id, attr_names): self._obj_dict[obj_id] = {"status": status} collection = getCollectionForId(obj_id) - follow_links = self._params.get("follow_links") - bucket = self._params.get("bucket") - if collection == "groups" and follow_links: + + if collection == "groups" and self._follow_links: links = None status = 200 try: - links = await getLinks(self._app, obj_id, bucket=bucket) + links = await getLinks(self._app, obj_id, bucket=self._bucket) except HTTPNotFound: status = 404 except HTTPServiceUnavailable: @@ -168,7 +178,6 @@ async def get_attributes(self, obj_id, attr_names): if status >= 500: log.warn(f"getLinks for {obj_id} returned: {status}") elif links: - log.debug(f"follow_links for: {links}") self.follow_links(obj_id, links) else: log.debug(f"no links for {obj_id}") @@ -179,11 +188,9 @@ async def put_attributes(self, obj_id, attr_items): req = getDataNodeUrl(self._app, obj_id) collection = getCollectionForId(obj_id) req += f"/{collection}/{obj_id}/attributes" - kwargs = {} - if "bucket" in self._params: - kwargs["bucket"] = self._params["bucket"] - if "replace" in self._params: - kwargs["replace"] = self._params["replace"] + kwargs = {"bucket": self._bucket} + if self._replace: + kwargs["replace"] = True status = None try: status = await putAttributes(self._app, obj_id, attr_items, **kwargs) @@ -205,17 +212,14 @@ async def get_obj_json(self, obj_id): for each group found, search the links if follow_links is set """ log.debug(f"get_obj_json: {obj_id}") collection = getCollectionForId(obj_id) - kwargs = {} + kwargs = {"bucket": self._bucket, "include_attrs": self._include_attrs} - for k in ("include_links", "include_attrs", "bucket"): - if k in self._params: - kwargs[k] = self._params[k] - if collection == "groups" and self._params.get("follow_links"): + if collection == "groups" and self._follow_links: follow_links = True kwargs["include_links"] = True # get them so we can follow them else: follow_links = False - if follow_links or self._params.get("include_attrs"): + if follow_links or self._include_attrs: kwargs["refresh"] = True # don't want a cached version in this case log.debug(f"follow_links: {follow_links}") @@ -261,10 +265,9 @@ async def get_obj_json(self, obj_id): log.error("expected links key in obj_json") return links = obj_json["links"] - log.debug(f"follow_links for: {links}") self.follow_links(obj_id, links) - if not self._params.get("include_links"): + if not self._include_links: # don't keep the links del obj_json["links"] @@ -278,17 +281,11 @@ async def get_links(self, grp_id, titles=None): if collection != "groups": log.warn(f"get_links, expected groups id but got: {grp_id}") return - kwargs = {} + kwargs = {"bucket": self._bucket} if titles: kwargs["titles"] = titles - if self._params.get("bucket"): - kwargs["bucket"] = self._params["bucket"] - if self._params.get("follow_links"): - follow_links = True - else: - follow_links = False - log.debug(f"follow_links: {follow_links}") + log.debug(f"follow_links: {self._follow_links}") log.debug(f"getLinks kwargs: {kwargs}") links = None @@ -322,8 +319,7 @@ async def get_links(self, grp_id, titles=None): self._obj_dict[grp_id] = links # store the links # if follow_links, add any group links to the lookup ids set - if follow_links: - log.debug(f"follow links for {grp_id}") + if self._follow_links: self.follow_links(grp_id, links) async def put_links(self, grp_id, link_items): @@ -331,9 +327,7 @@ async def put_links(self, grp_id, link_items): log.debug(f"put_links for {grp_id}, {len(link_items)} links") req = getDataNodeUrl(self._app, grp_id) req += f"/groups/{grp_id}/links" - kwargs = {} - if "bucket" in self._params: - kwargs["bucket"] = self._params["bucket"] + kwargs = {"bucket": self._bucket} status = None try: status = await putLinks(self._app, grp_id, link_items, **kwargs) @@ -382,9 +376,9 @@ async def crawl(self): status = self.get_status() if status: log.debug(f"DomainCrawler -- status: {status}") - log.debug(f"raise_error: {self._raise_error}") - if self._raise_error: - # throw the approriate exception if other than 200, 201 + log.debug(f"ignore_error: {self._ignore_error}") + if not self._ignore_error: + # throw the appropriate exception if other than 200, 201 if status == 200: pass # ok elif status == 201: diff --git a/hsds/domain_sn.py b/hsds/domain_sn.py index bd1c33aa..ec6ce1fc 100755 --- a/hsds/domain_sn.py +++ b/hsds/domain_sn.py @@ -51,17 +51,18 @@ async def get_collections(app, root_id, bucket=None, max_objects_limit=None): log.info(f"get_collections for {root_id}") - crawler_params = { + kwargs = { + "action": "get_obj", "include_attrs": False, "include_links": False, - "bucket": bucket, "follow_links": True, + "bucket": bucket, } if max_objects_limit: - crawler_params["max_objects_limit"] = max_objects_limit + kwargs["max_objects_limit"] = max_objects_limit - crawler = DomainCrawler(app, [root_id, ], action="get_obj", params=crawler_params) + crawler = DomainCrawler(app, [root_id, ], **kwargs) await crawler.crawl() if max_objects_limit and len(crawler._obj_dict) >= max_objects_limit: msg = "get_collections - too many objects: " @@ -88,7 +89,6 @@ async def get_collections(app, root_id, bucket=None, max_objects_limit=None): log.warn(f"get_collections - unexpected id type: {obj_id}") if root_id in group_ids: group_ids.remove(root_id) # don't include the root id - print(f"get_collections - group_ids: {group_ids}") result = {} result["groups"] = group_ids @@ -105,7 +105,8 @@ async def getDomainObjects(app, root_id, include_attrs=False, bucket=None): log.info(f"getDomainObjects for root: {root_id}, include_attrs: {include_attrs}") max_objects_limit = int(config.get("domain_req_max_objects_limit", default=500)) - crawler_params = { + kwargs = { + "action": "get_obj", "include_attrs": include_attrs, "include_links": True, "follow_links": True, @@ -113,7 +114,7 @@ async def getDomainObjects(app, root_id, include_attrs=False, bucket=None): "bucket": bucket, } - crawler = DomainCrawler(app, [root_id, ], action="get_obj", params=crawler_params) + crawler = DomainCrawler(app, [root_id, ], **kwargs) await crawler.crawl() if len(crawler._obj_dict) >= max_objects_limit: msg = "getDomainObjects - too many objects: " diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 9be0016b..f7aadd14 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -62,8 +62,6 @@ async def GET_Links(request): await validateAction(app, domain, group_id, username, "read") - kwargs = {"bucket": bucket} - if "follow_links" in params and params["follow_links"]: follow_links = True else: @@ -78,17 +76,37 @@ async def GET_Links(request): log.warn(msg) raise HTTPBadRequest(reason=msg) log.debug(f"using pattern: {pattern} for GET_Links") - kwargs["pattern"] = pattern else: pattern = None + create_order = False + if "CreateOrder" in params and params["CreateOrder"]: + if params["CreateOrder"] != "0": + create_order = True + + limit = None + if "Limit" in params: + try: + limit = int(params["Limit"]) + except ValueError: + msg = "Bad Request: Expected int type for limit" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + + if "Marker" in params: + marker = params["Marker"] + else: + marker = None + if follow_links: # Use DomainCrawler to fetch links from multiple objects. # set the follow_links and bucket params - kwargs["follow_links"] = True - crawler_kwargs = {"action": "get_link", "raise_error": True, "params": kwargs} + log.debug(f"GET_Links - following links starting with {group_id}") + + kwargs = {"action": "get_link", "bucket": bucket, "follow_links": True} + kwargs["include_links"] = True items = [group_id, ] - crawler = DomainCrawler(app, items, **crawler_kwargs) + crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. await crawler.crawl() @@ -109,18 +127,15 @@ async def GET_Links(request): msg += f"from {len(grp_links)} links with pattern {pattern}" log.debug(msg) else: - if "CreateOrder" in params and params["CreateOrder"]: + kwargs = {"bucket": bucket} + if create_order: kwargs["create_order"] = True - if "Limit" in params: - try: - limit = int(params["Limit"]) - except ValueError: - msg = "Bad Request: Expected int type for limit" - log.warn(msg) - raise HTTPBadRequest(reason=msg) + if limit: kwargs["limit"] = limit - if "Marker" in params: - kwargs["marker"] = params["Marker"] + if marker: + kwargs["marker"] = marker + if pattern: + kwargs["pattern"] = pattern links = await getLinks(app, group_id, **kwargs) @@ -325,6 +340,10 @@ async def PUT_Links(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) log.debug(f"got bucket: {bucket}") + if "replace" in params and params["replace"]: + replace = True + else: + replace = False # get domain JSON domain_json = await getDomainJson(app, domain) @@ -434,10 +453,6 @@ async def PUT_Links(request): await validateAction(app, domain, req_grp_id, username, "create") - kwargs = {"bucket": bucket} - if params.get("replace"): - kwargs["replace"] = True - count = len(grp_ids) if count == 0: msg = "no grp_ids defined" @@ -445,6 +460,9 @@ async def PUT_Links(request): raise HTTPBadRequest(reason=msg) elif count == 1: # just send one PUT Attributes request to the dn + kwargs = {"bucket": bucket} + if replace: + kwargs["replace"] = True grp_id = list(grp_ids.keys())[0] link_json = grp_ids[grp_id] log.debug(f"got link_json: {link_json}") @@ -453,13 +471,10 @@ async def PUT_Links(request): else: # put multi obj + kwargs = {"action": "put_link", "bucket": bucket} + if replace: + kwargs["replace"] = True - # mixin some additonal kwargs - crawler_params = {"follow_links": False} - if bucket: - crawler_params["bucket"] = bucket - - kwargs = {"action": "put_link", "raise_error": True, "params": crawler_params} crawler = DomainCrawler(app, grp_ids, **kwargs) # will raise exception on not found, server busy, etc. @@ -660,9 +675,9 @@ async def POST_Links(request): else: # Use DomainCrawler to fetch links from multiple object. # set the follow_links and bucket params - crawler_params = {"follow_links": follow_links, "bucket": bucket} - - kwargs = {"action": "get_link", "raise_error": True, "params": crawler_params} + kwargs = {"action": "get_link", "bucket": bucket, "include_links": True} + if follow_links: + kwargs["follow_links"] = True crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. await crawler.crawl() diff --git a/hsds/servicenode_lib.py b/hsds/servicenode_lib.py index bdb56c63..a3d128c6 100644 --- a/hsds/servicenode_lib.py +++ b/hsds/servicenode_lib.py @@ -355,11 +355,14 @@ async def getLinks(app, group_id, req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" params = {"bucket": bucket} + if pattern is not None: + params["pattern"] = pattern log.debug(f"getLinks {group_id}") if titles: # do a post request with the given title list log.debug(f"getLinks for {group_id} - {len(titles)} titles") + log.debug(f" params: {params}") data = {"titles": titles} post_rsp = await http_post(app, req, data=data, params=params) log.debug(f"got link_json: {post_rsp}") @@ -369,15 +372,13 @@ async def getLinks(app, group_id, links = post_rsp["links"] else: # do a get for all links - log.debug(f"getLinks, all links for {group_id}") if create_order: params["CreateOrder"] = 1 if limit is not None: params["Limit"] = str(limit) if marker is not None: params["Marker"] = marker - if pattern is not None: - params["pattern"] = pattern + log.debug(f"getLinks, all links for {group_id}, params: {params}") get_rsp = await http_get(app, req, params=params) log.debug(f"got link_json: {get_rsp}") @@ -851,7 +852,7 @@ async def doFlush(app, root_id, bucket=None): async def getAttributes(app, obj_id, attr_names=None, - include_data=True, + include_data=False, max_data_size=0, ignore_nan=False, create_order=False, diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 1066c52a..c4f8e0ca 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -97,6 +97,7 @@ def testListAttr(self): self.assertEqual(rspJson["attributeCount"], attr_count) for creation_order in (False, True): + print("creation_order:", creation_order) expected_names = copy(attr_names) if creation_order: From c0bbc2d204a994ff7d01a3f850a642aca8b8dc0c Mon Sep 17 00:00:00 2001 From: jreadey Date: Mon, 22 Jan 2024 16:03:41 +0000 Subject: [PATCH 15/18] add pattern matching to post links --- hsds/domain_crawl.py | 39 ++++- hsds/link_dn.py | 42 ++--- hsds/link_sn.py | 36 ++++- tests/integ/link_test.py | 326 ++++++++++++++++++++++----------------- 4 files changed, 279 insertions(+), 164 deletions(-) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 05f59b31..25a7b33e 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -19,6 +19,7 @@ from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone from .util.idUtil import getCollectionForId, getDataNodeUrl +from .util.globparser import globmatch from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks from . import hsds_logger as log @@ -35,6 +36,9 @@ def __init__( include_attrs=False, include_data=False, ignore_nan=False, + create_order=False, + pattern=None, + limit=None, replace=False, ignore_error=False, max_tasks=40, @@ -49,6 +53,9 @@ def __init__( self._include_attrs = include_attrs self._include_data = include_data self._ignore_nan = ignore_nan + self._create_order = create_order + self._pattern = pattern + self._limit = limit self._replace = replace self._max_tasks = max_tasks self._q = asyncio.Queue() @@ -274,7 +281,7 @@ async def get_obj_json(self, obj_id): async def get_links(self, grp_id, titles=None): """ if titles is set, get all the links in grp_id that have a title in the list. Otherwise, return all links for the object. """ - log.debug(f"get_links: {grp_id}") + log.debug(f"get_links: {grp_id}m follow_links: {self._follow_links}") if titles: log.debug(f"titles; {titles}") collection = getCollectionForId(grp_id) @@ -284,6 +291,20 @@ async def get_links(self, grp_id, titles=None): kwargs = {"bucket": self._bucket} if titles: kwargs["titles"] = titles + else: + # only use limit if we are attempting to fetch all links + if self._limit: + kwargs["limit"] = self._limit + if self._create_order: + kwargs["create_order"] = True + pattern = None + if self._pattern and not titles: + if self._follow_links: + # apply the pattern after we get the links back + log.debug("will apply pattern on return") + pattern = self._pattern + else: + kwargs["pattern"] = self._pattern log.debug(f"follow_links: {self._follow_links}") log.debug(f"getLinks kwargs: {kwargs}") @@ -314,9 +335,21 @@ async def get_links(self, grp_id, titles=None): return log.debug(f"DomainCrawler - got links for {grp_id}") - log.debug(f"save to obj_dict: {links}") - self._obj_dict[grp_id] = links # store the links + if pattern: + filtered_links = [] + for link in links: + title = link["title"] + if globmatch(title, pattern): + filtered_links.append(link) + msg = f"getLinks with pattern: {pattern} returning " + msg += f"{len(filtered_links)} links from {len(links)}" + log.debug(msg) + log.debug(f"save to obj_dict: {filtered_links}") + self._obj_dict[grp_id] = filtered_links + else: + log.debug(f"save to obj_dict: {links}") + self._obj_dict[grp_id] = links # store the links # if follow_links, add any group links to the lookup ids set if self._follow_links: diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 974a4115..7c71baa0 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -43,11 +43,34 @@ def _index(items, marker, create_order=False): return -1 +def _getTitles(links, create_order=False): + titles = [] + if create_order: + order_dict = {} + for title in links: + item = links[title] + if "created" not in item: + log.warning(f"expected to find 'created' key in link item {title}") + continue + order_dict[title] = item["created"] + log.debug(f"order_dict: {order_dict}") + # now sort by created + for k in sorted(order_dict.items(), key=lambda item: item[1]): + titles.append(k[0]) + log.debug(f"links by create order: {titles}") + else: + titles = list(links.keys()) + titles.sort() + log.debug(f"links by lexographic order: {titles}") + return titles + + async def GET_Links(request): """HTTP GET method to return JSON for a link collection""" log.request(request) app = request.app params = request.rel_url.query + log.debug(f"GET_Links params: {params}") group_id = get_obj_id(request) log.info(f"GET links: {group_id}") if not isValidUuid(group_id, obj_class="group"): @@ -95,24 +118,7 @@ async def GET_Links(request): # return a list of links based on sorted dictionary keys link_dict = group_json["links"] - titles = [] - if create_order: - order_dict = {} - for title in link_dict: - item = link_dict[title] - if "created" not in item: - log.warning(f"expected to find 'created' key in link item {title}") - continue - order_dict[title] = item["created"] - log.debug(f"order_dict: {order_dict}") - # now sort by created - for k in sorted(order_dict.items(), key=lambda item: item[1]): - titles.append(k[0]) - log.debug(f"links by create order: {titles}") - else: - titles = list(link_dict.keys()) - titles.sort() # sort by key - log.debug(f"links by lexographic order: {titles}") + titles = _getTitles(link_dict, create_order=create_order) if pattern: try: diff --git a/hsds/link_sn.py b/hsds/link_sn.py index f7aadd14..1ebc9449 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -551,6 +551,7 @@ async def POST_Links(request): log.request(request) app = request.app params = request.rel_url.query + log.debug(f"POST_Links params: {params}") log.info("POST_Links") req_id = request.match_info.get("id") @@ -558,6 +559,22 @@ async def POST_Links(request): follow_links = True else: follow_links = False + create_order = False + if "CreateOrder" in params and params["CreateOrder"]: + if params["CreateOrder"] != "0": + create_order = True + limit = None + if "Limit" in params: + try: + limit = int(params["Limit"]) + except ValueError: + msg = "Bad Request: Expected int type for limit" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "pattern" in params: + pattern = params["pattern"] + else: + pattern = None if not request.has_body: msg = "POST Links with no body" @@ -668,8 +685,19 @@ async def POST_Links(request): elif len(items) == 1 and not follow_links: # just make a request to the datanode group_id = list(items.keys())[0] + kwargs = {"bucket": bucket} + titles = items[group_id] - links = await getLinks(app, group_id, titles=titles, bucket=bucket) + if titles: + kwargs["titles"] = titles + else: + if limit: + kwargs["limit"] = limit + if create_order: + kwargs["create_order"] = True + if pattern: + kwargs["pattern"] = pattern + links = await getLinks(app, group_id, **kwargs) resp_json["links"] = links else: @@ -678,6 +706,12 @@ async def POST_Links(request): kwargs = {"action": "get_link", "bucket": bucket, "include_links": True} if follow_links: kwargs["follow_links"] = True + if create_order: + kwargs["create_order"] = True + if limit: + kwargs["limit"] = limit + if pattern: + kwargs["pattern"] = pattern crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. await crawler.crawl() diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index aa23e481..c141db40 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -344,100 +344,122 @@ def testGetLinks(self): req = helper.getEndpoint() + "/groups/" + root_id + "/links" - for creation_order in (False, True): - - # get all the links for the root group - params = {} - if creation_order: - params["CreateOrder"] = 1 - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), len(link_names)) - ret_names = [] - for link in links: - self.assertTrue("title" in link) - self.assertTrue("class" in link) - self.assertEqual(link["class"], "H5L_TYPE_HARD") - self.assertTrue("collection" in link) - self.assertEqual(link["collection"], "groups") - self.assertTrue("created" in link) - ret_names.append(link["title"]) + for use_post in (False, True): + for creation_order in (False, True): + # get all the links for the root group + params = {} + if creation_order: + params["CreateOrder"] = 1 + + if use_post: + payload = {"group_ids": [root_id, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + if use_post: + pass # hrefs not returned for post + else: + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), len(link_names)) + ret_names = [] + for link in links: + self.assertTrue("title" in link) + self.assertTrue("class" in link) + self.assertEqual(link["class"], "H5L_TYPE_HARD") + if use_post: + pass # href, collection not returned for post + else: + self.assertTrue("href" in link) + self.assertTrue("collection" in link) + self.assertEqual(link["collection"], "groups") + self.assertTrue("created" in link) + ret_names.append(link["title"]) - expected_names = copy(link_names) + expected_names = copy(link_names) - if creation_order: - # result should come back in sorted order - pass - else: - expected_names.sort() # lexographic order - # sorted list should be: - # ['eighth', 'eleventh', 'fifth', 'first', 'fourth', 'ninth', - # 'second', 'seventh', 'sixth', 'tenth', 'third', 'twelfth'] - # - - self.assertEqual(ret_names, expected_names) - - # get links with a result limit of 4 - limit = 4 - params = {"Limit": limit} - if creation_order: - params["CreateOrder"] = 1 - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), limit) - last_link = links[-1] - self.assertEqual(last_link["title"], expected_names[limit - 1]) - - # get links after the one with name: "seventh" - marker = "seventh" - params = {"Marker": marker} - if creation_order: - params["CreateOrder"] = 1 - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - if creation_order: - self.assertEqual(len(links), 5) - else: - self.assertEqual(len(links), 4) - last_link = links[-1] - # "twelfth" is last in either ordering - self.assertEqual(last_link["title"], "twelfth") - - # Use a marker that is not present (should return 404) - params["Marker"] = "foobar" - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 404) - - # get links starting with name: "seventh", and limit to 3 results - params["Marker"] = "seventh" - limit = 3 - params["Limit"] = limit - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), 3) - last_link = links[-1] - if creation_order: - # expecting: "eighth", "ninth", "tenth" - self.assertEqual(last_link["title"], "tenth") - else: - # expecting: "sixth", "tenth", "third" - self.assertEqual(last_link["title"], "third") + if creation_order: + # result should come back in sorted order + pass + else: + expected_names.sort() # lexographic order + # sorted list should be: + # ['eighth', 'eleventh', 'fifth', 'first', 'fourth', 'ninth', + # 'second', 'seventh', 'sixth', 'tenth', 'third', 'twelfth'] + # + + self.assertEqual(ret_names, expected_names) + + # get links with a result limit of 4 + limit = 4 + params = {"Limit": limit} + if creation_order: + params["CreateOrder"] = 1 + if use_post: + payload = {"group_ids": [root_id, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + if use_post: + pass # no hrefs for post + else: + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), limit) + last_link = links[-1] + self.assertEqual(last_link["title"], expected_names[limit - 1]) + + # get links after the one with name: "seventh" + marker = "seventh" + params = {"Marker": marker} + if creation_order: + params["CreateOrder"] = 1 + # Marker isn't supported for POST, so just run get twice + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + if creation_order: + self.assertEqual(len(links), 5) + else: + self.assertEqual(len(links), 4) + last_link = links[-1] + # "twelfth" is last in either ordering + self.assertEqual(last_link["title"], "twelfth") + + # Use a marker that is not present (should return 404) + params["Marker"] = "foobar" + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 404) + + # get links starting with name: "seventh", and limit to 3 results + params["Marker"] = "seventh" + limit = 3 + params["Limit"] = limit + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), 3) + last_link = links[-1] + if creation_order: + # expecting: "eighth", "ninth", "tenth" + self.assertEqual(last_link["title"], "tenth") + else: + # expecting: "sixth", "tenth", "third" + self.assertEqual(last_link["title"], "third") def testGet(self): # test getting links from an existing domain @@ -623,60 +645,80 @@ def testGetPattern(self): g1_2_uuid = helper.getUUIDByPath(domain, "/g1/g1.2", session=self.session) now = time.time() - # do get with a regex pattern + # do get with a glob pattern # get links for /g1/g1.2: - req = helper.getEndpoint() + "/groups/" + g1_2_uuid + "/links" - params = {"pattern": "ext*"} - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), 1) # only extlink should be returned - link = links[0] - for name in ("created", "class", "h5domain", "h5path", "title", "href"): - self.assertTrue(name in link) - self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") - self.assertEqual(link["title"], "extlink") - self.assertEqual(link["h5domain"], "somefile") - self.assertEqual(link["h5path"], "somepath") - self.assertTrue(link["created"] < now - 10) - # get links for root group and other groups recursively - req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" - params = {"follow_links": 1, "pattern": "dset*"} - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) - hrefs = rspJson["hrefs"] - self.assertEqual(len(hrefs), 3) - self.assertTrue("links" in rspJson) - obj_map = rspJson["links"] # map of grp ids to links - - expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") - - self.assertEqual(len(obj_map), 6) # 6 groups should be returned - link_count = 0 + for use_post in (False, True): + req = helper.getEndpoint() + "/groups/" + g1_2_uuid + "/links" + params = {"pattern": "ext*"} + if use_post: + payload = {"group_ids": [g1_2_uuid, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + links = rspJson["links"] - for grp_id in obj_map: - helper.validateId(grp_id) - links = obj_map[grp_id] - for link in links: - self.assertTrue("title" in link) - link_title = link["title"] - self.assertTrue(link_title in expected_dset_links) - self.assertTrue("class" in link) - link_class = link["class"] - # only hardlinks will be a match with this pattern - self.assertEqual(link_class, "H5L_TYPE_HARD") - link_count += 1 - self.assertTrue("id" in link) - link_id = link["id"] - helper.validateId(link_id) - self.assertTrue(link_id.startswith("d-")) # link to a dataset + self.assertEqual(len(links), 1) # only extlink should be returned + link = links[0] + for name in ("created", "class", "h5domain", "h5path", "title"): + self.assertTrue(name in link) + if use_post: + pass # no href with post + else: + self.assertTrue("href" in link) + self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(link["title"], "extlink") + self.assertEqual(link["h5domain"], "somefile") + self.assertEqual(link["h5path"], "somepath") + self.assertTrue(link["created"] < now - 10) + + # get links for root group and other groups recursively + req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" + params = {"follow_links": 1, "pattern": "dset*"} + if use_post: + payload = {"group_ids": [root_uuid, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + if use_post: + pass # hrefs not returned with post + else: + self.assertTrue("hrefs" in rspJson) + hrefs = rspJson["hrefs"] + self.assertEqual(len(hrefs), 3) + self.assertTrue("links" in rspJson) + obj_map = rspJson["links"] # map of grp ids to links + + expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") + + self.assertEqual(len(obj_map), 6) # 6 groups should be returned + link_count = 0 + + for grp_id in obj_map: + helper.validateId(grp_id) + links = obj_map[grp_id] + for link in links: + self.assertTrue("title" in link) + link_title = link["title"] + self.assertTrue(link_title in expected_dset_links) + self.assertTrue("class" in link) + link_class = link["class"] + # only hardlinks will be a match with this pattern + self.assertEqual(link_class, "H5L_TYPE_HARD") + link_count += 1 + self.assertTrue("id" in link) + link_id = link["id"] + helper.validateId(link_id) + self.assertTrue(link_id.startswith("d-")) # link to a dataset - self.assertEqual(link_count, len(expected_dset_links)) + self.assertEqual(link_count, len(expected_dset_links)) def testSoftLinkTraversal(self): # test that an object can be found via path with an external link From 98451f3c1964f30134c26b771ac82e60cf908e11 Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 23 Jan 2024 09:02:36 +0000 Subject: [PATCH 16/18] added crawler support for limit, encoding, patterna dn create_order --- README.md | 2 +- hsds/attr_sn.py | 8 +++ hsds/domain_crawl.py | 46 +++++++++---- tests/integ/attr_test.py | 141 +++++++++++++++++++++++++++++++++++++++ tests/integ/link_test.py | 3 +- 5 files changed, 185 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index c11b2fc9..07b09f87 100755 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Make sure you have Python 3 and Pip installed, then: - Set user_name: `$ export USER_NAME=$USER` - Set user_password: `$ export USER_PASSWORD=$USER` - Set admin name: `$ export ADMIN_USERNAME=$USER` - - Set admin password: `$ $export ADMIN_PASSWORD=$USER` + - Set admin password: `$ export ADMIN_PASSWORD=$USER` - Run test suite: `$ python testall.py --skip_unit` 5. (Optional) Install the h5pyd package for an h5py compatible api and tool suite: https://github.com/HDFGroup/h5pyd 6. (Optional) Post install setup (test data, home folders, cli tools, etc): [docs/post_install.md](docs/post_install.md) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index c8259f2a..0ee647ce 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -155,6 +155,14 @@ async def GET_Attributes(request): kwargs["max_data_size"] = max_data_size if ignore_nan: kwargs["ignore_nan"] = True + if limit: + kwargs["limit"] = limit + if encoding: + kwargs["encoding"] = encoding + if pattern: + kwargs["pattern"] = pattern + if create_order: + kwargs["create_order"] = True items = [obj_id, ] crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 25a7b33e..e2008c92 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -35,7 +35,9 @@ def __init__( include_links=False, include_attrs=False, include_data=False, + max_data_size=0, ignore_nan=False, + encoding=None, create_order=False, pattern=None, limit=None, @@ -52,7 +54,9 @@ def __init__( self._include_links = include_links self._include_attrs = include_attrs self._include_data = include_data + self._max_data_size = max_data_size self._ignore_nan = ignore_nan + self._encoding = encoding self._create_order = create_order self._pattern = pattern self._limit = limit @@ -135,8 +139,20 @@ async def get_attributes(self, obj_id, attr_names): kwargs["include_data"] = True if self._ignore_nan: kwargs["ignore_nan"] = True + if self._encoding: + kwargs["encoding"] = self._encoding if attr_names: kwargs["attr_names"] = attr_names + else: + # only apply these parameters if we are attempting to fetch all links + if self._limit: + kwargs["limit"] = self._limit + if self._create_order: + kwargs["create_order"] = True + if self._pattern: + kwargs["pattern"] = self._pattern + if self._max_data_size > 0: + kwargs["max_data_size"] = self._max_data_size log.debug(f"using kwargs: {kwargs}") status = 200 @@ -281,7 +297,8 @@ async def get_obj_json(self, obj_id): async def get_links(self, grp_id, titles=None): """ if titles is set, get all the links in grp_id that have a title in the list. Otherwise, return all links for the object. """ - log.debug(f"get_links: {grp_id}m follow_links: {self._follow_links}") + log.debug(f"get_links: {grp_id} follow_links: {self._follow_links}") + pattern = None if titles: log.debug(f"titles; {titles}") collection = getCollectionForId(grp_id) @@ -292,19 +309,21 @@ async def get_links(self, grp_id, titles=None): if titles: kwargs["titles"] = titles else: - # only use limit if we are attempting to fetch all links + # only apply these parameters if we are attempting to fetch all links if self._limit: kwargs["limit"] = self._limit - if self._create_order: - kwargs["create_order"] = True - pattern = None - if self._pattern and not titles: - if self._follow_links: - # apply the pattern after we get the links back - log.debug("will apply pattern on return") - pattern = self._pattern - else: - kwargs["pattern"] = self._pattern + if self._create_order: + kwargs["create_order"] = True + + if self._pattern: + if self._follow_links: + # apply the pattern after we get the links back, + # otherwise we won't get the groups links that we + # need to follow + log.debug("will apply pattern on return") + pattern = self._pattern + else: + kwargs["pattern"] = self._pattern log.debug(f"follow_links: {self._follow_links}") log.debug(f"getLinks kwargs: {kwargs}") @@ -322,7 +341,7 @@ async def get_links(self, grp_id, titles=None): except Exception as e: log.error(f"unexpected exception {e}") status = 500 - log.debug(f"getObjectJson status: {status}") + log.debug(f"get_links status: {status}") if links is None: msg = f"DomainCrawler - get_links for {grp_id} " @@ -337,6 +356,7 @@ async def get_links(self, grp_id, titles=None): log.debug(f"DomainCrawler - got links for {grp_id}") if pattern: + log.debug(f"applying pattern: {pattern}") filtered_links = [] for link in links: title = link["title"] diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index c4f8e0ca..3034f024 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -2302,6 +2302,23 @@ def testGetPattern(self): self.assertTrue(name in attr) self.assertEqual(attr["name"], "attr1") + # do recursive get with a pattern + req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" + params = {"pattern": "*1", "follow_links": 1} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + obj_map = rspJson["attributes"] + self.assertEqual(len(obj_map), 10) # 10 objects in the domain + attr_count = 0 + for obj_id in obj_map: + attrs = obj_map[obj_id] + attr_count += len(attrs) + for attr in attrs: + self.assertEqual(attr["name"], "attr1") + self.assertEqual(attr_count, 2) + def testGetRecursive(self): # test getting all attributes from an existing domain domain = helper.getTestDomain("tall.h5") @@ -2350,6 +2367,130 @@ def testGetRecursive(self): self.assertTrue("created" in attrJson) self.assertTrue("value" in attrJson) + # same thing with Limit + req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" + params = {"follow_links": "1", "Limit": 1} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + obj_map = rspJson["attributes"] + self.assertEqual(len(obj_map), 10) + attr_count = 0 + for obj_id in obj_map: + self.assertTrue(len(obj_map[obj_id]) <= 1) + attr_count += len(obj_map[obj_id]) + self.assertEqual(attr_count, 2) + for obj_id in (root_uuid, d111_uuid): + # these are the only two objects with attributes + self.assertTrue(obj_id in obj_map) + obj_attrs = obj_map[obj_id] + self.assertEqual(len(obj_attrs), 1) + for attrJson in obj_attrs: + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertTrue(attr_name in ("attr1", "attr2")) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("value" in attrJson) + + # do a get with encoding + req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" + params = {"follow_links": "1", "encoding": "base64"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + obj_map = rspJson["attributes"] + self.assertEqual(len(obj_map), 10) + attr_count = 0 + for obj_id in obj_map: + attr_count += len(obj_map[obj_id]) + self.assertEqual(attr_count, 4) + for obj_id in (root_uuid, d111_uuid): + # these are the only two objects with attributes + self.assertTrue(obj_id in obj_map) + obj_attrs = obj_map[obj_id] + self.assertEqual(len(obj_attrs), 2) + for attrJson in obj_attrs: + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertTrue(attr_name in ("attr1", "attr2")) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertTrue("encoding" in attrJson) + self.assertEqual(attrJson["encoding"], "base64") + self.assertTrue("value" in attrJson) + self.assertTrue(isinstance(attrJson["value"], str)) + + # do a get with includeData set to false + req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" + params = {"follow_links": "1", "IncludeData": "0"} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + obj_map = rspJson["attributes"] + self.assertEqual(len(obj_map), 10) + attr_count = 0 + for obj_id in obj_map: + attr_count += len(obj_map[obj_id]) + self.assertEqual(attr_count, 4) + for obj_id in (root_uuid, d111_uuid): + # these are the only two objects with attributes + self.assertTrue(obj_id in obj_map) + obj_attrs = obj_map[obj_id] + self.assertEqual(len(obj_attrs), 2) + for attrJson in obj_attrs: + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertTrue(attr_name in ("attr1", "attr2")) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + self.assertFalse("value" in attrJson) + + # do a get with max_data_size of 10 bytes + req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" + params = {"follow_links": "1", "max_data_size": 10} + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("attributes" in rspJson) + obj_map = rspJson["attributes"] + self.assertEqual(len(obj_map), 10) + attr_count = 0 + for obj_id in obj_map: + attr_count += len(obj_map[obj_id]) + self.assertEqual(attr_count, 4) + for obj_id in (root_uuid, d111_uuid): + # these are the only two objects with attributes + self.assertTrue(obj_id in obj_map) + obj_attrs = obj_map[obj_id] + self.assertEqual(len(obj_attrs), 2) + for attrJson in obj_attrs: + self.assertTrue("name" in attrJson) + attr_name = attrJson["name"] + self.assertTrue(attr_name in ("attr1", "attr2")) + self.assertTrue("type" in attrJson) + self.assertTrue("shape" in attrJson) + shapeJson = attrJson["shape"] + self.assertEqual(shapeJson["class"], "H5S_SIMPLE") + self.assertTrue("created" in attrJson) + if obj_id == root_uuid and attr_name == "attr1": + self.assertTrue("value" in attrJson) + else: + # other attributes are larger than 10 bytes + self.assertFalse("value" in attrJson) + if __name__ == "__main__": # setup test files diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index c141db40..3d12335d 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -1272,7 +1272,8 @@ def testPostLinkMultiple(self): if link_class == "H5L_TYPE_EXTERNAL": self.assertEqual(link["h5domain"], expected["h5domain"]) - # get just the request links for each group + # get just the requested links for each group + req = helper.getEndpoint() + "/groups/" + root_id + "/links" link_map = {g1_id: ["g1.1", "g1.2"], g2_id: ["dset2.2", ]} payload = {"group_ids": link_map} rsp = self.session.post(req, data=json.dumps(payload), headers=headers) From 9092a87c7387564e4344027907c0c50a8bb512a8 Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 23 Jan 2024 10:05:34 +0000 Subject: [PATCH 17/18] added getBoolanParam util --- hsds/attr_sn.py | 61 +++++++++++++--------------------------- hsds/domain_crawl.py | 7 ++--- hsds/link_sn.py | 34 ++++++---------------- hsds/util/httpUtil.py | 25 ++++++++++++++++ tests/integ/attr_test.py | 10 +++---- 5 files changed, 62 insertions(+), 75 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index 0ee647ce..11f27a94 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -18,7 +18,7 @@ from aiohttp.web import StreamResponse from json import JSONDecodeError -from .util.httpUtil import getAcceptType, jsonResponse, getHref +from .util.httpUtil import getAcceptType, jsonResponse, getHref, getBooleanParam from .util.globparser import globmatch from .util.idUtil import isValidUuid, getRootObjId from .util.authUtil import getUserPasswordFromRequest, validateUserPassword @@ -66,20 +66,14 @@ async def GET_Attributes(request): bucket = getBucketForDomain(domain) log.debug(f"bucket: {bucket}") - if "follow_links" in params and params["follow_links"]: - if collection != "groups": - msg = "follow_links can only be used with group ids" - log.warn(msg) - raise HTTPBadRequest(reason=msg) - follow_links = True - else: - follow_links = False + follow_links = getBooleanParam(params, "follow_links") + if follow_links and collection != "groups": + msg = "follow_links can only be used with group ids" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + log.debug(f"getAttributes follow_links: {follow_links}") - include_data = True - if "IncludeData" in params: - IncludeData = params["IncludeData"] - if not IncludeData or IncludeData == "0": - include_data = False + include_data = getBooleanParam(params, "IncludeData") log.debug(f"include_data: {include_data}") if "max_data_size" in params: @@ -92,16 +86,9 @@ async def GET_Attributes(request): else: max_data_size = 0 - if "ignore_nan" in params and params["ignore_nan"]: - ignore_nan = True - else: - ignore_nan = False - - if "CreateOrder" in params and params["CreateOrder"]: - create_order = True - else: - create_order = False - + ignore_nan = getBooleanParam(params, "ignore_nan") + create_order = getBooleanParam(params, "CreateOrder") + if "Limit" in params: try: limit = int(params["Limit"]) @@ -249,15 +236,13 @@ async def GET_Attribute(request): await validateAction(app, domain, obj_id, username, "read") - if "ignore_nan" in params and params["ignore_nan"]: - ignore_nan = True - else: - ignore_nan = False + ignore_nan = getBooleanParam(params, "ignore_nan") - if "IncludeData" in params and not params["IncludeData"]: - include_data = False - else: + if "IncludeData" not in params: + # this boolean param breaks our usual rule of default False include_data = True + else: + include_data = getBooleanParam(params, "IncludeData") if params.get("encoding"): if params["encoding"] != "base64": @@ -864,10 +849,8 @@ async def GET_AttributeValue(request): await validateAction(app, domain, obj_id, username, "read") params = request.rel_url.query - if "ignore_nan" in params and params["ignore_nan"]: - ignore_nan = True - else: - ignore_nan = False + ignore_nan = getBooleanParam(params, "ignore_nan") + if "encoding" in params: encoding = params["encoding"] if encoding and encoding != "base64": @@ -1288,12 +1271,8 @@ async def POST_Attributes(request): log.debug(f"got params: {params}") include_data = False max_data_size = 0 - if "IncludeData" in params: - IncludeData = params["IncludeData"] - log.debug(f"got IncludeData: [{IncludeData}], type: {type(IncludeData)}") - if IncludeData and IncludeData != "0": - include_data = True - log.debug(f"include_data: {include_data}") + include_data = getBooleanParam(params, "IncludeData") + log.debug(f"include_data: {include_data}") if "max_data_size" in params: try: max_data_size = int(params["max_data_size"]) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index e2008c92..119ac442 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -18,6 +18,7 @@ from aiohttp.web_exceptions import HTTPServiceUnavailable, HTTPConflict, HTTPBadRequest from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone +from .util.httpUtil import isOK from .util.idUtil import getCollectionForId, getDataNodeUrl from .util.globparser import globmatch from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks @@ -174,7 +175,7 @@ async def get_attributes(self, obj_id, attr_names): log.error(f"unexpected exception from post request: {e}") status = 500 - if status == 200: + if isOK(status): log.debug(f"got attributes: {attributes}") self._obj_dict[obj_id] = attributes else: @@ -432,10 +433,8 @@ async def crawl(self): log.debug(f"ignore_error: {self._ignore_error}") if not self._ignore_error: # throw the appropriate exception if other than 200, 201 - if status == 200: + if isOK(status): pass # ok - elif status == 201: - pass # also ok elif status == 400: log.warn("DomainCrawler - BadRequest") raise HTTPBadRequest(reason="unkown") diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 1ebc9449..950ed989 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -16,7 +16,7 @@ from aiohttp.web_exceptions import HTTPBadRequest from json import JSONDecodeError -from .util.httpUtil import getHref +from .util.httpUtil import getHref, getBooleanParam from .util.httpUtil import jsonResponse from .util.globparser import globmatch from .util.idUtil import isValidUuid, getDataNodeUrl, getCollectionForId @@ -62,11 +62,8 @@ async def GET_Links(request): await validateAction(app, domain, group_id, username, "read") - if "follow_links" in params and params["follow_links"]: - follow_links = True - else: - follow_links = False - + follow_links = getBooleanParam(params, "follow_links") + if "pattern" in params and params["pattern"]: pattern = params["pattern"] try: @@ -79,10 +76,7 @@ async def GET_Links(request): else: pattern = None - create_order = False - if "CreateOrder" in params and params["CreateOrder"]: - if params["CreateOrder"] != "0": - create_order = True + create_order = getBooleanParam(params, "CreateOrder") limit = None if "Limit" in params: @@ -206,9 +200,6 @@ async def GET_Link(request): req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" log.debug("get LINK: " + req) - params = {} - if bucket: - params["bucket"] = bucket link_json = await getLink(app, group_id, link_title, bucket=bucket) @@ -340,10 +331,7 @@ async def PUT_Links(request): raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) log.debug(f"got bucket: {bucket}") - if "replace" in params and params["replace"]: - replace = True - else: - replace = False + replace = getBooleanParam(params, "replace") # get domain JSON domain_json = await getDomainJson(app, domain) @@ -555,14 +543,10 @@ async def POST_Links(request): log.info("POST_Links") req_id = request.match_info.get("id") - if params.get("follow_links"): - follow_links = True - else: - follow_links = False - create_order = False - if "CreateOrder" in params and params["CreateOrder"]: - if params["CreateOrder"] != "0": - create_order = True + follow_links = getBooleanParam(params, "follow_links") + + create_order = getBooleanParam(params, "CreateOrder") + limit = None if "Limit" in params: try: diff --git a/hsds/util/httpUtil.py b/hsds/util/httpUtil.py index 1cc1e0dd..f17a4ee6 100644 --- a/hsds/util/httpUtil.py +++ b/hsds/util/httpUtil.py @@ -42,6 +42,31 @@ def getUrl(host, port): """return url for host and port""" return f"http://{host}:{port}" +def getBooleanParam(params, key): + """ return False if the given key is not in the + params dict, or is it, but has the value, 0, or "0". + return True otherwise """ + + if not isinstance(key, str): + raise TypeError("expected str value for key") + + if key not in params: + return False + + value = params[key] + if not value: + return False + + try: + int_value = int(value) + except ValueError: + return True + + if int_value: + return True + else: + return False + def getPortFromUrl(url): """Get Port number for given url""" diff --git a/tests/integ/attr_test.py b/tests/integ/attr_test.py index 3034f024..f669746b 100644 --- a/tests/integ/attr_test.py +++ b/tests/integ/attr_test.py @@ -2290,7 +2290,7 @@ def testGetPattern(self): # do get with a glob pattern req = helper.getEndpoint() + "/datasets/" + d111_uuid + "/attributes" - params = {"pattern": "*1"} + params = {"pattern": "*1", "IncludeData": 1} rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) @@ -2340,7 +2340,7 @@ def testGetRecursive(self): # do get with follow_links req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" - params = {"follow_links": "1"} + params = {"follow_links": "1", "IncludeData": 1} rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) @@ -2395,11 +2395,11 @@ def testGetRecursive(self): shapeJson = attrJson["shape"] self.assertEqual(shapeJson["class"], "H5S_SIMPLE") self.assertTrue("created" in attrJson) - self.assertTrue("value" in attrJson) + self.assertFalse("value" in attrJson) # do a get with encoding req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" - params = {"follow_links": "1", "encoding": "base64"} + params = {"follow_links": "1", "encoding": "base64", "IncludeData": 1} rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) @@ -2460,7 +2460,7 @@ def testGetRecursive(self): # do a get with max_data_size of 10 bytes req = helper.getEndpoint() + "/groups/" + root_uuid + "/attributes" - params = {"follow_links": "1", "max_data_size": 10} + params = {"follow_links": "1", "max_data_size": 10, "IncludeData": 1} rsp = self.session.get(req, params=params, headers=headers) self.assertEqual(rsp.status_code, 200) rspJson = json.loads(rsp.text) From 04c9837d278a6cb32286be7c102f61ca9df277cb Mon Sep 17 00:00:00 2001 From: jreadey Date: Tue, 23 Jan 2024 14:35:00 +0000 Subject: [PATCH 18/18] fix flake8 errors --- hsds/attr_sn.py | 10 +++++----- hsds/link_sn.py | 6 +++--- hsds/util/httpUtil.py | 9 +++++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/hsds/attr_sn.py b/hsds/attr_sn.py index 11f27a94..6f4ebd8e 100755 --- a/hsds/attr_sn.py +++ b/hsds/attr_sn.py @@ -71,7 +71,7 @@ async def GET_Attributes(request): msg = "follow_links can only be used with group ids" log.warn(msg) raise HTTPBadRequest(reason=msg) - + log.debug(f"getAttributes follow_links: {follow_links}") include_data = getBooleanParam(params, "IncludeData") log.debug(f"include_data: {include_data}") @@ -87,8 +87,8 @@ async def GET_Attributes(request): max_data_size = 0 ignore_nan = getBooleanParam(params, "ignore_nan") - create_order = getBooleanParam(params, "CreateOrder") - + create_order = getBooleanParam(params, "CreateOrder") + if "Limit" in params: try: limit = int(params["Limit"]) @@ -242,7 +242,7 @@ async def GET_Attribute(request): # this boolean param breaks our usual rule of default False include_data = True else: - include_data = getBooleanParam(params, "IncludeData") + include_data = getBooleanParam(params, "IncludeData") if params.get("encoding"): if params["encoding"] != "base64": @@ -1271,7 +1271,7 @@ async def POST_Attributes(request): log.debug(f"got params: {params}") include_data = False max_data_size = 0 - include_data = getBooleanParam(params, "IncludeData") + include_data = getBooleanParam(params, "IncludeData") log.debug(f"include_data: {include_data}") if "max_data_size" in params: try: diff --git a/hsds/link_sn.py b/hsds/link_sn.py index 950ed989..609479c5 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -63,7 +63,7 @@ async def GET_Links(request): await validateAction(app, domain, group_id, username, "read") follow_links = getBooleanParam(params, "follow_links") - + if "pattern" in params and params["pattern"]: pattern = params["pattern"] try: @@ -544,9 +544,9 @@ async def POST_Links(request): req_id = request.match_info.get("id") follow_links = getBooleanParam(params, "follow_links") - + create_order = getBooleanParam(params, "CreateOrder") - + limit = None if "Limit" in params: try: diff --git a/hsds/util/httpUtil.py b/hsds/util/httpUtil.py index f17a4ee6..5df7bfcb 100644 --- a/hsds/util/httpUtil.py +++ b/hsds/util/httpUtil.py @@ -42,17 +42,18 @@ def getUrl(host, port): """return url for host and port""" return f"http://{host}:{port}" + def getBooleanParam(params, key): """ return False if the given key is not in the params dict, or is it, but has the value, 0, or "0". return True otherwise """ - + if not isinstance(key, str): raise TypeError("expected str value for key") if key not in params: - return False - + return False + value = params[key] if not value: return False @@ -66,7 +67,7 @@ def getBooleanParam(params, key): return True else: return False - + def getPortFromUrl(url): """Get Port number for given url"""