From c0bbc2d204a994ff7d01a3f850a642aca8b8dc0c Mon Sep 17 00:00:00 2001 From: jreadey Date: Mon, 22 Jan 2024 16:03:41 +0000 Subject: [PATCH] add pattern matching to post links --- hsds/domain_crawl.py | 39 ++++- hsds/link_dn.py | 42 ++--- hsds/link_sn.py | 36 ++++- tests/integ/link_test.py | 326 ++++++++++++++++++++++----------------- 4 files changed, 279 insertions(+), 164 deletions(-) diff --git a/hsds/domain_crawl.py b/hsds/domain_crawl.py index 05f59b31..25a7b33e 100644 --- a/hsds/domain_crawl.py +++ b/hsds/domain_crawl.py @@ -19,6 +19,7 @@ from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone from .util.idUtil import getCollectionForId, getDataNodeUrl +from .util.globparser import globmatch from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks from . import hsds_logger as log @@ -35,6 +36,9 @@ def __init__( include_attrs=False, include_data=False, ignore_nan=False, + create_order=False, + pattern=None, + limit=None, replace=False, ignore_error=False, max_tasks=40, @@ -49,6 +53,9 @@ def __init__( self._include_attrs = include_attrs self._include_data = include_data self._ignore_nan = ignore_nan + self._create_order = create_order + self._pattern = pattern + self._limit = limit self._replace = replace self._max_tasks = max_tasks self._q = asyncio.Queue() @@ -274,7 +281,7 @@ async def get_obj_json(self, obj_id): async def get_links(self, grp_id, titles=None): """ if titles is set, get all the links in grp_id that have a title in the list. Otherwise, return all links for the object. """ - log.debug(f"get_links: {grp_id}") + log.debug(f"get_links: {grp_id}m follow_links: {self._follow_links}") if titles: log.debug(f"titles; {titles}") collection = getCollectionForId(grp_id) @@ -284,6 +291,20 @@ async def get_links(self, grp_id, titles=None): kwargs = {"bucket": self._bucket} if titles: kwargs["titles"] = titles + else: + # only use limit if we are attempting to fetch all links + if self._limit: + kwargs["limit"] = self._limit + if self._create_order: + kwargs["create_order"] = True + pattern = None + if self._pattern and not titles: + if self._follow_links: + # apply the pattern after we get the links back + log.debug("will apply pattern on return") + pattern = self._pattern + else: + kwargs["pattern"] = self._pattern log.debug(f"follow_links: {self._follow_links}") log.debug(f"getLinks kwargs: {kwargs}") @@ -314,9 +335,21 @@ async def get_links(self, grp_id, titles=None): return log.debug(f"DomainCrawler - got links for {grp_id}") - log.debug(f"save to obj_dict: {links}") - self._obj_dict[grp_id] = links # store the links + if pattern: + filtered_links = [] + for link in links: + title = link["title"] + if globmatch(title, pattern): + filtered_links.append(link) + msg = f"getLinks with pattern: {pattern} returning " + msg += f"{len(filtered_links)} links from {len(links)}" + log.debug(msg) + log.debug(f"save to obj_dict: {filtered_links}") + self._obj_dict[grp_id] = filtered_links + else: + log.debug(f"save to obj_dict: {links}") + self._obj_dict[grp_id] = links # store the links # if follow_links, add any group links to the lookup ids set if self._follow_links: diff --git a/hsds/link_dn.py b/hsds/link_dn.py index 974a4115..7c71baa0 100755 --- a/hsds/link_dn.py +++ b/hsds/link_dn.py @@ -43,11 +43,34 @@ def _index(items, marker, create_order=False): return -1 +def _getTitles(links, create_order=False): + titles = [] + if create_order: + order_dict = {} + for title in links: + item = links[title] + if "created" not in item: + log.warning(f"expected to find 'created' key in link item {title}") + continue + order_dict[title] = item["created"] + log.debug(f"order_dict: {order_dict}") + # now sort by created + for k in sorted(order_dict.items(), key=lambda item: item[1]): + titles.append(k[0]) + log.debug(f"links by create order: {titles}") + else: + titles = list(links.keys()) + titles.sort() + log.debug(f"links by lexographic order: {titles}") + return titles + + async def GET_Links(request): """HTTP GET method to return JSON for a link collection""" log.request(request) app = request.app params = request.rel_url.query + log.debug(f"GET_Links params: {params}") group_id = get_obj_id(request) log.info(f"GET links: {group_id}") if not isValidUuid(group_id, obj_class="group"): @@ -95,24 +118,7 @@ async def GET_Links(request): # return a list of links based on sorted dictionary keys link_dict = group_json["links"] - titles = [] - if create_order: - order_dict = {} - for title in link_dict: - item = link_dict[title] - if "created" not in item: - log.warning(f"expected to find 'created' key in link item {title}") - continue - order_dict[title] = item["created"] - log.debug(f"order_dict: {order_dict}") - # now sort by created - for k in sorted(order_dict.items(), key=lambda item: item[1]): - titles.append(k[0]) - log.debug(f"links by create order: {titles}") - else: - titles = list(link_dict.keys()) - titles.sort() # sort by key - log.debug(f"links by lexographic order: {titles}") + titles = _getTitles(link_dict, create_order=create_order) if pattern: try: diff --git a/hsds/link_sn.py b/hsds/link_sn.py index f7aadd14..1ebc9449 100755 --- a/hsds/link_sn.py +++ b/hsds/link_sn.py @@ -551,6 +551,7 @@ async def POST_Links(request): log.request(request) app = request.app params = request.rel_url.query + log.debug(f"POST_Links params: {params}") log.info("POST_Links") req_id = request.match_info.get("id") @@ -558,6 +559,22 @@ async def POST_Links(request): follow_links = True else: follow_links = False + create_order = False + if "CreateOrder" in params and params["CreateOrder"]: + if params["CreateOrder"] != "0": + create_order = True + limit = None + if "Limit" in params: + try: + limit = int(params["Limit"]) + except ValueError: + msg = "Bad Request: Expected int type for limit" + log.warn(msg) + raise HTTPBadRequest(reason=msg) + if "pattern" in params: + pattern = params["pattern"] + else: + pattern = None if not request.has_body: msg = "POST Links with no body" @@ -668,8 +685,19 @@ async def POST_Links(request): elif len(items) == 1 and not follow_links: # just make a request to the datanode group_id = list(items.keys())[0] + kwargs = {"bucket": bucket} + titles = items[group_id] - links = await getLinks(app, group_id, titles=titles, bucket=bucket) + if titles: + kwargs["titles"] = titles + else: + if limit: + kwargs["limit"] = limit + if create_order: + kwargs["create_order"] = True + if pattern: + kwargs["pattern"] = pattern + links = await getLinks(app, group_id, **kwargs) resp_json["links"] = links else: @@ -678,6 +706,12 @@ async def POST_Links(request): kwargs = {"action": "get_link", "bucket": bucket, "include_links": True} if follow_links: kwargs["follow_links"] = True + if create_order: + kwargs["create_order"] = True + if limit: + kwargs["limit"] = limit + if pattern: + kwargs["pattern"] = pattern crawler = DomainCrawler(app, items, **kwargs) # will raise exception on NotFound, etc. await crawler.crawl() diff --git a/tests/integ/link_test.py b/tests/integ/link_test.py index aa23e481..c141db40 100755 --- a/tests/integ/link_test.py +++ b/tests/integ/link_test.py @@ -344,100 +344,122 @@ def testGetLinks(self): req = helper.getEndpoint() + "/groups/" + root_id + "/links" - for creation_order in (False, True): - - # get all the links for the root group - params = {} - if creation_order: - params["CreateOrder"] = 1 - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), len(link_names)) - ret_names = [] - for link in links: - self.assertTrue("title" in link) - self.assertTrue("class" in link) - self.assertEqual(link["class"], "H5L_TYPE_HARD") - self.assertTrue("collection" in link) - self.assertEqual(link["collection"], "groups") - self.assertTrue("created" in link) - ret_names.append(link["title"]) + for use_post in (False, True): + for creation_order in (False, True): + # get all the links for the root group + params = {} + if creation_order: + params["CreateOrder"] = 1 + + if use_post: + payload = {"group_ids": [root_id, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + if use_post: + pass # hrefs not returned for post + else: + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), len(link_names)) + ret_names = [] + for link in links: + self.assertTrue("title" in link) + self.assertTrue("class" in link) + self.assertEqual(link["class"], "H5L_TYPE_HARD") + if use_post: + pass # href, collection not returned for post + else: + self.assertTrue("href" in link) + self.assertTrue("collection" in link) + self.assertEqual(link["collection"], "groups") + self.assertTrue("created" in link) + ret_names.append(link["title"]) - expected_names = copy(link_names) + expected_names = copy(link_names) - if creation_order: - # result should come back in sorted order - pass - else: - expected_names.sort() # lexographic order - # sorted list should be: - # ['eighth', 'eleventh', 'fifth', 'first', 'fourth', 'ninth', - # 'second', 'seventh', 'sixth', 'tenth', 'third', 'twelfth'] - # - - self.assertEqual(ret_names, expected_names) - - # get links with a result limit of 4 - limit = 4 - params = {"Limit": limit} - if creation_order: - params["CreateOrder"] = 1 - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), limit) - last_link = links[-1] - self.assertEqual(last_link["title"], expected_names[limit - 1]) - - # get links after the one with name: "seventh" - marker = "seventh" - params = {"Marker": marker} - if creation_order: - params["CreateOrder"] = 1 - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - if creation_order: - self.assertEqual(len(links), 5) - else: - self.assertEqual(len(links), 4) - last_link = links[-1] - # "twelfth" is last in either ordering - self.assertEqual(last_link["title"], "twelfth") - - # Use a marker that is not present (should return 404) - params["Marker"] = "foobar" - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 404) - - # get links starting with name: "seventh", and limit to 3 results - params["Marker"] = "seventh" - limit = 3 - params["Limit"] = limit - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - self.assertTrue("hrefs" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), 3) - last_link = links[-1] - if creation_order: - # expecting: "eighth", "ninth", "tenth" - self.assertEqual(last_link["title"], "tenth") - else: - # expecting: "sixth", "tenth", "third" - self.assertEqual(last_link["title"], "third") + if creation_order: + # result should come back in sorted order + pass + else: + expected_names.sort() # lexographic order + # sorted list should be: + # ['eighth', 'eleventh', 'fifth', 'first', 'fourth', 'ninth', + # 'second', 'seventh', 'sixth', 'tenth', 'third', 'twelfth'] + # + + self.assertEqual(ret_names, expected_names) + + # get links with a result limit of 4 + limit = 4 + params = {"Limit": limit} + if creation_order: + params["CreateOrder"] = 1 + if use_post: + payload = {"group_ids": [root_id, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + if use_post: + pass # no hrefs for post + else: + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), limit) + last_link = links[-1] + self.assertEqual(last_link["title"], expected_names[limit - 1]) + + # get links after the one with name: "seventh" + marker = "seventh" + params = {"Marker": marker} + if creation_order: + params["CreateOrder"] = 1 + # Marker isn't supported for POST, so just run get twice + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + if creation_order: + self.assertEqual(len(links), 5) + else: + self.assertEqual(len(links), 4) + last_link = links[-1] + # "twelfth" is last in either ordering + self.assertEqual(last_link["title"], "twelfth") + + # Use a marker that is not present (should return 404) + params["Marker"] = "foobar" + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 404) + + # get links starting with name: "seventh", and limit to 3 results + params["Marker"] = "seventh" + limit = 3 + params["Limit"] = limit + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + self.assertTrue("hrefs" in rspJson) + links = rspJson["links"] + self.assertEqual(len(links), 3) + last_link = links[-1] + if creation_order: + # expecting: "eighth", "ninth", "tenth" + self.assertEqual(last_link["title"], "tenth") + else: + # expecting: "sixth", "tenth", "third" + self.assertEqual(last_link["title"], "third") def testGet(self): # test getting links from an existing domain @@ -623,60 +645,80 @@ def testGetPattern(self): g1_2_uuid = helper.getUUIDByPath(domain, "/g1/g1.2", session=self.session) now = time.time() - # do get with a regex pattern + # do get with a glob pattern # get links for /g1/g1.2: - req = helper.getEndpoint() + "/groups/" + g1_2_uuid + "/links" - params = {"pattern": "ext*"} - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("links" in rspJson) - links = rspJson["links"] - self.assertEqual(len(links), 1) # only extlink should be returned - link = links[0] - for name in ("created", "class", "h5domain", "h5path", "title", "href"): - self.assertTrue(name in link) - self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") - self.assertEqual(link["title"], "extlink") - self.assertEqual(link["h5domain"], "somefile") - self.assertEqual(link["h5path"], "somepath") - self.assertTrue(link["created"] < now - 10) - # get links for root group and other groups recursively - req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" - params = {"follow_links": 1, "pattern": "dset*"} - rsp = self.session.get(req, params=params, headers=headers) - self.assertEqual(rsp.status_code, 200) - rspJson = json.loads(rsp.text) - self.assertTrue("hrefs" in rspJson) - hrefs = rspJson["hrefs"] - self.assertEqual(len(hrefs), 3) - self.assertTrue("links" in rspJson) - obj_map = rspJson["links"] # map of grp ids to links - - expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") - - self.assertEqual(len(obj_map), 6) # 6 groups should be returned - link_count = 0 + for use_post in (False, True): + req = helper.getEndpoint() + "/groups/" + g1_2_uuid + "/links" + params = {"pattern": "ext*"} + if use_post: + payload = {"group_ids": [g1_2_uuid, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + self.assertTrue("links" in rspJson) + links = rspJson["links"] - for grp_id in obj_map: - helper.validateId(grp_id) - links = obj_map[grp_id] - for link in links: - self.assertTrue("title" in link) - link_title = link["title"] - self.assertTrue(link_title in expected_dset_links) - self.assertTrue("class" in link) - link_class = link["class"] - # only hardlinks will be a match with this pattern - self.assertEqual(link_class, "H5L_TYPE_HARD") - link_count += 1 - self.assertTrue("id" in link) - link_id = link["id"] - helper.validateId(link_id) - self.assertTrue(link_id.startswith("d-")) # link to a dataset + self.assertEqual(len(links), 1) # only extlink should be returned + link = links[0] + for name in ("created", "class", "h5domain", "h5path", "title"): + self.assertTrue(name in link) + if use_post: + pass # no href with post + else: + self.assertTrue("href" in link) + self.assertEqual(link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(link["title"], "extlink") + self.assertEqual(link["h5domain"], "somefile") + self.assertEqual(link["h5path"], "somepath") + self.assertTrue(link["created"] < now - 10) + + # get links for root group and other groups recursively + req = helper.getEndpoint() + "/groups/" + root_uuid + "/links" + params = {"follow_links": 1, "pattern": "dset*"} + if use_post: + payload = {"group_ids": [root_uuid, ]} + data = json.dumps(payload) + rsp = self.session.post(req, data=data, params=params, headers=headers) + else: + rsp = self.session.get(req, params=params, headers=headers) + self.assertEqual(rsp.status_code, 200) + rspJson = json.loads(rsp.text) + if use_post: + pass # hrefs not returned with post + else: + self.assertTrue("hrefs" in rspJson) + hrefs = rspJson["hrefs"] + self.assertEqual(len(hrefs), 3) + self.assertTrue("links" in rspJson) + obj_map = rspJson["links"] # map of grp ids to links + + expected_dset_links = ("dset1.1.1", "dset1.1.2", "dset2.1", "dset2.2") + + self.assertEqual(len(obj_map), 6) # 6 groups should be returned + link_count = 0 + + for grp_id in obj_map: + helper.validateId(grp_id) + links = obj_map[grp_id] + for link in links: + self.assertTrue("title" in link) + link_title = link["title"] + self.assertTrue(link_title in expected_dset_links) + self.assertTrue("class" in link) + link_class = link["class"] + # only hardlinks will be a match with this pattern + self.assertEqual(link_class, "H5L_TYPE_HARD") + link_count += 1 + self.assertTrue("id" in link) + link_id = link["id"] + helper.validateId(link_id) + self.assertTrue(link_id.startswith("d-")) # link to a dataset - self.assertEqual(link_count, len(expected_dset_links)) + self.assertEqual(link_count, len(expected_dset_links)) def testSoftLinkTraversal(self): # test that an object can be found via path with an external link