Skip to content

Commit

Permalink
add pattern matching to post links
Browse files Browse the repository at this point in the history
  • Loading branch information
jreadey committed Jan 22, 2024
1 parent f07794d commit c0bbc2d
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 164 deletions.
39 changes: 36 additions & 3 deletions hsds/domain_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from aiohttp.web_exceptions import HTTPInternalServerError, HTTPNotFound, HTTPGone

from .util.idUtil import getCollectionForId, getDataNodeUrl
from .util.globparser import globmatch
from .servicenode_lib import getObjectJson, getAttributes, putAttributes, getLinks, putLinks
from . import hsds_logger as log

Expand All @@ -35,6 +36,9 @@ def __init__(
include_attrs=False,
include_data=False,
ignore_nan=False,
create_order=False,
pattern=None,
limit=None,
replace=False,
ignore_error=False,
max_tasks=40,
Expand All @@ -49,6 +53,9 @@ def __init__(
self._include_attrs = include_attrs
self._include_data = include_data
self._ignore_nan = ignore_nan
self._create_order = create_order
self._pattern = pattern
self._limit = limit
self._replace = replace
self._max_tasks = max_tasks
self._q = asyncio.Queue()
Expand Down Expand Up @@ -274,7 +281,7 @@ async def get_obj_json(self, obj_id):
async def get_links(self, grp_id, titles=None):
""" if titles is set, get all the links in grp_id that
have a title in the list. Otherwise, return all links for the object. """
log.debug(f"get_links: {grp_id}")
log.debug(f"get_links: {grp_id}m follow_links: {self._follow_links}")
if titles:
log.debug(f"titles; {titles}")
collection = getCollectionForId(grp_id)
Expand All @@ -284,6 +291,20 @@ async def get_links(self, grp_id, titles=None):
kwargs = {"bucket": self._bucket}
if titles:
kwargs["titles"] = titles
else:
# only use limit if we are attempting to fetch all links
if self._limit:
kwargs["limit"] = self._limit
if self._create_order:
kwargs["create_order"] = True
pattern = None
if self._pattern and not titles:
if self._follow_links:
# apply the pattern after we get the links back
log.debug("will apply pattern on return")
pattern = self._pattern
else:
kwargs["pattern"] = self._pattern

log.debug(f"follow_links: {self._follow_links}")
log.debug(f"getLinks kwargs: {kwargs}")
Expand Down Expand Up @@ -314,9 +335,21 @@ async def get_links(self, grp_id, titles=None):
return

log.debug(f"DomainCrawler - got links for {grp_id}")
log.debug(f"save to obj_dict: {links}")

self._obj_dict[grp_id] = links # store the links
if pattern:
filtered_links = []
for link in links:
title = link["title"]
if globmatch(title, pattern):
filtered_links.append(link)
msg = f"getLinks with pattern: {pattern} returning "
msg += f"{len(filtered_links)} links from {len(links)}"
log.debug(msg)
log.debug(f"save to obj_dict: {filtered_links}")
self._obj_dict[grp_id] = filtered_links
else:
log.debug(f"save to obj_dict: {links}")
self._obj_dict[grp_id] = links # store the links

# if follow_links, add any group links to the lookup ids set
if self._follow_links:
Expand Down
42 changes: 24 additions & 18 deletions hsds/link_dn.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,34 @@ def _index(items, marker, create_order=False):
return -1


def _getTitles(links, create_order=False):
titles = []
if create_order:
order_dict = {}
for title in links:
item = links[title]
if "created" not in item:
log.warning(f"expected to find 'created' key in link item {title}")
continue
order_dict[title] = item["created"]
log.debug(f"order_dict: {order_dict}")
# now sort by created
for k in sorted(order_dict.items(), key=lambda item: item[1]):
titles.append(k[0])
log.debug(f"links by create order: {titles}")
else:
titles = list(links.keys())
titles.sort()
log.debug(f"links by lexographic order: {titles}")
return titles


async def GET_Links(request):
"""HTTP GET method to return JSON for a link collection"""
log.request(request)
app = request.app
params = request.rel_url.query
log.debug(f"GET_Links params: {params}")
group_id = get_obj_id(request)
log.info(f"GET links: {group_id}")
if not isValidUuid(group_id, obj_class="group"):
Expand Down Expand Up @@ -95,24 +118,7 @@ async def GET_Links(request):
# return a list of links based on sorted dictionary keys
link_dict = group_json["links"]

titles = []
if create_order:
order_dict = {}
for title in link_dict:
item = link_dict[title]
if "created" not in item:
log.warning(f"expected to find 'created' key in link item {title}")
continue
order_dict[title] = item["created"]
log.debug(f"order_dict: {order_dict}")
# now sort by created
for k in sorted(order_dict.items(), key=lambda item: item[1]):
titles.append(k[0])
log.debug(f"links by create order: {titles}")
else:
titles = list(link_dict.keys())
titles.sort() # sort by key
log.debug(f"links by lexographic order: {titles}")
titles = _getTitles(link_dict, create_order=create_order)

if pattern:
try:
Expand Down
36 changes: 35 additions & 1 deletion hsds/link_sn.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,13 +551,30 @@ async def POST_Links(request):
log.request(request)
app = request.app
params = request.rel_url.query
log.debug(f"POST_Links params: {params}")
log.info("POST_Links")
req_id = request.match_info.get("id")

if params.get("follow_links"):
follow_links = True
else:
follow_links = False
create_order = False
if "CreateOrder" in params and params["CreateOrder"]:
if params["CreateOrder"] != "0":
create_order = True
limit = None
if "Limit" in params:
try:
limit = int(params["Limit"])
except ValueError:
msg = "Bad Request: Expected int type for limit"
log.warn(msg)
raise HTTPBadRequest(reason=msg)
if "pattern" in params:
pattern = params["pattern"]
else:
pattern = None

if not request.has_body:
msg = "POST Links with no body"
Expand Down Expand Up @@ -668,8 +685,19 @@ async def POST_Links(request):
elif len(items) == 1 and not follow_links:
# just make a request to the datanode
group_id = list(items.keys())[0]
kwargs = {"bucket": bucket}

titles = items[group_id]
links = await getLinks(app, group_id, titles=titles, bucket=bucket)
if titles:
kwargs["titles"] = titles
else:
if limit:
kwargs["limit"] = limit
if create_order:
kwargs["create_order"] = True
if pattern:
kwargs["pattern"] = pattern
links = await getLinks(app, group_id, **kwargs)

resp_json["links"] = links
else:
Expand All @@ -678,6 +706,12 @@ async def POST_Links(request):
kwargs = {"action": "get_link", "bucket": bucket, "include_links": True}
if follow_links:
kwargs["follow_links"] = True
if create_order:
kwargs["create_order"] = True
if limit:
kwargs["limit"] = limit
if pattern:
kwargs["pattern"] = pattern
crawler = DomainCrawler(app, items, **kwargs)
# will raise exception on NotFound, etc.
await crawler.crawl()
Expand Down
Loading

0 comments on commit c0bbc2d

Please sign in to comment.