Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add datastore_multisearch_counts action #145

Merged
merged 1 commit into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions ckanext/versioned_datastore/logic/actions/meta/help.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,3 +775,41 @@

:rtype: bool
'''

datastore_multisearch_counts = """
Count the number of records that match the query in each of the provided resources and
return the counts for each resource in a dict.

The query and resource IDs are parsed in the same way as the datastore_multisearch
action.

Params:

:param query: the search JSON
:type query: dict
:param version: version to search at, if not provided the current version of the data is
searched
:type version: int, number of milliseconds (not seconds!) since UNIX epoch
:param resource_ids_and_versions: a dict of resource ids and the versions to search them
at. If this is present it's values are prioritised
over the version and resource_ids parameters.
:type resource_ids_and_versions: dict of strings -> ints (number of milliseconds (not
seconds!) since UNIX epoch)
:param query_version: the query language version (for example v1.0.0)
:type query_version: string
:param resource_ids: a list of resource ids to search. If no resources ids are specified
(either because the parameter is missing or because an empty list
is passed) then all resources in the datastore that the user can
access are searched. Any resources that the user cannot access or
that aren't datastore resources are skipped. If this means that no
resources are available from the provided list then a
ValidationError is raised.
:type resource_ids: a list of strings

**Results:**

The result of this action is a dictionary where the keys are the resource IDs and the
values are the number of records in that resource which matched the query.

:rtype: dict with str keys and int values
"""
10 changes: 10 additions & 0 deletions ckanext/versioned_datastore/logic/actions/meta/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,13 @@ def datastore_edit_slug():
'current_slug': [str, not_missing, not_empty],
'new_reserved_slug': [str, not_missing, not_empty, url_safe],
}


def datastore_multisearch_counts():
return {
'query': [ignore_missing, json_validator],
'version': [ignore_missing, int_validator],
'query_version': [ignore_missing, str],
'resource_ids': [ignore_missing, list_of_strings()],
'resource_ids_and_versions': [ignore_missing, json_validator],
}
97 changes: 97 additions & 0 deletions ckanext/versioned_datastore/logic/actions/multisearch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict
from datetime import datetime
from typing import Dict

import jsonschema
from ckantools.decorators import action
Expand Down Expand Up @@ -622,3 +623,99 @@ def datastore_edit_slug(context, current_slug, new_reserved_slug):
slug.reserved_pretty_slug = new_reserved_slug.lower()
slug.commit()
return slug.as_dict()


@action(
schema.datastore_multisearch_counts(),
help.datastore_multisearch_counts,
toolkit.side_effect_free,
)
def datastore_multisearch_counts(
context,
query=None,
query_version=None,
version=None,
resource_ids=None,
resource_ids_and_versions=None,
) -> Dict[str, int]:
"""
Efficiently counts the number of records in each of the given resources matching the
given query. A dict of resource IDs -> count is returned. If no records in a
resource match the query then it will appear in the dict with a count value of 0.

:param context: the context dict from the action call
:param query: the query dict. If None (default) then an empty query is used
:param query_version: the version of the query schema the query is using. If None
(default) then the latest query schema version is used
:param version: the version to search the data at. If None (default) the current
time is used
:param resource_ids: the list of resource to search. If None (default) then all the
resources the user has access to are queried. If a list of
resources are passed then any resources not accessible to the
user will be removed before querying
:param resource_ids_and_versions: a dict of resources and versions to search each of
them at. This allows precise searching of each
resource at a specific parameter. If None
(default) then the resource_ids parameter is used
together with the version parameter. If this
parameter is provided though, it takes priority
over the resource_ids and version parameters.
:return: a dict of resource IDs -> count
"""
# provide some more complex defaults for some parameters if necessary
if query is None:
query = {}
if query_version is None:
query_version = get_latest_query_version()

try:
# validate and translate the query into an elasticsearch-dsl Search object
validate_query(query, query_version)
search = translate_query(query, query_version)
except (jsonschema.ValidationError, InvalidQuerySchemaVersionError) as e:
raise toolkit.ValidationError(e.message)

# figure out which resources we're searching
resource_ids, skipped_resource_ids = determine_resources_to_search(
context, resource_ids, resource_ids_and_versions
)
if not resource_ids:
raise toolkit.ValidationError(
"The requested resources aren't accessible to this user"
)

# add the version filter necessary given the parameters and the resources we're
# searching
version_filter = determine_version_filter(
version, resource_ids, resource_ids_and_versions
)
search = search.filter(version_filter)

# add the resource indexes we're searching on
search = search.index(
[prefix_resource(resource_id) for resource_id in resource_ids]
)
# no results please, we aren't going to use them
search = search.extra(size=0)
# use an aggregation to get the hit count of each resource, set the size to the
# number of resources we're querying to ensure we get all counts in one go and don't
# have to paginate with a composite agg
search.aggs.bucket("counts", "terms", field="_index", size=len(resource_ids))

# create a multisearch for this one query - this ensures there aren't any issues
# with the length of the URL as the index list is passed as a part of the body
multisearch = MultiSearch(using=common.ES_CLIENT).add(search)

# run the search and get the only result from the search results list
result = next(iter(multisearch.execute()))

# build the response JSON
counts = {
trim_index_name(bucket["key"]): bucket["doc_count"]
for bucket in result.aggs.to_dict()["counts"]["buckets"]
}
# add resources that didn't have any hits into the counts dict too
counts.update(
{resource_id: 0 for resource_id in resource_ids if resource_id not in counts}
)
return counts
6 changes: 6 additions & 0 deletions ckanext/versioned_datastore/logic/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,9 @@ def datastore_custom_download_filename(context, data_dict):
# only allow access to admins (they usually skip this check)
user_is_sysadmin = context.get('auth_user_obj').sysadmin
return {'success': user_is_sysadmin}


@auth(anon=True)
def datastore_multisearch_counts(context, data_dict):
# allow access to everyone
return {"success": True}