diff --git a/ckanext/versioned_datastore/logic/actions/meta/help.py b/ckanext/versioned_datastore/logic/actions/meta/help.py index ecba330f..8ab1e89f 100644 --- a/ckanext/versioned_datastore/logic/actions/meta/help.py +++ b/ckanext/versioned_datastore/logic/actions/meta/help.py @@ -775,3 +775,41 @@ :rtype: bool ''' + +datastore_multisearch_counts = """ +Count the number of records that match the query in each of the provided resources and +return the counts for each resource in a dict. + +The query and resource IDs are parsed in the same way as the datastore_multisearch +action. + +Params: + +:param query: the search JSON +:type query: dict +:param version: version to search at, if not provided the current version of the data is + searched +:type version: int, number of milliseconds (not seconds!) since UNIX epoch +:param resource_ids_and_versions: a dict of resource ids and the versions to search them + at. If this is present it's values are prioritised + over the version and resource_ids parameters. +:type resource_ids_and_versions: dict of strings -> ints (number of milliseconds (not + seconds!) since UNIX epoch) +:param query_version: the query language version (for example v1.0.0) +:type query_version: string +:param resource_ids: a list of resource ids to search. If no resources ids are specified + (either because the parameter is missing or because an empty list + is passed) then all resources in the datastore that the user can + access are searched. Any resources that the user cannot access or + that aren't datastore resources are skipped. If this means that no + resources are available from the provided list then a + ValidationError is raised. +:type resource_ids: a list of strings + +**Results:** + +The result of this action is a dictionary where the keys are the resource IDs and the +values are the number of records in that resource which matched the query. + +:rtype: dict with str keys and int values +""" diff --git a/ckanext/versioned_datastore/logic/actions/meta/schema.py b/ckanext/versioned_datastore/logic/actions/meta/schema.py index 40f1d195..a8bfc753 100644 --- a/ckanext/versioned_datastore/logic/actions/meta/schema.py +++ b/ckanext/versioned_datastore/logic/actions/meta/schema.py @@ -234,3 +234,13 @@ def datastore_edit_slug(): 'current_slug': [str, not_missing, not_empty], 'new_reserved_slug': [str, not_missing, not_empty, url_safe], } + + +def datastore_multisearch_counts(): + return { + 'query': [ignore_missing, json_validator], + 'version': [ignore_missing, int_validator], + 'query_version': [ignore_missing, str], + 'resource_ids': [ignore_missing, list_of_strings()], + 'resource_ids_and_versions': [ignore_missing, json_validator], + } diff --git a/ckanext/versioned_datastore/logic/actions/multisearch.py b/ckanext/versioned_datastore/logic/actions/multisearch.py index 1df24882..8c4cb166 100644 --- a/ckanext/versioned_datastore/logic/actions/multisearch.py +++ b/ckanext/versioned_datastore/logic/actions/multisearch.py @@ -1,5 +1,6 @@ from collections import defaultdict from datetime import datetime +from typing import Dict import jsonschema from ckantools.decorators import action @@ -622,3 +623,99 @@ def datastore_edit_slug(context, current_slug, new_reserved_slug): slug.reserved_pretty_slug = new_reserved_slug.lower() slug.commit() return slug.as_dict() + + +@action( + schema.datastore_multisearch_counts(), + help.datastore_multisearch_counts, + toolkit.side_effect_free, +) +def datastore_multisearch_counts( + context, + query=None, + query_version=None, + version=None, + resource_ids=None, + resource_ids_and_versions=None, +) -> Dict[str, int]: + """ + Efficiently counts the number of records in each of the given resources matching the + given query. A dict of resource IDs -> count is returned. If no records in a + resource match the query then it will appear in the dict with a count value of 0. + + :param context: the context dict from the action call + :param query: the query dict. If None (default) then an empty query is used + :param query_version: the version of the query schema the query is using. If None + (default) then the latest query schema version is used + :param version: the version to search the data at. If None (default) the current + time is used + :param resource_ids: the list of resource to search. If None (default) then all the + resources the user has access to are queried. If a list of + resources are passed then any resources not accessible to the + user will be removed before querying + :param resource_ids_and_versions: a dict of resources and versions to search each of + them at. This allows precise searching of each + resource at a specific parameter. If None + (default) then the resource_ids parameter is used + together with the version parameter. If this + parameter is provided though, it takes priority + over the resource_ids and version parameters. + :return: a dict of resource IDs -> count + """ + # provide some more complex defaults for some parameters if necessary + if query is None: + query = {} + if query_version is None: + query_version = get_latest_query_version() + + try: + # validate and translate the query into an elasticsearch-dsl Search object + validate_query(query, query_version) + search = translate_query(query, query_version) + except (jsonschema.ValidationError, InvalidQuerySchemaVersionError) as e: + raise toolkit.ValidationError(e.message) + + # figure out which resources we're searching + resource_ids, skipped_resource_ids = determine_resources_to_search( + context, resource_ids, resource_ids_and_versions + ) + if not resource_ids: + raise toolkit.ValidationError( + "The requested resources aren't accessible to this user" + ) + + # add the version filter necessary given the parameters and the resources we're + # searching + version_filter = determine_version_filter( + version, resource_ids, resource_ids_and_versions + ) + search = search.filter(version_filter) + + # add the resource indexes we're searching on + search = search.index( + [prefix_resource(resource_id) for resource_id in resource_ids] + ) + # no results please, we aren't going to use them + search = search.extra(size=0) + # use an aggregation to get the hit count of each resource, set the size to the + # number of resources we're querying to ensure we get all counts in one go and don't + # have to paginate with a composite agg + search.aggs.bucket("counts", "terms", field="_index", size=len(resource_ids)) + + # create a multisearch for this one query - this ensures there aren't any issues + # with the length of the URL as the index list is passed as a part of the body + multisearch = MultiSearch(using=common.ES_CLIENT).add(search) + + # run the search and get the only result from the search results list + result = next(iter(multisearch.execute())) + + # build the response JSON + counts = { + trim_index_name(bucket["key"]): bucket["doc_count"] + for bucket in result.aggs.to_dict()["counts"]["buckets"] + } + # add resources that didn't have any hits into the counts dict too + counts.update( + {resource_id: 0 for resource_id in resource_ids if resource_id not in counts} + ) + return counts diff --git a/ckanext/versioned_datastore/logic/auth.py b/ckanext/versioned_datastore/logic/auth.py index 5ad12965..2336eba2 100644 --- a/ckanext/versioned_datastore/logic/auth.py +++ b/ckanext/versioned_datastore/logic/auth.py @@ -146,3 +146,9 @@ def datastore_custom_download_filename(context, data_dict): # only allow access to admins (they usually skip this check) user_is_sysadmin = context.get('auth_user_obj').sysadmin return {'success': user_is_sysadmin} + + +@auth(anon=True) +def datastore_multisearch_counts(context, data_dict): + # allow access to everyone + return {"success": True}