NaturalHistoryMuseum · jrdh · Nov 29, 2023 · Nov 29, 2023
diff --git a/ckanext/versioned_datastore/logic/actions/meta/help.py b/ckanext/versioned_datastore/logic/actions/meta/help.py
@@ -775,3 +775,41 @@
 
 :rtype: bool
 '''
+
+datastore_multisearch_counts = """
+Count the number of records that match the query in each of the provided resources and
+return the counts for each resource in a dict.
+
+The query and resource IDs are parsed in the same way as the datastore_multisearch
+action.
+
+Params:
+
+:param query: the search JSON
+:type query: dict
+:param version: version to search at, if not provided the current version of the data is
+               searched
+:type version: int, number of milliseconds (not seconds!) since UNIX epoch
+:param resource_ids_and_versions: a dict of resource ids and the versions to search them
+                                  at. If this is present it's values are prioritised
+                                  over the version and resource_ids parameters.
+:type resource_ids_and_versions: dict of strings -> ints (number of milliseconds (not
+                                 seconds!) since UNIX epoch)
+:param query_version: the query language version (for example v1.0.0)
+:type query_version: string
+:param resource_ids: a list of resource ids to search. If no resources ids are specified
+                     (either because the parameter is missing or because an empty list
+                     is passed) then all resources in the datastore that the user can
+                     access are searched. Any resources that the user cannot access or
+                     that aren't datastore resources are skipped. If this means that no
+                     resources are available from the provided list then a
+                     ValidationError is raised.
+:type resource_ids: a list of strings
+
+**Results:**
+
+The result of this action is a dictionary where the keys are the resource IDs and the
+values are the number of records in that resource which matched the query.
+
+:rtype: dict with str keys and int values
+"""
diff --git a/ckanext/versioned_datastore/logic/actions/meta/schema.py b/ckanext/versioned_datastore/logic/actions/meta/schema.py
@@ -234,3 +234,13 @@ def datastore_edit_slug():
         'current_slug': [str, not_missing, not_empty],
         'new_reserved_slug': [str, not_missing, not_empty, url_safe],
     }
+
+
+def datastore_multisearch_counts():
+    return {
+        'query': [ignore_missing, json_validator],
+        'version': [ignore_missing, int_validator],
+        'query_version': [ignore_missing, str],
+        'resource_ids': [ignore_missing, list_of_strings()],
+        'resource_ids_and_versions': [ignore_missing, json_validator],
+    }
diff --git a/ckanext/versioned_datastore/logic/actions/multisearch.py b/ckanext/versioned_datastore/logic/actions/multisearch.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from datetime import datetime
+from typing import Dict
 
 import jsonschema
 from ckantools.decorators import action
@@ -622,3 +623,99 @@ def datastore_edit_slug(context, current_slug, new_reserved_slug):
     slug.reserved_pretty_slug = new_reserved_slug.lower()
     slug.commit()
     return slug.as_dict()
+
+
+@action(
+    schema.datastore_multisearch_counts(),
+    help.datastore_multisearch_counts,
+    toolkit.side_effect_free,
+)
+def datastore_multisearch_counts(
+    context,
+    query=None,
+    query_version=None,
+    version=None,
+    resource_ids=None,
+    resource_ids_and_versions=None,
+) -> Dict[str, int]:
+    """
+    Efficiently counts the number of records in each of the given resources matching the
+    given query. A dict of resource IDs -> count is returned. If no records in a
+    resource match the query then it will appear in the dict with a count value of 0.
+
+    :param context: the context dict from the action call
+    :param query: the query dict. If None (default) then an empty query is used
+    :param query_version: the version of the query schema the query is using. If None
+                          (default) then the latest query schema version is used
+    :param version: the version to search the data at. If None (default) the current
+                    time is used
+    :param resource_ids: the list of resource to search. If None (default) then all the
+                         resources the user has access to are queried. If a list of
+                         resources are passed then any resources not accessible to the
+                         user will be removed before querying
+    :param resource_ids_and_versions: a dict of resources and versions to search each of
+                                      them at. This allows precise searching of each
+                                      resource at a specific parameter. If None
+                                      (default) then the resource_ids parameter is used
+                                      together with the version parameter. If this
+                                      parameter is provided though, it takes priority
+                                      over the resource_ids and version parameters.
+    :return: a dict of resource IDs -> count
+    """
+    # provide some more complex defaults for some parameters if necessary
+    if query is None:
+        query = {}
+    if query_version is None:
+        query_version = get_latest_query_version()
+
+    try:
+        # validate and translate the query into an elasticsearch-dsl Search object
+        validate_query(query, query_version)
+        search = translate_query(query, query_version)
+    except (jsonschema.ValidationError, InvalidQuerySchemaVersionError) as e:
+        raise toolkit.ValidationError(e.message)
+
+    # figure out which resources we're searching
+    resource_ids, skipped_resource_ids = determine_resources_to_search(
+        context, resource_ids, resource_ids_and_versions
+    )
+    if not resource_ids:
+        raise toolkit.ValidationError(
+            "The requested resources aren't accessible to this user"
+        )
+
+    # add the version filter necessary given the parameters and the resources we're
+    # searching
+    version_filter = determine_version_filter(
+        version, resource_ids, resource_ids_and_versions
+    )
+    search = search.filter(version_filter)
+
+    # add the resource indexes we're searching on
+    search = search.index(
+        [prefix_resource(resource_id) for resource_id in resource_ids]
+    )
+    # no results please, we aren't going to use them
+    search = search.extra(size=0)
+    # use an aggregation to get the hit count of each resource, set the size to the
+    # number of resources we're querying to ensure we get all counts in one go and don't
+    # have to paginate with a composite agg
+    search.aggs.bucket("counts", "terms", field="_index", size=len(resource_ids))
+
+    # create a multisearch for this one query - this ensures there aren't any issues
+    # with the length of the URL as the index list is passed as a part of the body
+    multisearch = MultiSearch(using=common.ES_CLIENT).add(search)
+
+    # run the search and get the only result from the search results list
+    result = next(iter(multisearch.execute()))
+
+    # build the response JSON
+    counts = {
+        trim_index_name(bucket["key"]): bucket["doc_count"]
+        for bucket in result.aggs.to_dict()["counts"]["buckets"]
+    }
+    # add resources that didn't have any hits into the counts dict too
+    counts.update(
+        {resource_id: 0 for resource_id in resource_ids if resource_id not in counts}
+    )
+    return counts
diff --git a/ckanext/versioned_datastore/logic/auth.py b/ckanext/versioned_datastore/logic/auth.py
@@ -146,3 +146,9 @@ def datastore_custom_download_filename(context, data_dict):
     # only allow access to admins (they usually skip this check)
     user_is_sysadmin = context.get('auth_user_obj').sysadmin
     return {'success': user_is_sysadmin}
+
+
+@auth(anon=True)
+def datastore_multisearch_counts(context, data_dict):
+    # allow access to everyone
+    return {"success": True}