facets: integrate combined_subjects / fix nested subject faceting

- closes #798
inveniosoftware · Apr 11, 2024 · f2116da · f2116da
1 parent 86b4662
commit f2116da
Show file tree

Hide file tree

Showing 14 changed files with 194 additions and 10 deletions.
diff --git a/invenio_rdm_records/config.py b/invenio_rdm_records/config.py
@@ -174,6 +174,11 @@ def always_valid(identifier):
             "field": "subjects.subject",
         },
     },
+    # subject_nested is deprecated and should be removed.
+    # subject_combined does require a pre-existing change to indexed documents,
+    # so it's unclear if a direct replacement is right.
+    # Keeping it around until v13 might be better. On the flipside it is an incorrect
+    # facet...
     "subject_nested": {
         "facet": facets.subject_nested,
         "ui": {
@@ -183,6 +188,15 @@ def always_valid(identifier):
             },
         },
     },
+    "subject_combined": {
+        "facet": facets.subject_combined,
+        "ui": {
+            "field": "subjects.scheme",
+            "childAgg": {
+                "field": "subjects.subject",
+            },
+        },
+    },
 }
 
 RDM_SEARCH_SORT_BY_VERIFIED = False

diff --git a/invenio_rdm_records/records/api.py b/invenio_rdm_records/records/api.py
@@ -47,6 +47,7 @@
 
 from . import models
 from .dumpers import (
+    CombinedSubjectsDumperExt,
     EDTFDumperExt,
     EDTFListDumperExt,
     GrantTokensDumperExt,
@@ -118,6 +119,7 @@ class CommonFieldsMixin:
             EDTFDumperExt("metadata.publication_date"),
             EDTFListDumperExt("metadata.dates", "date"),
             RelationDumperExt("relations"),
+            CombinedSubjectsDumperExt(),
             CustomFieldsDumperExt(fields_var="RDM_CUSTOM_FIELDS"),
             StatisticsDumperExt("stats"),
         ]

diff --git a/invenio_rdm_records/records/dumpers/__init__.py b/invenio_rdm_records/records/dumpers/__init__.py
@@ -8,12 +8,14 @@
 """Search dumpers, for transforming to and from versions to index."""
 
 from .access import GrantTokensDumperExt
+from .combined_subjects import CombinedSubjectsDumperExt
 from .edtf import EDTFDumperExt, EDTFListDumperExt
 from .locations import LocationsDumper
 from .pids import PIDsDumperExt
 from .statistics import StatisticsDumperExt
 
 __all__ = (
+    "CombinedSubjectsDumperExt",
     "EDTFDumperExt",
     "EDTFListDumperExt",
     "PIDsDumperExt",

diff --git a/invenio_rdm_records/records/dumpers/combined_subjects.py b/invenio_rdm_records/records/dumpers/combined_subjects.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 Northwestern University.
+#
+# Invenio-RDM-Records is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Search dumpers for combined subjects."""
+
+from invenio_records.dumpers import SearchDumperExt
+
+SPLITCHAR = "::"  # explict better than implicit
+
+
+class CombinedSubjectsDumperExt(SearchDumperExt):
+    """Search dumper extension for sombined subjects support.
+
+    It parses the values of the `subjects` field
+    in the document and adds entries of the form:
+    `<scheme><splitchar><subject>` or `<subject>` in the `combined_subjects` field.
+
+    The combined_subjects field is required for properly aggregating/faceting subjects.
+
+    This dumper needs to be placed after the RelationDumper for subjects as it relies
+    on dereferenced scheme + subject pairs.
+    """
+
+    def __init__(self, splitchar=SPLITCHAR):
+        """Constructor.
+
+        :param splitchar: string to use to combine scheme + subject.
+                          It must be identical to the splitchar value used in the
+                          CombinedTermsFacet.
+        """
+        super().__init__()
+        self._splitchar = splitchar
+
+    def dump(self, record, data):
+        """Dump the data to secondary storage (OpenSearch-like)."""
+        subjects = data.get("metadata", {}).get("subjects", [])
+
+        def get_scheme_subject(subject_dict):
+            """
+            Return [<scheme>, <subject>] or [<subject>] for the given `subject_dict`.
+
+            Assumes subject_dict has been dereferenced at this point.
+            """
+            result = []
+            if "scheme" in subject_dict:
+                result.append(subject_dict["scheme"])
+            result.append(subject_dict["subject"])
+            return result
+
+        # There is no clarity on what keys can be assumed to be present in data
+        # (e.g., test_records_communities calls dumps() without "metadata"),
+        # so one has to be careful in how the dumped data is inserted back into `data`
+        metadata = data.get("metadata", {})
+        metadata["combined_subjects"] = [
+            self._splitchar.join(get_scheme_subject(subject)) for subject in subjects
+        ]
+        data["metadata"] = metadata
+
+    def load(self, data, record_cls):
+        """Load the data from secondary storage (OpenSearch-like).
+
+        This is run against the parent too (for some reason), so presence of any
+        field cannot be assumed.
+        """
+        if "metadata" in data:
+            data["metadata"].pop("combined_subjects", None)
+        return data
diff --git a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json
@@ -1138,6 +1138,9 @@
               }
             }
           },
+          "combined_subjects": {
+            "type": "keyword"
+          },
           "title": {
             "type": "text",
             "fields": {

diff --git a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v6.0.0.json
@@ -1095,6 +1095,9 @@
               }
             }
           },
+          "combined_subjects": {
+            "type": "keyword"
+          },
           "title": {
             "type": "text",
             "fields": {

diff --git a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json
@@ -1138,6 +1138,9 @@
               }
             }
           },
+          "combined_subjects": {
+            "type": "keyword"
+          },
           "title": {
             "type": "text",
             "fields": {

diff --git a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v6.0.0.json
@@ -1095,6 +1095,9 @@
               }
             }
           },
+          "combined_subjects": {
+            "type": "keyword"
+          },
           "title": {
             "type": "text",
             "fields": {

diff --git a/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json
@@ -1138,6 +1138,9 @@
               }
             }
           },
+          "combined_subjects": {
+            "type": "keyword"
+          },
           "title": {
             "type": "text",
             "fields": {

diff --git a/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json
@@ -1095,6 +1095,9 @@
               }
             }
           },
+          "combined_subjects": {
+            "type": "keyword"
+          },
           "title": {
             "type": "text",
             "fields": {

diff --git a/invenio_rdm_records/services/facets.py b/invenio_rdm_records/services/facets.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2020-2023 CERN.
-# Copyright (C) 2020-2021 Northwestern University.
+# Copyright (C) 2020-2024 Northwestern University.
 # Copyright (C)      2021 TU Wien.
 # Copyright (C) 2023 Graz University of Technology.
 #
@@ -10,14 +10,19 @@
 
 """Facet definitions."""
 
+from warnings import warn
+
 from invenio_i18n import gettext as _
 from invenio_records_resources.services.records.facets import (
+    CombinedTermsFacet,
     NestedTermsFacet,
     TermsFacet,
 )
 from invenio_vocabularies.contrib.subjects import SubjectsLabels
+from invenio_vocabularies.records.models import VocabularyScheme
 from invenio_vocabularies.services.facets import VocabularyLabels
 
+from ..records.dumpers.combined_subjects import SPLITCHAR
 from ..records.systemfields.access.field.record import AccessStatusEnum
 
 access_status = TermsFacet(
@@ -62,14 +67,44 @@
 )
 
 
-subject_nested = NestedTermsFacet(
-    field="metadata.subjects.scheme",
-    subfield="metadata.subjects.subject.keyword",
-    label=_("Subjects"),
-    value_labels=SubjectsLabels(),
-)
+def deprecated_subject_nested():
+    """Deprecated NestedTermsFacet.
+
+    Will warn until this is completely removed.
+    """
+    warn(
+        "subject_nested is deprecated. Use subject_combined instead.",
+        DeprecationWarning,
+    )
+    return NestedTermsFacet(
+        field="metadata.subjects.scheme",
+        subfield="metadata.subjects.subject.keyword",
+        label=_("Subjects"),
+        value_labels=SubjectsLabels(),
+    )
+
+
+subject_nested = deprecated_subject_nested()
+
 
 subject = TermsFacet(
     field="metadata.subjects.subject.keyword",
     label=_("Subjects"),
 )
+
+
+def get_subject_schemes():
+    """Return subject schemes."""
+    return [
+        row.id for row in VocabularyScheme.query.filter_by(parent_id="subjects").all()
+    ]
+
+
+subject_combined = CombinedTermsFacet(
+    field="metadata.subjects.scheme",
+    combined_field="metadata.combined_subjects",
+    parents=get_subject_schemes,
+    splitchar=SPLITCHAR,
+    label=_("Subjects"),
+    value_labels=SubjectsLabels(),
+)
diff --git a/setup.cfg b/setup.cfg
@@ -41,6 +41,7 @@ install_requires =
     invenio-administration>=2.0.0,<3.0.0
     invenio-communities>=13.0.0,<14.0.0
     invenio-drafts-resources>=3.0.0,<4.0.0
+    invenio-records-resources>=5.3.0,<6.0.0
     invenio-github>=1.0.0,<2.0.0
     invenio-i18n>=2.0.0,<3.0.0
     invenio-oaiserver>=2.0.0,<3.0.0

diff --git a/tests/records/dumpers/test_combined_subjects_dumper.py b/tests/records/dumpers/test_combined_subjects_dumper.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2023 Northwestern University.
+#
+# Invenio-RDM-Records is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License; see LICENSE file for more details.
+
+"""Test combined subjects dumper."""
+
+from copy import deepcopy
+
+from invenio_rdm_records.records import RDMDraft, RDMRecord
+from invenio_rdm_records.records.api import RDMParent
+
+
+def test_combined_subjects_dumper(running_app, db, minimal_record):
+    input_subjects = [
+        {"id": "http://id.nlm.nih.gov/mesh/A-D000007"},
+        {"subject": "Capybara"},
+    ]
+    minimal_record["metadata"]["subjects"] = input_subjects
+    parent = RDMParent.create({})
+
+    cases = []
+    for api_cls in [RDMDraft, RDMRecord]:
+        data_input = deepcopy(minimal_record)
+        api_instance = api_cls.create(data_input, parent=parent)
+        dump = api_instance.dumps()
+        cases.append((dump, api_cls))
+
+    expected_combined_subjects = [
+        "MeSH::Abdominal Injuries",
+        "Capybara",
+    ]
+    for dump, api_cls in cases:
+        assert expected_combined_subjects == dump["metadata"]["combined_subjects"]
+
+        loaded = api_cls.loads(dump)
+        assert "combined_subjects" not in loaded["metadata"]
+        assert "subjects" in loaded["metadata"]
diff --git a/tests/records/test_records_communities.py b/tests/records/test_records_communities.py
@@ -43,9 +43,10 @@ def test_community_integration(db, community, running_app, minimal_record):
     db.session.commit()
 
     assert record.parent.communities.entries[0] == community
-    assert record.dumps()["parent"]["communities"]["default"] == str(community.id)
-    assert record.dumps()["parent"]["communities"]["ids"] == [str(community.id)]
-    entries = record.dumps()["parent"]["communities"]["entries"]
+    dump = record.dumps()
+    assert dump["parent"]["communities"]["default"] == str(community.id)
+    assert dump["parent"]["communities"]["ids"] == [str(community.id)]
+    entries = dump["parent"]["communities"]["entries"]
     assert len(entries) == 1
     assert entries[0]["id"] == str(community.id)
     assert entries[0]["slug"] == "test-community"