diff --git a/invenio_rdm_records/config.py b/invenio_rdm_records/config.py index e7744d439..3815d211a 100644 --- a/invenio_rdm_records/config.py +++ b/invenio_rdm_records/config.py @@ -174,6 +174,11 @@ def always_valid(identifier): "field": "subjects.subject", }, }, + # subject_nested is deprecated and should be removed. + # subject_combined does require a pre-existing change to indexed documents, + # so it's unclear if a direct replacement is right. + # Keeping it around until v13 might be better. On the flipside it is an incorrect + # facet... "subject_nested": { "facet": facets.subject_nested, "ui": { @@ -183,6 +188,15 @@ def always_valid(identifier): }, }, }, + "subject_combined": { + "facet": facets.subject_combined, + "ui": { + "field": "subjects.scheme", + "childAgg": { + "field": "subjects.subject", + }, + }, + }, } RDM_SEARCH_SORT_BY_VERIFIED = False diff --git a/invenio_rdm_records/records/api.py b/invenio_rdm_records/records/api.py index f6314c51a..0c8706117 100644 --- a/invenio_rdm_records/records/api.py +++ b/invenio_rdm_records/records/api.py @@ -47,6 +47,7 @@ from . import models from .dumpers import ( + CombinedSubjectsDumperExt, EDTFDumperExt, EDTFListDumperExt, GrantTokensDumperExt, @@ -118,6 +119,7 @@ class CommonFieldsMixin: EDTFDumperExt("metadata.publication_date"), EDTFListDumperExt("metadata.dates", "date"), RelationDumperExt("relations"), + CombinedSubjectsDumperExt(), CustomFieldsDumperExt(fields_var="RDM_CUSTOM_FIELDS"), StatisticsDumperExt("stats"), ] diff --git a/invenio_rdm_records/records/dumpers/__init__.py b/invenio_rdm_records/records/dumpers/__init__.py index 0eaa889f6..93bac555d 100644 --- a/invenio_rdm_records/records/dumpers/__init__.py +++ b/invenio_rdm_records/records/dumpers/__init__.py @@ -8,12 +8,14 @@ """Search dumpers, for transforming to and from versions to index.""" from .access import GrantTokensDumperExt +from .combined_subjects import CombinedSubjectsDumperExt from .edtf import EDTFDumperExt, EDTFListDumperExt from .locations import LocationsDumper from .pids import PIDsDumperExt from .statistics import StatisticsDumperExt __all__ = ( + "CombinedSubjectsDumperExt", "EDTFDumperExt", "EDTFListDumperExt", "PIDsDumperExt", diff --git a/invenio_rdm_records/records/dumpers/combined_subjects.py b/invenio_rdm_records/records/dumpers/combined_subjects.py new file mode 100644 index 000000000..567bcbe7d --- /dev/null +++ b/invenio_rdm_records/records/dumpers/combined_subjects.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 Northwestern University. +# +# Invenio-RDM-Records is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Search dumpers for combined subjects.""" + +from invenio_records.dumpers import SearchDumperExt + +SPLITCHAR = "::" # explict better than implicit + + +class CombinedSubjectsDumperExt(SearchDumperExt): + """Search dumper extension for sombined subjects support. + + It parses the values of the `subjects` field + in the document and adds entries of the form: + `` or `` in the `combined_subjects` field. + + The combined_subjects field is required for properly aggregating/faceting subjects. + + This dumper needs to be placed after the RelationDumper for subjects as it relies + on dereferenced scheme + subject pairs. + """ + + def __init__(self, splitchar=SPLITCHAR): + """Constructor. + + :param splitchar: string to use to combine scheme + subject. + It must be identical to the splitchar value used in the + CombinedTermsFacet. + """ + super().__init__() + self._splitchar = splitchar + + def dump(self, record, data): + """Dump the data to secondary storage (OpenSearch-like).""" + subjects = data.get("metadata", {}).get("subjects", []) + + def get_scheme_subject(subject_dict): + """ + Return [, ] or [] for the given `subject_dict`. + + Assumes subject_dict has been dereferenced at this point. + """ + result = [] + if "scheme" in subject_dict: + result.append(subject_dict["scheme"]) + result.append(subject_dict["subject"]) + return result + + # There is no clarity on what keys can be assumed to be present in data + # (e.g., test_records_communities calls dumps() without "metadata"), + # so one has to be careful in how the dumped data is inserted back into `data` + metadata = data.get("metadata", {}) + metadata["combined_subjects"] = [ + self._splitchar.join(get_scheme_subject(subject)) for subject in subjects + ] + data["metadata"] = metadata + + def load(self, data, record_cls): + """Load the data from secondary storage (OpenSearch-like). + + This is run against the parent too (for some reason), so presence of any + field cannot be assumed. + """ + if "metadata" in data: + data["metadata"].pop("combined_subjects", None) + return data diff --git a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json index 2665890d1..afbdca240 100644 --- a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/drafts/draft-v6.0.0.json @@ -1138,6 +1138,9 @@ } } }, + "combined_subjects": { + "type": "keyword" + }, "title": { "type": "text", "fields": { diff --git a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v6.0.0.json index d72d6aa87..5ba5afff6 100644 --- a/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v1/rdmrecords/records/record-v6.0.0.json @@ -1095,6 +1095,9 @@ } } }, + "combined_subjects": { + "type": "keyword" + }, "title": { "type": "text", "fields": { diff --git a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json index 2665890d1..afbdca240 100644 --- a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/drafts/draft-v6.0.0.json @@ -1138,6 +1138,9 @@ } } }, + "combined_subjects": { + "type": "keyword" + }, "title": { "type": "text", "fields": { diff --git a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v6.0.0.json index 984e02d72..8a34446b1 100644 --- a/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/os-v2/rdmrecords/records/record-v6.0.0.json @@ -1095,6 +1095,9 @@ } } }, + "combined_subjects": { + "type": "keyword" + }, "title": { "type": "text", "fields": { diff --git a/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json b/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json index 0ba2e7ef2..046f8887d 100644 --- a/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/v7/rdmrecords/drafts/draft-v6.0.0.json @@ -1138,6 +1138,9 @@ } } }, + "combined_subjects": { + "type": "keyword" + }, "title": { "type": "text", "fields": { diff --git a/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json b/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json index 7f30d7933..5b8a8e96d 100644 --- a/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json +++ b/invenio_rdm_records/records/mappings/v7/rdmrecords/records/record-v6.0.0.json @@ -1095,6 +1095,9 @@ } } }, + "combined_subjects": { + "type": "keyword" + }, "title": { "type": "text", "fields": { diff --git a/invenio_rdm_records/services/facets.py b/invenio_rdm_records/services/facets.py index e85f472e0..d21c96e66 100644 --- a/invenio_rdm_records/services/facets.py +++ b/invenio_rdm_records/services/facets.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2020-2023 CERN. -# Copyright (C) 2020-2021 Northwestern University. +# Copyright (C) 2020-2024 Northwestern University. # Copyright (C) 2021 TU Wien. # Copyright (C) 2023 Graz University of Technology. # @@ -10,14 +10,19 @@ """Facet definitions.""" +from warnings import warn + from invenio_i18n import gettext as _ from invenio_records_resources.services.records.facets import ( + CombinedTermsFacet, NestedTermsFacet, TermsFacet, ) from invenio_vocabularies.contrib.subjects import SubjectsLabels +from invenio_vocabularies.records.models import VocabularyScheme from invenio_vocabularies.services.facets import VocabularyLabels +from ..records.dumpers.combined_subjects import SPLITCHAR from ..records.systemfields.access.field.record import AccessStatusEnum access_status = TermsFacet( @@ -62,14 +67,44 @@ ) -subject_nested = NestedTermsFacet( - field="metadata.subjects.scheme", - subfield="metadata.subjects.subject.keyword", - label=_("Subjects"), - value_labels=SubjectsLabels(), -) +def deprecated_subject_nested(): + """Deprecated NestedTermsFacet. + + Will warn until this is completely removed. + """ + warn( + "subject_nested is deprecated. Use subject_combined instead.", + DeprecationWarning, + ) + return NestedTermsFacet( + field="metadata.subjects.scheme", + subfield="metadata.subjects.subject.keyword", + label=_("Subjects"), + value_labels=SubjectsLabels(), + ) + + +subject_nested = deprecated_subject_nested() + subject = TermsFacet( field="metadata.subjects.subject.keyword", label=_("Subjects"), ) + + +def get_subject_schemes(): + """Return subject schemes.""" + return [ + row.id for row in VocabularyScheme.query.filter_by(parent_id="subjects").all() + ] + + +subject_combined = CombinedTermsFacet( + field="metadata.subjects.scheme", + combined_field="metadata.combined_subjects", + parents=get_subject_schemes, + splitchar=SPLITCHAR, + label=_("Subjects"), + value_labels=SubjectsLabels(), +) diff --git a/setup.cfg b/setup.cfg index e91fe765a..d544af482 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,7 @@ install_requires = invenio-administration>=2.0.0,<3.0.0 invenio-communities>=13.0.0,<14.0.0 invenio-drafts-resources>=3.0.0,<4.0.0 + invenio-records-resources>=5.3.0,<6.0.0 invenio-github>=1.0.0,<2.0.0 invenio-i18n>=2.0.0,<3.0.0 invenio-oaiserver>=2.0.0,<3.0.0 diff --git a/tests/records/dumpers/test_combined_subjects_dumper.py b/tests/records/dumpers/test_combined_subjects_dumper.py new file mode 100644 index 000000000..e8d8740e7 --- /dev/null +++ b/tests/records/dumpers/test_combined_subjects_dumper.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2023 Northwestern University. +# +# Invenio-RDM-Records is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Test combined subjects dumper.""" + +from copy import deepcopy + +from invenio_rdm_records.records import RDMDraft, RDMRecord +from invenio_rdm_records.records.api import RDMParent + + +def test_combined_subjects_dumper(running_app, db, minimal_record): + input_subjects = [ + {"id": "http://id.nlm.nih.gov/mesh/A-D000007"}, + {"subject": "Capybara"}, + ] + minimal_record["metadata"]["subjects"] = input_subjects + parent = RDMParent.create({}) + + cases = [] + for api_cls in [RDMDraft, RDMRecord]: + data_input = deepcopy(minimal_record) + api_instance = api_cls.create(data_input, parent=parent) + dump = api_instance.dumps() + cases.append((dump, api_cls)) + + expected_combined_subjects = [ + "MeSH::Abdominal Injuries", + "Capybara", + ] + for dump, api_cls in cases: + assert expected_combined_subjects == dump["metadata"]["combined_subjects"] + + loaded = api_cls.loads(dump) + assert "combined_subjects" not in loaded["metadata"] + assert "subjects" in loaded["metadata"] diff --git a/tests/records/test_records_communities.py b/tests/records/test_records_communities.py index 07357e070..04de1f0de 100644 --- a/tests/records/test_records_communities.py +++ b/tests/records/test_records_communities.py @@ -43,9 +43,10 @@ def test_community_integration(db, community, running_app, minimal_record): db.session.commit() assert record.parent.communities.entries[0] == community - assert record.dumps()["parent"]["communities"]["default"] == str(community.id) - assert record.dumps()["parent"]["communities"]["ids"] == [str(community.id)] - entries = record.dumps()["parent"]["communities"]["entries"] + dump = record.dumps() + assert dump["parent"]["communities"]["default"] == str(community.id) + assert dump["parent"]["communities"]["ids"] == [str(community.id)] + entries = dump["parent"]["communities"]["entries"] assert len(entries) == 1 assert entries[0]["id"] == str(community.id) assert entries[0]["slug"] == "test-community"