Skip to content

Commit

Permalink
facets: integrate combined_subjects / fix nested subject faceting
Browse files Browse the repository at this point in the history
- closes #798
  • Loading branch information
fenekku committed Apr 11, 2024
1 parent 86b4662 commit f2116da
Show file tree
Hide file tree
Showing 14 changed files with 194 additions and 10 deletions.
14 changes: 14 additions & 0 deletions invenio_rdm_records/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ def always_valid(identifier):
"field": "subjects.subject",
},
},
# subject_nested is deprecated and should be removed.
# subject_combined does require a pre-existing change to indexed documents,
# so it's unclear if a direct replacement is right.
# Keeping it around until v13 might be better. On the flipside it is an incorrect
# facet...
"subject_nested": {
"facet": facets.subject_nested,
"ui": {
Expand All @@ -183,6 +188,15 @@ def always_valid(identifier):
},
},
},
"subject_combined": {
"facet": facets.subject_combined,
"ui": {
"field": "subjects.scheme",
"childAgg": {
"field": "subjects.subject",
},
},
},
}

RDM_SEARCH_SORT_BY_VERIFIED = False
Expand Down
2 changes: 2 additions & 0 deletions invenio_rdm_records/records/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

from . import models
from .dumpers import (
CombinedSubjectsDumperExt,
EDTFDumperExt,
EDTFListDumperExt,
GrantTokensDumperExt,
Expand Down Expand Up @@ -118,6 +119,7 @@ class CommonFieldsMixin:
EDTFDumperExt("metadata.publication_date"),
EDTFListDumperExt("metadata.dates", "date"),
RelationDumperExt("relations"),
CombinedSubjectsDumperExt(),
CustomFieldsDumperExt(fields_var="RDM_CUSTOM_FIELDS"),
StatisticsDumperExt("stats"),
]
Expand Down
2 changes: 2 additions & 0 deletions invenio_rdm_records/records/dumpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
"""Search dumpers, for transforming to and from versions to index."""

from .access import GrantTokensDumperExt
from .combined_subjects import CombinedSubjectsDumperExt
from .edtf import EDTFDumperExt, EDTFListDumperExt
from .locations import LocationsDumper
from .pids import PIDsDumperExt
from .statistics import StatisticsDumperExt

__all__ = (
"CombinedSubjectsDumperExt",
"EDTFDumperExt",
"EDTFListDumperExt",
"PIDsDumperExt",
Expand Down
71 changes: 71 additions & 0 deletions invenio_rdm_records/records/dumpers/combined_subjects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 Northwestern University.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Search dumpers for combined subjects."""

from invenio_records.dumpers import SearchDumperExt

SPLITCHAR = "::" # explict better than implicit


class CombinedSubjectsDumperExt(SearchDumperExt):
"""Search dumper extension for sombined subjects support.
It parses the values of the `subjects` field
in the document and adds entries of the form:
`<scheme><splitchar><subject>` or `<subject>` in the `combined_subjects` field.
The combined_subjects field is required for properly aggregating/faceting subjects.
This dumper needs to be placed after the RelationDumper for subjects as it relies
on dereferenced scheme + subject pairs.
"""

def __init__(self, splitchar=SPLITCHAR):
"""Constructor.
:param splitchar: string to use to combine scheme + subject.
It must be identical to the splitchar value used in the
CombinedTermsFacet.
"""
super().__init__()
self._splitchar = splitchar

def dump(self, record, data):
"""Dump the data to secondary storage (OpenSearch-like)."""
subjects = data.get("metadata", {}).get("subjects", [])

def get_scheme_subject(subject_dict):
"""
Return [<scheme>, <subject>] or [<subject>] for the given `subject_dict`.
Assumes subject_dict has been dereferenced at this point.
"""
result = []
if "scheme" in subject_dict:
result.append(subject_dict["scheme"])
result.append(subject_dict["subject"])
return result

# There is no clarity on what keys can be assumed to be present in data
# (e.g., test_records_communities calls dumps() without "metadata"),
# so one has to be careful in how the dumped data is inserted back into `data`
metadata = data.get("metadata", {})
metadata["combined_subjects"] = [
self._splitchar.join(get_scheme_subject(subject)) for subject in subjects
]
data["metadata"] = metadata

def load(self, data, record_cls):
"""Load the data from secondary storage (OpenSearch-like).
This is run against the parent too (for some reason), so presence of any
field cannot be assumed.
"""
if "metadata" in data:
data["metadata"].pop("combined_subjects", None)
return data
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,9 @@
}
}
},
"combined_subjects": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,9 @@
}
}
},
"combined_subjects": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,9 @@
}
}
},
"combined_subjects": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,9 @@
}
}
},
"combined_subjects": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,9 @@
}
}
},
"combined_subjects": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,9 @@
}
}
},
"combined_subjects": {
"type": "keyword"
},
"title": {
"type": "text",
"fields": {
Expand Down
49 changes: 42 additions & 7 deletions invenio_rdm_records/services/facets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020-2023 CERN.
# Copyright (C) 2020-2021 Northwestern University.
# Copyright (C) 2020-2024 Northwestern University.
# Copyright (C) 2021 TU Wien.
# Copyright (C) 2023 Graz University of Technology.
#
Expand All @@ -10,14 +10,19 @@

"""Facet definitions."""

from warnings import warn

from invenio_i18n import gettext as _
from invenio_records_resources.services.records.facets import (
CombinedTermsFacet,
NestedTermsFacet,
TermsFacet,
)
from invenio_vocabularies.contrib.subjects import SubjectsLabels
from invenio_vocabularies.records.models import VocabularyScheme
from invenio_vocabularies.services.facets import VocabularyLabels

from ..records.dumpers.combined_subjects import SPLITCHAR
from ..records.systemfields.access.field.record import AccessStatusEnum

access_status = TermsFacet(
Expand Down Expand Up @@ -62,14 +67,44 @@
)


subject_nested = NestedTermsFacet(
field="metadata.subjects.scheme",
subfield="metadata.subjects.subject.keyword",
label=_("Subjects"),
value_labels=SubjectsLabels(),
)
def deprecated_subject_nested():
"""Deprecated NestedTermsFacet.
Will warn until this is completely removed.
"""
warn(
"subject_nested is deprecated. Use subject_combined instead.",
DeprecationWarning,
)
return NestedTermsFacet(
field="metadata.subjects.scheme",
subfield="metadata.subjects.subject.keyword",
label=_("Subjects"),
value_labels=SubjectsLabels(),
)


subject_nested = deprecated_subject_nested()


subject = TermsFacet(
field="metadata.subjects.subject.keyword",
label=_("Subjects"),
)


def get_subject_schemes():
"""Return subject schemes."""
return [
row.id for row in VocabularyScheme.query.filter_by(parent_id="subjects").all()
]


subject_combined = CombinedTermsFacet(
field="metadata.subjects.scheme",
combined_field="metadata.combined_subjects",
parents=get_subject_schemes,
splitchar=SPLITCHAR,
label=_("Subjects"),
value_labels=SubjectsLabels(),
)
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ install_requires =
invenio-administration>=2.0.0,<3.0.0
invenio-communities>=13.0.0,<14.0.0
invenio-drafts-resources>=3.0.0,<4.0.0
invenio-records-resources>=5.3.0,<6.0.0
invenio-github>=1.0.0,<2.0.0
invenio-i18n>=2.0.0,<3.0.0
invenio-oaiserver>=2.0.0,<3.0.0
Expand Down
40 changes: 40 additions & 0 deletions tests/records/dumpers/test_combined_subjects_dumper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2023 Northwestern University.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Test combined subjects dumper."""

from copy import deepcopy

from invenio_rdm_records.records import RDMDraft, RDMRecord
from invenio_rdm_records.records.api import RDMParent


def test_combined_subjects_dumper(running_app, db, minimal_record):
input_subjects = [
{"id": "http://id.nlm.nih.gov/mesh/A-D000007"},
{"subject": "Capybara"},
]
minimal_record["metadata"]["subjects"] = input_subjects
parent = RDMParent.create({})

cases = []
for api_cls in [RDMDraft, RDMRecord]:
data_input = deepcopy(minimal_record)
api_instance = api_cls.create(data_input, parent=parent)
dump = api_instance.dumps()
cases.append((dump, api_cls))

expected_combined_subjects = [
"MeSH::Abdominal Injuries",
"Capybara",
]
for dump, api_cls in cases:
assert expected_combined_subjects == dump["metadata"]["combined_subjects"]

loaded = api_cls.loads(dump)
assert "combined_subjects" not in loaded["metadata"]
assert "subjects" in loaded["metadata"]
7 changes: 4 additions & 3 deletions tests/records/test_records_communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ def test_community_integration(db, community, running_app, minimal_record):
db.session.commit()

assert record.parent.communities.entries[0] == community
assert record.dumps()["parent"]["communities"]["default"] == str(community.id)
assert record.dumps()["parent"]["communities"]["ids"] == [str(community.id)]
entries = record.dumps()["parent"]["communities"]["entries"]
dump = record.dumps()
assert dump["parent"]["communities"]["default"] == str(community.id)
assert dump["parent"]["communities"]["ids"] == [str(community.id)]
entries = dump["parent"]["communities"]["entries"]
assert len(entries) == 1
assert entries[0]["id"] == str(community.id)
assert entries[0]["slug"] == "test-community"
Expand Down

0 comments on commit f2116da

Please sign in to comment.