Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

serializer: updated subjects and affiliations links in dcat #1890

Merged
merged 2 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion invenio_rdm_records/records/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ class CommonFieldsMixin:
),
subjects=PIDListRelation(
"metadata.subjects",
keys=["subject", "scheme", "props"],
keys=["subject", "scheme", "props", "identifiers"],
pid_field=Subject.pid,
cache_key="subjects",
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,16 @@
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,16 @@
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,16 @@
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,8 @@
"accent_analyzer": {
"tokenizer": "standard",
"type": "custom",
"char_filter": [
"strip_special_chars"
],
"filter": [
"lowercase",
"asciifolding"
]
"char_filter": ["strip_special_chars"],
"filter": ["lowercase", "asciifolding"]
}
}
}
Expand Down Expand Up @@ -1248,6 +1243,16 @@
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,16 @@
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1195,6 +1195,16 @@
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
}
}
},
Expand Down
103 changes: 103 additions & 0 deletions invenio_rdm_records/resources/serializers/dcat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from datacite import schema43
from flask_resources import BaseListSchema, MarshmallowSerializer
from flask_resources.serializers import SimpleSerializer
from idutils import detect_identifier_schemes, to_url
from lxml import etree as ET
from pkg_resources import resource_stream
from werkzeug.utils import cached_property
Expand Down Expand Up @@ -93,13 +94,115 @@ def access_url(file):
if isinstance(tag_value, dict):
el.attrib.update(tag_value)

def add_missing_creatibutor_links(self, rdf_tree):
"""Add missing `rdf:about` attributes to <rdf:Description> within <dct:creator> and <dct:contributor> and <foaf:Organization> within <org:memberOf>."""
namespaces = rdf_tree.nsmap

# Helper function to add rdf:about based on identifier
def add_rdf_about(element, identifier_elem):
identifier = identifier_elem.text.strip()
schemes = detect_identifier_schemes(identifier)
rdf_about_url = next(
(
to_url(identifier, scheme=scheme)
for scheme in schemes
if to_url(identifier, scheme)
),
None,
)
if rdf_about_url:
element.set(
"{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", rdf_about_url
)

# Process <dct:creator> and <dct:contributor>
contributors_and_creators = rdf_tree.xpath(
"//dct:creator/rdf:Description | //dct:contributor/rdf:Description",
namespaces=namespaces,
)

for description in contributors_and_creators:
# Add rdf:about for creator/contributor if missing
if not description.get(
"{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"
):
identifier_elem = description.find("dct:identifier", namespaces)
if identifier_elem is not None:
add_rdf_about(description, identifier_elem)

# Process <foaf:Organization> within <org:memberOf> at any level
organizations = rdf_tree.xpath(
"//org:memberOf//foaf:Organization[not(@rdf:about)]",
namespaces=namespaces,
)

for org in organizations:
org_identifier_elem = org.find("dct:identifier", namespaces)
if org_identifier_elem is not None:
add_rdf_about(org, org_identifier_elem)

return rdf_tree

def add_subjects_uri(self, rdf_tree, subjects):
"""Add valueURI of subjects to the corresponding dct:subject elements in the RDF tree."""
namespaces = rdf_tree.nsmap
for subject in subjects:
value_uri = subject.get("valueURI")
subject_label = subject.get("subject")
subject_scheme = subject.get("subjectScheme")
subject_props = subject.get("subjectProps", {})

if value_uri and subject_label and subject_scheme:
# Find the corresponding dct:subject element by prefLabel and subjectScheme
subject_element = rdf_tree.xpath(
f"""
//dct:subject[
skos:Concept[
skos:prefLabel[text()='{subject_label}']
and skos:inScheme/skos:ConceptScheme/dct:title[text()='{subject_scheme}']
]
]
""",
namespaces=namespaces,
)[0]

if subject_element:
# Add the valueURI to the dct:subject element as rdf:about
subject_element.set(
"{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", value_uri
)

# Check if
# subject has a definition in its props
definition = subject_props.get("definition")
if definition:
concept_elem = subject_element.find(
".//skos:Concept", namespaces=namespaces
)
if concept_elem is not None:
skos_definition = ET.Element(
"{http://www.w3.org/2004/02/skos/core#}definition"
)
skos_definition.text = definition
concept_elem.append(skos_definition)

return rdf_tree

def transform_with_xslt(self, dc_record, **kwargs):
"""Transform record with XSLT."""
dc_etree = schema43.dump_etree(dc_record)
dc_namespace = schema43.ns[None]
dc_etree.tag = "{{{0}}}resource".format(dc_namespace)
dcat_etree = self.xslt_transform_func(dc_etree).getroot()

# Add valueURI to subjects
subjects = dc_record.get("subjects", [])
if subjects:
dcat_etree = self.add_subjects_uri(dcat_etree, subjects)

# Add the identifier links for creators & contributors if missing
dcat_etree = self.add_missing_creatibutor_links(dcat_etree)

# Inject files in results (since the XSLT can't do that by default)
files_data = dc_record.get("_files", [])
if files_data:
Expand Down
43 changes: 42 additions & 1 deletion invenio_rdm_records/resources/serializers/dcat/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import idutils
from flask import current_app
from marshmallow import fields, missing
from marshmallow import ValidationError, fields, missing, validate
from marshmallow_utils.html import sanitize_unicode

from invenio_rdm_records.resources.serializers.datacite import DataCite43Schema
Expand Down Expand Up @@ -49,3 +49,44 @@ def get_files(self, obj):
)

return files_list or missing

def get_subjects(self, obj):
"""Get subjects."""
subjects = obj["metadata"].get("subjects", [])
if not subjects:
return missing

validator = validate.URL()
serialized_subjects = []

for subject in subjects:
entry = {"subject": subject.get("subject")}

id_ = subject.get("id")
if id_:
entry["subjectScheme"] = subject.get("scheme")
try:
validator(id_)
entry["valueURI"] = id_
except ValidationError:
pass

# Get identifiers and assign valueURI if scheme is 'url' and id_ was not a valid url
if "valueURI" not in entry:
entry["valueURI"] = next(
(
identifier.get("identifier")
for identifier in subject.get("identifiers", [])
if identifier.get("scheme") == "url"
),
None,
)

# Add props if it exists
props = subject.get("props", {})
if props:
entry["subjectProps"] = props

serialized_subjects.append(entry)

return serialized_subjects if serialized_subjects else missing
2 changes: 1 addition & 1 deletion tests/resources/serializers/test_dcat_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_dcat_serializer(running_app, full_record_to_dict):
" </dct:publisher>\n"
" <dct:issued "
'rdf:datatype="http://www.w3.org/2001/XMLSchema#gYear">2018</dct:issued>\n'
" <dct:subject>\n"
' <dct:subject rdf:about="http://id.nlm.nih.gov/mesh/A-D000007">\n'
" <skos:Concept>\n"
" <skos:prefLabel>Abdominal Injuries</skos:prefLabel>\n"
" <skos:inScheme>\n"
Expand Down
Loading