Skip to content

Commit

Permalink
document: dublin core improvements
Browse files Browse the repository at this point in the history
* Adds files into the `dc:identifier`.
* Adds attributes `xml:lang` on the descriptions, subjects
  and titles fields.
* Closes #661.
* Closes #800.

Co-Authored-by: Bertrand Zuchuat <[email protected]>
  • Loading branch information
Garfield-fr authored and PascalRepond committed May 5, 2022
1 parent de7b33d commit 396bfc7
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 20 deletions.
70 changes: 66 additions & 4 deletions sonar/modules/documents/serializers/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

"""Dublin Core serializer."""

from dcxml import simpledc
from flask_resources.serializers import SerializerMixin
from lxml import etree

from sonar.modules.documents.serializers.schemas.dc import DublinCoreSchema

Expand All @@ -38,10 +38,72 @@ def transform_record(self, obj):
def serialize_object_xml(self, obj):
"""Serialize a single record and persistent identifier to etree.
:param obj: Record instance
:param obj: Record instance.
:returns: an etree element.
"""
json = self.transform_record(obj["_source"])
return simpledc.dump_etree(json)
data = self.transform_record(obj["_source"])
return self.serialize_dict_to_etree(data)

def serialize_dict_to_etree(self, data):
"""Serialize json to etree.
:param data: transformed record to dict.
:returns: an etree element.
"""
ns = {
'dc': 'http://purl.org/dc/elements/1.1/',
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'xml': 'xml',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
}
container = '{http://www.openarchives.org/OAI/2.0/oai_dc/}dc'
"""Default container element."""
attrib = {
'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation':
'http://www.openarchives.org/OAI/2.0/oai_dc/ '
'http://www.openarchives.org/OAI/2.0/oai_dc.xsd'
}
"""Default container element attributes."""
elements = {
'contributors': 'contributor',
'creators': 'creator',
'dates': 'date',
'descriptions': 'description',
'formats': 'format',
'identifiers': 'identifier',
'languages': 'language',
'publishers': 'publisher',
'relations': 'relation',
'rights': 'rights',
'sources': 'source',
'subjects': 'subject',
'titles': 'title',
'types': 'type'
}

root = etree.Element(container, nsmap=ns, attrib=attrib)

for key, values in data.items():
if key in elements:
for value in values:
attrs = {}
if isinstance(value, dict):
val = value['value']
if '@attrs' in value:
for attr in value['@attrs']:
prefix = attr['prefix'] \
if 'prefix' in attr else 'xml'
attrs[f'{{{prefix}}}{attr["name"]}'] = \
attr['value']
else:
val = value
field = etree.SubElement(
root,
f'{{http://purl.org/dc/elements/1.1/}}{elements[key]}',
attrs
)
field.text = val
return root


def sonar_dublin_core(pid, record):
Expand Down
80 changes: 72 additions & 8 deletions sonar/modules/documents/serializers/schemas/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import re

from flask import request
from flask import current_app, request
from marshmallow import fields

from sonar.modules.documents.api import DocumentRecord
Expand All @@ -46,6 +46,17 @@ class DublinCoreSchema(BaseSchema):
titles = fields.Method('get_titles')
types = fields.Method('get_types')

def translate_language(self, language):
"""Translate language code ISO-639-3 to ISO-639-2 if possible.
:param language: language with ISO-639-3 format.
:returns: language code ISO-639-2 if possible or ISO-639-3.
"""
langs = current_app.config['SONAR_APP_LANGUAGES_MAP']
if language in langs and langs[language]:
return langs[language]
return language

def get_contributors(self, obj):
"""Get contributors."""
items = []
Expand Down Expand Up @@ -85,7 +96,21 @@ def get_dates(self, obj):

def get_descriptions(self, obj):
"""Get descriptions."""
return [file['value'] for file in obj['metadata'].get('abstracts', [])]
items = []
for abstract in obj['metadata'].get('abstracts', []):
if 'language' in abstract:
items.append({
'@attrs': [{
'prefix':'xml',
'name':'lang',
'value': self.translate_language(abstract['language'])
}],
'value':abstract['value']
})
else:
items.append(abstract['value'])

return items

def get_formats(self, obj):
"""Get formats."""
Expand All @@ -98,10 +123,27 @@ def get_formats(self, obj):

def get_identifiers(self, obj):
"""Get identifiers."""
return [
items = [
DocumentRecord.get_permanent_link(request.host_url,
obj['metadata']['pid'])
]
# If files on the document
if '_files' in obj['metadata']:
# Extraction of files only with a type file
files = filter(
lambda f: ('type' in f and f['type'] == 'file'),
obj['metadata']['_files'])
# Files sorting
files = sorted(files, key=lambda file: file.get('order', 100))
# Remove / at the end of host_url
host = request.host_url[:-1]
# Add file only the the link is defined in download
for file in files:
links = file.get('links', {})
if 'download' in links and links.get('download'):
items.append(host + links.get('download'))

return items

def get_languages(self, obj):
"""Get languages."""
Expand Down Expand Up @@ -222,7 +264,19 @@ def get_subjects(self, obj):

# Subjects
for subjects in obj['metadata'].get('subjects', []):
items = items + subjects['label']['value']
if 'language' in subjects['label']:
for value in subjects['label']['value']:
items.append({
'@attrs': [{
'prefix': 'xml',
'name': 'lang',
'value': self.translate_language(
subjects['label']['language'])
}],
'value': value
})
else:
items = items + subjects['label']['value']

# Classification
for classification in obj['metadata'].get('classification', []):
Expand All @@ -240,12 +294,22 @@ def get_subjects(self, obj):

def get_titles(self, obj):
"""Get titles."""
title = [obj['metadata']['title'][0]['mainTitle'][0]['value']]

title = {
'@attrs': [{
'prefix': 'xml',
'name': 'lang',
'value': self.translate_language(
obj['metadata']['title'][0]['mainTitle'][0]['language'])
}],
'value': obj['metadata']['title'][0]['mainTitle'][0]['value']\
.strip()
}
if obj['metadata']['title'][0].get('subtitle'):
title.append(obj['metadata']['title'][0]['subtitle'][0]['value'])
subtitle = obj['metadata']['title'][0]['subtitle'][0]['value']\
.strip()
title['value'] = f"{title['value']} : {subtitle}"

return [' : '.join(title)]
return [title]

def get_types(self, obj):
"""Get types."""
Expand Down
67 changes: 59 additions & 8 deletions tests/ui/documents/test_dc_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,9 @@
def minimal_document(db, bucket_location, organisation):
record = DocumentRecord.create(
{
'pid':
'1000',
'pid': '1000',
'title': [{
'type':
'bf:Title',
'type': 'bf:Title',
'mainTitle': [{
'language': 'eng',
'value': 'Title of the document'
Expand Down Expand Up @@ -181,6 +179,29 @@ def test_descriptions(minimal_document):
assert result['descriptions'] == ['Description 1', 'Description 2']


def test_descriptions_attributes(minimal_document):
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['descriptions'] == []

minimal_document['abstracts'] = [{
'language': 'fre',
'value': 'Description 1'
}, {
'language': 'ace',
'value': 'Description 2'
}]
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['descriptions'] == [
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'fr'}],
'value': 'Description 1'
},
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'ace'}],
'value': 'Description 2'
}
]

def test_formats(minimal_document):
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['formats'] == []
Expand Down Expand Up @@ -402,7 +423,22 @@ def test_subjects(minimal_document):
}]
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['subjects'] == [
'Subject 1', 'Subject 2', 'Sujet 1', 'Sujet 2'
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
'value': 'Subject 1'
},
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
'value': 'Subject 2'
},
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'fr'}],
'value': 'Sujet 1'
},
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'fr'}],
'value': 'Sujet 2'
}
]

minimal_document.pop('subjects', None)
Expand All @@ -423,7 +459,12 @@ def test_subjects(minimal_document):
def test_titles(minimal_document):
"""Test titles serialization."""
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['titles'] == ['Title of the document']
assert result['titles'] == [
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
'value': 'Title of the document'
}
]

minimal_document['title'] = [{
'mainTitle': [{
Expand All @@ -437,7 +478,12 @@ def test_titles(minimal_document):
}]
}]
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['titles'] == ['Title 1']
assert result['titles'] == [
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
'value': 'Title 1'
}
]

minimal_document['title'] = [{
'mainTitle': [{
Expand All @@ -450,7 +496,12 @@ def test_titles(minimal_document):
}]
}]
result = SonarDublinCoreXMLSerializer().transform_record(minimal_document)
assert result['titles'] == ['Title 1 : Subtitle 1']
assert result['titles'] == [
{
'@attrs': [{'prefix': 'xml', 'name': 'lang', 'value': 'en'}],
'value': 'Title 1 : Subtitle 1'
}
]


def test_types(minimal_document):
Expand Down

0 comments on commit 396bfc7

Please sign in to comment.