-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HYC-1280: Create intermediate ingest object (#738)
* Create object to store xml and parse jats xml Create intermediate object for parsing xml and translating to Hyrax objects - Move check for extra files in package - Do not map UNC affiliation until we can do so more reliably, put in "other affiliation" until then * Add and configure javascript driver for capybara, upgrade * Use ffaker for tests, do setup step once for feature, for faster run
- Loading branch information
Showing
21 changed files
with
2,382 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
# For information on the JATS metadata standard, see https://jats.nlm.nih.gov/ | ||
# Currently used for Sage ingest | ||
class JatsIngestWork | ||
include ActiveModel | ||
attr_reader :xml_path | ||
|
||
def initialize(xml_path:) | ||
@xml_path = xml_path | ||
end | ||
|
||
def jats_xml | ||
@jats_xml ||= File.read(xml_path) | ||
end | ||
|
||
def document | ||
@document ||= Nokogiri::XML(jats_xml) | ||
end | ||
|
||
def article_metadata | ||
@article_metadata ||= document.xpath('.//article-meta') | ||
end | ||
|
||
def creators_metadata | ||
@creators_metadata ||= document.xpath('.//contrib-group') | ||
end | ||
|
||
def journal_metadata | ||
@journal_metadata ||= document.xpath('.//journal-meta') | ||
end | ||
|
||
def permissions | ||
@permissions ||= article_metadata.xpath('.//permissions') | ||
end | ||
|
||
def abstract | ||
article_metadata.xpath('.//abstract').map(&:inner_text) | ||
end | ||
|
||
def copyright_date | ||
permissions.at('copyright-year').inner_text | ||
end | ||
|
||
def creators | ||
@creators ||= begin | ||
creators_metadata.xpath('.//contrib').map.with_index do |contributor, index| | ||
[index, contributor_to_hash(contributor, index)] | ||
end.to_h | ||
end | ||
end | ||
|
||
# TODO: Map affiliation to UNC controlled vocabulary | ||
def contributor_to_hash(contributor, index) | ||
affiliation_ids = affiliation_ids(contributor) | ||
first_affiliation = affiliation_map[affiliation_ids.first] | ||
{ | ||
'name' => "#{surname(contributor)}, #{given_names(contributor)}", | ||
'orcid' => orcid(contributor), | ||
'affiliation' => '', | ||
# 'affiliation' => some_method, # Do not store affiliation until we can map it to the controlled vocabulary | ||
'other_affiliation' => first_affiliation, | ||
'index' => (index+1).to_s | ||
} | ||
end | ||
|
||
def affiliation_map | ||
@affiliation_map ||= begin | ||
document.xpath('//aff').map do |affil| | ||
[affil.attributes["id"].value, affiliation_to_s(affil)] | ||
end.to_h | ||
end | ||
end | ||
|
||
def affiliation_ids(elem) | ||
references = elem.xpath('xref') | ||
references.map do |ref| | ||
reference_type = ref['ref-type'] | ||
next unless reference_type=="aff" | ||
|
||
ref["rid"] | ||
end.compact | ||
end | ||
|
||
def affiliation_to_s(affil_elem) | ||
affil_elem.children.map do |child| | ||
# Don't include newlines or the order label | ||
next if child.inner_text == "\n" || child.name == "label" | ||
|
||
# Only include the institution name proper from the institution-wrap, don't include the institution-id | ||
if child.xpath(".//institution").present? | ||
child.xpath(".//institution").inner_text | ||
else | ||
child.inner_text | ||
end | ||
end.join | ||
end | ||
|
||
def date_of_publication | ||
if publication_day && publication_month && publication_year | ||
"#{publication_year}-#{publication_month}-#{publication_day}" | ||
elsif publication_month && publication_year | ||
"#{publication_year}-#{publication_month}" | ||
else | ||
publication_year | ||
end | ||
end | ||
|
||
def funder | ||
article_metadata.xpath('.//funding-source/institution-wrap/institution').map(&:inner_text) | ||
end | ||
|
||
# The Sage-assigned DOI | ||
def identifier | ||
article_metadata.xpath('.//article-id[@pub-id-type="doi"]').map(&:inner_text) | ||
end | ||
|
||
def issn | ||
journal_metadata.xpath(".//issn").map(&:inner_text) | ||
end | ||
|
||
def journal_issue | ||
article_metadata.at('issue')&.inner_text | ||
end | ||
|
||
def journal_title | ||
journal_metadata.xpath(".//journal-title-group/journal-title").inner_text | ||
end | ||
|
||
def journal_volume | ||
article_metadata.at('volume')&.inner_text | ||
end | ||
|
||
def keyword | ||
article_metadata.at('kwd-group').xpath("//kwd").map do |elem| | ||
if elem.at('italic') | ||
elem.at('italic').inner_text | ||
else | ||
elem.inner_text | ||
end | ||
end | ||
end | ||
|
||
def license | ||
permissions.xpath(".//license/@xlink:href").map do |elem| | ||
CdrLicenseService.authority.find(elem&.inner_text)[:id] | ||
end | ||
end | ||
|
||
def license_label | ||
license.map do |lic| | ||
CdrLicenseService.label(lic) | ||
end | ||
end | ||
|
||
def page_end | ||
article_metadata.at('lpage')&.inner_text | ||
end | ||
|
||
def page_start | ||
article_metadata.at('fpage')&.inner_text | ||
end | ||
|
||
def publisher | ||
journal_metadata.xpath('.//publisher/publisher-name').map(&:inner_text) | ||
end | ||
|
||
def rights_holder | ||
permissions.xpath('.//copyright-holder').map(&:inner_text) | ||
end | ||
|
||
def title | ||
article_metadata.xpath('.//title-group/article-title').map(&:inner_text) | ||
end | ||
|
||
private | ||
|
||
def publication_year | ||
year = publication_date_node_set.at('year')&.inner_text&.to_i | ||
format('%04d', year) if year | ||
end | ||
|
||
def publication_month | ||
month = publication_date_node_set.at('month')&.inner_text&.to_i | ||
format('%02d', month) if month | ||
end | ||
|
||
def publication_day | ||
day = publication_date_node_set.at('day')&.inner_text&.to_i | ||
format('%02d', day) if day | ||
end | ||
|
||
def publication_date_node_set | ||
if physical_publication_date.present? | ||
physical_publication_date | ||
elsif electronic_and_physical_publication_date.present? | ||
electronic_and_physical_publication_date | ||
elsif electronic_publication_date.present? | ||
electronic_publication_date | ||
end | ||
end | ||
|
||
def electronic_publication_date | ||
article_metadata.xpath('.//pub-date[@pub-type="epub"]') | ||
end | ||
|
||
def electronic_and_physical_publication_date | ||
article_metadata.xpath('.//pub-date[@pub-type="epub-ppub"]') | ||
end | ||
|
||
def physical_publication_date | ||
article_metadata.xpath('.//pub-date[@pub-type="ppub"]') | ||
end | ||
|
||
def surname(contributor) | ||
contributor.xpath('name/surname').inner_text | ||
end | ||
|
||
def given_names(contributor) | ||
contributor.xpath('name/given-names').inner_text | ||
end | ||
|
||
def orcid(contributor) | ||
contributor.xpath('contrib-id').inner_text | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.