diff --git a/README.md b/README.md index 4ec73be31..e5165341b 100755 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ bundle exec rake sage:ingest[PATH_TO_CONFIG] bundle exec rake sage:ingest[/hyrax/spec/fixtures/sage/sage_config.yml] ``` * See [example config file](spec/fixtures/sage/sage_config.yml). -* The beginning and ending of the rake task will be output to the console, the remaining events will be logged to the rails log and tagged with "Sage ingest". +* The beginning and ending of the rake task will be output to the console, the remaining events will be logged to the sage ingest log (in the same directory as the Rails log). #### Testing ##### RSpec Testing diff --git a/app/models/jats_ingest_work.rb b/app/models/jats_ingest_work.rb index 6f766d1ba..d5a72e640 100644 --- a/app/models/jats_ingest_work.rb +++ b/app/models/jats_ingest_work.rb @@ -140,8 +140,10 @@ def keyword def license permissions.xpath('.//license/@xlink:href').map do |elem| - CdrLicenseService.authority.find(elem&.inner_text)[:id] - end + controlled_vocab_hash = CdrLicenseService.authority.find(elem&.inner_text) + Rails.logger.warn("Could not match license uri: #{elem&.inner_text} to a license in the controlled vocabulary. Work with DOI #{identifier&.first} may not include the required license.") if controlled_vocab_hash.empty? + controlled_vocab_hash[:id] + end.compact end def license_label diff --git a/app/services/tasks/sage_ingest_service.rb b/app/services/tasks/sage_ingest_service.rb index 1f1425732..103b96d09 100644 --- a/app/services/tasks/sage_ingest_service.rb +++ b/app/services/tasks/sage_ingest_service.rb @@ -1,12 +1,27 @@ module Tasks require 'tasks/migrate/services/progress_tracker' class SageIngestService - attr_reader :package_dir, :ingest_progress_log, :admin_set + attr_reader :package_dir, :ingest_progress_log, :admin_set, :depositor + + def logger + @logger ||= begin + log_path = File.join(Rails.configuration.log_directory, 'sage_ingest.log') + Logger.new(log_path, progname: 'Sage ingest') + end + end def initialize(args) config = YAML.load_file(args[:configuration_file]) + # Create temp directory for unzipped contents + @temp = config['unzip_dir'] + FileUtils.mkdir_p @temp unless File.exist?(@temp) + + logger.info('Beginning Sage ingest') + @admin_set = ::AdminSet.where(title: config['admin_set'])&.first + raise(ActiveRecord::RecordNotFound, "Could not find AdminSet with title #{config['admin_set']}") unless @admin_set.present? - @admin_set = ::AdminSet.where(title: config['admin_set']).first + @depositor = User.find_by(uid: config['depositor_onyen']) + raise(ActiveRecord::RecordNotFound, "Could not find User with onyen #{config['depositor_onyen']}") unless @depositor.present? @package_dir = config['package_dir'] @ingest_progress_log = Migrate::Services::ProgressTracker.new(config['ingest_progress_log']) @@ -15,29 +30,47 @@ def initialize(args) def process_packages sage_package_paths = Dir.glob("#{@package_dir}/*.zip").sort count = sage_package_paths.count - Rails.logger.tagged('Sage ingest') { Rails.logger.info("Beginning ingest of #{count} Sage packages") } + logger.info("Beginning ingest of #{count} Sage packages") sage_package_paths.each.with_index(1) do |package_path, index| - Rails.logger.tagged('Sage ingest') { Rails.logger.info("Begin processing #{package_path} (#{index} of #{count})") } + logger.info("Begin processing #{package_path} (#{index} of #{count})") orig_file_name = File.basename(package_path, '.zip') - Dir.mktmpdir do |dir| - file_names = extract_files(package_path, dir).keys - next unless file_names.count == 2 - - _pdf_file_name = file_names.find { |name| name.match(/^(\S*).pdf/) } - xml_file_name = file_names.find { |name| name.match(/^(\S*).xml/) } - # parse xml - ingest_work = JatsIngestWork.new(xml_path: File.join(dir, xml_file_name)) - # Create Article with metadata and save - build_article(ingest_work) - # Add PDF file to Article (including FileSets) - # save object - # set off background jobs for object? - mark_done(orig_file_name) if package_ingest_complete?(dir, file_names) + + file_names = extract_files(package_path, @temp).keys + unless file_names.count.between?(2, 3) + logger.info("Error extracting #{package_path}: skipping zip file") + next end + + pdf_file_name = file_names.find { |name| name.match(/^(\S*).pdf/) } + + jats_xml_path = jats_xml_path(file_names: file_names, dir: @temp) + + # parse xml + ingest_work = JatsIngestWork.new(xml_path: jats_xml_path) + # Create Article with metadata and save + art_with_meta = article_with_metadata(ingest_work) + # Add PDF file to Article (including FileSets) + attach_file_set_to_work(work: art_with_meta, dir: @temp, file_name: pdf_file_name, user: @depositor, visibility: art_with_meta.visibility) + # Add xml metadata file to Article + attach_file_set_to_work(work: art_with_meta, dir: @temp, file_name: jats_xml_file_name(file_names: file_names), user: @depositor, visibility: Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PRIVATE) + mark_done(orig_file_name) if package_ingest_complete?(@temp, file_names) end + logger.info("Completing ingest of #{count} Sage packages.") end - def build_article(ingest_work) + def jats_xml_path(file_names:, dir:) + jats_xml_name = jats_xml_file_name(file_names: file_names) + + File.join(dir, jats_xml_name) + end + + def jats_xml_file_name(file_names:) + file_names -= ['manifest.xml'] + file_names.find { |name| name.match(/^(\S*).xml/) } + end + + def article_with_metadata(ingest_work) + logger.info("Creating article from DOI: #{ingest_work.identifier}") art = Article.new art.admin_set = @admin_set # required fields @@ -63,6 +96,7 @@ def build_article(ingest_work) art.resource_type = ['Article'] art.rights_holder = ingest_work.rights_holder art.rights_statement = 'http://rightsstatements.org/vocab/InC/1.0/' + art.rights_statement_label = 'In Copyright' art.visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC # fields not normally edited via UI art.date_uploaded = DateTime.current @@ -73,8 +107,22 @@ def build_article(ingest_work) art end + def attach_file_set_to_work(work:, dir:, file_name:, user:, visibility:) + file_set_params = { visibility: visibility } + logger.info("Attaching file_set for #{file_name} to DOI: #{work.identifier.first}") + file_set = FileSet.create + actor = Hyrax::Actors::FileSetActor.new(file_set, user) + actor.create_metadata(file_set_params) + file = File.open(File.join(dir, file_name)) + actor.create_content(file) + actor.attach_to_work(work, file_set_params) + file.close + + file_set + end + def mark_done(orig_file_name) - Rails.logger.tagged('Sage ingest') { Rails.logger.info("Marked package ingest complete #{orig_file_name}") } + logger.info("Marked package ingest complete #{orig_file_name}") @ingest_progress_log.add_entry(orig_file_name) end @@ -82,21 +130,22 @@ def mark_done(orig_file_name) def package_ingest_complete?(dir, file_names) return true if File.exist?(File.join(dir, file_names.first)) && File.exist?(File.join(dir, file_names.last)) - Rails.logger.tagged('Sage ingest') { Rails.logger.error("Package ingest not complete for #{file_names.first} and #{file_names.last}") } + logger.error("Package ingest not complete for #{file_names.first} and #{file_names.last}") false end def extract_files(package_path, temp_dir) + logger.info("Extracting files from #{package_path} to #{temp_dir}") extracted_files = Zip::File.open(package_path) do |zip_file| zip_file.each do |file| file_path = File.join(temp_dir, file.name) - zip_file.extract(file, file_path) + zip_file.extract(file, file_path) unless File.exist?(file_path) end end - Rails.logger.tagged('Sage ingest') { Rails.logger.error("Unexpected package contents - more than two files extracted from #{package_path}") } unless extracted_files.count == 2 + logger.error("Unexpected package contents - #{extracted_files.count} files extracted from #{package_path}") unless extracted_files.count.between?(2, 3) extracted_files rescue Zip::DestinationFileExistsError => e - Rails.logger.tagged('Sage ingest') { Rails.logger.info("#{package_path}, zip file error: #{e.message}") } + logger.info("#{package_path}, zip file error: #{e.message}") end end end diff --git a/config/application.rb b/config/application.rb index c2ec3895f..e9b29a82e 100755 --- a/config/application.rb +++ b/config/application.rb @@ -35,6 +35,7 @@ class Application < Rails::Application config.log_formatter = proc do |severity, time, _progname, msg| "#{time} - #{severity}: #{msg}\n" end + config.log_directory = ENV['LOG_DIRECTORY'] || 'log' log_path = ENV['LOGS_PATH'] || "log/#{Rails.env}.log" logger = ActiveSupport::Logger.new(log_path) logger.formatter = config.log_formatter diff --git a/config/authorities/licenses.yml b/config/authorities/licenses.yml index 68ef414dc..dbf571730 100755 --- a/config/authorities/licenses.yml +++ b/config/authorities/licenses.yml @@ -2,6 +2,9 @@ terms: - id: http://creativecommons.org/licenses/by/3.0/us/ term: Attribution 3.0 United States active: data + - id: http://creativecommons.org/licenses/by/4.0/ + term: Attribution 4.0 International + active: all - id: http://creativecommons.org/licenses/by-sa/3.0/us/ term: Attribution-ShareAlike 3.0 United States active: all diff --git a/config/environments/production.rb b/config/environments/production.rb index 633c4f38d..69c99d8cd 100755 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -74,6 +74,8 @@ # require 'syslog/logger' # config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new 'app-name') + config.log_directory = ENV['LOG_DIRECTORY'] || '/logs/hyc/' + if ENV['RAILS_LOG_TO_STDOUT'].present? logger = ActiveSupport::Logger.new(STDOUT) logger.formatter = config.log_formatter diff --git a/spec/features/boxc_redirect_spec.rb b/spec/features/boxc_redirect_spec.rb index 45ba7642d..ae7cde5f7 100644 --- a/spec/features/boxc_redirect_spec.rb +++ b/spec/features/boxc_redirect_spec.rb @@ -77,13 +77,14 @@ end describe '#after_sign_in_path_for' do + let(:admin) { FactoryBot.create(:admin, password: 'password') } scenario 'visiting a private work page' do private_article = Article.create(title: ['test article']) visit "#{ENV['HYRAX_HOST']}/concern/articles/#{private_article.id}" expect(current_path).to eq '/users/sign_in' - fill_in 'Onyen', with: 'admin' + fill_in 'Onyen', with: admin.uid fill_in 'Password', with: 'password' click_button 'Log in' diff --git a/spec/features/create_batch_of_works_spec.rb b/spec/features/create_batch_of_works_spec.rb index 50f5a7587..85774f408 100644 --- a/spec/features/create_batch_of_works_spec.rb +++ b/spec/features/create_batch_of_works_spec.rb @@ -4,18 +4,14 @@ # testing overrides RSpec.feature 'Create a batch of works', js: false do context 'a logged in user' do - let(:user) do - User.new(email: 'test@example.com', guest: false, uid: 'test') { |u| u.save!(validate: false) } - end + let(:user) { FactoryBot.create(:user) } - let(:admin_user) do - User.find_by_user_key('admin') - end + let(:admin_user) { FactoryBot.create(:admin) } scenario 'as a non-admin' do login_as user visit root_path - expect(page).to have_content user.uid + expect(page).to have_content user.display_name within("//ul[@id='user_utility_links']") do click_link 'Dashboard' @@ -33,7 +29,7 @@ scenario 'as an admin' do login_as admin_user visit root_path - expect(page).to have_content admin_user.uid + expect(page).to have_content admin_user.display_name within("//ul[@id='user_utility_links']") do click_link 'Dashboard' diff --git a/spec/features/edit_sage_ingested_works_spec.rb b/spec/features/edit_sage_ingested_works_spec.rb index 85c91bdb1..6ba974dc5 100644 --- a/spec/features/edit_sage_ingested_works_spec.rb +++ b/spec/features/edit_sage_ingested_works_spec.rb @@ -3,105 +3,122 @@ include Warden::Test::Helpers -RSpec.feature 'Edit works created through the Sage ingest', js: false do +RSpec.feature 'Edit works created through the Sage ingest', :sage, js: false do let(:ingest_progress_log_path) { File.join(fixture_path, 'sage', 'ingest_progress.log') } + let(:admin) { FactoryBot.create(:admin) } + let(:admin_set) do + AdminSet.create(title: ['sage admin set'], description: ['some description'], edit_users: [admin.user_key]) + end + let(:path_to_config) { File.join(fixture_path, 'sage', 'sage_config.yml') } + let(:path_to_tmp) { FileUtils.mkdir_p(File.join(fixture_path, 'sage', 'tmp')).first } + let(:ingest_service) { Tasks::SageIngestService.new(configuration_file: path_to_config) } + let(:article_count) { Article.count } + let(:articles) { Article.all } + # We're not clearing out the database, Fedora, and Solr before this test, so to find the first work created in this + # test, we need to count backwards from the last work created. + let(:first_work) { articles[-4] } + let(:first_work_id) { first_work.id } + let(:third_work_id) { articles[-2].id } # empty the progress log - around do |example| + around(:all) do |example| File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } example.run File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } + FileUtils.remove_entry(path_to_tmp) end - before(:all) do - @admin_user = FactoryBot.create(:admin) - @admin_set = AdminSet.create(title: ['sage admin set'], - description: ['some description'], - edit_users: [@admin_user.user_key]) - @path_to_config = File.join(fixture_path, 'sage', 'sage_config.yml') - - @ingest_service = Tasks::SageIngestService.new(configuration_file: @path_to_config) - @ingest_service.process_packages - - @article_count = Article.count - @articles = Article.all - # We're not clearing out the database, Fedora, and Solr before this test, so to find the first work created in this - # test, we need to count backwards from the last work created. - @first_work = @articles[-4] - @first_work_id = @first_work.id - @third_work_id = @articles[-2].id + before(:each) do + # return the FactoryBot admin user when searching for uid: admin from config + allow(User).to receive(:find_by).and_return(admin) + # Stub background jobs that don't do well in CI + # stub virus checking + allow(Hydra::Works::VirusCheckerService).to receive(:file_has_virus?) { false } + # stub longleaf job + allow(RegisterToLongleafJob).to receive(:perform_later).and_return(nil) + # stub FITS characterization + allow(CharacterizeJob).to receive(:perform_later) + # instantiate the sage ingest admin_set + admin_set + ingest_service.process_packages end - it 'can open the edit page' do - login_as @admin_user - visit "concern/articles/#{@first_work_id}" - expect(page).to have_content('Inequalities in Cervical Cancer Screening Uptake Between') - expect(page).to have_content('Smith, Jennifer S.') - expect(page).to have_content('sage admin set') - expect(page).to have_content('Attribution-NonCommercial 4.0 International') - click_link('Edit') - expect(page).to have_link('Work Deposit Form') + context 'as a regular user' do + let(:user) { FactoryBot.create(:user) } + it 'gives an unauthorized message' do + login_as user + visit "concern/articles/#{first_work_id}/edit" + expect(page).to have_content('Unauthorized') + end end - it 'has attached the file_set to the work' do - pending('Adding file sets to the Article object') - expect(@first_work.file_sets.first).to be_instance_of(FileSet) - end + context 'as an admin user' do + it 'can open the edit page' do + login_as admin + visit "concern/articles/#{first_work_id}" + expect(page).to have_content('Inequalities in Cervical Cancer Screening Uptake Between') + expect(page).to have_content('Smith, Jennifer S.') + expect(page).to have_content('sage admin set') + expect(page).to have_content('Attribution-NonCommercial 4.0 International') + expect(page).to have_content('February 1, 2021') + expect(page).to have_content('In Copyright') + click_link('Edit', match: :first) + expect(page).to have_link('Work Deposit Form') + end - it 'can render the pre-populated edit page' do - login_as @admin_user - visit "concern/articles/#{@first_work_id}/edit" - # These values are also tested in the spec/services/tasks/sage_ingest_service_spec.rb - # form order - expect(page).to have_field('Title', with: 'Inequalities in Cervical Cancer Screening Uptake Between Chinese Migrant Women and Local Women: A Cross-Sectional Study') - expect(page).to have_field('Creator #1', with: 'Holt, Hunter K.') - expect(page).to have_field('Additional affiliation (Creator #1)', with: 'Department of Family and Community Medicine, University of California, San Francisco, CA, USA') - expect(page).to have_field('ORCID (Creator #1)', with: 'https://orcid.org/0000-0001-6833-8372') - expect(page).to have_field('Abstract', with: /Efforts to increase education opportunities, provide insurance/) - click_link('Optional fields') - # alpha order below - expect(page).to have_field('Copyright date', with: '2021') - expect(page).to have_field('Date of publication', with: 'February 1, 2021') # aka date_issued - expect(page).to have_select('Dcmi type', with_selected: 'Text') - expect(page).to have_field('Funder', with: 'Fogarty International Center') - expect(page).to have_field('Identifier', with: 'https://doi.org/10.1177/1073274820985792') - expect(page).to have_field('ISSN', with: '1073-2748') - expect(page).to have_field('Journal issue', with: '') - expect(page).to have_field('Journal title', with: 'Cancer Control') - expect(page).to have_field('Journal volume', with: '28') - # keywords - expected_keywords = ['HPV', 'HPV knowledge and awareness', 'cervical cancer screening', 'migrant women', 'China', ''] - keyword_fields = page.all(:xpath, '/html/body/div[2]/div[2]/form/div/div[1]/div/div/div[1]/div[2]/div[2]/div[1]/ul/li/input') - keywords = keyword_fields.map(&:value) - expect(keyword_fields.count).to eq 6 - expect(keywords).to match_array(expected_keywords) - expect(page).to have_select('License', with_selected: 'Attribution-NonCommercial 4.0 International') - expect(page).to have_field('Publisher', with: 'SAGE Publications') - expect(page).to have_select('Resource type', with_selected: 'Article') - expect(page).to have_field('Rights holder', with: /SAGE Publications Inc, unless otherwise noted. Manuscript/) - expect(page).to have_select('Rights statement', with_selected: 'In Copyright') - expect(page).to have_checked_field('Public') - end - - # creators after the first one need JS to render - it 'can render the javascript-drawn fields', js: true do - login_as @admin_user - visit "concern/articles/#{@first_work_id}/edit" - expect(page).to have_field('Creator #2', with: 'Zhang, Xi') - end + # Creator 2 only visible with Javascript on + it 'can render the pre-populated edit page', js: true do + login_as admin + visit "concern/articles/#{first_work_id}/edit" + # These values are also tested in the spec/services/tasks/sage_ingest_service_spec.rb + # form order + expect(page).to have_field('Title', with: 'Inequalities in Cervical Cancer Screening Uptake Between Chinese Migrant Women and Local Women: A Cross-Sectional Study') + expect(page).to have_field('Creator #1', with: 'Holt, Hunter K.') + # Creator 2 only visible with Javascript on + expect(page).to have_field('Creator #2', with: 'Zhang, Xi') + expect(page).to have_field('Additional affiliation (Creator #1)', with: 'Department of Family and Community Medicine, University of California, San Francisco, CA, USA') + expect(page).to have_field('ORCID (Creator #1)', with: 'https://orcid.org/0000-0001-6833-8372') + expect(page).to have_field('Abstract', with: /Efforts to increase education opportunities, provide insurance/) + click_link('Optional fields') + # alpha order below + expect(page).to have_field('Copyright date', with: '2021') + expect(page).to have_field('Date of publication', with: 'February 1, 2021') # aka date_issued + expect(page).to have_select('Dcmi type', with_selected: 'Text') + expect(page).to have_field('Funder', with: 'Fogarty International Center') + expect(page).to have_field('Identifier', with: 'https://doi.org/10.1177/1073274820985792') + expect(page).to have_field('ISSN', with: '1073-2748') + expect(page).to have_field('Journal issue', with: '') + expect(page).to have_field('Journal title', with: 'Cancer Control') + expect(page).to have_field('Journal volume', with: '28') + # keywords + expected_keywords = ['HPV', 'HPV knowledge and awareness', 'cervical cancer screening', 'migrant women', 'China', ''] + keyword_fields = page.all(:xpath, '/html/body/div[2]/div[2]/form/div/div[1]/div/div/div[1]/div[2]/div[2]/div[1]/ul/li/input') + keywords = keyword_fields.map(&:value) + expect(keyword_fields.count).to eq 6 + expect(keywords).to match_array(expected_keywords) + expect(page).to have_select('License', with_selected: 'Attribution-NonCommercial 4.0 International') + expect(page).to have_field('Publisher', with: 'SAGE Publications') + expect(page).to have_select('Resource type', with_selected: 'Article') + expect(page).to have_field('Rights holder', with: /SAGE Publications Inc, unless otherwise noted. Manuscript/) + expect(page).to have_select('Rights statement', with_selected: 'In Copyright') + expect(page).to have_checked_field('Public') + end - it 'can render values only present on the second work' do - login_as @admin_user - visit "concern/articles/#{@third_work_id}/edit" - expect(page).to have_field('Title', with: /The Prevalence of Bacterial Infection in Patients Undergoing/) - expect(page).to have_field('Journal issue', with: '1') - expect(page).to have_field('Journal volume', with: '11') - keyword_fields = page.all(:xpath, '/html/body/div[2]/div[2]/form/div/div[1]/div/div/div[1]/div[2]/div[2]/div[1]/ul/li/input') - keywords = keyword_fields.map(&:value) - expect(keyword_fields.count).to eq 8 - expect(keywords).to include('Propionibacterium acnes') - expect(page).to have_select('License', with_selected: 'Attribution-NonCommercial-NoDerivatives 4.0 International') - expect(page).to have_field('Page end', with: '20') - expect(page).to have_field('Page start', with: '13') + it 'can render values only present on the second work' do + login_as admin + visit "concern/articles/#{third_work_id}/edit" + expect(page).to have_field('Title', with: /The Prevalence of Bacterial Infection in Patients Undergoing/) + # date that includes only month and year on the edit page + expect(page).to have_field('Date of publication', with: 'January 2021') # aka date_issued + expect(page).to have_field('Journal issue', with: '1') + expect(page).to have_field('Journal volume', with: '11') + keyword_fields = page.all(:xpath, '/html/body/div[2]/div[2]/form/div/div[1]/div/div/div[1]/div[2]/div[2]/div[1]/ul/li/input') + keywords = keyword_fields.map(&:value) + expect(keyword_fields.count).to eq 8 + expect(keywords).to include('Propionibacterium acnes') + expect(page).to have_select('License', with_selected: 'Attribution-NonCommercial-NoDerivatives 4.0 International') + expect(page).to have_field('Page end', with: '20') + expect(page).to have_field('Page start', with: '13') + end end end diff --git a/spec/fixtures/sage/AJH_2021_38_4_10.1177_1049909120951088.zip b/spec/fixtures/sage/AJH_2021_38_4_10.1177_1049909120951088.zip new file mode 100644 index 000000000..306b88f80 Binary files /dev/null and b/spec/fixtures/sage/AJH_2021_38_4_10.1177_1049909120951088.zip differ diff --git a/spec/fixtures/sage/quadruple_package.zip b/spec/fixtures/sage/quadruple_package.zip new file mode 100644 index 000000000..b07bc56d9 Binary files /dev/null and b/spec/fixtures/sage/quadruple_package.zip differ diff --git a/spec/fixtures/sage/sage_config.yml b/spec/fixtures/sage/sage_config.yml index 6f1c52e09..d4909927f 100644 --- a/spec/fixtures/sage/sage_config.yml +++ b/spec/fixtures/sage/sage_config.yml @@ -1,8 +1,9 @@ # Example configuration for Sage ingest +unzip_dir: spec/fixtures/sage/tmp package_dir: spec/fixtures/sage ingest_progress_log: spec/fixtures/sage/ingest_progress.log admin_set: sage admin set -# depositor_onyen: admin +depositor_onyen: admin # deposit_title: Deposit by Sage Depositor via CDR Collector 1.0 # deposit_method: CDR Collector 1.0 # deposit_type: Sage diff --git a/spec/fixtures/sage/tmp/.keep b/spec/fixtures/sage/tmp/.keep deleted file mode 100644 index e69de29bb..000000000 diff --git a/spec/fixtures/sage/triple_package.zip b/spec/fixtures/sage/triple_package.zip deleted file mode 100644 index b002cc10c..000000000 Binary files a/spec/fixtures/sage/triple_package.zip and /dev/null differ diff --git a/spec/models/jats_ingest_work_spec.rb b/spec/models/jats_ingest_work_spec.rb index bb0ded1f0..ec63c5084 100644 --- a/spec/models/jats_ingest_work_spec.rb +++ b/spec/models/jats_ingest_work_spec.rb @@ -1,9 +1,25 @@ require 'rails_helper' -RSpec.describe JatsIngestWork, type: :model do +RSpec.describe JatsIngestWork, :sage, type: :model do let(:xml_file_path) { File.join(fixture_path, 'sage', 'CCX_2021_28_10.1177_1073274820985792', '10.1177_1073274820985792.xml') } let(:work) { described_class.new(xml_path: xml_file_path) } + context 'when it can\'t match the license' do + before do + allow(CdrLicenseService.authority).to receive(:find).and_return({}) + end + + it 'can return the license info' do + expect(work.license).to eq([]) + end + + it 'logs a warning to the rails log' do + allow(Rails.logger).to receive(:warn) + work.license + expect(Rails.logger).to have_received(:warn).with('Could not match license uri: https://creativecommons.org/licenses/by-nc/4.0/ to a license in the controlled vocabulary. Work with DOI https://doi.org/10.1177/1073274820985792 may not include the required license.') + end + end + it 'can be initialized' do expect(described_class.new(xml_path: xml_file_path)).to be_instance_of described_class end diff --git a/spec/services/tasks/sage_ingest_service_spec.rb b/spec/services/tasks/sage_ingest_service_spec.rb index 283764f4b..cab04b782 100644 --- a/spec/services/tasks/sage_ingest_service_spec.rb +++ b/spec/services/tasks/sage_ingest_service_spec.rb @@ -1,12 +1,14 @@ require 'rails_helper' include ActiveSupport::Testing::TimeHelpers -RSpec.describe Tasks::SageIngestService do +RSpec.describe Tasks::SageIngestService, :sage do + include ActiveJob::TestHelper + let(:service) { described_class.new(configuration_file: path_to_config) } let(:sage_fixture_path) { File.join(fixture_path, 'sage') } let(:path_to_config) { File.join(sage_fixture_path, 'sage_config.yml') } - let(:path_to_tmp) { File.join(sage_fixture_path, 'tmp') } + let(:path_to_tmp) { FileUtils.mkdir_p(File.join(fixture_path, 'sage', 'tmp')).first } let(:first_package_identifier) { 'CCX_2021_28_10.1177_1073274820985792' } let(:first_zip_path) { "spec/fixtures/sage/#{first_package_identifier}.zip" } let(:first_dir_path) { "spec/fixtures/sage/tmp/#{first_package_identifier}" } @@ -21,137 +23,227 @@ end before do + # return the FactoryBot admin user when searching for uid: admin from config + allow(User).to receive(:find_by).with(uid: 'admin').and_return(admin) # instantiate the sage ingest admin_set admin_set end - # empty the progress log - around do |example| - File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } - example.run - File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } + after do + FileUtils.remove_entry(path_to_tmp) end - describe '#initialize' do - it 'sets parameters from the configuration file' do - expect(service.package_dir).to eq 'spec/fixtures/sage' - expect(service.admin_set).to be_instance_of(AdminSet) + context 'running the background jobs' do + let(:ingest_work) { JatsIngestWork.new(xml_path: first_xml_path) } + let(:built_article) { service.article_with_metadata(ingest_work) } + let(:user) { FactoryBot.create(:admin) } + + before do + # Stub background jobs that don't do well in CI + # stub virus checking + allow(Hydra::Works::VirusCheckerService).to receive(:file_has_virus?) { false } + # stub longleaf job + allow(RegisterToLongleafJob).to receive(:perform_later).and_return(nil) + # stub FITS characterization + allow(CharacterizeJob).to receive(:perform_later) end - it 'creates a progress log for the ingest' do - expect(service.ingest_progress_log).to be_instance_of(Migrate::Services::ProgressTracker) + around do |example| + File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } + perform_enqueued_jobs do + example.run + end + File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } end - end - it 'can run a wrapper method' do - expect(File.foreach(ingest_progress_log_path).count).to eq 0 - expect do - service.process_packages - end.to change { Article.count }.by(4) - expect(File.foreach(ingest_progress_log_path).count).to eq 4 + it 'attaches a file to the file_set' do + service.extract_files(first_zip_path, path_to_tmp) + service.attach_file_set_to_work(work: built_article, dir: path_to_tmp, file_name: '10.1177_1073274820985792.pdf', user: user, visibility: 'open') + fs = built_article.file_sets.first + expect(fs.files.first).to be_instance_of(Hydra::PCDM::File) + end end - context 'with an ingest work object' do - let(:ingest_work) { JatsIngestWork.new(xml_path: first_xml_path) } - let(:built_article) { service.build_article(ingest_work) } - let(:temp_dir) { Dir.mktmpdir } - - after do - FileUtils.remove_entry(temp_dir) + context 'without running the background jobs' do + # empty the progress log + around do |example| + File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } + example.run + File.open(ingest_progress_log_path, 'w') { |file| file.truncate(0) } end - it 'can create a valid article' do + describe '#initialize' do + it 'sets parameters from the configuration file' do + expect(service.package_dir).to eq 'spec/fixtures/sage' + expect(service.admin_set).to be_instance_of(AdminSet) + expect(service.depositor).to be_instance_of(User) + end + + it 'creates a progress log for the ingest' do + expect(service.ingest_progress_log).to be_instance_of(Migrate::Services::ProgressTracker) + end + end + # rubocop:disable Layout/MultilineMethodCallIndentation + it 'can run a wrapper method' do + expect(File.foreach(ingest_progress_log_path).count).to eq 0 expect do - service.build_article(ingest_work) - end.to change { Article.count }.by(1) + service.process_packages + end.to change { Article.count }.by(5) + .and change { FileSet.count }.by(10) + expect(File.foreach(ingest_progress_log_path).count).to eq 5 end - - it 'returns a valid article' do - expect(built_article).to be_instance_of Article - expect(built_article.persisted?).to be true - expect(built_article.valid?).to be true - # These values are also tested via the edit form in spec/features/edit_sage_ingested_works_spec.rb - expect(built_article.title).to eq(['Inequalities in Cervical Cancer Screening Uptake Between Chinese Migrant Women and Local Women: A Cross-Sectional Study']) - first_creator = built_article.creators.find { |creator| creator[:index] == ['1'] } - expect(first_creator.attributes['name']).to match_array(['Holt, Hunter K.']) - expect(first_creator.attributes['other_affiliation']).to match_array(['Department of Family and Community Medicine, University of California, San Francisco, CA, USA']) - expect(first_creator.attributes['orcid']).to match_array(['https://orcid.org/0000-0001-6833-8372']) - expect(built_article.abstract).to include(/Efforts to increase education opportunities, provide insurance/) - expect(built_article.date_issued).to eq('2021-02-01') - expect(built_article.copyright_date).to eq('2021') - expect(built_article.dcmi_type).to match_array(['http://purl.org/dc/dcmitype/Text']) - expect(built_article.funder).to match_array(['Fogarty International Center']) - expect(built_article.identifier).to match_array(['https://doi.org/10.1177/1073274820985792']) - expect(built_article.issn).to match_array(['1073-2748']) - expect(built_article.journal_issue).to be nil - expect(built_article.journal_title).to eq('Cancer Control') - expect(built_article.journal_volume).to eq('28') - expect(built_article.keyword).to match_array(['HPV', 'HPV knowledge and awareness', 'cervical cancer screening', 'migrant women', 'China']) - expect(built_article.license).to match_array(['http://creativecommons.org/licenses/by-nc/4.0/']) - expect(built_article.license_label).to match_array(['Attribution-NonCommercial 4.0 International']) - expect(built_article.publisher).to match_array(['SAGE Publications']) - expect(built_article.resource_type).to match_array(['Article']) - expect(built_article.rights_holder).to include(/SAGE Publications Inc, unless otherwise noted. Manuscript/) - expect(built_article.rights_statement).to eq('http://rightsstatements.org/vocab/InC/1.0/') - expect(built_article.visibility).to eq('open') + # rubocop:enable Layout/MultilineMethodCallIndentation + + context 'with an ingest work object' do + let(:ingest_work) { JatsIngestWork.new(xml_path: first_xml_path) } + let(:built_article) { service.article_with_metadata(ingest_work) } + let(:user) { FactoryBot.create(:admin) } + + it 'can create a valid article' do + expect do + service.article_with_metadata(ingest_work) + end.to change { Article.count }.by(1) + end + + it 'returns a valid article' do + expect(built_article).to be_instance_of Article + expect(built_article.persisted?).to be true + expect(built_article.valid?).to be true + # These values are also tested via the edit form in spec/features/edit_sage_ingested_works_spec.rb + expect(built_article.title).to eq(['Inequalities in Cervical Cancer Screening Uptake Between Chinese Migrant Women and Local Women: A Cross-Sectional Study']) + first_creator = built_article.creators.find { |creator| creator[:index] == ['1'] } + expect(first_creator.attributes['name']).to match_array(['Holt, Hunter K.']) + expect(first_creator.attributes['other_affiliation']).to match_array(['Department of Family and Community Medicine, University of California, San Francisco, CA, USA']) + expect(first_creator.attributes['orcid']).to match_array(['https://orcid.org/0000-0001-6833-8372']) + expect(built_article.abstract).to include(/Efforts to increase education opportunities, provide insurance/) + expect(built_article.date_issued).to eq('2021-02-01') + expect(built_article.copyright_date).to eq('2021') + expect(built_article.dcmi_type).to match_array(['http://purl.org/dc/dcmitype/Text']) + expect(built_article.funder).to match_array(['Fogarty International Center']) + expect(built_article.identifier).to match_array(['https://doi.org/10.1177/1073274820985792']) + expect(built_article.issn).to match_array(['1073-2748']) + expect(built_article.journal_issue).to be nil + expect(built_article.journal_title).to eq('Cancer Control') + expect(built_article.journal_volume).to eq('28') + expect(built_article.keyword).to match_array(['HPV', 'HPV knowledge and awareness', 'cervical cancer screening', 'migrant women', 'China']) + expect(built_article.license).to match_array(['http://creativecommons.org/licenses/by-nc/4.0/']) + expect(built_article.license_label).to match_array(['Attribution-NonCommercial 4.0 International']) + expect(built_article.publisher).to match_array(['SAGE Publications']) + expect(built_article.resource_type).to match_array(['Article']) + expect(built_article.rights_holder).to include(/SAGE Publications Inc, unless otherwise noted. Manuscript/) + expect(built_article.rights_statement).to eq('http://rightsstatements.org/vocab/InC/1.0/') + expect(built_article.rights_statement_label).to eq('In Copyright') + expect(built_article.visibility).to eq('open') + end + + it 'puts the work in an admin_set' do + expect(built_article.admin_set).to be_instance_of(AdminSet) + expect(built_article.admin_set.title).to eq(admin_set.title) + end + + it 'attaches a pdf file_set to the article' do + service.extract_files(first_zip_path, path_to_tmp) + expect do + service.attach_file_set_to_work(work: built_article, dir: path_to_tmp, file_name: '10.1177_1073274820985792.pdf', user: user, visibility: 'open') + end.to change { FileSet.count }.by(1) + expect(built_article.file_sets).to be_instance_of(Array) + fs = built_article.file_sets.first + expect(fs).to be_instance_of(FileSet) + expect(fs.depositor).to eq(user.uid) + expect(fs.visibility).to eq(built_article.visibility) + expect(fs.parent).to eq(built_article) + end + + it 'attaches an xml file_set to the article' do + service.extract_files(first_zip_path, path_to_tmp) + expect do + service.attach_file_set_to_work(work: built_article, dir: path_to_tmp, file_name: '10.1177_1073274820985792.xml', user: user, visibility: 'restricted') + end.to change { FileSet.count }.by(1) + expect(built_article.file_sets).to be_instance_of(Array) + fs = built_article.file_sets.first + expect(fs).to be_instance_of(FileSet) + expect(fs.depositor).to eq(user.uid) + expect(fs.visibility).to eq('restricted') + expect(fs.parent).to eq(built_article) + end end - it 'puts the work in an admin_set' do - expect(built_article.admin_set).to be_instance_of(AdminSet) - expect(built_article.admin_set.title).to eq(admin_set.title) - end + context 'when it cannot find the depositor' do + before do + # return nil when searching for the depositor + allow(User).to receive(:find_by).with(uid: 'admin').and_return(nil) + end - it 'attaches a file_set to the article' do - pending('Adding file sets to the Article object') - expect(built_article.file_sets).to be_instance_of(Array) - expect(built_article.file_sets.first).to be_instance_of(FileSet) + it 'raises an error' do + expect { described_class.new(configuration_file: path_to_config) }.to raise_error(ActiveRecord::RecordNotFound, 'Could not find User with onyen admin') + end end - end - describe '#extract_files' do - let(:temp_dir) { Dir.mktmpdir } - after do - FileUtils.remove_entry(temp_dir) - end - it 'takes a path to a zip file and a temp directory as arguments' do - service.extract_files(first_zip_path, temp_dir) - expect(Dir.entries(temp_dir)).to match_array(['.', '..', '10.1177_1073274820985792.pdf', '10.1177_1073274820985792.xml']) + context 'when it cannot find the admin_set' do + before do + allow(AdminSet).to receive(:where).with(title: 'sage admin set').and_return(nil) + end + + it 'raises an error' do + expect { described_class.new(configuration_file: path_to_config) }.to raise_error(ActiveRecord::RecordNotFound, 'Could not find AdminSet with title sage admin set') + end end - end - context 'with a package already unzipped' do - it 'can write to the progress log' do - allow(service).to receive(:package_ingest_complete?).and_return(true) - expect(File.size(ingest_progress_log_path)).to eq 0 - service.mark_done(first_package_identifier) - expect(File.read(ingest_progress_log_path).chomp).to eq 'CCX_2021_28_10.1177_1073274820985792' + describe '#extract_files' do + it 'takes a path to a zip file and a temp directory as arguments' do + service.extract_files(first_zip_path, path_to_tmp) + expect(Dir.entries(path_to_tmp)).to match_array(['.', '..', '10.1177_1073274820985792.pdf', '10.1177_1073274820985792.xml']) + end end - end - context 'with an unzipped file already present' do - before do - FileUtils.touch(first_pdf_path) + it 'writes to the log' do + logger = spy('logger') + allow(Logger).to receive(:new).and_return(logger) + described_class.new(configuration_file: path_to_config) + expect(logger).to have_received(:info).with('Beginning Sage ingest') end - after do - FileUtils.rm_rf(Dir["#{path_to_tmp}/*"]) + context 'with a log directory configured' do + it 'can write to the configured log path when it is an absolute path' do + allow(Rails.configuration).to receive(:log_directory).and_return('/some/absolute/path/') + + logger = spy('logger') + allow(Logger).to receive(:new).and_return(logger) + + expect(Logger).to receive(:new).with('/some/absolute/path/sage_ingest.log', { progname: 'Sage ingest' }) + described_class.new(configuration_file: path_to_config) + end + + it 'can write to the configured log path when it is a relative path' do + allow(Rails.configuration).to receive(:log_directory).and_return('some/relative/path/') + + logger = spy('logger') + allow(Logger).to receive(:new).and_return(logger) + + expect(Logger).to receive(:new).with('some/relative/path/sage_ingest.log', { progname: 'Sage ingest' }) + described_class.new(configuration_file: path_to_config) + end end - it 'logs to the rails log' do - allow(Rails.logger).to receive(:info) - service.extract_files(first_zip_path, path_to_tmp) - expect(Rails.logger).to have_received(:info).with("#{first_zip_path}, zip file error: Destination '#{first_pdf_path}' already exists") + context 'with a package including a manifest' do + let(:package_path) { File.join(fixture_path, 'sage', 'AJH_2021_38_4_10.1177_1049909120951088.zip') } + + it 'correctly identifies the manifest and jats xml' do + file_names = service.extract_files(package_path, path_to_tmp).keys + expect(service.jats_xml_path(file_names: file_names, dir: path_to_tmp)).to eq(File.join(path_to_tmp, '10.1177_1049909120951088.xml')) + end end - end - context 'with unexpected contents in the package' do - let(:temp_dir) { Dir.mktmpdir } - let(:package_path) { File.join(fixture_path, 'sage', 'triple_package.zip') } + context 'with unexpected contents in the package' do + let(:path_to_tmp) { Dir.mktmpdir } + let(:package_path) { File.join(fixture_path, 'sage', 'quadruple_package.zip') } - it 'logs an error' do - allow(Rails.logger).to receive(:error) - service.extract_files(package_path, temp_dir) - expect(Rails.logger).to have_received(:error).with("Unexpected package contents - more than two files extracted from #{package_path}") + it 'logs an error' do + logger = spy('logger') + allow(Logger).to receive(:new).and_return(logger) + service.extract_files(package_path, path_to_tmp) + expect(logger).to have_received(:error).with("Unexpected package contents - 4 files extracted from #{package_path}") + end end end end