Skip to content

Commit

Permalink
HYC-1282: Attach PDFs to object ingested through Sage (#746)
Browse files Browse the repository at this point in the history
- If the app can't map the license uri to the controlled vocabulary, log a warning to the Rails log (we still want the object to save / do the rest of the ingest, but we want to know about it so that we can add that license to the controlled vocabulary and add it to the object that's missing it)
- Update the sage logger to log to its own file in the logs directory, which is now configured with sane defaults 
- Raise errors if the admin set or user from the configuration can't be found (the job will fail eventually, might as well fail early) 
- Do not unzip the file within a temp directory block, because then the background jobs won't be able to find it
  - Will need to add these files to the cleanup cron job
- Create the file_set with the PDF and attach it to the work
- Refactor the Sage feature test not to use a before(:all) block - it's less good practice to use :all and it was making mocking things difficult. Found other ways to make test somewhat faster (combined some tests with shared setup)
  • Loading branch information
maxkadel authored Jan 26, 2022
1 parent bda2699 commit 9b4b209
Show file tree
Hide file tree
Showing 16 changed files with 405 additions and 225 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ bundle exec rake sage:ingest[PATH_TO_CONFIG]
bundle exec rake sage:ingest[/hyrax/spec/fixtures/sage/sage_config.yml]
```
* See [example config file](spec/fixtures/sage/sage_config.yml).
* The beginning and ending of the rake task will be output to the console, the remaining events will be logged to the rails log and tagged with "Sage ingest".
* The beginning and ending of the rake task will be output to the console, the remaining events will be logged to the sage ingest log (in the same directory as the Rails log).

#### Testing
##### RSpec Testing
Expand Down
6 changes: 4 additions & 2 deletions app/models/jats_ingest_work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,10 @@ def keyword

def license
permissions.xpath('.//license/@xlink:href').map do |elem|
CdrLicenseService.authority.find(elem&.inner_text)[:id]
end
controlled_vocab_hash = CdrLicenseService.authority.find(elem&.inner_text)
Rails.logger.warn("Could not match license uri: #{elem&.inner_text} to a license in the controlled vocabulary. Work with DOI #{identifier&.first} may not include the required license.") if controlled_vocab_hash.empty?
controlled_vocab_hash[:id]
end.compact
end

def license_label
Expand Down
97 changes: 73 additions & 24 deletions app/services/tasks/sage_ingest_service.rb
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
module Tasks
require 'tasks/migrate/services/progress_tracker'
class SageIngestService
attr_reader :package_dir, :ingest_progress_log, :admin_set
attr_reader :package_dir, :ingest_progress_log, :admin_set, :depositor

def logger
@logger ||= begin
log_path = File.join(Rails.configuration.log_directory, 'sage_ingest.log')
Logger.new(log_path, progname: 'Sage ingest')
end
end

def initialize(args)
config = YAML.load_file(args[:configuration_file])
# Create temp directory for unzipped contents
@temp = config['unzip_dir']
FileUtils.mkdir_p @temp unless File.exist?(@temp)

logger.info('Beginning Sage ingest')
@admin_set = ::AdminSet.where(title: config['admin_set'])&.first
raise(ActiveRecord::RecordNotFound, "Could not find AdminSet with title #{config['admin_set']}") unless @admin_set.present?

@admin_set = ::AdminSet.where(title: config['admin_set']).first
@depositor = User.find_by(uid: config['depositor_onyen'])
raise(ActiveRecord::RecordNotFound, "Could not find User with onyen #{config['depositor_onyen']}") unless @depositor.present?

@package_dir = config['package_dir']
@ingest_progress_log = Migrate::Services::ProgressTracker.new(config['ingest_progress_log'])
Expand All @@ -15,29 +30,47 @@ def initialize(args)
def process_packages
sage_package_paths = Dir.glob("#{@package_dir}/*.zip").sort
count = sage_package_paths.count
Rails.logger.tagged('Sage ingest') { Rails.logger.info("Beginning ingest of #{count} Sage packages") }
logger.info("Beginning ingest of #{count} Sage packages")
sage_package_paths.each.with_index(1) do |package_path, index|
Rails.logger.tagged('Sage ingest') { Rails.logger.info("Begin processing #{package_path} (#{index} of #{count})") }
logger.info("Begin processing #{package_path} (#{index} of #{count})")
orig_file_name = File.basename(package_path, '.zip')
Dir.mktmpdir do |dir|
file_names = extract_files(package_path, dir).keys
next unless file_names.count == 2

_pdf_file_name = file_names.find { |name| name.match(/^(\S*).pdf/) }
xml_file_name = file_names.find { |name| name.match(/^(\S*).xml/) }
# parse xml
ingest_work = JatsIngestWork.new(xml_path: File.join(dir, xml_file_name))
# Create Article with metadata and save
build_article(ingest_work)
# Add PDF file to Article (including FileSets)
# save object
# set off background jobs for object?
mark_done(orig_file_name) if package_ingest_complete?(dir, file_names)

file_names = extract_files(package_path, @temp).keys
unless file_names.count.between?(2, 3)
logger.info("Error extracting #{package_path}: skipping zip file")
next
end

pdf_file_name = file_names.find { |name| name.match(/^(\S*).pdf/) }

jats_xml_path = jats_xml_path(file_names: file_names, dir: @temp)

# parse xml
ingest_work = JatsIngestWork.new(xml_path: jats_xml_path)
# Create Article with metadata and save
art_with_meta = article_with_metadata(ingest_work)
# Add PDF file to Article (including FileSets)
attach_file_set_to_work(work: art_with_meta, dir: @temp, file_name: pdf_file_name, user: @depositor, visibility: art_with_meta.visibility)
# Add xml metadata file to Article
attach_file_set_to_work(work: art_with_meta, dir: @temp, file_name: jats_xml_file_name(file_names: file_names), user: @depositor, visibility: Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PRIVATE)
mark_done(orig_file_name) if package_ingest_complete?(@temp, file_names)
end
logger.info("Completing ingest of #{count} Sage packages.")
end

def build_article(ingest_work)
def jats_xml_path(file_names:, dir:)
jats_xml_name = jats_xml_file_name(file_names: file_names)

File.join(dir, jats_xml_name)
end

def jats_xml_file_name(file_names:)
file_names -= ['manifest.xml']
file_names.find { |name| name.match(/^(\S*).xml/) }
end

def article_with_metadata(ingest_work)
logger.info("Creating article from DOI: #{ingest_work.identifier}")
art = Article.new
art.admin_set = @admin_set
# required fields
Expand All @@ -63,6 +96,7 @@ def build_article(ingest_work)
art.resource_type = ['Article']
art.rights_holder = ingest_work.rights_holder
art.rights_statement = 'http://rightsstatements.org/vocab/InC/1.0/'
art.rights_statement_label = 'In Copyright'
art.visibility = Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC
# fields not normally edited via UI
art.date_uploaded = DateTime.current
Expand All @@ -73,30 +107,45 @@ def build_article(ingest_work)
art
end

def attach_file_set_to_work(work:, dir:, file_name:, user:, visibility:)
file_set_params = { visibility: visibility }
logger.info("Attaching file_set for #{file_name} to DOI: #{work.identifier.first}")
file_set = FileSet.create
actor = Hyrax::Actors::FileSetActor.new(file_set, user)
actor.create_metadata(file_set_params)
file = File.open(File.join(dir, file_name))
actor.create_content(file)
actor.attach_to_work(work, file_set_params)
file.close

file_set
end

def mark_done(orig_file_name)
Rails.logger.tagged('Sage ingest') { Rails.logger.info("Marked package ingest complete #{orig_file_name}") }
logger.info("Marked package ingest complete #{orig_file_name}")
@ingest_progress_log.add_entry(orig_file_name)
end

# TODO: Make more assertions about what a completed ingest looks like and test here
def package_ingest_complete?(dir, file_names)
return true if File.exist?(File.join(dir, file_names.first)) && File.exist?(File.join(dir, file_names.last))

Rails.logger.tagged('Sage ingest') { Rails.logger.error("Package ingest not complete for #{file_names.first} and #{file_names.last}") }
logger.error("Package ingest not complete for #{file_names.first} and #{file_names.last}")
false
end

def extract_files(package_path, temp_dir)
logger.info("Extracting files from #{package_path} to #{temp_dir}")
extracted_files = Zip::File.open(package_path) do |zip_file|
zip_file.each do |file|
file_path = File.join(temp_dir, file.name)
zip_file.extract(file, file_path)
zip_file.extract(file, file_path) unless File.exist?(file_path)
end
end
Rails.logger.tagged('Sage ingest') { Rails.logger.error("Unexpected package contents - more than two files extracted from #{package_path}") } unless extracted_files.count == 2
logger.error("Unexpected package contents - #{extracted_files.count} files extracted from #{package_path}") unless extracted_files.count.between?(2, 3)
extracted_files
rescue Zip::DestinationFileExistsError => e
Rails.logger.tagged('Sage ingest') { Rails.logger.info("#{package_path}, zip file error: #{e.message}") }
logger.info("#{package_path}, zip file error: #{e.message}")
end
end
end
1 change: 1 addition & 0 deletions config/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Application < Rails::Application
config.log_formatter = proc do |severity, time, _progname, msg|
"#{time} - #{severity}: #{msg}\n"
end
config.log_directory = ENV['LOG_DIRECTORY'] || 'log'
log_path = ENV['LOGS_PATH'] || "log/#{Rails.env}.log"
logger = ActiveSupport::Logger.new(log_path)
logger.formatter = config.log_formatter
Expand Down
3 changes: 3 additions & 0 deletions config/authorities/licenses.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ terms:
- id: http://creativecommons.org/licenses/by/3.0/us/
term: Attribution 3.0 United States
active: data
- id: http://creativecommons.org/licenses/by/4.0/
term: Attribution 4.0 International
active: all
- id: http://creativecommons.org/licenses/by-sa/3.0/us/
term: Attribution-ShareAlike 3.0 United States
active: all
Expand Down
2 changes: 2 additions & 0 deletions config/environments/production.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@
# require 'syslog/logger'
# config.logger = ActiveSupport::TaggedLogging.new(Syslog::Logger.new 'app-name')

config.log_directory = ENV['LOG_DIRECTORY'] || '/logs/hyc/'

if ENV['RAILS_LOG_TO_STDOUT'].present?
logger = ActiveSupport::Logger.new(STDOUT)
logger.formatter = config.log_formatter
Expand Down
3 changes: 2 additions & 1 deletion spec/features/boxc_redirect_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,14 @@
end

describe '#after_sign_in_path_for' do
let(:admin) { FactoryBot.create(:admin, password: 'password') }
scenario 'visiting a private work page' do
private_article = Article.create(title: ['test article'])

visit "#{ENV['HYRAX_HOST']}/concern/articles/#{private_article.id}"
expect(current_path).to eq '/users/sign_in'

fill_in 'Onyen', with: 'admin'
fill_in 'Onyen', with: admin.uid
fill_in 'Password', with: 'password'
click_button 'Log in'

Expand Down
12 changes: 4 additions & 8 deletions spec/features/create_batch_of_works_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,14 @@
# testing overrides
RSpec.feature 'Create a batch of works', js: false do
context 'a logged in user' do
let(:user) do
User.new(email: '[email protected]', guest: false, uid: 'test') { |u| u.save!(validate: false) }
end
let(:user) { FactoryBot.create(:user) }

let(:admin_user) do
User.find_by_user_key('admin')
end
let(:admin_user) { FactoryBot.create(:admin) }

scenario 'as a non-admin' do
login_as user
visit root_path
expect(page).to have_content user.uid
expect(page).to have_content user.display_name

within("//ul[@id='user_utility_links']") do
click_link 'Dashboard'
Expand All @@ -33,7 +29,7 @@
scenario 'as an admin' do
login_as admin_user
visit root_path
expect(page).to have_content admin_user.uid
expect(page).to have_content admin_user.display_name

within("//ul[@id='user_utility_links']") do
click_link 'Dashboard'
Expand Down
Loading

0 comments on commit 9b4b209

Please sign in to comment.