This repository has been archived by the owner on Sep 27, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy patharchivesspace_checker.rb
226 lines (187 loc) · 7 KB
/
archivesspace_checker.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
set :root, File.dirname(__FILE__)
# EadChecker is a Sinatra App
class ArchivesspaceChecker < Sinatra::Base
# site-specific configuration
CONFIG = if File.exists?(File.join('config', 'config.yml'))
YAML.safe_load(IO.read(File.join('config', 'config.yml'))) || {}
else
{}
end
set :assets_precompile, %w(application.js application.css *.png *.jpg)
set :assets_css_compressor, :scss
set :assets_js_compressor, :uglifier
register Sinatra::AssetPipeline
register Sinatra::Partial
set :haml, :format => :html5
# Schematron phases supported by included schematron
PHASE_OPTS = [
{name: "Manual", value: "'manual'", checked: "checked"},
{name: "Automatic", value: "'automated'"},
{name: "Everything", value: "'#ALL'"}
]
# Output options
OUTPUT_OPTS = {
'xml' => {name: 'xml', value: 'xml', mime: 'application/xml', :checked => "checked"},
'csv' => {name: 'csv', value: 'csv', mime: 'text/csv'}
}
# Processor configuration
# ========================
Saxon::Processor.default.config[:line_numbering] = true
# Disable a bunch of stuff in parser to prevent XXE vulnerabilities
parser_options = Saxon::Processor.default.to_java.getUnderlyingConfiguration.parseOptions
parser_options.add_parser_feature("http://apache.org/xml/features/disallow-doctype-decl", true)
parser_options.add_parser_feature("http://xml.org/sax/features/external-general-entities", false)
parser_options.add_parser_feature("http://xml.org/sax/features/external-parameter-entities", false)
# The schematron used by the application to check XML
SCHEMATRON = IO.read(CONFIG['schematron'] ||
File.join('schematron', 'archivesspace_checker_sch.xml'))
# Default Schematronium instance used for checking files
CHECKER = Schematronium.new(SCHEMATRON)
# A tagged string class, used to attach phase information to the rule descriptions
#
# Delegates most functionality to String
class RuleKeyStr < Delegator
attr_writer :manual
# @param [String] obj internal string instance to delegate to
def initialize(obj)
super
@str = obj
@manual = nil
end
# Requires manual intervention to fix this error?
def manual?
@manual
end
# @!visibility private
def __getobj__
@str
end
# @!visibility private
def __setobj__(obj)
@str = obj
end
end
stron_xml = Nokogiri::XML(SCHEMATRON).remove_namespaces!
# Representation of Schematronium structure used for generating help
STRON_REP = stron_xml.xpath('//rule').reduce({}) do |result, rule|
key = RuleKeyStr.new(rule.xpath('./comment()').text.strip)
key.manual = rule.ancestors('pattern').first['id'].match(/-manual\Z/)
result[key] = rule.xpath('./assert').map(&:text).map(&:strip)
result
end.sort_by {|k,v| k}.to_h
# @!group Helper Methods
# Runs schematron over a particular file
#
# If phase argument is provided, constructs checker restricted to that phase.
# @param [File] f a file to check
# @param [String] phase schematron phase to be run
def check_file(f, phase)
# If phase is other than default, bespoke checker
checker = (phase == "'#ALL'") ? CHECKER : Schematronium.new(SCHEMATRON, phase)
s_xml = Saxon.XML(f)
xml = checker.check(s_xml.to_s)
xml.remove_namespaces!
xml = xml.xpath("//failed-assert") + xml.xpath("//successful-report")
xml.each do |el|
el["line-number"] = s_xml.xpath(el.attr("location")).get_line_number
end
xml
end
# Stream XML as generated to out
#
# @param [Nokogiri::XML::NodeSet] xml results from schematron processing
# @param [String] orig_name name of EAD as uploaded
# @param [IO] out stream to write output to
# @return [nil]
def xml_output(xml, orig_name, out)
counts = xml.group_by {|el| el.element_children.first.text.strip.gsub(/\s+/, ' ')}.map {|k,v| [k,v.count]}.to_h
out << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
out << "<file file_name='#{orig_name}' total_errors='#{xml.count}'>\n"
out << "<error_counts>\n"
counts.each do |k,v|
out << "<message count='#{v}'>#{k}</message>\n"
end
out << "</error_counts>\n"
out << "<errors>\n"
xml.each do |n|
out << n.to_xml
end
out << "</errors>\n"
out << "</file>"
nil # Return value is not for use
end
# Produce CSV output method
# @param [Nokogiri::XML::NodeSet] xml results from schematron processing
# @param [String] orig_name name of EAD as uploaded
# @param [IO] out stream to write output to
# @return [nil]
def csv_output(xml, orig_name, out)
opts = {encoding: 'utf-8'}
out << CSV.generate_line(%w|filename total_errors|, opts)
out << CSV.generate_line( %w|filename total_errors|, opts)
out << CSV.generate_line( [orig_name, xml.count], opts)
out << CSV.generate_line( [], opts)
out << CSV.generate_line( %w|type location line-number message|, opts)
xml.each do |el|
out << CSV.generate_line( [el.name,
el['location'],
el['line-number'],
el.xpath('.//text').first.content], opts)
end
return nil
end
# @!endgroup
# @!group Routes
# Index route, entry point. This is the tool's UI
get "/" do
haml :index
end
# Form submissions post to this route, the response is information on errors
# in XML or CSV
#
# Output is streamed, due to issues with using Nokogiri to build large XML response sets.
#
# @see #xml_output
# @see #csv_output
post "/result.:filetype" do
up = params['eadFile']
# If Saxon throws, set headers and just return the response
begin
result_of_check = check_file(up[:tempfile], params[:phase])
rescue Java::NetSfSaxonS9api::SaxonApiException => e
headers "Content-Type" => "#{OUTPUT_OPTS['xml'][:mime]}; charset=utf8"
return <<-ERROR.lines.map(&:lstrip).join
<?xml version="1.0" encoding="UTF-8"?>
<fatal-error>
Possible causes include parse error, DOCTYPE declaration, or entity expansion in the EAD file you're checking. DOCTYPE declarations and entity resolution are disallowed for security reasons.
Original error message:
#{ e.message.split(/;/).map(&:strip).last(3).join("\n") }
</fatal-error>
ERROR
end
# Stream because otherwise large XML output will blow up the heap
headers "Content-Type" => "#{OUTPUT_OPTS[params[:filetype]][:mime]}; charset=utf8"
stream do |out|
case params[:filetype]
when 'xml'
xml_output(result_of_check,
up[:filename],
out)
when 'csv'
csv_output(result_of_check,
up[:filename],
out)
end
end
end
# Help page which lists errors that the tool can check for
get "/possible-errors" do
haml :possible_errors
end
# The schematron file
get "/schematron.xml" do
headers "Content-Type" => "application/xml; charset=utf8"
SCHEMATRON
end
# @!endgroup
end