-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_structure.rb
201 lines (161 loc) · 3.99 KB
/
check_structure.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env ruby
# Rolled it myself because csv-lint gem wanted to report spurious invalid encoding for US-ASCII files
# and not report ragged columns in said files
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'pry'
end
require 'csv'
require 'optparse'
require 'pry'
options = {
delimiter: ',',
suffix: 'csv'
}
OptionParser.new do |opts|
opts.banner = 'Usage: check_structure.rb -i path-to-input-dir -s file_suffix -d delimiter_name -o output_file'
opts.on('-i', '--input PATH', String, 'Path to input directory containing files') do |i|
options[:input] = File.expand_path(i)
end
opts.on('-s', '--suffix STRING', String, 'File suffix, without dot') do |s|
options[:suffix] = ".#{s.delete_prefix('.')}"
end
opts.on('-d', '--delimiter STRING', String, 'Delimiter name: comma (default), tab, pipe') do |d|
translations = {
comma: ',',
pipe: '|',
tab: "\t"
}
allowed = translations.keys.map(&:to_s)
d_allowed = allowed.any?(d)
unless d_allowed
puts "#{d} is not an allowed delimiter value. Use one of: #{allowed.join(', ')}"
exit
end
options[:delimiter] = translations[d.to_sym]
end
opts.on('-o', '--output PATH', String, 'Path to output CSV file') do |o|
options[:output] = File.expand_path(o)
end
opts.on('-h', '--help', 'Prints this help') do
puts opts
exit
end
end.parse!
class Checker
attr_reader :filename, :report
def initialize(path, delimiter)
@path = path
@filename = Pathname.new(path).basename
@rows = CSV.foreach(path, col_sep: delimiter)
@report = RowReport.new(@rows.first.length)
end
def check
@rows.each_with_index do |row, ind|
next if ind == 0
report.record(row, ind)
end
end
def ok?
report.ok?
end
end
class RaggedReport
attr_reader :count, :examples
Sufficient_Examples = 3
def initialize
@count = 0
@examples = {}
end
def add(row, ind)
@count += 1
return if sufficient_examples?
add_example(row, ind)
end
private
def add_example(row, ind)
row_num = ind + 1
@examples[row_num] = row
end
def sufficient_examples?
@examples.length == Sufficient_Examples
end
end
class RowReport
attr_reader :header_ct, :correct, :ragged
def initialize(header_ct)
@header_ct = header_ct
@ragged = {}
@correct = 0
end
def ok?
@ragged.empty?
end
def record(row, ind)
col_ct = row.length
col_ct == header_ct ? report_expected : report_ragged(col_ct, row, ind)
end
private
def report_expected
@correct += 1
end
def prepare_ragged_report(col_ct)
return if @ragged.key?(col_ct)
@ragged[col_ct] = RaggedReport.new
end
def report_ragged(col_ct, row, ind)
prepare_ragged_report(col_ct)
@ragged[col_ct].add(row, ind)
end
end
class CumulativeReport
attr_reader :path
def initialize(path)
@path = path
headers = %w[filename ok? col_ct occurrences example_rows]
CSV.open(path, 'w'){ |csv| csv << headers }
end
def add_file_info(checker)
CSV.open(path, 'a') do |csv|
rows_for(checker).each{ |row| csv << row }
end
end
private
def bad_rows(checker)
rows = [good_row(checker)]
checker.report.ragged.each do |col_ct, details|
rows << [
checker.filename,
'n',
col_ct,
details.count,
details.examples.keys.join(', ')
]
end
rows
end
def good_row(checker)
report = checker.report
[
checker.filename,
checker.ok? ? 'y' : 'n',
report.header_ct,
report.correct,
nil
]
end
def rows_for(checker)
return [good_row(checker)] if checker.ok?
bad_rows(checker)
end
end
report = CumulativeReport.new(options[:output])
files = Dir.children(options[:input])
.select{ |name| name.downcase.end_with?(options[:suffix]) }
.map{ |name| "#{options[:input]}/#{name}" }
files.each do |file|
checker = Checker.new(file, options[:delimiter])
checker.check
report.add_file_info(checker)
end