Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make row validation optional, parse XLSX files, add worksheet column to Excel parser #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions lib/etl/engine.rb
Original file line number Diff line number Diff line change
Expand Up @@ -310,12 +310,13 @@ def process_batch(batch)
def process_control(control)
control = ETL::Control.resolve(control)
say_on_own_line "Processing control #{control.file}"

ETL::Engine.job = ETL::Execution::Job.create!(
:control_file => control.file,
:status => 'executing',
:batch_id => ETL::Engine.batch ? ETL::Engine.batch.id : nil
)

ETL::Engine.job = ETL::Execution::Job.new.tap do |job|
job.control_file = control.file
job.status = 'executing'
job.batch_id = ETL::Engine.batch ? ETL::Engine.batch.id : nil
job.save!
end

execute_dependencies(control)

Expand Down
10 changes: 9 additions & 1 deletion lib/etl/parser/csv_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ def initialize(source, options={})
super
configure
end

attr_reader :validate_rows

def get_fields_names(file)
File.open(file) do |input|
Expand Down Expand Up @@ -43,7 +45,7 @@ def each
end
line += 1
row = {}
validate_row(raw_row, line, file)
validate_row(raw_row, line, file) if self.validate_rows
raw_row.each_with_index do |value, index|
f = fields[index]
row[f.name] = value
Expand All @@ -70,6 +72,12 @@ def validate_row(row, line, file)
end

def configure
@validate_rows = if source.configuration.has_key?(:validate_rows)
source.configuration[:validate_rows]
else
true
end

source.definition.each do |options|
case options
when Symbol
Expand Down
31 changes: 24 additions & 7 deletions lib/etl/parser/excel_parser.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
optional_require 'spreadsheet'
optional_require 'roo'

module ETL
class Parser
class ExcelParser < ETL::Parser

attr_accessor :ignore_blank_line
attr_accessor :ignore_blank_line, :worksheet_column, :validate_rows

# Initialize the parser
# * <tt>source</tt>: The Source object
Expand All @@ -20,19 +20,29 @@ def each
ETL::Engine.logger.debug "parsing #{file}"
line = 0
lines_skipped = 0
book = Spreadsheet.open file
book = Roo::Spreadsheet.open file
loopworksheets = []

if worksheets.empty?
loopworksheets = book.worksheets
loopworksheets = book.sheets
else
worksheets.each do |index|
loopworksheets << book.worksheet(index)
loopworksheets << book.sheet(index)
end
end

sheet_index = -1

loopworksheets.each do |sheet|
book.each_with_pagename do |name, sheet|
sheet_index += 1
# puts "Sheet: #{name}"
# puts worksheets.inspect
if !worksheets.empty? && !worksheets.include?(sheet_index)
# puts "No!!! #{sheet_index.inspect}"
next
end
sheet.each do |raw_row|
# puts raw_row.inspect
if lines_skipped < source.skip_lines
ETL::Engine.logger.debug "skipping line"
lines_skipped += 1
Expand All @@ -44,11 +54,12 @@ def each
lines_skipped += 1
next
end
validate_row(raw_row, line, file)
validate_row(raw_row, line, file) if self.validate_rows
raw_row.each_with_index do |value, index|
f = fields[index]
row[f.name] = value
end
row[worksheet_column] = name if worksheet_column
yield row
end
end
Expand Down Expand Up @@ -87,6 +98,12 @@ def configure
end unless source.definition[:worksheets].nil?

self.ignore_blank_line = source.definition[:ignore_blank_line]
self.worksheet_column = source.definition[:worksheet_column]
self.validate_rows = if source.configuration.has_key?(:validate_rows)
source.configuration[:validate_rows]
else
true
end

source.definition[:fields].each do |options|
case options
Expand Down
Binary file modified spec/fixtures/data/excel2.xls
Binary file not shown.
5 changes: 4 additions & 1 deletion spec/fixtures/excel2.ctl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ source :in, {
:ssn,
:age,
:sex
]
] #,
# Add worksheet column e.g.
# In case the schemas of sheets are the same but their data should be differentiable as such.
# :worksheet_column => :name_info
}

transform :ssn, :sha1
Expand Down