From 0949e0c5dfc8640d4fcaaf3b740f61a5608eebb0 Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Thu, 12 Dec 2024 17:34:59 -0500 Subject: [PATCH] Fix row_count always being 1 when \r used as lineTerminator --- lib/csvlint/validate.rb | 61 +++++++++++++++++++++++++++++++++++++---- spec/validator_spec.rb | 58 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 6 deletions(-) diff --git a/lib/csvlint/validate.rb b/lib/csvlint/validate.rb index 549d1f9..11b8060 100644 --- a/lib/csvlint/validate.rb +++ b/lib/csvlint/validate.rb @@ -67,6 +67,7 @@ def initialize(source, dialect = {}, schema = nil, options = {}) @formats = [] @schema = schema @dialect = dialect + @orig_sep = $/ @csv_header = true @headers = {} @lambda = options[:lambda] @@ -107,10 +108,17 @@ def validate def validate_stream @current_line = 1 + # StringIO.each_line splits on the value of $/ (input record separator), + # which defaults to \n. This is why CSVs with \r EOL character don't get + # lines counted properly? + set_sep(@source) + @source.each_line do |line| break if line_limit_reached? parse_line(line) end + + reset_sep validate_line(@leading, @current_line) unless @leading == "" end @@ -131,10 +139,13 @@ def validate_url request.on_body do |chunk| chunk.force_encoding(Encoding::UTF_8) if chunk.encoding == Encoding::ASCII_8BIT io = StringIO.new(chunk) + set_sep(io) + io.each_line do |line| break if line_limit_reached? parse_line(line) end + reset_sep end request.run # Validate the last line too @@ -144,7 +155,7 @@ def validate_url def parse_line(line) line = @leading + line # Check if the last line is a line break - in which case it's a full line - if line[-1, 1].include?("\n") + if ["\n", "\r"].include?(line[-1, 1]) # If the number of quotes is odd, the linebreak is inside some quotes if line.count(@dialect["quoteChar"]).odd? @leading = line @@ -304,7 +315,7 @@ def header? end def report_line_breaks(line_no = nil) - return unless @input[-1, 1].include?("\n") # Return straight away if there's no newline character - i.e. we're on the last line + return unless ["\r", "\n"].include?(@input[-1, 1]) # Return straight away if there's no newline character - i.e. we're on the last line line_break = get_line_break(@input) @line_breaks << line_break unless line_breaks_reported? @@ -518,6 +529,43 @@ def locate_schema private + def set_sep(source) + $/ = determine_sep(source) + end + + def determine_sep(source) + return explicitly_set_sep if explicitly_set_sep + + src_str = case source + when File + File.read(source.path) + when IO + source.read + when StringIO + source.string + when Tempfile + source.read + else + raise "Unhandled source class: #{source.class}" + end + src_str.include?("\n") ? "\n" : "\r" + end + + def explicitly_set_sep + return unless @dialect + return unless @dialect.key?("lineTerminator") + + sep = @dialect["lineTerminator"] + return unless sep.is_a?(String) + return if sep.empty? + + sep + end + + def reset_sep + $/ = @orig_sep + end + def parse_extension(source) case source when File @@ -589,11 +637,12 @@ def line_limit_reached? end def get_line_break(line) - eol = line.chars.last(2) - if eol.first == "\r" - "\r\n" + eol = line.chars.last(2).join + case eol + when "\r\n" + eol else - "\n" + eol[-1] end end diff --git a/spec/validator_spec.rb b/spec/validator_spec.rb index 06c3380..7aab8b2 100644 --- a/spec/validator_spec.rb +++ b/spec/validator_spec.rb @@ -608,6 +608,64 @@ end end + describe "#row_count" do + let(:validator) { Csvlint::Validator.new(StringIO.new(csv), opts) } + let(:opts) { {} } + let(:result) { validator.row_count } + + context 'with \r\n line terminators and final EOL' do + let(:csv) do + %("Foo","Bar","Baz"\r\n"1","2","3"\r\n"3","2","1"\r\n) + end + + it "should count the total number of rows read" do + expect(validator.row_count).to eq(3) + end + end + + context 'with \r\n line terminators and without final EOL' do + let(:csv) do + %("Foo","Bar","Baz"\r\n"1","2","3"\r\n"3","2","1") + end + + it "should count the total number of rows read" do + expect(validator.row_count).to eq(3) + end + end + + context 'with \n line terminators and final EOL' do + let(:csv) do + %("Foo","Bar","Baz"\n"1","2","3"\n"3","2","1"\n) + end + + it "should count the total number of rows read" do + expect(validator.row_count).to eq(3) + end + end + + context 'with \r line terminators and final EOL' do + let(:csv) do + %("Foo","Bar","Baz"\r"1","2","3"\r"3","2","1"\r) + end + + it "should count the total number of rows read" do + expect(validator.row_count).to eq(3) + end + end + + context 'with \r line terminators and final EOL and lineTerminator ' \ + "specified" do + let(:csv) do + %("Foo","Bar","Baz"\r"1","2","3"\r"3","2","1"\r) + end + let(:opts) { {"lineTerminator" => "\r"} } + + it "should count the total number of rows read" do + expect(validator.row_count).to eq(3) + end + end + end + # Commented out because there is currently no way to mock redirects with Typhoeus and WebMock - see https://github.com/bblimke/webmock/issues/237 # it "should follow redirects to SSL" do # stub_request(:get, "http://example.com/redirect").to_return(:status => 301, :headers=>{"Location" => "https://example.com/example.csv"})