From c05b6b3bcd89639206c116e7f6e7c1fbf6f27d9c Mon Sep 17 00:00:00 2001 From: longqm Date: Fri, 13 Sep 2024 11:58:21 +0800 Subject: [PATCH] CSVRow::current_row_start(): track row start position of input stream --- include/internal/basic_csv_parser.cpp | 1 + include/internal/basic_csv_parser.hpp | 1 + include/internal/csv_row.hpp | 6 + single_include/csv.hpp | 1034 +++++++++++++------------ single_include_test/csv.hpp | 1034 +++++++++++++------------ tests/CMakeLists.txt | 1 + tests/test_row_start_position.cpp | 57 ++ 7 files changed, 1108 insertions(+), 1026 deletions(-) create mode 100644 tests/test_row_start_position.cpp diff --git a/include/internal/basic_csv_parser.cpp b/include/internal/basic_csv_parser.cpp index 61b8f358..ff039093 100644 --- a/include/internal/basic_csv_parser.cpp +++ b/include/internal/basic_csv_parser.cpp @@ -235,6 +235,7 @@ namespace csv { this->field_start = UNINITIALIZED_FIELD; this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; // Create memory map size_t length = std::min(this->source_size - this->mmap_pos, bytes); diff --git a/include/internal/basic_csv_parser.hpp b/include/internal/basic_csv_parser.hpp index d76b2d9e..b245e9a2 100644 --- a/include/internal/basic_csv_parser.hpp +++ b/include/internal/basic_csv_parser.hpp @@ -321,6 +321,7 @@ namespace csv { if (this->eof()) return; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index bd92c0ae..f0acd89e 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -130,6 +130,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -318,6 +321,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 83f64ee3..1886c0a7 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -5525,6 +5525,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -5713,6 +5716,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; @@ -6154,6 +6160,7 @@ namespace csv { if (this->eof()) return; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { @@ -6937,591 +6944,325 @@ namespace csv { namespace csv { namespace internals { - CSV_INLINE size_t get_file_size(csv::string_view filename) { - std::ifstream infile(std::string(filename), std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - - return end - start; + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; } - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - return get_csv_head(filename, get_file_size(filename)); - } + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; - CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { - const size_t bytes = 500000; + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } + } - std::error_code error; - size_t length = std::min((size_t)file_size, bytes); - auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; - if (error) { - throw std::runtime_error("Cannot open file " + std::string(filename)); - } + return CSV_NOT_FOUND; + } - return std::string(mmap.begin(), mmap.end()); + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); } -#ifdef _MSC_VER -#pragma region IBasicCVParser -#endif - CSV_INLINE IBasicCSVParser::IBasicCSVParser( - const CSVFormat& format, - const ColNamesPtr& col_names - ) : _col_names(col_names) { - if (format.no_quote) { - _parse_flags = internals::make_parse_flags(format.get_delim()); - } - else { - _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); - } + } +} +/** @file + * Defines an object used to store CSV format settings + */ - _ws_flags = internals::make_ws_flags( - format.trim_chars.data(), format.trim_chars.size() - ); - } +#include +#include - CSV_INLINE void IBasicCSVParser::end_feed() { - using internals::ParseFlags; - bool empty_last_field = this->data_ptr - && this->data_ptr->_data - && !this->data_ptr->data.empty() - && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER - || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } - // Push field - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } - // Push row - if (this->current_row.size() > 0) - this->push_row(); - } + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } - CSV_INLINE void IBasicCSVParser::parse_field() noexcept { - using internals::ParseFlags; - auto& in = this->data_ptr->data; + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } - // Trim off leading whitespace - while (data_pos < in.size() && ws_flag(in[data_pos])) - data_pos++; + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } - if (field_start == UNINITIALIZED_FIELD) - field_start = (int)(data_pos - current_row_start()); + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) - data_pos++; + this->header = row; + this->col_names = {}; + return *this; + } - field_length = data_pos - (field_start + current_row_start()); + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) - this->field_length--; + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); } - CSV_INLINE void IBasicCSVParser::push_field() - { - // Update - if (field_has_double_quote) { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length, - true - ); - field_has_double_quote = false; + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; - } - else { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length - ); - } + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; - current_row.row_length++; + if (i + 1 < intersection.size()) + err_msg += ", "; + } - // Reset field state - field_start = UNINITIALIZED_FIELD; - field_length = 0; + throw std::runtime_error(err_msg + '.'); } + } +} +/** @file + * @brief Defines functionality needed for basic CSV parsing + */ - /** @return The number of characters parsed that belong to complete rows */ - CSV_INLINE size_t IBasicCSVParser::parse() - { - using internals::ParseFlags; - this->quote_escape = false; - this->data_pos = 0; - this->current_row_start() = 0; - this->trim_utf8_bom(); +namespace csv { + namespace internals { + CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { + /** Print a CSV row */ + std::stringstream ret; + for (size_t i = 0; i < row.size(); i++) { + ret << row[i]; + if (i + 1 < row.size()) ret << delim; + else ret << '\n'; + } + ret.flush(); - auto& in = this->data_ptr->data; - while (this->data_pos < in.size()) { - switch (compound_parse_flag(in[this->data_pos])) { - case ParseFlags::DELIMITER: - this->push_field(); - this->data_pos++; - break; + return ret.str(); + } - case ParseFlags::NEWLINE: - this->data_pos++; + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { + // Parse the CSV + auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) - this->data_pos++; + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); - // End of record -> Write record - this->push_field(); - this->push_row(); + return CSVRow(std::move(rows[format.get_header()])); + } - // Reset - this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); - break; + CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; - case ParseFlags::NOT_SPECIAL: - this->parse_field(); - break; + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; - case ParseFlags::QUOTE_ESCAPE_QUOTE: - if (data_pos + 1 == in.size()) return this->current_row_start(); - else if (data_pos + 1 < in.size()) { - auto next_ch = parse_flag(in[data_pos + 1]); - if (next_ch >= ParseFlags::DELIMITER) { - quote_escape = false; - data_pos++; - break; - } - else if (next_ch == ParseFlags::QUOTE) { - // Case: Escaped quote - data_pos += 2; - this->field_length += 2; - this->field_has_double_quote = true; - break; - } - } - - // Case: Unescaped single quote => not strictly valid but we'll keep it - this->field_length++; - data_pos++; - - break; + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; - default: // Quote (currently not quote escaped) - if (this->field_length == 0) { - quote_escape = true; - data_pos++; - if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) - field_start = (int)(data_pos - current_row_start()); - break; - } + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); - // Case: Unescaped quote - this->field_length++; - data_pos++; + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; - break; + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; + } } } - return this->current_row_start(); - } - - CSV_INLINE void IBasicCSVParser::push_row() { - current_row.row_length = fields->size() - current_row.fields_start; - this->_records->push_back(std::move(current_row)); - } - - CSV_INLINE void IBasicCSVParser::reset_data_ptr() { - this->data_ptr = std::make_shared(); - this->data_ptr->parse_flags = this->_parse_flags; - this->data_ptr->col_names = this->_col_names; - this->fields = &(this->data_ptr->fields); - } - - CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { - auto& data = this->data_ptr->data; + double final_score = 0; + size_t header_row = 0; - if (!this->unicode_bom_scan && data.size() >= 3) { - if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { - this->data_pos += 3; // Remove BOM from input string - this->_utf8_bom = true; + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { + final_score = score; + header_row = row_when[row_size]; } - - this->unicode_bom_scan = true; } - } -#ifdef _MSC_VER -#pragma endregion -#endif - -#ifdef _MSC_VER -#pragma region Specializations -#endif - CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { - // Reset parser state - this->field_start = UNINITIALIZED_FIELD; - this->field_length = 0; - this->reset_data_ptr(); - // Create memory map - size_t length = std::min(this->source_size - this->mmap_pos, bytes); - std::error_code error; - this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); - this->mmap_pos += length; - if (error) throw error; + return { + final_score, + header_row + }; + } - auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { + /** For each delimiter, find out which row length was most common. + * The delimiter with the longest mode row length wins. + * Then, the line number of the header row is the first row with + * the mode row length. + */ - // Create string view - this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + CSVFormat format; + size_t max_score = 0, + header = 0; + char current_delim = delims[0]; - // Parse - this->current_row = CSVRow(this->data_ptr); - size_t remainder = this->parse(); + for (char cand_delim : delims) { + auto result = calculate_score(head, format.delimiter(cand_delim)); - if (this->mmap_pos == this->source_size || no_chunk()) { - this->_eof = true; - this->end_feed(); + if ((size_t)result.score > max_score) { + max_score = (size_t)result.score; + current_delim = cand_delim; + header = result.header; + } } - this->mmap_pos -= (length - remainder); + return { current_delim, (int)header }; } -#ifdef _MSC_VER -#pragma endregion -#endif } -} + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { + auto head = internals::get_csv_head(filename); -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = guess_format(filename, format.get_possible_delims()); + format.delimiter(guess_result.delim).header_row(guess_result.header_row); } - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; + return internals::_get_col_names(head, format); + } - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; - } - } + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { + auto head = internals::get_csv_head(filename); + return internals::_guess_format(head, delims); + } - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; + /** Reads an arbitrarily large CSV file using memory-mapped IO. + * + * **Details:** Reads the first block of a CSV file synchronously to get information + * such as column names and delimiting character. + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + * \snippet tests/test_read_csv.cpp CSVField Example + * + */ + CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { + auto head = internals::get_csv_head(filename); + using Parser = internals::MmapParser; - return CSV_NOT_FOUND; + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = internals::_guess_format(head, format.possible_delimiters); + format.delimiter(guess_result.delim); + format.header = guess_result.header_row; + this->_format = format; } - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); - } + if (!format.col_names.empty()) + this->set_col_names(format.col_names); + this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 + this->initial_read(); } -} -/** @file - * Defines an object used to store CSV format settings - */ -#include -#include + /** Return the format of the original raw CSV */ + CSV_INLINE CSVFormat CSVReader::get_format() const { + CSVFormat new_format = this->_format; + // Since users are normally not allowed to set + // column names and header row simulatenously, + // we will set the backing variables directly here + new_format.col_names = this->col_names->get_col_names(); + new_format.header = this->_format.header; -namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; + return new_format; } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + /** Return the CSV's column names as a vector of strings. */ + CSV_INLINE std::vector CSVReader::get_col_names() const { + if (this->col_names) { + return this->col_names->get_col_names(); + } - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; + return std::vector(); } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; - } + /** Return the index of the column name if found or + * csv::CSV_NOT_FOUND otherwise. + */ + CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { + auto _col_names = this->get_col_names(); + for (size_t i = 0; i < _col_names.size(); i++) + if (_col_names[i] == col_name) return (int)i; - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - - this->header = row; - this->col_names = {}; - return *this; - } - - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); - - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; - - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); - - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); - } - - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; - - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; - - if (i + 1 < intersection.size()) - err_msg += ", "; - } - - throw std::runtime_error(err_msg + '.'); - } - } -} -/** @file - * @brief Defines functionality needed for basic CSV parsing - */ - - -namespace csv { - namespace internals { - CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { - /** Print a CSV row */ - std::stringstream ret; - for (size_t i = 0; i < row.size(); i++) { - ret << row[i]; - if (i + 1 < row.size()) ret << delim; - else ret << '\n'; - } - ret.flush(); - - return ret.str(); - } - - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - // Parse the CSV - auto trim_chars = format.get_trim_chars(); - std::stringstream source(head.data()); - RowCollection rows; - - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); - - return CSVRow(std::move(rows[format.get_header()])); - } - - CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; - - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; - - // Parse the CSV - std::stringstream source(head.data()); - RowCollection rows; - - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); - - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; - - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } - } - - double final_score = 0; - size_t header_row = 0; - - // Final score is equal to the largest - // row size times rows of that size - for (auto& pair : row_tally) { - auto row_size = pair.first; - auto row_count = pair.second; - double score = (double)(row_size * row_count); - if (score > final_score) { - final_score = score; - header_row = row_when[row_size]; - } - } - - return { - final_score, - header_row - }; - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { - /** For each delimiter, find out which row length was most common. - * The delimiter with the longest mode row length wins. - * Then, the line number of the header row is the first row with - * the mode row length. - */ - - CSVFormat format; - size_t max_score = 0, - header = 0; - char current_delim = delims[0]; - - for (char cand_delim : delims) { - auto result = calculate_score(head, format.delimiter(cand_delim)); - - if ((size_t)result.score > max_score) { - max_score = (size_t)result.score; - current_delim = cand_delim; - header = result.header; - } - } - - return { current_delim, (int)header }; - } - } - - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { - auto head = internals::get_csv_head(filename); - - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = guess_format(filename, format.get_possible_delims()); - format.delimiter(guess_result.delim).header_row(guess_result.header_row); - } - - return internals::_get_col_names(head, format); - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { - auto head = internals::get_csv_head(filename); - return internals::_guess_format(head, delims); - } - - /** Reads an arbitrarily large CSV file using memory-mapped IO. - * - * **Details:** Reads the first block of a CSV file synchronously to get information - * such as column names and delimiting character. - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - * \snippet tests/test_read_csv.cpp CSVField Example - * - */ - CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { - auto head = internals::get_csv_head(filename); - using Parser = internals::MmapParser; - - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = internals::_guess_format(head, format.possible_delimiters); - format.delimiter(guess_result.delim); - format.header = guess_result.header_row; - this->_format = format; - } - - if (!format.col_names.empty()) - this->set_col_names(format.col_names); - - this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 - this->initial_read(); - } - - /** Return the format of the original raw CSV */ - CSV_INLINE CSVFormat CSVReader::get_format() const { - CSVFormat new_format = this->_format; - - // Since users are normally not allowed to set - // column names and header row simulatenously, - // we will set the backing variables directly here - new_format.col_names = this->col_names->get_col_names(); - new_format.header = this->_format.header; - - return new_format; - } - - /** Return the CSV's column names as a vector of strings. */ - CSV_INLINE std::vector CSVReader::get_col_names() const { - if (this->col_names) { - return this->col_names->get_col_names(); - } - - return std::vector(); - } - - /** Return the index of the column name if found or - * csv::CSV_NOT_FOUND otherwise. - */ - CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { - auto _col_names = this->get_col_names(); - for (size_t i = 0; i < _col_names.size(); i++) - if (_col_names[i] == col_name) return (int)i; - - return CSV_NOT_FOUND; + return CSV_NOT_FOUND; } CSV_INLINE void CSVReader::trim_header() { @@ -8575,5 +8316,272 @@ namespace csv { } } +namespace csv { + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); + + return end - start; + } + + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); + } + + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; + + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } + + return std::string(mmap.begin(), mmap.end()); + } + +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); + } + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); + } + + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); + } + + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; + + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER + || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); + + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } + + // Push row + if (this->current_row.size() > 0) + this->push_row(); + } + + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; + + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; + + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); + + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; + + field_length = data_pos - (field_start + current_row_start()); + + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; + } + + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; + + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } + + current_row.row_length++; + + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; + } + + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); + + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; + break; + + case ParseFlags::NEWLINE: + this->data_pos++; + + // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) + while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; + + // End of record -> Write record + this->push_field(); + this->push_row(); + + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; + + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; + + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } + } + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; + + break; + + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; + } + + // Case: Unescaped quote + this->field_length++; + data_pos++; + + break; + } + } + + return this->current_row_start(); + } + + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); + } + + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } + + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; + + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } + + this->unicode_bom_scan = true; + } + } +#ifdef _MSC_VER +#pragma endregion +#endif + +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; + + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; + + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); + + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); + + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } + + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif + } +} + + #endif diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index 83f64ee3..1886c0a7 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -5525,6 +5525,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -5713,6 +5716,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; @@ -6154,6 +6160,7 @@ namespace csv { if (this->eof()) return; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { @@ -6937,591 +6944,325 @@ namespace csv { namespace csv { namespace internals { - CSV_INLINE size_t get_file_size(csv::string_view filename) { - std::ifstream infile(std::string(filename), std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - - return end - start; + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; } - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - return get_csv_head(filename, get_file_size(filename)); - } + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; - CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { - const size_t bytes = 500000; + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } + } - std::error_code error; - size_t length = std::min((size_t)file_size, bytes); - auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; - if (error) { - throw std::runtime_error("Cannot open file " + std::string(filename)); - } + return CSV_NOT_FOUND; + } - return std::string(mmap.begin(), mmap.end()); + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); } -#ifdef _MSC_VER -#pragma region IBasicCVParser -#endif - CSV_INLINE IBasicCSVParser::IBasicCSVParser( - const CSVFormat& format, - const ColNamesPtr& col_names - ) : _col_names(col_names) { - if (format.no_quote) { - _parse_flags = internals::make_parse_flags(format.get_delim()); - } - else { - _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); - } + } +} +/** @file + * Defines an object used to store CSV format settings + */ - _ws_flags = internals::make_ws_flags( - format.trim_chars.data(), format.trim_chars.size() - ); - } +#include +#include - CSV_INLINE void IBasicCSVParser::end_feed() { - using internals::ParseFlags; - bool empty_last_field = this->data_ptr - && this->data_ptr->_data - && !this->data_ptr->data.empty() - && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER - || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } - // Push field - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } - // Push row - if (this->current_row.size() > 0) - this->push_row(); - } + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } - CSV_INLINE void IBasicCSVParser::parse_field() noexcept { - using internals::ParseFlags; - auto& in = this->data_ptr->data; + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } - // Trim off leading whitespace - while (data_pos < in.size() && ws_flag(in[data_pos])) - data_pos++; + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } - if (field_start == UNINITIALIZED_FIELD) - field_start = (int)(data_pos - current_row_start()); + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) - data_pos++; + this->header = row; + this->col_names = {}; + return *this; + } - field_length = data_pos - (field_start + current_row_start()); + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) - this->field_length--; + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); } - CSV_INLINE void IBasicCSVParser::push_field() - { - // Update - if (field_has_double_quote) { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length, - true - ); - field_has_double_quote = false; + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; - } - else { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length - ); - } + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; - current_row.row_length++; + if (i + 1 < intersection.size()) + err_msg += ", "; + } - // Reset field state - field_start = UNINITIALIZED_FIELD; - field_length = 0; + throw std::runtime_error(err_msg + '.'); } + } +} +/** @file + * @brief Defines functionality needed for basic CSV parsing + */ - /** @return The number of characters parsed that belong to complete rows */ - CSV_INLINE size_t IBasicCSVParser::parse() - { - using internals::ParseFlags; - this->quote_escape = false; - this->data_pos = 0; - this->current_row_start() = 0; - this->trim_utf8_bom(); +namespace csv { + namespace internals { + CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { + /** Print a CSV row */ + std::stringstream ret; + for (size_t i = 0; i < row.size(); i++) { + ret << row[i]; + if (i + 1 < row.size()) ret << delim; + else ret << '\n'; + } + ret.flush(); - auto& in = this->data_ptr->data; - while (this->data_pos < in.size()) { - switch (compound_parse_flag(in[this->data_pos])) { - case ParseFlags::DELIMITER: - this->push_field(); - this->data_pos++; - break; + return ret.str(); + } - case ParseFlags::NEWLINE: - this->data_pos++; + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { + // Parse the CSV + auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) - this->data_pos++; + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); - // End of record -> Write record - this->push_field(); - this->push_row(); + return CSVRow(std::move(rows[format.get_header()])); + } - // Reset - this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); - break; + CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; - case ParseFlags::NOT_SPECIAL: - this->parse_field(); - break; + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; - case ParseFlags::QUOTE_ESCAPE_QUOTE: - if (data_pos + 1 == in.size()) return this->current_row_start(); - else if (data_pos + 1 < in.size()) { - auto next_ch = parse_flag(in[data_pos + 1]); - if (next_ch >= ParseFlags::DELIMITER) { - quote_escape = false; - data_pos++; - break; - } - else if (next_ch == ParseFlags::QUOTE) { - // Case: Escaped quote - data_pos += 2; - this->field_length += 2; - this->field_has_double_quote = true; - break; - } - } - - // Case: Unescaped single quote => not strictly valid but we'll keep it - this->field_length++; - data_pos++; - - break; + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; - default: // Quote (currently not quote escaped) - if (this->field_length == 0) { - quote_escape = true; - data_pos++; - if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) - field_start = (int)(data_pos - current_row_start()); - break; - } + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); - // Case: Unescaped quote - this->field_length++; - data_pos++; + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; - break; + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; + } } } - return this->current_row_start(); - } - - CSV_INLINE void IBasicCSVParser::push_row() { - current_row.row_length = fields->size() - current_row.fields_start; - this->_records->push_back(std::move(current_row)); - } - - CSV_INLINE void IBasicCSVParser::reset_data_ptr() { - this->data_ptr = std::make_shared(); - this->data_ptr->parse_flags = this->_parse_flags; - this->data_ptr->col_names = this->_col_names; - this->fields = &(this->data_ptr->fields); - } - - CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { - auto& data = this->data_ptr->data; + double final_score = 0; + size_t header_row = 0; - if (!this->unicode_bom_scan && data.size() >= 3) { - if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { - this->data_pos += 3; // Remove BOM from input string - this->_utf8_bom = true; + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { + final_score = score; + header_row = row_when[row_size]; } - - this->unicode_bom_scan = true; } - } -#ifdef _MSC_VER -#pragma endregion -#endif - -#ifdef _MSC_VER -#pragma region Specializations -#endif - CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { - // Reset parser state - this->field_start = UNINITIALIZED_FIELD; - this->field_length = 0; - this->reset_data_ptr(); - // Create memory map - size_t length = std::min(this->source_size - this->mmap_pos, bytes); - std::error_code error; - this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); - this->mmap_pos += length; - if (error) throw error; + return { + final_score, + header_row + }; + } - auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { + /** For each delimiter, find out which row length was most common. + * The delimiter with the longest mode row length wins. + * Then, the line number of the header row is the first row with + * the mode row length. + */ - // Create string view - this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + CSVFormat format; + size_t max_score = 0, + header = 0; + char current_delim = delims[0]; - // Parse - this->current_row = CSVRow(this->data_ptr); - size_t remainder = this->parse(); + for (char cand_delim : delims) { + auto result = calculate_score(head, format.delimiter(cand_delim)); - if (this->mmap_pos == this->source_size || no_chunk()) { - this->_eof = true; - this->end_feed(); + if ((size_t)result.score > max_score) { + max_score = (size_t)result.score; + current_delim = cand_delim; + header = result.header; + } } - this->mmap_pos -= (length - remainder); + return { current_delim, (int)header }; } -#ifdef _MSC_VER -#pragma endregion -#endif } -} + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { + auto head = internals::get_csv_head(filename); -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = guess_format(filename, format.get_possible_delims()); + format.delimiter(guess_result.delim).header_row(guess_result.header_row); } - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; + return internals::_get_col_names(head, format); + } - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; - } - } + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { + auto head = internals::get_csv_head(filename); + return internals::_guess_format(head, delims); + } - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; + /** Reads an arbitrarily large CSV file using memory-mapped IO. + * + * **Details:** Reads the first block of a CSV file synchronously to get information + * such as column names and delimiting character. + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + * \snippet tests/test_read_csv.cpp CSVField Example + * + */ + CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { + auto head = internals::get_csv_head(filename); + using Parser = internals::MmapParser; - return CSV_NOT_FOUND; + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = internals::_guess_format(head, format.possible_delimiters); + format.delimiter(guess_result.delim); + format.header = guess_result.header_row; + this->_format = format; } - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); - } + if (!format.col_names.empty()) + this->set_col_names(format.col_names); + this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 + this->initial_read(); } -} -/** @file - * Defines an object used to store CSV format settings - */ -#include -#include + /** Return the format of the original raw CSV */ + CSV_INLINE CSVFormat CSVReader::get_format() const { + CSVFormat new_format = this->_format; + // Since users are normally not allowed to set + // column names and header row simulatenously, + // we will set the backing variables directly here + new_format.col_names = this->col_names->get_col_names(); + new_format.header = this->_format.header; -namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; + return new_format; } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + /** Return the CSV's column names as a vector of strings. */ + CSV_INLINE std::vector CSVReader::get_col_names() const { + if (this->col_names) { + return this->col_names->get_col_names(); + } - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; + return std::vector(); } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; - } + /** Return the index of the column name if found or + * csv::CSV_NOT_FOUND otherwise. + */ + CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { + auto _col_names = this->get_col_names(); + for (size_t i = 0; i < _col_names.size(); i++) + if (_col_names[i] == col_name) return (int)i; - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - - this->header = row; - this->col_names = {}; - return *this; - } - - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); - - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; - - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); - - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); - } - - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; - - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; - - if (i + 1 < intersection.size()) - err_msg += ", "; - } - - throw std::runtime_error(err_msg + '.'); - } - } -} -/** @file - * @brief Defines functionality needed for basic CSV parsing - */ - - -namespace csv { - namespace internals { - CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { - /** Print a CSV row */ - std::stringstream ret; - for (size_t i = 0; i < row.size(); i++) { - ret << row[i]; - if (i + 1 < row.size()) ret << delim; - else ret << '\n'; - } - ret.flush(); - - return ret.str(); - } - - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - // Parse the CSV - auto trim_chars = format.get_trim_chars(); - std::stringstream source(head.data()); - RowCollection rows; - - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); - - return CSVRow(std::move(rows[format.get_header()])); - } - - CSV_INLINE GuessScore calculate_score(csv::string_view head, CSVFormat format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; - - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; - - // Parse the CSV - std::stringstream source(head.data()); - RowCollection rows; - - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); - - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; - - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } - } - - double final_score = 0; - size_t header_row = 0; - - // Final score is equal to the largest - // row size times rows of that size - for (auto& pair : row_tally) { - auto row_size = pair.first; - auto row_count = pair.second; - double score = (double)(row_size * row_count); - if (score > final_score) { - final_score = score; - header_row = row_when[row_size]; - } - } - - return { - final_score, - header_row - }; - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { - /** For each delimiter, find out which row length was most common. - * The delimiter with the longest mode row length wins. - * Then, the line number of the header row is the first row with - * the mode row length. - */ - - CSVFormat format; - size_t max_score = 0, - header = 0; - char current_delim = delims[0]; - - for (char cand_delim : delims) { - auto result = calculate_score(head, format.delimiter(cand_delim)); - - if ((size_t)result.score > max_score) { - max_score = (size_t)result.score; - current_delim = cand_delim; - header = result.header; - } - } - - return { current_delim, (int)header }; - } - } - - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { - auto head = internals::get_csv_head(filename); - - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = guess_format(filename, format.get_possible_delims()); - format.delimiter(guess_result.delim).header_row(guess_result.header_row); - } - - return internals::_get_col_names(head, format); - } - - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { - auto head = internals::get_csv_head(filename); - return internals::_guess_format(head, delims); - } - - /** Reads an arbitrarily large CSV file using memory-mapped IO. - * - * **Details:** Reads the first block of a CSV file synchronously to get information - * such as column names and delimiting character. - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - * \snippet tests/test_read_csv.cpp CSVField Example - * - */ - CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { - auto head = internals::get_csv_head(filename); - using Parser = internals::MmapParser; - - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = internals::_guess_format(head, format.possible_delimiters); - format.delimiter(guess_result.delim); - format.header = guess_result.header_row; - this->_format = format; - } - - if (!format.col_names.empty()) - this->set_col_names(format.col_names); - - this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 - this->initial_read(); - } - - /** Return the format of the original raw CSV */ - CSV_INLINE CSVFormat CSVReader::get_format() const { - CSVFormat new_format = this->_format; - - // Since users are normally not allowed to set - // column names and header row simulatenously, - // we will set the backing variables directly here - new_format.col_names = this->col_names->get_col_names(); - new_format.header = this->_format.header; - - return new_format; - } - - /** Return the CSV's column names as a vector of strings. */ - CSV_INLINE std::vector CSVReader::get_col_names() const { - if (this->col_names) { - return this->col_names->get_col_names(); - } - - return std::vector(); - } - - /** Return the index of the column name if found or - * csv::CSV_NOT_FOUND otherwise. - */ - CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { - auto _col_names = this->get_col_names(); - for (size_t i = 0; i < _col_names.size(); i++) - if (_col_names[i] == col_name) return (int)i; - - return CSV_NOT_FOUND; + return CSV_NOT_FOUND; } CSV_INLINE void CSVReader::trim_header() { @@ -8575,5 +8316,272 @@ namespace csv { } } +namespace csv { + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); + + return end - start; + } + + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); + } + + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; + + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } + + return std::string(mmap.begin(), mmap.end()); + } + +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); + } + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); + } + + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); + } + + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; + + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER + || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); + + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } + + // Push row + if (this->current_row.size() > 0) + this->push_row(); + } + + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; + + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; + + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); + + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; + + field_length = data_pos - (field_start + current_row_start()); + + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; + } + + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; + + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } + + current_row.row_length++; + + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; + } + + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); + + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; + break; + + case ParseFlags::NEWLINE: + this->data_pos++; + + // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) + while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; + + // End of record -> Write record + this->push_field(); + this->push_row(); + + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; + + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; + + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } + } + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; + + break; + + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; + } + + // Case: Unescaped quote + this->field_length++; + data_pos++; + + break; + } + } + + return this->current_row_start(); + } + + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); + } + + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } + + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; + + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } + + this->unicode_bom_scan = true; + } + } +#ifdef _MSC_VER +#pragma endregion +#endif + +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; + + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; + + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); + + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); + + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } + + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif + } +} + + #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e5b57a87..43358c36 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,6 +27,7 @@ target_sources(csv_test test_data_type.cpp test_raw_csv_data.cpp test_round_trip.cpp + test_row_start_position.cpp ) target_link_libraries(csv_test csv) target_link_libraries(csv_test Catch2::Catch2WithMain) diff --git a/tests/test_row_start_position.cpp b/tests/test_row_start_position.cpp new file mode 100644 index 00000000..c5162681 --- /dev/null +++ b/tests/test_row_start_position.cpp @@ -0,0 +1,57 @@ +/** @file + * Tests for CSV parsing + */ + +#include // remove() +#include + +#include +#include +#include +#include + +#include "csv.hpp" + +using namespace csv; +using std::string; +using std::vector; + +// +// CSVRow::current_row_start() +// + +TEST_CASE("CSVRow::current_row_start", "[current_row_start]") { + CSVGuessResult guessed_format = guess_format("./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); + REQUIRE(guessed_format.delim == ';'); + REQUIRE(guessed_format.header_row == 0); + + std::fstream fstream; + auto testfile = std::filesystem::path("./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); + std::ifstream ifs(testfile.c_str()); + std::string content((std::istreambuf_iterator(ifs)), (std::istreambuf_iterator())); + + CSVFormat format; + format.delimiter(guessed_format.delim).header_row(guessed_format.header_row); + + { + // parse from file + CSVReader reader(testfile.c_str(), format); + uint64_t pos = 0; + for (CSVRow& row : reader) { + pos = content.find_first_of('\n', pos) + 1; + REQUIRE(row.current_row_start() == pos); + } + } + + { + // parse from stream + auto stream = std::stringstream(content); + auto reader = CSVReader(stream, format); + + uint64_t pos = 0; + for (CSVRow& row : reader) { + pos = content.find_first_of('\n', pos) + 1; + REQUIRE(row.current_row_start() == pos); + } + } +}