From 21ff5cb3e9db1a3d36f14037069f50be49e36096 Mon Sep 17 00:00:00 2001 From: longqm Date: Fri, 13 Sep 2024 11:58:21 +0800 Subject: [PATCH 1/2] CSVRow::current_row_start(): track row start position of input stream --- include/internal/basic_csv_parser.cpp | 1 + include/internal/basic_csv_parser.hpp | 1 + include/internal/csv_row.hpp | 6 + single_include/csv.hpp | 2634 +++++++++++++------------ single_include_test/csv.hpp | 2634 +++++++++++++------------ tests/CMakeLists.txt | 1 + tests/test_row_start_position.cpp | 57 + 7 files changed, 2708 insertions(+), 2626 deletions(-) create mode 100644 tests/test_row_start_position.cpp diff --git a/include/internal/basic_csv_parser.cpp b/include/internal/basic_csv_parser.cpp index 61b8f358..ff039093 100644 --- a/include/internal/basic_csv_parser.cpp +++ b/include/internal/basic_csv_parser.cpp @@ -235,6 +235,7 @@ namespace csv { this->field_start = UNINITIALIZED_FIELD; this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; // Create memory map size_t length = std::min(this->source_size - this->mmap_pos, bytes); diff --git a/include/internal/basic_csv_parser.hpp b/include/internal/basic_csv_parser.hpp index d76b2d9e..b245e9a2 100644 --- a/include/internal/basic_csv_parser.hpp +++ b/include/internal/basic_csv_parser.hpp @@ -321,6 +321,7 @@ namespace csv { if (this->eof()) return; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index 0ab935d0..f12bb76e 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -136,6 +136,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -324,6 +327,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 811c8e14..74dcabce 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -5531,6 +5531,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -5719,6 +5722,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; @@ -6160,6 +6166,7 @@ namespace csv { if (this->eof()) return; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { @@ -6943,1643 +6950,1644 @@ namespace csv { namespace csv { namespace internals { - CSV_INLINE size_t get_file_size(csv::string_view filename) { - std::ifstream infile(std::string(filename), std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - - return end - start; + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; } - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - return get_csv_head(filename, get_file_size(filename)); - } + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; - CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { - const size_t bytes = 500000; + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } + } - std::error_code error; - size_t length = std::min((size_t)file_size, bytes); - auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; - if (error) { - throw std::runtime_error("Cannot open file " + std::string(filename)); - } + return CSV_NOT_FOUND; + } - return std::string(mmap.begin(), mmap.end()); + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); } -#ifdef _MSC_VER -#pragma region IBasicCVParser -#endif - CSV_INLINE IBasicCSVParser::IBasicCSVParser( - const CSVFormat& format, - const ColNamesPtr& col_names - ) : _col_names(col_names) { - if (format.no_quote) { - _parse_flags = internals::make_parse_flags(format.get_delim()); - } - else { - _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); - } + } +} +/** @file + * Defines an object used to store CSV format settings + */ - _ws_flags = internals::make_ws_flags( - format.trim_chars.data(), format.trim_chars.size() - ); - } +#include +#include - CSV_INLINE void IBasicCSVParser::end_feed() { - using internals::ParseFlags; - bool empty_last_field = this->data_ptr - && this->data_ptr->_data - && !this->data_ptr->data.empty() - && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER - || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } - // Push field - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } - // Push row - if (this->current_row.size() > 0) - this->push_row(); - } + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } - CSV_INLINE void IBasicCSVParser::parse_field() noexcept { - using internals::ParseFlags; - auto& in = this->data_ptr->data; + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } - // Trim off leading whitespace - while (data_pos < in.size() && ws_flag(in[data_pos])) - data_pos++; + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } - if (field_start == UNINITIALIZED_FIELD) - field_start = (int)(data_pos - current_row_start()); + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) - data_pos++; + this->header = row; + this->col_names = {}; + return *this; + } - field_length = data_pos - (field_start + current_row_start()); + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) - this->field_length--; + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); } - CSV_INLINE void IBasicCSVParser::push_field() - { - // Update - if (field_has_double_quote) { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length, - true - ); - field_has_double_quote = false; + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; - } - else { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length - ); - } + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; - current_row.row_length++; + if (i + 1 < intersection.size()) + err_msg += ", "; + } - // Reset field state - field_start = UNINITIALIZED_FIELD; - field_length = 0; + throw std::runtime_error(err_msg + '.'); } + } +} +/** @file + * Defines an input iterator for csv::CSVReader + */ - /** @return The number of characters parsed that belong to complete rows */ - CSV_INLINE size_t IBasicCSVParser::parse() - { - using internals::ParseFlags; - this->quote_escape = false; - this->data_pos = 0; - this->current_row_start() = 0; - this->trim_utf8_bom(); +namespace csv { + /** Return an iterator to the first row in the reader */ + CSV_INLINE CSVReader::iterator CSVReader::begin() { + if (this->records->empty()) { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); - auto& in = this->data_ptr->data; - while (this->data_pos < in.size()) { - switch (compound_parse_flag(in[this->data_pos])) { - case ParseFlags::DELIMITER: - this->push_field(); - this->data_pos++; - break; + // Still empty => return end iterator + if (this->records->empty()) return this->end(); + } - case ParseFlags::NEWLINE: - this->data_pos++; + this->_n_rows++; + CSVReader::iterator ret(this, this->records->pop_front()); + return ret; + } - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) - this->data_pos++; + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. + */ + CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { + return CSVReader::iterator(); + } - // End of record -> Write record - this->push_field(); - this->push_row(); + ///////////////////////// + // CSVReader::iterator // + ///////////////////////// - // Reset - this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); - break; + CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : + daddy(_daddy) { + row = std::move(_row); + } - case ParseFlags::NOT_SPECIAL: - this->parse_field(); - break; + /** Advance the iterator by one row. If this CSVReader has an + * associated file, then the iterator will lazily pull more data from + * that file until the end of file is reached. + * + * @note This iterator does **not** block the thread responsible for parsing CSV. + * + */ + CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } - case ParseFlags::QUOTE_ESCAPE_QUOTE: - if (data_pos + 1 == in.size()) return this->current_row_start(); - else if (data_pos + 1 < in.size()) { - auto next_ch = parse_flag(in[data_pos + 1]); - if (next_ch >= ParseFlags::DELIMITER) { - quote_escape = false; - data_pos++; - break; - } - else if (next_ch == ParseFlags::QUOTE) { - // Case: Escaped quote - data_pos += 2; - this->field_length += 2; - this->field_has_double_quote = true; - break; - } - } - - // Case: Unescaped single quote => not strictly valid but we'll keep it - this->field_length++; - data_pos++; - - break; - - default: // Quote (currently not quote escaped) - if (this->field_length == 0) { - quote_escape = true; - data_pos++; - if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) - field_start = (int)(data_pos - current_row_start()); - break; - } + return *this; + } - // Case: Unescaped quote - this->field_length++; - data_pos++; + /** Post-increment iterator */ + CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { + auto temp = *this; + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } - break; - } - } + return temp; + } +} - return this->current_row_start(); - } +/** @file + * Implements JSON serialization abilities + */ - CSV_INLINE void IBasicCSVParser::push_row() { - current_row.row_length = fields->size() - current_row.fields_start; - this->_records->push_back(std::move(current_row)); - } - CSV_INLINE void IBasicCSVParser::reset_data_ptr() { - this->data_ptr = std::make_shared(); - this->data_ptr->parse_flags = this->_parse_flags; - this->data_ptr->col_names = this->_col_names; - this->fields = &(this->data_ptr->fields); - } +namespace csv { + /* + The implementations for json_extra_space() and json_escape_string() + were modified from source code for JSON for Modern C++. - CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { - auto& data = this->data_ptr->data; + The respective license is below: - if (!this->unicode_bom_scan && data.size() >= 3) { - if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { - this->data_pos += 3; // Remove BOM from input string - this->_utf8_bom = true; - } + The code is licensed under the [MIT + License](http://opensource.org/licenses/MIT): + + Copyright © 2013-2015 Niels Lohmann. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + */ - this->unicode_bom_scan = true; - } - } -#ifdef _MSC_VER -#pragma endregion -#endif + namespace internals { + /*! + @brief calculates the extra space to escape a JSON string -#ifdef _MSC_VER -#pragma region Specializations -#endif - CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { - // Reset parser state - this->field_start = UNINITIALIZED_FIELD; - this->field_length = 0; - this->reset_data_ptr(); + @param[in] s the string to escape + @return the number of characters required to escape string @a s - // Create memory map - size_t length = std::min(this->source_size - this->mmap_pos, bytes); - std::error_code error; - this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); - this->mmap_pos += length; - if (error) throw error; + @complexity Linear in the length of string @a s. + */ + static std::size_t json_extra_space(csv::string_view& s) noexcept + { + std::size_t result = 0; - auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); - // Create string view - this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + for (const auto& c : s) + { + switch (c) + { + case '"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + { + // from c (1 byte) to \x (2 bytes) + result += 1; + break; + } - // Parse - this->current_row = CSVRow(this->data_ptr); - size_t remainder = this->parse(); - if (this->mmap_pos == this->source_size || no_chunk()) { - this->_eof = true; - this->end_feed(); + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // from c (1 byte) to \uxxxx (6 bytes) + result += 5; + } + break; + } + } } - this->mmap_pos -= (length - remainder); - } -#ifdef _MSC_VER -#pragma endregion -#endif - } -} - -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; + return result; } - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; - - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; + CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept + { + const auto space = json_extra_space(s); + if (space == 0) + { + return std::string(s); } - } - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; + // create a result string of necessary size + size_t result_size = s.size() + space; + std::string result(result_size, '\\'); + std::size_t pos = 0; - return CSV_NOT_FOUND; - } + for (const auto& c : s) + { + switch (c) + { + // quotation mark (0x22) + case '"': + { + result[pos + 1] = '"'; + pos += 2; + break; + } - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); - } - } -} -/** @file - * Defines an object used to store CSV format settings - */ + // reverse solidus (0x5c) + case '\\': + { + // nothing to change + pos += 2; + break; + } -#include -#include + // backspace (0x08) + case '\b': + { + result[pos + 1] = 'b'; + pos += 2; + break; + } -namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; - } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + // formfeed (0x0c) + case '\f': + { + result[pos + 1] = 'f'; + pos += 2; + break; + } - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; - } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; - } + // newline (0x0a) + case '\n': + { + result[pos + 1] = 'n'; + pos += 2; + break; + } - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; - } - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + // carriage return (0x0d) + case '\r': + { + result[pos + 1] = 'r'; + pos += 2; + break; + } - this->header = row; - this->col_names = {}; - return *this; + + // horizontal tab (0x09) + case '\t': + { + result[pos + 1] = 't'; + pos += 2; + break; + } + + + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // print character c as \uxxxx + snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); + pos += 6; + // overwrite trailing null character + result[pos] = '\\'; + } + else + { + // all other characters are added as-is + result[pos++] = c; + } + break; + } + } + } + + return result; + } } - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); + /** Convert a CSV row to a JSON object, i.e. + * `{"col1":"value1","col2":"value2"}` + * + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for original columns. + */ + CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) { + col_names = this->data ? this->get_col_names() : std::vector({}); + } - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; + const size_t _n_cols = col_names.size(); + std::string ret = "{"; + + for (size_t i = 0; i < _n_cols; i++) { + auto& col = col_names[i]; + auto field = this->operator[](col); - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); + // TODO: Possible performance enhancements by caching escaped column names + ret += '"' + internals::json_escape_string(col) + "\":"; - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; + + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; } - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; + ret += '}'; + return ret; + } - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; + /** Convert a CSV row to a JSON array, i.e. + * `["value1","value2",...]` + * + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for all columns. + */ + CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) + col_names = this->data ? this->get_col_names() : std::vector({}); - if (i + 1 < intersection.size()) - err_msg += ", "; - } + const size_t _n_cols = col_names.size(); + std::string ret = "["; - throw std::runtime_error(err_msg + '.'); + for (size_t i = 0; i < _n_cols; i++) { + auto field = this->operator[](col_names[i]); + + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; + + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; } + + ret += ']'; + return ret; } } + /** @file - * @brief Defines functionality needed for basic CSV parsing + * Calculates statistics from CSV files */ +#include namespace csv { - namespace internals { - CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { - /** Print a CSV row */ - std::stringstream ret; - for (size_t i = 0; i < row.size(); i++) { - ret << row[i]; - if (i + 1 < row.size()) ret << delim; - else ret << '\n'; - } - ret.flush(); + /** Calculate statistics for an arbitrarily large file. When this constructor + * is called, CSVStat will process the entire file iteratively. Once finished, + * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. + */ + CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) : + reader(filename, format) { + this->calc(); + } - return ret.str(); - } + /** Calculate statistics for a CSV stored in a std::stringstream */ + CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) : + reader(stream, format) { + this->calc(); + } - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - // Parse the CSV - auto trim_chars = format.get_trim_chars(); - std::stringstream source(head.data()); - RowCollection rows; + /** Return current means */ + CSV_INLINE std::vector CSVStat::get_mean() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->rolling_means[i]); + } + return ret; + } - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); + /** Return current variances */ + CSV_INLINE std::vector CSVStat::get_variance() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); + } + return ret; + } - return CSVRow(std::move(rows[format.get_header()])); + /** Return current mins */ + CSV_INLINE std::vector CSVStat::get_mins() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->mins[i]); } + return ret; + } - CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; + /** Return current maxes */ + CSV_INLINE std::vector CSVStat::get_maxes() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->maxes[i]); + } + return ret; + } - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; + /** Get counts for each column */ + CSV_INLINE std::vector CSVStat::get_counts() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->counts[i]); + } + return ret; + } - // Parse the CSV - std::stringstream source(head.data()); - RowCollection rows; + /** Get data type counts for each column */ + CSV_INLINE std::vector CSVStat::get_dtypes() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->dtypes[i]); + } + return ret; + } - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); - - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; - - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } + CSV_INLINE void CSVStat::calc_chunk() { + /** Only create stats counters the first time **/ + if (dtypes.empty()) { + /** Go through all records and calculate specified statistics */ + for (size_t i = 0; i < this->get_col_names().size(); i++) { + dtypes.push_back({}); + counts.push_back({}); + rolling_means.push_back(0); + rolling_vars.push_back(0); + mins.push_back(NAN); + maxes.push_back(NAN); + n.push_back(0); } + } - double final_score = 0; - size_t header_row = 0; - - // Final score is equal to the largest - // row size times rows of that size - for (auto& pair : row_tally) { - auto row_size = pair.first; - auto row_count = pair.second; - double score = (double)(row_size * row_count); - if (score > final_score) { - final_score = score; - header_row = row_when[row_size]; - } - } + // Start threads + std::vector pool; + for (size_t i = 0; i < this->get_col_names().size(); i++) + pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); - return { - final_score, - header_row - }; - } + // Block until done + for (auto& th : pool) + th.join(); - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { - /** For each delimiter, find out which row length was most common. - * The delimiter with the longest mode row length wins. - * Then, the line number of the header row is the first row with - * the mode row length. - */ + this->records.clear(); + } - CSVFormat format; - size_t max_score = 0, - header = 0; - char current_delim = delims[0]; + CSV_INLINE void CSVStat::calc() { + constexpr size_t CALC_CHUNK_SIZE = 5000; - for (char cand_delim : delims) { - auto result = calculate_score(head, format.delimiter(cand_delim)); + for (auto& row : reader) { + this->records.push_back(std::move(row)); - if ((size_t)result.score > max_score) { - max_score = (size_t)result.score; - current_delim = cand_delim; - header = result.header; - } + /** Chunk rows */ + if (this->records.size() == CALC_CHUNK_SIZE) { + calc_chunk(); } + } - return { current_delim, (int)header }; + if (!this->records.empty()) { + calc_chunk(); } } - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { - auto head = internals::get_csv_head(filename); + CSV_INLINE void CSVStat::calc_worker(const size_t &i) { + /** Worker thread for CSVStat::calc() which calculates statistics for one column. + * + * @param[in] i Column index + */ - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = guess_format(filename, format.get_possible_delims()); - format.delimiter(guess_result.delim).header_row(guess_result.header_row); - } + auto current_record = this->records.begin(); - return internals::_get_col_names(head, format); - } + for (size_t processed = 0; current_record != this->records.end(); processed++) { + if (current_record->size() == this->get_col_names().size()) { + auto current_field = (*current_record)[i]; - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { - auto head = internals::get_csv_head(filename); - return internals::_guess_format(head, delims); - } + // Optimization: Don't count() if there's too many distinct values in the first 1000 rows + if (processed < 1000 || this->counts[i].size() <= 500) + this->count(current_field, i); - /** Reads an arbitrarily large CSV file using memory-mapped IO. - * - * **Details:** Reads the first block of a CSV file synchronously to get information - * such as column names and delimiting character. - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - * \snippet tests/test_read_csv.cpp CSVField Example - * - */ - CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { - auto head = internals::get_csv_head(filename); - using Parser = internals::MmapParser; + this->dtype(current_field, i); - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = internals::_guess_format(head, format.possible_delimiters); - format.delimiter(guess_result.delim); - format.header = guess_result.header_row; - this->_format = format; - } + // Numeric Stuff + if (current_field.is_num()) { + long double x_n = current_field.get(); - if (!format.col_names.empty()) - this->set_col_names(format.col_names); + // This actually calculates mean AND variance + this->variance(x_n, i); + this->min_max(x_n, i); + } + } + else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) { + throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record)); + } - this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 - this->initial_read(); + ++current_record; + } } - /** Return the format of the original raw CSV */ - CSV_INLINE CSVFormat CSVReader::get_format() const { - CSVFormat new_format = this->_format; + CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) { + /** Given a record update the type counter + * @param[in] record Data observation + * @param[out] i The column index that should be updated + */ + + auto type = data.type(); + if (this->dtypes[i].find(type) != + this->dtypes[i].end()) { + // Increment count + this->dtypes[i][type]++; + } else { + // Initialize count + this->dtypes[i].insert(std::make_pair(type, 1)); + } + } - // Since users are normally not allowed to set - // column names and header row simulatenously, - // we will set the backing variables directly here - new_format.col_names = this->col_names->get_col_names(); - new_format.header = this->_format.header; + CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) { + /** Given a record update the frequency counter + * @param[in] record Data observation + * @param[out] i The column index that should be updated + */ - return new_format; - } + auto item = data.get(); - /** Return the CSV's column names as a vector of strings. */ - CSV_INLINE std::vector CSVReader::get_col_names() const { - if (this->col_names) { - return this->col_names->get_col_names(); + if (this->counts[i].find(item) != + this->counts[i].end()) { + // Increment count + this->counts[i][item]++; + } else { + // Initialize count + this->counts[i].insert(std::make_pair(item, 1)); } - - return std::vector(); } - /** Return the index of the column name if found or - * csv::CSV_NOT_FOUND otherwise. - */ - CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { - auto _col_names = this->get_col_names(); - for (size_t i = 0; i < _col_names.size(); i++) - if (_col_names[i] == col_name) return (int)i; - - return CSV_NOT_FOUND; + CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) { + /** Update current minimum and maximum + * @param[in] x_n Data observation + * @param[out] i The column index that should be updated + */ + if (std::isnan(this->mins[i])) + this->mins[i] = x_n; + if (std::isnan(this->maxes[i])) + this->maxes[i] = x_n; + + if (x_n < this->mins[i]) + this->mins[i] = x_n; + else if (x_n > this->maxes[i]) + this->maxes[i] = x_n; } - CSV_INLINE void CSVReader::trim_header() { - if (!this->header_trimmed) { - for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { - if (i == this->_format.header && this->col_names->empty()) { - this->set_col_names(this->records->pop_front()); - } - else { - this->records->pop_front(); - } - } + CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) { + /** Given a record update rolling mean and variance for all columns + * using Welford's Algorithm + * @param[in] x_n Data observation + * @param[out] i The column index that should be updated + */ + long double& current_rolling_mean = this->rolling_means[i]; + long double& current_rolling_var = this->rolling_vars[i]; + long double& current_n = this->n[i]; + long double delta; + long double delta2; - this->header_trimmed = true; + current_n++; + + if (current_n == 1) { + current_rolling_mean = x_n; + } else { + delta = x_n - current_rolling_mean; + current_rolling_mean += delta/current_n; + delta2 = x_n - current_rolling_mean; + current_rolling_var += delta*delta2; } } - /** - * @param[in] names Column names - */ - CSV_INLINE void CSVReader::set_col_names(const std::vector& names) - { - this->col_names->set_col_names(names); - this->n_cols = names.size(); - } - - /** - * Read a chunk of CSV data. - * - * @note This method is meant to be run on its own thread. Only one `read_csv()` thread - * should be active at a time. - * - * @param[in] bytes Number of bytes to read. - * - * @see CSVReader::read_csv_worker - * @see CSVReader::read_row() - */ - CSV_INLINE bool CSVReader::read_csv(size_t bytes) { - // Tell read_row() to listen for CSV rows - this->records->notify_all(); - - this->parser->set_output(*this->records); - this->parser->next(bytes); - - if (!this->header_trimmed) { - this->trim_header(); - } - - // Tell read_row() to stop waiting - this->records->kill_all(); - - return true; - } - - /** - * Retrieve rows as CSVRow objects, returning true if more rows are available. - * - * @par Performance Notes - * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time - * - For performance details, read the documentation for CSVRow and CSVField. - * - * @param[out] row The variable where the parsed row will be stored - * @see CSVRow, CSVField + /** Useful for uploading CSV files to SQL databases. * - * **Example:** - * \snippet tests/test_read_csv.cpp CSVField Example + * Return a data type for each column such that every value in a column can be + * converted to the corresponding data type without data loss. + * @param[in] filename The CSV file * + * \return A mapping of column names to csv::DataType enums */ - CSV_INLINE bool CSVReader::read_row(CSVRow &row) { - while (true) { - if (this->records->empty()) { - if (this->records->is_waitable()) - // Reading thread is currently active => wait for it to populate records - this->records->wait(); - else if (this->parser->eof()) - // End of file and no more records - return false; - else { - // Reading thread is not active => start another one - if (this->read_csv_worker.joinable()) - this->read_csv_worker.join(); + CSV_INLINE std::unordered_map csv_data_types(const std::string& filename) { + CSVStat stat(filename); + std::unordered_map csv_dtypes; - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - } - } - else if (this->records->front().size() != this->n_cols && - this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { - auto errored_row = this->records->pop_front(); + auto col_names = stat.get_col_names(); + auto temp = stat.get_dtypes(); - if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { - if (errored_row.size() < this->n_cols) - throw std::runtime_error("Line too short " + internals::format_row(errored_row)); + for (size_t i = 0; i < stat.get_col_names().size(); i++) { + auto& col = temp[i]; + auto& col_name = col_names[i]; - throw std::runtime_error("Line too long " + internals::format_row(errored_row)); - } - } - else { - row = this->records->pop_front(); - this->_n_rows++; - return true; - } + if (col[DataType::CSV_STRING]) + csv_dtypes[col_name] = DataType::CSV_STRING; + else if (col[DataType::CSV_INT64]) + csv_dtypes[col_name] = DataType::CSV_INT64; + else if (col[DataType::CSV_INT32]) + csv_dtypes[col_name] = DataType::CSV_INT32; + else if (col[DataType::CSV_INT16]) + csv_dtypes[col_name] = DataType::CSV_INT16; + else if (col[DataType::CSV_INT8]) + csv_dtypes[col_name] = DataType::CSV_INT8; + else + csv_dtypes[col_name] = DataType::CSV_DOUBLE; } - return false; + return csv_dtypes; } } - -/** @file - * Defines an input iterator for csv::CSVReader - */ +#include +#include namespace csv { - /** Return an iterator to the first row in the reader */ - CSV_INLINE CSVReader::iterator CSVReader::begin() { - if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(in.data()); + return CSVReader(stream, format); + } - // Still empty => return end iterator - if (this->records->empty()) return this->end(); - } + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); - this->_n_rows++; - CSVReader::iterator ret(this, this->records->pop_front()); - return ret; + return parse(in, format); } - /** A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * */ - CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { - return CSVReader::iterator(); + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); } - ///////////////////////// - // CSVReader::iterator // - ///////////////////////// - - CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : - daddy(_daddy) { - row = std::move(_row); + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); } - /** Advance the iterator by one row. If this CSVReader has an - * associated file, then the iterator will lazily pull more data from - * that file until the end of file is reached. - * - * @note This iterator does **not** block the thread responsible for parsing CSV. + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file */ - CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } - - return *this; + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); } - /** Post-increment iterator */ - CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { - auto temp = *this; - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); - return temp; + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; + + return info; } } - /** @file - * Defines the data type used for storing information about a CSV row + * @brief Defines functionality needed for basic CSV parsing */ -#include -#include namespace csv { namespace internals { - CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { - const size_t page_no = n / _single_buffer_capacity; - const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; - return this->buffers[page_no][buffer_idx]; + CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { + /** Print a CSV row */ + std::stringstream ret; + for (size_t i = 0; i < row.size(); i++) { + ret << row[i]; + if (i + 1 < row.size()) ret << delim; + else ret << '\n'; + } + ret.flush(); + + return ret.str(); } - CSV_INLINE void CSVFieldList::allocate() { - buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { + // Parse the CSV + auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; - _current_buffer_size = 0; - _back = buffers.back().get(); + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + return CSVRow(std::move(rows[format.get_header()])); } - } - /** Return a CSVField object corrsponding to the nth value in the row. - * - * @note This method performs bounds checking, and will throw an - * `std::runtime_error` if n is invalid. - * - * @complexity - * Constant, by calling csv::CSVRow::get_csv::string_view() - * - */ - CSV_INLINE CSVField CSVRow::operator[](size_t n) const { - return CSVField(this->get_field(n)); - } - - /** Retrieve a value by its associated column name. If the column - * specified can't be round, a runtime error is thrown. - * - * @complexity - * Constant. This calls the other CSVRow::operator[]() after - * converting column names into indices using a hash table. - * - * @param[in] col_name The column to look for - */ - CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { - auto & col_names = this->data->col_names; - auto col_pos = col_names->index_of(col_name); - if (col_pos > -1) { - return this->operator[](col_pos); - } - - throw std::runtime_error("Can't find a column named " + col_name); - } - - CSV_INLINE CSVRow::operator std::vector() const { - std::vector ret; - for (size_t i = 0; i < size(); i++) - ret.push_back(std::string(this->get_field(i))); + CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; - return ret; - } + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; - CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const - { - using internals::ParseFlags; + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; - if (index >= this->size()) - throw std::runtime_error("Index out of bounds."); + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); - const size_t field_index = this->fields_start + index; - auto& field = this->data->fields[field_index]; - auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; - if (field.has_double_quote) { - auto& value = this->data->double_quote_fields[field_index]; - if (value.empty()) { - bool prev_ch_quote = false; - for (size_t i = 0; i < field.length; i++) { - if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { - if (prev_ch_quote) { - prev_ch_quote = false; - continue; - } - else { - prev_ch_quote = true; - } + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; } - - value += field_str[i]; } } - return csv::string_view(value); - } - - return field_str.substr(0, field.length); - } + double final_score = 0; + size_t header_row = 0; - CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { - size_t start = 0, end = 0; + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { + final_score = score; + header_row = row_when[row_size]; + } + } - // Trim out whitespace chars - for (; start < this->sv.size() && this->sv[start] == ' '; start++); - for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - - int value_ = 0; + return { + final_score, + header_row + }; + } - size_t digits = (end - start); - size_t base16_exponent = digits - 1; + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { + /** For each delimiter, find out which row length was most common. + * The delimiter with the longest mode row length wins. + * Then, the line number of the header row is the first row with + * the mode row length. + */ - if (digits == 0) return false; + CSVFormat format; + size_t max_score = 0, + header = 0; + char current_delim = delims[0]; - for (const auto& ch : this->sv.substr(start, digits)) { - int digit = 0; + for (char cand_delim : delims) { + auto result = calculate_score(head, format.delimiter(cand_delim)); - switch (ch) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - digit = static_cast(ch - '0'); - break; - case 'a': - case 'A': - digit = 10; - break; - case 'b': - case 'B': - digit = 11; - break; - case 'c': - case 'C': - digit = 12; - break; - case 'd': - case 'D': - digit = 13; - break; - case 'e': - case 'E': - digit = 14; - break; - case 'f': - case 'F': - digit = 15; - break; - default: - return false; + if ((size_t)result.score > max_score) { + max_score = (size_t)result.score; + current_delim = cand_delim; + header = result.header; + } } - value_ += digit * (int)pow(16, (double)base16_exponent); - base16_exponent--; + return { current_delim, (int)header }; } - - parsedValue = value_; - return true; } - CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { - // If field has already been parsed to empty, no need to do it aagin: - if (this->_type == DataType::CSV_NULL) - return false; - - // Not yet parsed or possibly parsed with other decimalSymbol - if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) - this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again - - // Integral types are not affected by decimalSymbol and need not be parsed again + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { + auto head = internals::get_csv_head(filename); - // Either we already had an integral type before, or we we just got any numeric type now. - if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { - dVal = this->value; - return true; + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = guess_format(filename, format.get_possible_delims()); + format.delimiter(guess_result.delim).header_row(guess_result.header_row); } - // CSV_NULL or CSV_STRING, not numeric - return false; + return internals::_get_col_names(head, format); } -#ifdef _MSC_VER -#pragma region CSVRow Iterator -#endif - /** Return an iterator pointing to the first field. */ - CSV_INLINE CSVRow::iterator CSVRow::begin() const { - return CSVRow::iterator(this, 0); + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { + auto head = internals::get_csv_head(filename); + return internals::_guess_format(head, delims); } - /** Return an iterator pointing to just after the end of the CSVRow. + /** Reads an arbitrarily large CSV file using memory-mapped IO. + * + * **Details:** Reads the first block of a CSV file synchronously to get information + * such as column names and delimiting character. + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + * \snippet tests/test_read_csv.cpp CSVField Example * - * @warning Attempting to dereference the end iterator results - * in dereferencing a null pointer. */ - CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { - return CSVRow::iterator(this, (int)this->size()); - } + CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { + auto head = internals::get_csv_head(filename); + using Parser = internals::MmapParser; - CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { - return std::reverse_iterator(this->end()); - } + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = internals::_guess_format(head, format.possible_delimiters); + format.delimiter(guess_result.delim); + format.header = guess_result.header_row; + this->_format = format; + } - CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { - return std::reverse_iterator(this->begin()); - } + if (!format.col_names.empty()) + this->set_col_names(format.col_names); - CSV_INLINE HEDLEY_NON_NULL(2) - CSVRow::iterator::iterator(const CSVRow* _reader, int _i) - : daddy(_reader), i(_i) { - if (_i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](_i)); - else - this->field = nullptr; + this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 + this->initial_read(); } - CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { - return *(this->field.get()); - } + /** Return the format of the original raw CSV */ + CSV_INLINE CSVFormat CSVReader::get_format() const { + CSVFormat new_format = this->_format; - CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - return this->field; - } + // Since users are normally not allowed to set + // column names and header row simulatenously, + // we will set the backing variables directly here + new_format.col_names = this->col_names->get_col_names(); + new_format.header = this->_format.header; - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { - // Pre-increment operator - this->i++; - if (this->i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](i)); - else // Reached the end of row - this->field = nullptr; - return *this; + return new_format; } - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { - // Post-increment operator - auto temp = *this; - this->operator++(); - return temp; - } + /** Return the CSV's column names as a vector of strings. */ + CSV_INLINE std::vector CSVReader::get_col_names() const { + if (this->col_names) { + return this->col_names->get_col_names(); + } - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { - // Pre-decrement operator - this->i--; - this->field = std::make_shared( - this->daddy->operator[](this->i)); - return *this; + return std::vector(); } - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { - // Post-decrement operator - auto temp = *this; - this->operator--(); - return temp; - } - - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator(this->daddy, i + (int)n); - } + /** Return the index of the column name if found or + * csv::CSV_NOT_FOUND otherwise. + */ + CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { + auto _col_names = this->get_col_names(); + for (size_t i = 0; i < _col_names.size(); i++) + if (_col_names[i] == col_name) return (int)i; - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator::operator+(-n); + return CSV_NOT_FOUND; } -#ifdef _MSC_VER -#pragma endregion CSVRow Iterator -#endif -} -/** @file - * Implements JSON serialization abilities - */ + CSV_INLINE void CSVReader::trim_header() { + if (!this->header_trimmed) { + for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { + if (i == this->_format.header && this->col_names->empty()) { + this->set_col_names(this->records->pop_front()); + } + else { + this->records->pop_front(); + } + } + this->header_trimmed = true; + } + } -namespace csv { - /* - The implementations for json_extra_space() and json_escape_string() - were modified from source code for JSON for Modern C++. + /** + * @param[in] names Column names + */ + CSV_INLINE void CSVReader::set_col_names(const std::vector& names) + { + this->col_names->set_col_names(names); + this->n_cols = names.size(); + } - The respective license is below: + /** + * Read a chunk of CSV data. + * + * @note This method is meant to be run on its own thread. Only one `read_csv()` thread + * should be active at a time. + * + * @param[in] bytes Number of bytes to read. + * + * @see CSVReader::read_csv_worker + * @see CSVReader::read_row() + */ + CSV_INLINE bool CSVReader::read_csv(size_t bytes) { + // Tell read_row() to listen for CSV rows + this->records->notify_all(); - The code is licensed under the [MIT - License](http://opensource.org/licenses/MIT): - - Copyright © 2013-2015 Niels Lohmann. - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation files - (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - */ + this->parser->set_output(*this->records); + this->parser->next(bytes); - namespace internals { - /*! - @brief calculates the extra space to escape a JSON string + if (!this->header_trimmed) { + this->trim_header(); + } - @param[in] s the string to escape - @return the number of characters required to escape string @a s + // Tell read_row() to stop waiting + this->records->kill_all(); - @complexity Linear in the length of string @a s. - */ - static std::size_t json_extra_space(csv::string_view& s) noexcept - { - std::size_t result = 0; + return true; + } + /** + * Retrieve rows as CSVRow objects, returning true if more rows are available. + * + * @par Performance Notes + * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time + * - For performance details, read the documentation for CSVRow and CSVField. + * + * @param[out] row The variable where the parsed row will be stored + * @see CSVRow, CSVField + * + * **Example:** + * \snippet tests/test_read_csv.cpp CSVField Example + * + */ + CSV_INLINE bool CSVReader::read_row(CSVRow &row) { + while (true) { + if (this->records->empty()) { + if (this->records->is_waitable()) + // Reading thread is currently active => wait for it to populate records + this->records->wait(); + else if (this->parser->eof()) + // End of file and no more records + return false; + else { + // Reading thread is not active => start another one + if (this->read_csv_worker.joinable()) + this->read_csv_worker.join(); - for (const auto& c : s) - { - switch (c) - { - case '"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - { - // from c (1 byte) to \x (2 bytes) - result += 1; - break; + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); } + } + else if (this->records->front().size() != this->n_cols && + this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { + auto errored_row = this->records->pop_front(); + if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { + if (errored_row.size() < this->n_cols) + throw std::runtime_error("Line too short " + internals::format_row(errored_row)); - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // from c (1 byte) to \uxxxx (6 bytes) - result += 5; - } - break; - } + throw std::runtime_error("Line too long " + internals::format_row(errored_row)); } } + else { + row = this->records->pop_front(); + this->_n_rows++; + return true; + } + } + return false; + } +} - return result; +/** @file + * Defines the data type used for storing information about a CSV row + */ + +#include +#include + +namespace csv { + namespace internals { + CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { + const size_t page_no = n / _single_buffer_capacity; + const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; + return this->buffers[page_no][buffer_idx]; } - CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept - { - const auto space = json_extra_space(s); - if (space == 0) - { - return std::string(s); - } + CSV_INLINE void CSVFieldList::allocate() { + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); - // create a result string of necessary size - size_t result_size = s.size() + space; - std::string result(result_size, '\\'); - std::size_t pos = 0; + _current_buffer_size = 0; + _back = buffers.back().get(); + } + } - for (const auto& c : s) - { - switch (c) - { - // quotation mark (0x22) - case '"': - { - result[pos + 1] = '"'; - pos += 2; - break; - } - - - // reverse solidus (0x5c) - case '\\': - { - // nothing to change - pos += 2; - break; - } - - - // backspace (0x08) - case '\b': - { - result[pos + 1] = 'b'; - pos += 2; - break; - } - - - // formfeed (0x0c) - case '\f': - { - result[pos + 1] = 'f'; - pos += 2; - break; - } + /** Return a CSVField object corrsponding to the nth value in the row. + * + * @note This method performs bounds checking, and will throw an + * `std::runtime_error` if n is invalid. + * + * @complexity + * Constant, by calling csv::CSVRow::get_csv::string_view() + * + */ + CSV_INLINE CSVField CSVRow::operator[](size_t n) const { + return CSVField(this->get_field(n)); + } + /** Retrieve a value by its associated column name. If the column + * specified can't be round, a runtime error is thrown. + * + * @complexity + * Constant. This calls the other CSVRow::operator[]() after + * converting column names into indices using a hash table. + * + * @param[in] col_name The column to look for + */ + CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { + auto & col_names = this->data->col_names; + auto col_pos = col_names->index_of(col_name); + if (col_pos > -1) { + return this->operator[](col_pos); + } - // newline (0x0a) - case '\n': - { - result[pos + 1] = 'n'; - pos += 2; - break; - } + throw std::runtime_error("Can't find a column named " + col_name); + } + CSV_INLINE CSVRow::operator std::vector() const { + std::vector ret; + for (size_t i = 0; i < size(); i++) + ret.push_back(std::string(this->get_field(i))); - // carriage return (0x0d) - case '\r': - { - result[pos + 1] = 'r'; - pos += 2; - break; - } + return ret; + } + CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const + { + using internals::ParseFlags; - // horizontal tab (0x09) - case '\t': - { - result[pos + 1] = 't'; - pos += 2; - break; - } + if (index >= this->size()) + throw std::runtime_error("Index out of bounds."); + const size_t field_index = this->fields_start + index; + auto& field = this->data->fields[field_index]; + auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // print character c as \uxxxx - snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); - pos += 6; - // overwrite trailing null character - result[pos] = '\\'; - } - else - { - // all other characters are added as-is - result[pos++] = c; + if (field.has_double_quote) { + auto& value = this->data->double_quote_fields[field_index]; + if (value.empty()) { + bool prev_ch_quote = false; + for (size_t i = 0; i < field.length; i++) { + if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { + if (prev_ch_quote) { + prev_ch_quote = false; + continue; + } + else { + prev_ch_quote = true; + } } - break; - } + + value += field_str[i]; } } - return result; + return csv::string_view(value); } + + return field_str.substr(0, field.length); } - /** Convert a CSV row to a JSON object, i.e. - * `{"col1":"value1","col2":"value2"}` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for original columns. - */ - CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) { - col_names = this->data ? this->get_col_names() : std::vector({}); - } + CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { + size_t start = 0, end = 0; - const size_t _n_cols = col_names.size(); - std::string ret = "{"; + // Trim out whitespace chars + for (; start < this->sv.size() && this->sv[start] == ' '; start++); + for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - for (size_t i = 0; i < _n_cols; i++) { - auto& col = col_names[i]; - auto field = this->operator[](col); + int value_ = 0; - // TODO: Possible performance enhancements by caching escaped column names - ret += '"' + internals::json_escape_string(col) + "\":"; + size_t digits = (end - start); + size_t base16_exponent = digits - 1; - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + if (digits == 0) return false; - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; + for (const auto& ch : this->sv.substr(start, digits)) { + int digit = 0; + + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digit = static_cast(ch - '0'); + break; + case 'a': + case 'A': + digit = 10; + break; + case 'b': + case 'B': + digit = 11; + break; + case 'c': + case 'C': + digit = 12; + break; + case 'd': + case 'D': + digit = 13; + break; + case 'e': + case 'E': + digit = 14; + break; + case 'f': + case 'F': + digit = 15; + break; + default: + return false; + } + + value_ += digit * (int)pow(16, (double)base16_exponent); + base16_exponent--; } - ret += '}'; - return ret; + parsedValue = value_; + return true; } - /** Convert a CSV row to a JSON array, i.e. - * `["value1","value2",...]` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for all columns. - */ - CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) - col_names = this->data ? this->get_col_names() : std::vector({}); - - const size_t _n_cols = col_names.size(); - std::string ret = "["; + CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { + // If field has already been parsed to empty, no need to do it aagin: + if (this->_type == DataType::CSV_NULL) + return false; - for (size_t i = 0; i < _n_cols; i++) { - auto field = this->operator[](col_names[i]); + // Not yet parsed or possibly parsed with other decimalSymbol + if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) + this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + // Integral types are not affected by decimalSymbol and need not be parsed again - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; + // Either we already had an integral type before, or we we just got any numeric type now. + if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { + dVal = this->value; + return true; } - ret += ']'; - return ret; + // CSV_NULL or CSV_STRING, not numeric + return false; } -} - -/** @file - * Calculates statistics from CSV files - */ -#include +#ifdef _MSC_VER +#pragma region CSVRow Iterator +#endif + /** Return an iterator pointing to the first field. */ + CSV_INLINE CSVRow::iterator CSVRow::begin() const { + return CSVRow::iterator(this, 0); + } -namespace csv { - /** Calculate statistics for an arbitrarily large file. When this constructor - * is called, CSVStat will process the entire file iteratively. Once finished, - * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. + /** Return an iterator pointing to just after the end of the CSVRow. + * + * @warning Attempting to dereference the end iterator results + * in dereferencing a null pointer. */ - CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) : - reader(filename, format) { - this->calc(); + CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { + return CSVRow::iterator(this, (int)this->size()); } - /** Calculate statistics for a CSV stored in a std::stringstream */ - CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) : - reader(stream, format) { - this->calc(); + CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { + return std::reverse_iterator(this->end()); } - /** Return current means */ - CSV_INLINE std::vector CSVStat::get_mean() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->rolling_means[i]); - } - return ret; + CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { + return std::reverse_iterator(this->begin()); } - /** Return current variances */ - CSV_INLINE std::vector CSVStat::get_variance() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); - } - return ret; + CSV_INLINE HEDLEY_NON_NULL(2) + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) + : daddy(_reader), i(_i) { + if (_i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](_i)); + else + this->field = nullptr; } - /** Return current mins */ - CSV_INLINE std::vector CSVStat::get_mins() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->mins[i]); - } - return ret; + CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { + return *(this->field.get()); } - /** Return current maxes */ - CSV_INLINE std::vector CSVStat::get_maxes() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->maxes[i]); - } - return ret; + CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { + return this->field; } - /** Get counts for each column */ - CSV_INLINE std::vector CSVStat::get_counts() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->counts[i]); - } - return ret; + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { + // Pre-increment operator + this->i++; + if (this->i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](i)); + else // Reached the end of row + this->field = nullptr; + return *this; } - /** Get data type counts for each column */ - CSV_INLINE std::vector CSVStat::get_dtypes() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->dtypes[i]); - } - return ret; + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { + // Post-increment operator + auto temp = *this; + this->operator++(); + return temp; } - CSV_INLINE void CSVStat::calc_chunk() { - /** Only create stats counters the first time **/ - if (dtypes.empty()) { - /** Go through all records and calculate specified statistics */ - for (size_t i = 0; i < this->get_col_names().size(); i++) { - dtypes.push_back({}); - counts.push_back({}); - rolling_means.push_back(0); - rolling_vars.push_back(0); - mins.push_back(NAN); - maxes.push_back(NAN); - n.push_back(0); - } - } - - // Start threads - std::vector pool; - for (size_t i = 0; i < this->get_col_names().size(); i++) - pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { + // Pre-decrement operator + this->i--; + this->field = std::make_shared( + this->daddy->operator[](this->i)); + return *this; + } - // Block until done - for (auto& th : pool) - th.join(); + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { + // Post-decrement operator + auto temp = *this; + this->operator--(); + return temp; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator(this->daddy, i + (int)n); + } - this->records.clear(); + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator::operator+(-n); } +#ifdef _MSC_VER +#pragma endregion CSVRow Iterator +#endif +} - CSV_INLINE void CSVStat::calc() { - constexpr size_t CALC_CHUNK_SIZE = 5000; - for (auto& row : reader) { - this->records.push_back(std::move(row)); +namespace csv { + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); - /** Chunk rows */ - if (this->records.size() == CALC_CHUNK_SIZE) { - calc_chunk(); - } + return end - start; } - if (!this->records.empty()) { - calc_chunk(); + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); } - } - - CSV_INLINE void CSVStat::calc_worker(const size_t &i) { - /** Worker thread for CSVStat::calc() which calculates statistics for one column. - * - * @param[in] i Column index - */ - auto current_record = this->records.begin(); - - for (size_t processed = 0; current_record != this->records.end(); processed++) { - if (current_record->size() == this->get_col_names().size()) { - auto current_field = (*current_record)[i]; + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; - // Optimization: Don't count() if there's too many distinct values in the first 1000 rows - if (processed < 1000 || this->counts[i].size() <= 500) - this->count(current_field, i); + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); - this->dtype(current_field, i); + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } - // Numeric Stuff - if (current_field.is_num()) { - long double x_n = current_field.get(); + return std::string(mmap.begin(), mmap.end()); + } - // This actually calculates mean AND variance - this->variance(x_n, i); - this->min_max(x_n, i); - } +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); } - else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) { - throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record)); + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); } - ++current_record; - } - } - - CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) { - /** Given a record update the type counter - * @param[in] record Data observation - * @param[out] i The column index that should be updated - */ - - auto type = data.type(); - if (this->dtypes[i].find(type) != - this->dtypes[i].end()) { - // Increment count - this->dtypes[i][type]++; - } else { - // Initialize count - this->dtypes[i].insert(std::make_pair(type, 1)); + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); } - } - CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) { - /** Given a record update the frequency counter - * @param[in] record Data observation - * @param[out] i The column index that should be updated - */ + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; - auto item = data.get(); + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER + || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); - if (this->counts[i].find(item) != - this->counts[i].end()) { - // Increment count - this->counts[i][item]++; - } else { - // Initialize count - this->counts[i].insert(std::make_pair(item, 1)); + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } + + // Push row + if (this->current_row.size() > 0) + this->push_row(); } - } - CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) { - /** Update current minimum and maximum - * @param[in] x_n Data observation - * @param[out] i The column index that should be updated - */ - if (std::isnan(this->mins[i])) - this->mins[i] = x_n; - if (std::isnan(this->maxes[i])) - this->maxes[i] = x_n; - - if (x_n < this->mins[i]) - this->mins[i] = x_n; - else if (x_n > this->maxes[i]) - this->maxes[i] = x_n; - } + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; - CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) { - /** Given a record update rolling mean and variance for all columns - * using Welford's Algorithm - * @param[in] x_n Data observation - * @param[out] i The column index that should be updated - */ - long double& current_rolling_mean = this->rolling_means[i]; - long double& current_rolling_var = this->rolling_vars[i]; - long double& current_n = this->n[i]; - long double delta; - long double delta2; + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; - current_n++; - - if (current_n == 1) { - current_rolling_mean = x_n; - } else { - delta = x_n - current_rolling_mean; - current_rolling_mean += delta/current_n; - delta2 = x_n - current_rolling_mean; - current_rolling_var += delta*delta2; + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); + + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; + + field_length = data_pos - (field_start + current_row_start()); + + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; } - } - /** Useful for uploading CSV files to SQL databases. - * - * Return a data type for each column such that every value in a column can be - * converted to the corresponding data type without data loss. - * @param[in] filename The CSV file - * - * \return A mapping of column names to csv::DataType enums - */ - CSV_INLINE std::unordered_map csv_data_types(const std::string& filename) { - CSVStat stat(filename); - std::unordered_map csv_dtypes; + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; - auto col_names = stat.get_col_names(); - auto temp = stat.get_dtypes(); + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } - for (size_t i = 0; i < stat.get_col_names().size(); i++) { - auto& col = temp[i]; - auto& col_name = col_names[i]; + current_row.row_length++; - if (col[DataType::CSV_STRING]) - csv_dtypes[col_name] = DataType::CSV_STRING; - else if (col[DataType::CSV_INT64]) - csv_dtypes[col_name] = DataType::CSV_INT64; - else if (col[DataType::CSV_INT32]) - csv_dtypes[col_name] = DataType::CSV_INT32; - else if (col[DataType::CSV_INT16]) - csv_dtypes[col_name] = DataType::CSV_INT16; - else if (col[DataType::CSV_INT8]) - csv_dtypes[col_name] = DataType::CSV_INT8; - else - csv_dtypes[col_name] = DataType::CSV_DOUBLE; + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; } - return csv_dtypes; - } -} -#include -#include + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example - */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(in.data()); - return CSVReader(stream, format); - } + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; + break; - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); + case ParseFlags::NEWLINE: + this->data_pos++; - return parse(in, format); - } + // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) + while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma - * - */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); - } + // End of record -> Write record + this->push_field(); + this->push_row(); - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); - } + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } + } + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; + break; - return info; + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; + } + + // Case: Unescaped quote + this->field_length++; + data_pos++; + + break; + } + } + + return this->current_row_start(); + } + + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); + } + + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } + + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; + + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } + + this->unicode_bom_scan = true; + } + } +#ifdef _MSC_VER +#pragma endregion +#endif + +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; + + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; + + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); + + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); + + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } + + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif } } + #endif diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index 811c8e14..74dcabce 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -5531,6 +5531,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -5719,6 +5722,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; @@ -6160,6 +6166,7 @@ namespace csv { if (this->eof()) return; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { @@ -6943,1643 +6950,1644 @@ namespace csv { namespace csv { namespace internals { - CSV_INLINE size_t get_file_size(csv::string_view filename) { - std::ifstream infile(std::string(filename), std::ios::binary); - const auto start = infile.tellg(); - infile.seekg(0, std::ios::end); - const auto end = infile.tellg(); - - return end - start; + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; } - CSV_INLINE std::string get_csv_head(csv::string_view filename) { - return get_csv_head(filename, get_file_size(filename)); - } + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; - CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { - const size_t bytes = 500000; + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } + } - std::error_code error; - size_t length = std::min((size_t)file_size, bytes); - auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; - if (error) { - throw std::runtime_error("Cannot open file " + std::string(filename)); - } + return CSV_NOT_FOUND; + } - return std::string(mmap.begin(), mmap.end()); + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); } -#ifdef _MSC_VER -#pragma region IBasicCVParser -#endif - CSV_INLINE IBasicCSVParser::IBasicCSVParser( - const CSVFormat& format, - const ColNamesPtr& col_names - ) : _col_names(col_names) { - if (format.no_quote) { - _parse_flags = internals::make_parse_flags(format.get_delim()); - } - else { - _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); - } + } +} +/** @file + * Defines an object used to store CSV format settings + */ - _ws_flags = internals::make_ws_flags( - format.trim_chars.data(), format.trim_chars.size() - ); - } +#include +#include - CSV_INLINE void IBasicCSVParser::end_feed() { - using internals::ParseFlags; - bool empty_last_field = this->data_ptr - && this->data_ptr->_data - && !this->data_ptr->data.empty() - && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER - || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } - // Push field - if (this->field_length > 0 || empty_last_field) { - this->push_field(); - } + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } - // Push row - if (this->current_row.size() > 0) - this->push_row(); - } + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } - CSV_INLINE void IBasicCSVParser::parse_field() noexcept { - using internals::ParseFlags; - auto& in = this->data_ptr->data; + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } - // Trim off leading whitespace - while (data_pos < in.size() && ws_flag(in[data_pos])) - data_pos++; + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } - if (field_start == UNINITIALIZED_FIELD) - field_start = (int)(data_pos - current_row_start()); + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous - // sequences, use the loop below to avoid having to go through the outer - // switch statement as much as possible - while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) - data_pos++; + this->header = row; + this->col_names = {}; + return *this; + } - field_length = data_pos - (field_start + current_row_start()); + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); - // Trim off trailing whitespace, this->field_length constraint matters - // when field is entirely whitespace - for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) - this->field_length--; + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); } - CSV_INLINE void IBasicCSVParser::push_field() - { - // Update - if (field_has_double_quote) { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length, - true - ); - field_has_double_quote = false; + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; - } - else { - fields->emplace_back( - field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, - field_length - ); - } + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; - current_row.row_length++; + if (i + 1 < intersection.size()) + err_msg += ", "; + } - // Reset field state - field_start = UNINITIALIZED_FIELD; - field_length = 0; + throw std::runtime_error(err_msg + '.'); } + } +} +/** @file + * Defines an input iterator for csv::CSVReader + */ - /** @return The number of characters parsed that belong to complete rows */ - CSV_INLINE size_t IBasicCSVParser::parse() - { - using internals::ParseFlags; - this->quote_escape = false; - this->data_pos = 0; - this->current_row_start() = 0; - this->trim_utf8_bom(); +namespace csv { + /** Return an iterator to the first row in the reader */ + CSV_INLINE CSVReader::iterator CSVReader::begin() { + if (this->records->empty()) { + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv_worker.join(); - auto& in = this->data_ptr->data; - while (this->data_pos < in.size()) { - switch (compound_parse_flag(in[this->data_pos])) { - case ParseFlags::DELIMITER: - this->push_field(); - this->data_pos++; - break; + // Still empty => return end iterator + if (this->records->empty()) return this->end(); + } - case ParseFlags::NEWLINE: - this->data_pos++; + this->_n_rows++; + CSVReader::iterator ret(this, this->records->pop_front()); + return ret; + } - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) - this->data_pos++; + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. + */ + CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { + return CSVReader::iterator(); + } - // End of record -> Write record - this->push_field(); - this->push_row(); + ///////////////////////// + // CSVReader::iterator // + ///////////////////////// - // Reset - this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); - break; + CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : + daddy(_daddy) { + row = std::move(_row); + } - case ParseFlags::NOT_SPECIAL: - this->parse_field(); - break; + /** Advance the iterator by one row. If this CSVReader has an + * associated file, then the iterator will lazily pull more data from + * that file until the end of file is reached. + * + * @note This iterator does **not** block the thread responsible for parsing CSV. + * + */ + CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } - case ParseFlags::QUOTE_ESCAPE_QUOTE: - if (data_pos + 1 == in.size()) return this->current_row_start(); - else if (data_pos + 1 < in.size()) { - auto next_ch = parse_flag(in[data_pos + 1]); - if (next_ch >= ParseFlags::DELIMITER) { - quote_escape = false; - data_pos++; - break; - } - else if (next_ch == ParseFlags::QUOTE) { - // Case: Escaped quote - data_pos += 2; - this->field_length += 2; - this->field_has_double_quote = true; - break; - } - } - - // Case: Unescaped single quote => not strictly valid but we'll keep it - this->field_length++; - data_pos++; - - break; - - default: // Quote (currently not quote escaped) - if (this->field_length == 0) { - quote_escape = true; - data_pos++; - if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) - field_start = (int)(data_pos - current_row_start()); - break; - } + return *this; + } - // Case: Unescaped quote - this->field_length++; - data_pos++; + /** Post-increment iterator */ + CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { + auto temp = *this; + if (!daddy->read_row(this->row)) { + this->daddy = nullptr; // this == end() + } - break; - } - } + return temp; + } +} - return this->current_row_start(); - } +/** @file + * Implements JSON serialization abilities + */ - CSV_INLINE void IBasicCSVParser::push_row() { - current_row.row_length = fields->size() - current_row.fields_start; - this->_records->push_back(std::move(current_row)); - } - CSV_INLINE void IBasicCSVParser::reset_data_ptr() { - this->data_ptr = std::make_shared(); - this->data_ptr->parse_flags = this->_parse_flags; - this->data_ptr->col_names = this->_col_names; - this->fields = &(this->data_ptr->fields); - } +namespace csv { + /* + The implementations for json_extra_space() and json_escape_string() + were modified from source code for JSON for Modern C++. - CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { - auto& data = this->data_ptr->data; + The respective license is below: - if (!this->unicode_bom_scan && data.size() >= 3) { - if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { - this->data_pos += 3; // Remove BOM from input string - this->_utf8_bom = true; - } + The code is licensed under the [MIT + License](http://opensource.org/licenses/MIT): + + Copyright © 2013-2015 Niels Lohmann. + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation files + (the "Software"), to deal in the Software without restriction, + including without limitation the rights to use, copy, modify, merge, + publish, distribute, sublicense, and/or sell copies of the Software, + and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + */ - this->unicode_bom_scan = true; - } - } -#ifdef _MSC_VER -#pragma endregion -#endif + namespace internals { + /*! + @brief calculates the extra space to escape a JSON string -#ifdef _MSC_VER -#pragma region Specializations -#endif - CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { - // Reset parser state - this->field_start = UNINITIALIZED_FIELD; - this->field_length = 0; - this->reset_data_ptr(); + @param[in] s the string to escape + @return the number of characters required to escape string @a s - // Create memory map - size_t length = std::min(this->source_size - this->mmap_pos, bytes); - std::error_code error; - this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); - this->mmap_pos += length; - if (error) throw error; + @complexity Linear in the length of string @a s. + */ + static std::size_t json_extra_space(csv::string_view& s) noexcept + { + std::size_t result = 0; - auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); - // Create string view - this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + for (const auto& c : s) + { + switch (c) + { + case '"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + { + // from c (1 byte) to \x (2 bytes) + result += 1; + break; + } - // Parse - this->current_row = CSVRow(this->data_ptr); - size_t remainder = this->parse(); - if (this->mmap_pos == this->source_size || no_chunk()) { - this->_eof = true; - this->end_feed(); + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // from c (1 byte) to \uxxxx (6 bytes) + result += 5; + } + break; + } + } } - this->mmap_pos -= (length - remainder); - } -#ifdef _MSC_VER -#pragma endregion -#endif - } -} - -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; + return result; } - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; - - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; + CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept + { + const auto space = json_extra_space(s); + if (space == 0) + { + return std::string(s); } - } - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; + // create a result string of necessary size + size_t result_size = s.size() + space; + std::string result(result_size, '\\'); + std::size_t pos = 0; - return CSV_NOT_FOUND; - } + for (const auto& c : s) + { + switch (c) + { + // quotation mark (0x22) + case '"': + { + result[pos + 1] = '"'; + pos += 2; + break; + } - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); - } - } -} -/** @file - * Defines an object used to store CSV format settings - */ + // reverse solidus (0x5c) + case '\\': + { + // nothing to change + pos += 2; + break; + } -#include -#include + // backspace (0x08) + case '\b': + { + result[pos + 1] = 'b'; + pos += 2; + break; + } -namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; - } - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } + // formfeed (0x0c) + case '\f': + { + result[pos + 1] = 'f'; + pos += 2; + break; + } - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; - } - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; - } + // newline (0x0a) + case '\n': + { + result[pos + 1] = 'n'; + pos += 2; + break; + } - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; - } - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + // carriage return (0x0d) + case '\r': + { + result[pos + 1] = 'r'; + pos += 2; + break; + } - this->header = row; - this->col_names = {}; - return *this; + + // horizontal tab (0x09) + case '\t': + { + result[pos + 1] = 't'; + pos += 2; + break; + } + + + default: + { + if (c >= 0x00 && c <= 0x1f) + { + // print character c as \uxxxx + snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); + pos += 6; + // overwrite trailing null character + result[pos] = '\\'; + } + else + { + // all other characters are added as-is + result[pos++] = c; + } + break; + } + } + } + + return result; + } } - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); + /** Convert a CSV row to a JSON object, i.e. + * `{"col1":"value1","col2":"value2"}` + * + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for original columns. + */ + CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) { + col_names = this->data ? this->get_col_names() : std::vector({}); + } - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; + const size_t _n_cols = col_names.size(); + std::string ret = "{"; + + for (size_t i = 0; i < _n_cols; i++) { + auto& col = col_names[i]; + auto field = this->operator[](col); - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); + // TODO: Possible performance enhancements by caching escaped column names + ret += '"' + internals::json_escape_string(col) + "\":"; - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; + + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; } - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; + ret += '}'; + return ret; + } - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; + /** Convert a CSV row to a JSON array, i.e. + * `["value1","value2",...]` + * + * @note All strings are properly escaped. Numeric values are not quoted. + * @param[in] subset A subset of columns to contain in the JSON. + * Leave empty for all columns. + */ + CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { + std::vector col_names = subset; + if (subset.empty()) + col_names = this->data ? this->get_col_names() : std::vector({}); - if (i + 1 < intersection.size()) - err_msg += ", "; - } + const size_t _n_cols = col_names.size(); + std::string ret = "["; - throw std::runtime_error(err_msg + '.'); + for (size_t i = 0; i < _n_cols; i++) { + auto field = this->operator[](col_names[i]); + + // Add quotes around strings but not numbers + if (field.is_num()) + ret += internals::json_escape_string(field.get()); + else + ret += '"' + internals::json_escape_string(field.get()) + '"'; + + // Do not add comma after last string + if (i + 1 < _n_cols) + ret += ','; } + + ret += ']'; + return ret; } } + /** @file - * @brief Defines functionality needed for basic CSV parsing + * Calculates statistics from CSV files */ +#include namespace csv { - namespace internals { - CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { - /** Print a CSV row */ - std::stringstream ret; - for (size_t i = 0; i < row.size(); i++) { - ret << row[i]; - if (i + 1 < row.size()) ret << delim; - else ret << '\n'; - } - ret.flush(); + /** Calculate statistics for an arbitrarily large file. When this constructor + * is called, CSVStat will process the entire file iteratively. Once finished, + * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. + */ + CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) : + reader(filename, format) { + this->calc(); + } - return ret.str(); - } + /** Calculate statistics for a CSV stored in a std::stringstream */ + CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) : + reader(stream, format) { + this->calc(); + } - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { - // Parse the CSV - auto trim_chars = format.get_trim_chars(); - std::stringstream source(head.data()); - RowCollection rows; + /** Return current means */ + CSV_INLINE std::vector CSVStat::get_mean() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->rolling_means[i]); + } + return ret; + } - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); + /** Return current variances */ + CSV_INLINE std::vector CSVStat::get_variance() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); + } + return ret; + } - return CSVRow(std::move(rows[format.get_header()])); + /** Return current mins */ + CSV_INLINE std::vector CSVStat::get_mins() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->mins[i]); } + return ret; + } - CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) { - // Frequency counter of row length - std::unordered_map row_tally = { { 0, 0 } }; + /** Return current maxes */ + CSV_INLINE std::vector CSVStat::get_maxes() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->maxes[i]); + } + return ret; + } - // Map row lengths to row num where they first occurred - std::unordered_map row_when = { { 0, 0 } }; + /** Get counts for each column */ + CSV_INLINE std::vector CSVStat::get_counts() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->counts[i]); + } + return ret; + } - // Parse the CSV - std::stringstream source(head.data()); - RowCollection rows; + /** Get data type counts for each column */ + CSV_INLINE std::vector CSVStat::get_dtypes() const { + std::vector ret; + for (size_t i = 0; i < this->get_col_names().size(); i++) { + ret.push_back(this->dtypes[i]); + } + return ret; + } - StreamParser parser(source, format); - parser.set_output(rows); - parser.next(); - - for (size_t i = 0; i < rows.size(); i++) { - auto& row = rows[i]; - - // Ignore zero-length rows - if (row.size() > 0) { - if (row_tally.find(row.size()) != row_tally.end()) { - row_tally[row.size()]++; - } - else { - row_tally[row.size()] = 1; - row_when[row.size()] = i; - } - } + CSV_INLINE void CSVStat::calc_chunk() { + /** Only create stats counters the first time **/ + if (dtypes.empty()) { + /** Go through all records and calculate specified statistics */ + for (size_t i = 0; i < this->get_col_names().size(); i++) { + dtypes.push_back({}); + counts.push_back({}); + rolling_means.push_back(0); + rolling_vars.push_back(0); + mins.push_back(NAN); + maxes.push_back(NAN); + n.push_back(0); } + } - double final_score = 0; - size_t header_row = 0; - - // Final score is equal to the largest - // row size times rows of that size - for (auto& pair : row_tally) { - auto row_size = pair.first; - auto row_count = pair.second; - double score = (double)(row_size * row_count); - if (score > final_score) { - final_score = score; - header_row = row_when[row_size]; - } - } + // Start threads + std::vector pool; + for (size_t i = 0; i < this->get_col_names().size(); i++) + pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); - return { - final_score, - header_row - }; - } + // Block until done + for (auto& th : pool) + th.join(); - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { - /** For each delimiter, find out which row length was most common. - * The delimiter with the longest mode row length wins. - * Then, the line number of the header row is the first row with - * the mode row length. - */ + this->records.clear(); + } - CSVFormat format; - size_t max_score = 0, - header = 0; - char current_delim = delims[0]; + CSV_INLINE void CSVStat::calc() { + constexpr size_t CALC_CHUNK_SIZE = 5000; - for (char cand_delim : delims) { - auto result = calculate_score(head, format.delimiter(cand_delim)); + for (auto& row : reader) { + this->records.push_back(std::move(row)); - if ((size_t)result.score > max_score) { - max_score = (size_t)result.score; - current_delim = cand_delim; - header = result.header; - } + /** Chunk rows */ + if (this->records.size() == CALC_CHUNK_SIZE) { + calc_chunk(); } + } - return { current_delim, (int)header }; + if (!this->records.empty()) { + calc_chunk(); } } - /** Return a CSV's column names - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - */ - CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { - auto head = internals::get_csv_head(filename); + CSV_INLINE void CSVStat::calc_worker(const size_t &i) { + /** Worker thread for CSVStat::calc() which calculates statistics for one column. + * + * @param[in] i Column index + */ - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = guess_format(filename, format.get_possible_delims()); - format.delimiter(guess_result.delim).header_row(guess_result.header_row); - } + auto current_record = this->records.begin(); - return internals::_get_col_names(head, format); - } + for (size_t processed = 0; current_record != this->records.end(); processed++) { + if (current_record->size() == this->get_col_names().size()) { + auto current_field = (*current_record)[i]; - /** Guess the delimiter used by a delimiter-separated values file */ - CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { - auto head = internals::get_csv_head(filename); - return internals::_guess_format(head, delims); - } + // Optimization: Don't count() if there's too many distinct values in the first 1000 rows + if (processed < 1000 || this->counts[i].size() <= 500) + this->count(current_field, i); - /** Reads an arbitrarily large CSV file using memory-mapped IO. - * - * **Details:** Reads the first block of a CSV file synchronously to get information - * such as column names and delimiting character. - * - * @param[in] filename Path to CSV file - * @param[in] format Format of the CSV file - * - * \snippet tests/test_read_csv.cpp CSVField Example - * - */ - CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { - auto head = internals::get_csv_head(filename); - using Parser = internals::MmapParser; + this->dtype(current_field, i); - /** Guess delimiter and header row */ - if (format.guess_delim()) { - auto guess_result = internals::_guess_format(head, format.possible_delimiters); - format.delimiter(guess_result.delim); - format.header = guess_result.header_row; - this->_format = format; - } + // Numeric Stuff + if (current_field.is_num()) { + long double x_n = current_field.get(); - if (!format.col_names.empty()) - this->set_col_names(format.col_names); + // This actually calculates mean AND variance + this->variance(x_n, i); + this->min_max(x_n, i); + } + } + else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) { + throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record)); + } - this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 - this->initial_read(); + ++current_record; + } } - /** Return the format of the original raw CSV */ - CSV_INLINE CSVFormat CSVReader::get_format() const { - CSVFormat new_format = this->_format; + CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) { + /** Given a record update the type counter + * @param[in] record Data observation + * @param[out] i The column index that should be updated + */ + + auto type = data.type(); + if (this->dtypes[i].find(type) != + this->dtypes[i].end()) { + // Increment count + this->dtypes[i][type]++; + } else { + // Initialize count + this->dtypes[i].insert(std::make_pair(type, 1)); + } + } - // Since users are normally not allowed to set - // column names and header row simulatenously, - // we will set the backing variables directly here - new_format.col_names = this->col_names->get_col_names(); - new_format.header = this->_format.header; + CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) { + /** Given a record update the frequency counter + * @param[in] record Data observation + * @param[out] i The column index that should be updated + */ - return new_format; - } + auto item = data.get(); - /** Return the CSV's column names as a vector of strings. */ - CSV_INLINE std::vector CSVReader::get_col_names() const { - if (this->col_names) { - return this->col_names->get_col_names(); + if (this->counts[i].find(item) != + this->counts[i].end()) { + // Increment count + this->counts[i][item]++; + } else { + // Initialize count + this->counts[i].insert(std::make_pair(item, 1)); } - - return std::vector(); } - /** Return the index of the column name if found or - * csv::CSV_NOT_FOUND otherwise. - */ - CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { - auto _col_names = this->get_col_names(); - for (size_t i = 0; i < _col_names.size(); i++) - if (_col_names[i] == col_name) return (int)i; - - return CSV_NOT_FOUND; + CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) { + /** Update current minimum and maximum + * @param[in] x_n Data observation + * @param[out] i The column index that should be updated + */ + if (std::isnan(this->mins[i])) + this->mins[i] = x_n; + if (std::isnan(this->maxes[i])) + this->maxes[i] = x_n; + + if (x_n < this->mins[i]) + this->mins[i] = x_n; + else if (x_n > this->maxes[i]) + this->maxes[i] = x_n; } - CSV_INLINE void CSVReader::trim_header() { - if (!this->header_trimmed) { - for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { - if (i == this->_format.header && this->col_names->empty()) { - this->set_col_names(this->records->pop_front()); - } - else { - this->records->pop_front(); - } - } + CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) { + /** Given a record update rolling mean and variance for all columns + * using Welford's Algorithm + * @param[in] x_n Data observation + * @param[out] i The column index that should be updated + */ + long double& current_rolling_mean = this->rolling_means[i]; + long double& current_rolling_var = this->rolling_vars[i]; + long double& current_n = this->n[i]; + long double delta; + long double delta2; - this->header_trimmed = true; + current_n++; + + if (current_n == 1) { + current_rolling_mean = x_n; + } else { + delta = x_n - current_rolling_mean; + current_rolling_mean += delta/current_n; + delta2 = x_n - current_rolling_mean; + current_rolling_var += delta*delta2; } } - /** - * @param[in] names Column names - */ - CSV_INLINE void CSVReader::set_col_names(const std::vector& names) - { - this->col_names->set_col_names(names); - this->n_cols = names.size(); - } - - /** - * Read a chunk of CSV data. - * - * @note This method is meant to be run on its own thread. Only one `read_csv()` thread - * should be active at a time. - * - * @param[in] bytes Number of bytes to read. - * - * @see CSVReader::read_csv_worker - * @see CSVReader::read_row() - */ - CSV_INLINE bool CSVReader::read_csv(size_t bytes) { - // Tell read_row() to listen for CSV rows - this->records->notify_all(); - - this->parser->set_output(*this->records); - this->parser->next(bytes); - - if (!this->header_trimmed) { - this->trim_header(); - } - - // Tell read_row() to stop waiting - this->records->kill_all(); - - return true; - } - - /** - * Retrieve rows as CSVRow objects, returning true if more rows are available. - * - * @par Performance Notes - * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time - * - For performance details, read the documentation for CSVRow and CSVField. - * - * @param[out] row The variable where the parsed row will be stored - * @see CSVRow, CSVField + /** Useful for uploading CSV files to SQL databases. * - * **Example:** - * \snippet tests/test_read_csv.cpp CSVField Example + * Return a data type for each column such that every value in a column can be + * converted to the corresponding data type without data loss. + * @param[in] filename The CSV file * + * \return A mapping of column names to csv::DataType enums */ - CSV_INLINE bool CSVReader::read_row(CSVRow &row) { - while (true) { - if (this->records->empty()) { - if (this->records->is_waitable()) - // Reading thread is currently active => wait for it to populate records - this->records->wait(); - else if (this->parser->eof()) - // End of file and no more records - return false; - else { - // Reading thread is not active => start another one - if (this->read_csv_worker.joinable()) - this->read_csv_worker.join(); + CSV_INLINE std::unordered_map csv_data_types(const std::string& filename) { + CSVStat stat(filename); + std::unordered_map csv_dtypes; - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - } - } - else if (this->records->front().size() != this->n_cols && - this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { - auto errored_row = this->records->pop_front(); + auto col_names = stat.get_col_names(); + auto temp = stat.get_dtypes(); - if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { - if (errored_row.size() < this->n_cols) - throw std::runtime_error("Line too short " + internals::format_row(errored_row)); + for (size_t i = 0; i < stat.get_col_names().size(); i++) { + auto& col = temp[i]; + auto& col_name = col_names[i]; - throw std::runtime_error("Line too long " + internals::format_row(errored_row)); - } - } - else { - row = this->records->pop_front(); - this->_n_rows++; - return true; - } + if (col[DataType::CSV_STRING]) + csv_dtypes[col_name] = DataType::CSV_STRING; + else if (col[DataType::CSV_INT64]) + csv_dtypes[col_name] = DataType::CSV_INT64; + else if (col[DataType::CSV_INT32]) + csv_dtypes[col_name] = DataType::CSV_INT32; + else if (col[DataType::CSV_INT16]) + csv_dtypes[col_name] = DataType::CSV_INT16; + else if (col[DataType::CSV_INT8]) + csv_dtypes[col_name] = DataType::CSV_INT8; + else + csv_dtypes[col_name] = DataType::CSV_DOUBLE; } - return false; + return csv_dtypes; } } - -/** @file - * Defines an input iterator for csv::CSVReader - */ +#include +#include namespace csv { - /** Return an iterator to the first row in the reader */ - CSV_INLINE CSVReader::iterator CSVReader::begin() { - if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(in.data()); + return CSVReader(stream, format); + } - // Still empty => return end iterator - if (this->records->empty()) return this->end(); - } + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); - this->_n_rows++; - CSVReader::iterator ret(this, this->records->pop_front()); - return ret; + return parse(in, format); } - /** A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * */ - CSV_INLINE HEDLEY_CONST CSVReader::iterator CSVReader::end() const noexcept { - return CSVReader::iterator(); + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); } - ///////////////////////// - // CSVReader::iterator // - ///////////////////////// - - CSV_INLINE CSVReader::iterator::iterator(CSVReader* _daddy, CSVRow&& _row) : - daddy(_daddy) { - row = std::move(_row); + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); } - /** Advance the iterator by one row. If this CSVReader has an - * associated file, then the iterator will lazily pull more data from - * that file until the end of file is reached. - * - * @note This iterator does **not** block the thread responsible for parsing CSV. + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file */ - CSV_INLINE CSVReader::iterator& CSVReader::iterator::operator++() { - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } - - return *this; + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); } - /** Post-increment iterator */ - CSV_INLINE CSVReader::iterator CSVReader::iterator::operator++(int) { - auto temp = *this; - if (!daddy->read_row(this->row)) { - this->daddy = nullptr; // this == end() - } + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); - return temp; + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; + + return info; } } - /** @file - * Defines the data type used for storing information about a CSV row + * @brief Defines functionality needed for basic CSV parsing */ -#include -#include namespace csv { namespace internals { - CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { - const size_t page_no = n / _single_buffer_capacity; - const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; - return this->buffers[page_no][buffer_idx]; + CSV_INLINE std::string format_row(const std::vector& row, csv::string_view delim) { + /** Print a CSV row */ + std::stringstream ret; + for (size_t i = 0; i < row.size(); i++) { + ret << row[i]; + if (i + 1 < row.size()) ret << delim; + else ret << '\n'; + } + ret.flush(); + + return ret.str(); } - CSV_INLINE void CSVFieldList::allocate() { - buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector _get_col_names(csv::string_view head, CSVFormat format) { + // Parse the CSV + auto trim_chars = format.get_trim_chars(); + std::stringstream source(head.data()); + RowCollection rows; - _current_buffer_size = 0; - _back = buffers.back().get(); + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); + + return CSVRow(std::move(rows[format.get_header()])); } - } - /** Return a CSVField object corrsponding to the nth value in the row. - * - * @note This method performs bounds checking, and will throw an - * `std::runtime_error` if n is invalid. - * - * @complexity - * Constant, by calling csv::CSVRow::get_csv::string_view() - * - */ - CSV_INLINE CSVField CSVRow::operator[](size_t n) const { - return CSVField(this->get_field(n)); - } - - /** Retrieve a value by its associated column name. If the column - * specified can't be round, a runtime error is thrown. - * - * @complexity - * Constant. This calls the other CSVRow::operator[]() after - * converting column names into indices using a hash table. - * - * @param[in] col_name The column to look for - */ - CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { - auto & col_names = this->data->col_names; - auto col_pos = col_names->index_of(col_name); - if (col_pos > -1) { - return this->operator[](col_pos); - } - - throw std::runtime_error("Can't find a column named " + col_name); - } - - CSV_INLINE CSVRow::operator std::vector() const { - std::vector ret; - for (size_t i = 0; i < size(); i++) - ret.push_back(std::string(this->get_field(i))); + CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) { + // Frequency counter of row length + std::unordered_map row_tally = { { 0, 0 } }; - return ret; - } + // Map row lengths to row num where they first occurred + std::unordered_map row_when = { { 0, 0 } }; - CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const - { - using internals::ParseFlags; + // Parse the CSV + std::stringstream source(head.data()); + RowCollection rows; - if (index >= this->size()) - throw std::runtime_error("Index out of bounds."); + StreamParser parser(source, format); + parser.set_output(rows); + parser.next(); - const size_t field_index = this->fields_start + index; - auto& field = this->data->fields[field_index]; - auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); + for (size_t i = 0; i < rows.size(); i++) { + auto& row = rows[i]; - if (field.has_double_quote) { - auto& value = this->data->double_quote_fields[field_index]; - if (value.empty()) { - bool prev_ch_quote = false; - for (size_t i = 0; i < field.length; i++) { - if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { - if (prev_ch_quote) { - prev_ch_quote = false; - continue; - } - else { - prev_ch_quote = true; - } + // Ignore zero-length rows + if (row.size() > 0) { + if (row_tally.find(row.size()) != row_tally.end()) { + row_tally[row.size()]++; + } + else { + row_tally[row.size()] = 1; + row_when[row.size()] = i; } - - value += field_str[i]; } } - return csv::string_view(value); - } - - return field_str.substr(0, field.length); - } + double final_score = 0; + size_t header_row = 0; - CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { - size_t start = 0, end = 0; + // Final score is equal to the largest + // row size times rows of that size + for (auto& pair : row_tally) { + auto row_size = pair.first; + auto row_count = pair.second; + double score = (double)(row_size * row_count); + if (score > final_score) { + final_score = score; + header_row = row_when[row_size]; + } + } - // Trim out whitespace chars - for (; start < this->sv.size() && this->sv[start] == ' '; start++); - for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - - int value_ = 0; + return { + final_score, + header_row + }; + } - size_t digits = (end - start); - size_t base16_exponent = digits - 1; + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult _guess_format(csv::string_view head, const std::vector& delims) { + /** For each delimiter, find out which row length was most common. + * The delimiter with the longest mode row length wins. + * Then, the line number of the header row is the first row with + * the mode row length. + */ - if (digits == 0) return false; + CSVFormat format; + size_t max_score = 0, + header = 0; + char current_delim = delims[0]; - for (const auto& ch : this->sv.substr(start, digits)) { - int digit = 0; + for (char cand_delim : delims) { + auto result = calculate_score(head, format.delimiter(cand_delim)); - switch (ch) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - digit = static_cast(ch - '0'); - break; - case 'a': - case 'A': - digit = 10; - break; - case 'b': - case 'B': - digit = 11; - break; - case 'c': - case 'C': - digit = 12; - break; - case 'd': - case 'D': - digit = 13; - break; - case 'e': - case 'E': - digit = 14; - break; - case 'f': - case 'F': - digit = 15; - break; - default: - return false; + if ((size_t)result.score > max_score) { + max_score = (size_t)result.score; + current_delim = cand_delim; + header = result.header; + } } - value_ += digit * (int)pow(16, (double)base16_exponent); - base16_exponent--; + return { current_delim, (int)header }; } - - parsedValue = value_; - return true; } - CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { - // If field has already been parsed to empty, no need to do it aagin: - if (this->_type == DataType::CSV_NULL) - return false; - - // Not yet parsed or possibly parsed with other decimalSymbol - if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) - this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again - - // Integral types are not affected by decimalSymbol and need not be parsed again + /** Return a CSV's column names + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + */ + CSV_INLINE std::vector get_col_names(csv::string_view filename, CSVFormat format) { + auto head = internals::get_csv_head(filename); - // Either we already had an integral type before, or we we just got any numeric type now. - if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { - dVal = this->value; - return true; + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = guess_format(filename, format.get_possible_delims()); + format.delimiter(guess_result.delim).header_row(guess_result.header_row); } - // CSV_NULL or CSV_STRING, not numeric - return false; + return internals::_get_col_names(head, format); } -#ifdef _MSC_VER -#pragma region CSVRow Iterator -#endif - /** Return an iterator pointing to the first field. */ - CSV_INLINE CSVRow::iterator CSVRow::begin() const { - return CSVRow::iterator(this, 0); + /** Guess the delimiter used by a delimiter-separated values file */ + CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector& delims) { + auto head = internals::get_csv_head(filename); + return internals::_guess_format(head, delims); } - /** Return an iterator pointing to just after the end of the CSVRow. + /** Reads an arbitrarily large CSV file using memory-mapped IO. + * + * **Details:** Reads the first block of a CSV file synchronously to get information + * such as column names and delimiting character. + * + * @param[in] filename Path to CSV file + * @param[in] format Format of the CSV file + * + * \snippet tests/test_read_csv.cpp CSVField Example * - * @warning Attempting to dereference the end iterator results - * in dereferencing a null pointer. */ - CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { - return CSVRow::iterator(this, (int)this->size()); - } + CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) { + auto head = internals::get_csv_head(filename); + using Parser = internals::MmapParser; - CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { - return std::reverse_iterator(this->end()); - } + /** Guess delimiter and header row */ + if (format.guess_delim()) { + auto guess_result = internals::_guess_format(head, format.possible_delimiters); + format.delimiter(guess_result.delim); + format.header = guess_result.header_row; + this->_format = format; + } - CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { - return std::reverse_iterator(this->begin()); - } + if (!format.col_names.empty()) + this->set_col_names(format.col_names); - CSV_INLINE HEDLEY_NON_NULL(2) - CSVRow::iterator::iterator(const CSVRow* _reader, int _i) - : daddy(_reader), i(_i) { - if (_i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](_i)); - else - this->field = nullptr; + this->parser = std::unique_ptr(new Parser(filename, format, this->col_names)); // For C++11 + this->initial_read(); } - CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { - return *(this->field.get()); - } + /** Return the format of the original raw CSV */ + CSV_INLINE CSVFormat CSVReader::get_format() const { + CSVFormat new_format = this->_format; - CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { - return this->field; - } + // Since users are normally not allowed to set + // column names and header row simulatenously, + // we will set the backing variables directly here + new_format.col_names = this->col_names->get_col_names(); + new_format.header = this->_format.header; - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { - // Pre-increment operator - this->i++; - if (this->i < (int)this->daddy->size()) - this->field = std::make_shared( - this->daddy->operator[](i)); - else // Reached the end of row - this->field = nullptr; - return *this; + return new_format; } - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { - // Post-increment operator - auto temp = *this; - this->operator++(); - return temp; - } + /** Return the CSV's column names as a vector of strings. */ + CSV_INLINE std::vector CSVReader::get_col_names() const { + if (this->col_names) { + return this->col_names->get_col_names(); + } - CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { - // Pre-decrement operator - this->i--; - this->field = std::make_shared( - this->daddy->operator[](this->i)); - return *this; + return std::vector(); } - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { - // Post-decrement operator - auto temp = *this; - this->operator--(); - return temp; - } - - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator(this->daddy, i + (int)n); - } + /** Return the index of the column name if found or + * csv::CSV_NOT_FOUND otherwise. + */ + CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const { + auto _col_names = this->get_col_names(); + for (size_t i = 0; i < _col_names.size(); i++) + if (_col_names[i] == col_name) return (int)i; - CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { - // Allows for iterator arithmetic - return CSVRow::iterator::operator+(-n); + return CSV_NOT_FOUND; } -#ifdef _MSC_VER -#pragma endregion CSVRow Iterator -#endif -} -/** @file - * Implements JSON serialization abilities - */ + CSV_INLINE void CSVReader::trim_header() { + if (!this->header_trimmed) { + for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) { + if (i == this->_format.header && this->col_names->empty()) { + this->set_col_names(this->records->pop_front()); + } + else { + this->records->pop_front(); + } + } + this->header_trimmed = true; + } + } -namespace csv { - /* - The implementations for json_extra_space() and json_escape_string() - were modified from source code for JSON for Modern C++. + /** + * @param[in] names Column names + */ + CSV_INLINE void CSVReader::set_col_names(const std::vector& names) + { + this->col_names->set_col_names(names); + this->n_cols = names.size(); + } - The respective license is below: + /** + * Read a chunk of CSV data. + * + * @note This method is meant to be run on its own thread. Only one `read_csv()` thread + * should be active at a time. + * + * @param[in] bytes Number of bytes to read. + * + * @see CSVReader::read_csv_worker + * @see CSVReader::read_row() + */ + CSV_INLINE bool CSVReader::read_csv(size_t bytes) { + // Tell read_row() to listen for CSV rows + this->records->notify_all(); - The code is licensed under the [MIT - License](http://opensource.org/licenses/MIT): - - Copyright © 2013-2015 Niels Lohmann. - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation files - (the "Software"), to deal in the Software without restriction, - including without limitation the rights to use, copy, modify, merge, - publish, distribute, sublicense, and/or sell copies of the Software, - and to permit persons to whom the Software is furnished to do so, - subject to the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - */ + this->parser->set_output(*this->records); + this->parser->next(bytes); - namespace internals { - /*! - @brief calculates the extra space to escape a JSON string + if (!this->header_trimmed) { + this->trim_header(); + } - @param[in] s the string to escape - @return the number of characters required to escape string @a s + // Tell read_row() to stop waiting + this->records->kill_all(); - @complexity Linear in the length of string @a s. - */ - static std::size_t json_extra_space(csv::string_view& s) noexcept - { - std::size_t result = 0; + return true; + } + /** + * Retrieve rows as CSVRow objects, returning true if more rows are available. + * + * @par Performance Notes + * - Reads chunks of data that are csv::internals::ITERATION_CHUNK_SIZE bytes large at a time + * - For performance details, read the documentation for CSVRow and CSVField. + * + * @param[out] row The variable where the parsed row will be stored + * @see CSVRow, CSVField + * + * **Example:** + * \snippet tests/test_read_csv.cpp CSVField Example + * + */ + CSV_INLINE bool CSVReader::read_row(CSVRow &row) { + while (true) { + if (this->records->empty()) { + if (this->records->is_waitable()) + // Reading thread is currently active => wait for it to populate records + this->records->wait(); + else if (this->parser->eof()) + // End of file and no more records + return false; + else { + // Reading thread is not active => start another one + if (this->read_csv_worker.joinable()) + this->read_csv_worker.join(); - for (const auto& c : s) - { - switch (c) - { - case '"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - { - // from c (1 byte) to \x (2 bytes) - result += 1; - break; + this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); } + } + else if (this->records->front().size() != this->n_cols && + this->_format.variable_column_policy != VariableColumnPolicy::KEEP) { + auto errored_row = this->records->pop_front(); + if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) { + if (errored_row.size() < this->n_cols) + throw std::runtime_error("Line too short " + internals::format_row(errored_row)); - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // from c (1 byte) to \uxxxx (6 bytes) - result += 5; - } - break; - } + throw std::runtime_error("Line too long " + internals::format_row(errored_row)); } } + else { + row = this->records->pop_front(); + this->_n_rows++; + return true; + } + } + return false; + } +} - return result; +/** @file + * Defines the data type used for storing information about a CSV row + */ + +#include +#include + +namespace csv { + namespace internals { + CSV_INLINE RawCSVField& CSVFieldList::operator[](size_t n) const { + const size_t page_no = n / _single_buffer_capacity; + const size_t buffer_idx = (page_no < 1) ? n : n % _single_buffer_capacity; + return this->buffers[page_no][buffer_idx]; } - CSV_INLINE std::string json_escape_string(csv::string_view s) noexcept - { - const auto space = json_extra_space(s); - if (space == 0) - { - return std::string(s); - } + CSV_INLINE void CSVFieldList::allocate() { + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); - // create a result string of necessary size - size_t result_size = s.size() + space; - std::string result(result_size, '\\'); - std::size_t pos = 0; + _current_buffer_size = 0; + _back = buffers.back().get(); + } + } - for (const auto& c : s) - { - switch (c) - { - // quotation mark (0x22) - case '"': - { - result[pos + 1] = '"'; - pos += 2; - break; - } - - - // reverse solidus (0x5c) - case '\\': - { - // nothing to change - pos += 2; - break; - } - - - // backspace (0x08) - case '\b': - { - result[pos + 1] = 'b'; - pos += 2; - break; - } - - - // formfeed (0x0c) - case '\f': - { - result[pos + 1] = 'f'; - pos += 2; - break; - } + /** Return a CSVField object corrsponding to the nth value in the row. + * + * @note This method performs bounds checking, and will throw an + * `std::runtime_error` if n is invalid. + * + * @complexity + * Constant, by calling csv::CSVRow::get_csv::string_view() + * + */ + CSV_INLINE CSVField CSVRow::operator[](size_t n) const { + return CSVField(this->get_field(n)); + } + /** Retrieve a value by its associated column name. If the column + * specified can't be round, a runtime error is thrown. + * + * @complexity + * Constant. This calls the other CSVRow::operator[]() after + * converting column names into indices using a hash table. + * + * @param[in] col_name The column to look for + */ + CSV_INLINE CSVField CSVRow::operator[](const std::string& col_name) const { + auto & col_names = this->data->col_names; + auto col_pos = col_names->index_of(col_name); + if (col_pos > -1) { + return this->operator[](col_pos); + } - // newline (0x0a) - case '\n': - { - result[pos + 1] = 'n'; - pos += 2; - break; - } + throw std::runtime_error("Can't find a column named " + col_name); + } + CSV_INLINE CSVRow::operator std::vector() const { + std::vector ret; + for (size_t i = 0; i < size(); i++) + ret.push_back(std::string(this->get_field(i))); - // carriage return (0x0d) - case '\r': - { - result[pos + 1] = 'r'; - pos += 2; - break; - } + return ret; + } + CSV_INLINE csv::string_view CSVRow::get_field(size_t index) const + { + using internals::ParseFlags; - // horizontal tab (0x09) - case '\t': - { - result[pos + 1] = 't'; - pos += 2; - break; - } + if (index >= this->size()) + throw std::runtime_error("Index out of bounds."); + const size_t field_index = this->fields_start + index; + auto& field = this->data->fields[field_index]; + auto field_str = csv::string_view(this->data->data).substr(this->data_start + field.start); - default: - { - if (c >= 0x00 && c <= 0x1f) - { - // print character c as \uxxxx - snprintf(&result[pos + 1], result_size - pos - 1, "u%04x", int(c)); - pos += 6; - // overwrite trailing null character - result[pos] = '\\'; - } - else - { - // all other characters are added as-is - result[pos++] = c; + if (field.has_double_quote) { + auto& value = this->data->double_quote_fields[field_index]; + if (value.empty()) { + bool prev_ch_quote = false; + for (size_t i = 0; i < field.length; i++) { + if (this->data->parse_flags[field_str[i] + 128] == ParseFlags::QUOTE) { + if (prev_ch_quote) { + prev_ch_quote = false; + continue; + } + else { + prev_ch_quote = true; + } } - break; - } + + value += field_str[i]; } } - return result; + return csv::string_view(value); } + + return field_str.substr(0, field.length); } - /** Convert a CSV row to a JSON object, i.e. - * `{"col1":"value1","col2":"value2"}` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for original columns. - */ - CSV_INLINE std::string CSVRow::to_json(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) { - col_names = this->data ? this->get_col_names() : std::vector({}); - } + CSV_INLINE bool CSVField::try_parse_hex(int& parsedValue) { + size_t start = 0, end = 0; - const size_t _n_cols = col_names.size(); - std::string ret = "{"; + // Trim out whitespace chars + for (; start < this->sv.size() && this->sv[start] == ' '; start++); + for (end = start; end < this->sv.size() && this->sv[end] != ' '; end++); - for (size_t i = 0; i < _n_cols; i++) { - auto& col = col_names[i]; - auto field = this->operator[](col); + int value_ = 0; - // TODO: Possible performance enhancements by caching escaped column names - ret += '"' + internals::json_escape_string(col) + "\":"; + size_t digits = (end - start); + size_t base16_exponent = digits - 1; - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + if (digits == 0) return false; - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; + for (const auto& ch : this->sv.substr(start, digits)) { + int digit = 0; + + switch (ch) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digit = static_cast(ch - '0'); + break; + case 'a': + case 'A': + digit = 10; + break; + case 'b': + case 'B': + digit = 11; + break; + case 'c': + case 'C': + digit = 12; + break; + case 'd': + case 'D': + digit = 13; + break; + case 'e': + case 'E': + digit = 14; + break; + case 'f': + case 'F': + digit = 15; + break; + default: + return false; + } + + value_ += digit * (int)pow(16, (double)base16_exponent); + base16_exponent--; } - ret += '}'; - return ret; + parsedValue = value_; + return true; } - /** Convert a CSV row to a JSON array, i.e. - * `["value1","value2",...]` - * - * @note All strings are properly escaped. Numeric values are not quoted. - * @param[in] subset A subset of columns to contain in the JSON. - * Leave empty for all columns. - */ - CSV_INLINE std::string CSVRow::to_json_array(const std::vector& subset) const { - std::vector col_names = subset; - if (subset.empty()) - col_names = this->data ? this->get_col_names() : std::vector({}); - - const size_t _n_cols = col_names.size(); - std::string ret = "["; + CSV_INLINE bool CSVField::try_parse_decimal(long double& dVal, const char decimalSymbol) { + // If field has already been parsed to empty, no need to do it aagin: + if (this->_type == DataType::CSV_NULL) + return false; - for (size_t i = 0; i < _n_cols; i++) { - auto field = this->operator[](col_names[i]); + // Not yet parsed or possibly parsed with other decimalSymbol + if (this->_type == DataType::UNKNOWN || this->_type == DataType::CSV_STRING || this->_type == DataType::CSV_DOUBLE) + this->_type = internals::data_type(this->sv, &this->value, decimalSymbol); // parse again - // Add quotes around strings but not numbers - if (field.is_num()) - ret += internals::json_escape_string(field.get()); - else - ret += '"' + internals::json_escape_string(field.get()) + '"'; + // Integral types are not affected by decimalSymbol and need not be parsed again - // Do not add comma after last string - if (i + 1 < _n_cols) - ret += ','; + // Either we already had an integral type before, or we we just got any numeric type now. + if (this->_type >= DataType::CSV_INT8 && this->_type <= DataType::CSV_DOUBLE) { + dVal = this->value; + return true; } - ret += ']'; - return ret; + // CSV_NULL or CSV_STRING, not numeric + return false; } -} - -/** @file - * Calculates statistics from CSV files - */ -#include +#ifdef _MSC_VER +#pragma region CSVRow Iterator +#endif + /** Return an iterator pointing to the first field. */ + CSV_INLINE CSVRow::iterator CSVRow::begin() const { + return CSVRow::iterator(this, 0); + } -namespace csv { - /** Calculate statistics for an arbitrarily large file. When this constructor - * is called, CSVStat will process the entire file iteratively. Once finished, - * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. + /** Return an iterator pointing to just after the end of the CSVRow. + * + * @warning Attempting to dereference the end iterator results + * in dereferencing a null pointer. */ - CSV_INLINE CSVStat::CSVStat(csv::string_view filename, CSVFormat format) : - reader(filename, format) { - this->calc(); + CSV_INLINE CSVRow::iterator CSVRow::end() const noexcept { + return CSVRow::iterator(this, (int)this->size()); } - /** Calculate statistics for a CSV stored in a std::stringstream */ - CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) : - reader(stream, format) { - this->calc(); + CSV_INLINE CSVRow::reverse_iterator CSVRow::rbegin() const noexcept { + return std::reverse_iterator(this->end()); } - /** Return current means */ - CSV_INLINE std::vector CSVStat::get_mean() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->rolling_means[i]); - } - return ret; + CSV_INLINE CSVRow::reverse_iterator CSVRow::rend() const { + return std::reverse_iterator(this->begin()); } - /** Return current variances */ - CSV_INLINE std::vector CSVStat::get_variance() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); - } - return ret; + CSV_INLINE HEDLEY_NON_NULL(2) + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) + : daddy(_reader), i(_i) { + if (_i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](_i)); + else + this->field = nullptr; } - /** Return current mins */ - CSV_INLINE std::vector CSVStat::get_mins() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->mins[i]); - } - return ret; + CSV_INLINE CSVRow::iterator::reference CSVRow::iterator::operator*() const { + return *(this->field.get()); } - /** Return current maxes */ - CSV_INLINE std::vector CSVStat::get_maxes() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->maxes[i]); - } - return ret; + CSV_INLINE CSVRow::iterator::pointer CSVRow::iterator::operator->() const { + return this->field; } - /** Get counts for each column */ - CSV_INLINE std::vector CSVStat::get_counts() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->counts[i]); - } - return ret; + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator++() { + // Pre-increment operator + this->i++; + if (this->i < (int)this->daddy->size()) + this->field = std::make_shared( + this->daddy->operator[](i)); + else // Reached the end of row + this->field = nullptr; + return *this; } - /** Get data type counts for each column */ - CSV_INLINE std::vector CSVStat::get_dtypes() const { - std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { - ret.push_back(this->dtypes[i]); - } - return ret; + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator++(int) { + // Post-increment operator + auto temp = *this; + this->operator++(); + return temp; } - CSV_INLINE void CSVStat::calc_chunk() { - /** Only create stats counters the first time **/ - if (dtypes.empty()) { - /** Go through all records and calculate specified statistics */ - for (size_t i = 0; i < this->get_col_names().size(); i++) { - dtypes.push_back({}); - counts.push_back({}); - rolling_means.push_back(0); - rolling_vars.push_back(0); - mins.push_back(NAN); - maxes.push_back(NAN); - n.push_back(0); - } - } - - // Start threads - std::vector pool; - for (size_t i = 0; i < this->get_col_names().size(); i++) - pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); + CSV_INLINE CSVRow::iterator& CSVRow::iterator::operator--() { + // Pre-decrement operator + this->i--; + this->field = std::make_shared( + this->daddy->operator[](this->i)); + return *this; + } - // Block until done - for (auto& th : pool) - th.join(); + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator--(int) { + // Post-decrement operator + auto temp = *this; + this->operator--(); + return temp; + } + + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator+(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator(this->daddy, i + (int)n); + } - this->records.clear(); + CSV_INLINE CSVRow::iterator CSVRow::iterator::operator-(difference_type n) const { + // Allows for iterator arithmetic + return CSVRow::iterator::operator+(-n); } +#ifdef _MSC_VER +#pragma endregion CSVRow Iterator +#endif +} - CSV_INLINE void CSVStat::calc() { - constexpr size_t CALC_CHUNK_SIZE = 5000; - for (auto& row : reader) { - this->records.push_back(std::move(row)); +namespace csv { + namespace internals { + CSV_INLINE size_t get_file_size(csv::string_view filename) { + std::ifstream infile(std::string(filename), std::ios::binary); + const auto start = infile.tellg(); + infile.seekg(0, std::ios::end); + const auto end = infile.tellg(); - /** Chunk rows */ - if (this->records.size() == CALC_CHUNK_SIZE) { - calc_chunk(); - } + return end - start; } - if (!this->records.empty()) { - calc_chunk(); + CSV_INLINE std::string get_csv_head(csv::string_view filename) { + return get_csv_head(filename, get_file_size(filename)); } - } - - CSV_INLINE void CSVStat::calc_worker(const size_t &i) { - /** Worker thread for CSVStat::calc() which calculates statistics for one column. - * - * @param[in] i Column index - */ - auto current_record = this->records.begin(); - - for (size_t processed = 0; current_record != this->records.end(); processed++) { - if (current_record->size() == this->get_col_names().size()) { - auto current_field = (*current_record)[i]; + CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) { + const size_t bytes = 500000; - // Optimization: Don't count() if there's too many distinct values in the first 1000 rows - if (processed < 1000 || this->counts[i].size() <= 500) - this->count(current_field, i); + std::error_code error; + size_t length = std::min((size_t)file_size, bytes); + auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error); - this->dtype(current_field, i); + if (error) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } - // Numeric Stuff - if (current_field.is_num()) { - long double x_n = current_field.get(); + return std::string(mmap.begin(), mmap.end()); + } - // This actually calculates mean AND variance - this->variance(x_n, i); - this->min_max(x_n, i); - } +#ifdef _MSC_VER +#pragma region IBasicCVParser +#endif + CSV_INLINE IBasicCSVParser::IBasicCSVParser( + const CSVFormat& format, + const ColNamesPtr& col_names + ) : _col_names(col_names) { + if (format.no_quote) { + _parse_flags = internals::make_parse_flags(format.get_delim()); } - else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) { - throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record)); + else { + _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char); } - ++current_record; - } - } - - CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) { - /** Given a record update the type counter - * @param[in] record Data observation - * @param[out] i The column index that should be updated - */ - - auto type = data.type(); - if (this->dtypes[i].find(type) != - this->dtypes[i].end()) { - // Increment count - this->dtypes[i][type]++; - } else { - // Initialize count - this->dtypes[i].insert(std::make_pair(type, 1)); + _ws_flags = internals::make_ws_flags( + format.trim_chars.data(), format.trim_chars.size() + ); } - } - CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) { - /** Given a record update the frequency counter - * @param[in] record Data observation - * @param[out] i The column index that should be updated - */ + CSV_INLINE void IBasicCSVParser::end_feed() { + using internals::ParseFlags; - auto item = data.get(); + bool empty_last_field = this->data_ptr + && this->data_ptr->_data + && !this->data_ptr->data.empty() + && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER + || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE); - if (this->counts[i].find(item) != - this->counts[i].end()) { - // Increment count - this->counts[i][item]++; - } else { - // Initialize count - this->counts[i].insert(std::make_pair(item, 1)); + // Push field + if (this->field_length > 0 || empty_last_field) { + this->push_field(); + } + + // Push row + if (this->current_row.size() > 0) + this->push_row(); } - } - CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) { - /** Update current minimum and maximum - * @param[in] x_n Data observation - * @param[out] i The column index that should be updated - */ - if (std::isnan(this->mins[i])) - this->mins[i] = x_n; - if (std::isnan(this->maxes[i])) - this->maxes[i] = x_n; - - if (x_n < this->mins[i]) - this->mins[i] = x_n; - else if (x_n > this->maxes[i]) - this->maxes[i] = x_n; - } + CSV_INLINE void IBasicCSVParser::parse_field() noexcept { + using internals::ParseFlags; + auto& in = this->data_ptr->data; - CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) { - /** Given a record update rolling mean and variance for all columns - * using Welford's Algorithm - * @param[in] x_n Data observation - * @param[out] i The column index that should be updated - */ - long double& current_rolling_mean = this->rolling_means[i]; - long double& current_rolling_var = this->rolling_vars[i]; - long double& current_n = this->n[i]; - long double delta; - long double delta2; + // Trim off leading whitespace + while (data_pos < in.size() && ws_flag(in[data_pos])) + data_pos++; - current_n++; - - if (current_n == 1) { - current_rolling_mean = x_n; - } else { - delta = x_n - current_rolling_mean; - current_rolling_mean += delta/current_n; - delta2 = x_n - current_rolling_mean; - current_rolling_var += delta*delta2; + if (field_start == UNINITIALIZED_FIELD) + field_start = (int)(data_pos - current_row_start()); + + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL) + data_pos++; + + field_length = data_pos - (field_start + current_row_start()); + + // Trim off trailing whitespace, this->field_length constraint matters + // when field is entirely whitespace + for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--) + this->field_length--; } - } - /** Useful for uploading CSV files to SQL databases. - * - * Return a data type for each column such that every value in a column can be - * converted to the corresponding data type without data loss. - * @param[in] filename The CSV file - * - * \return A mapping of column names to csv::DataType enums - */ - CSV_INLINE std::unordered_map csv_data_types(const std::string& filename) { - CSVStat stat(filename); - std::unordered_map csv_dtypes; + CSV_INLINE void IBasicCSVParser::push_field() + { + // Update + if (field_has_double_quote) { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length, + true + ); + field_has_double_quote = false; - auto col_names = stat.get_col_names(); - auto temp = stat.get_dtypes(); + } + else { + fields->emplace_back( + field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start, + field_length + ); + } - for (size_t i = 0; i < stat.get_col_names().size(); i++) { - auto& col = temp[i]; - auto& col_name = col_names[i]; + current_row.row_length++; - if (col[DataType::CSV_STRING]) - csv_dtypes[col_name] = DataType::CSV_STRING; - else if (col[DataType::CSV_INT64]) - csv_dtypes[col_name] = DataType::CSV_INT64; - else if (col[DataType::CSV_INT32]) - csv_dtypes[col_name] = DataType::CSV_INT32; - else if (col[DataType::CSV_INT16]) - csv_dtypes[col_name] = DataType::CSV_INT16; - else if (col[DataType::CSV_INT8]) - csv_dtypes[col_name] = DataType::CSV_INT8; - else - csv_dtypes[col_name] = DataType::CSV_DOUBLE; + // Reset field state + field_start = UNINITIALIZED_FIELD; + field_length = 0; } - return csv_dtypes; - } -} -#include -#include + /** @return The number of characters parsed that belong to complete rows */ + CSV_INLINE size_t IBasicCSVParser::parse() + { + using internals::ParseFlags; + this->quote_escape = false; + this->data_pos = 0; + this->current_row_start() = 0; + this->trim_utf8_bom(); -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example - */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(in.data()); - return CSVReader(stream, format); - } + auto& in = this->data_ptr->data; + while (this->data_pos < in.size()) { + switch (compound_parse_flag(in[this->data_pos])) { + case ParseFlags::DELIMITER: + this->push_field(); + this->data_pos++; + break; - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); + case ParseFlags::NEWLINE: + this->data_pos++; - return parse(in, format); - } + // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) + while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + this->data_pos++; - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma - * - */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); - } + // End of record -> Write record + this->push_field(); + this->push_row(); - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); - } + // Reset + this->current_row = CSVRow(data_ptr, this->data_pos, fields->size()); + break; - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } + case ParseFlags::NOT_SPECIAL: + this->parse_field(); + break; - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); + case ParseFlags::QUOTE_ESCAPE_QUOTE: + if (data_pos + 1 == in.size()) return this->current_row_start(); + else if (data_pos + 1 < in.size()) { + auto next_ch = parse_flag(in[data_pos + 1]); + if (next_ch >= ParseFlags::DELIMITER) { + quote_escape = false; + data_pos++; + break; + } + else if (next_ch == ParseFlags::QUOTE) { + // Case: Escaped quote + data_pos += 2; + this->field_length += 2; + this->field_has_double_quote = true; + break; + } + } + + // Case: Unescaped single quote => not strictly valid but we'll keep it + this->field_length++; + data_pos++; - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; + break; - return info; + default: // Quote (currently not quote escaped) + if (this->field_length == 0) { + quote_escape = true; + data_pos++; + if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos])) + field_start = (int)(data_pos - current_row_start()); + break; + } + + // Case: Unescaped quote + this->field_length++; + data_pos++; + + break; + } + } + + return this->current_row_start(); + } + + CSV_INLINE void IBasicCSVParser::push_row() { + current_row.row_length = fields->size() - current_row.fields_start; + this->_records->push_back(std::move(current_row)); + } + + CSV_INLINE void IBasicCSVParser::reset_data_ptr() { + this->data_ptr = std::make_shared(); + this->data_ptr->parse_flags = this->_parse_flags; + this->data_ptr->col_names = this->_col_names; + this->fields = &(this->data_ptr->fields); + } + + CSV_INLINE void IBasicCSVParser::trim_utf8_bom() { + auto& data = this->data_ptr->data; + + if (!this->unicode_bom_scan && data.size() >= 3) { + if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') { + this->data_pos += 3; // Remove BOM from input string + this->_utf8_bom = true; + } + + this->unicode_bom_scan = true; + } + } +#ifdef _MSC_VER +#pragma endregion +#endif + +#ifdef _MSC_VER +#pragma region Specializations +#endif + CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) { + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; + this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; + + // Create memory map + size_t length = std::min(this->source_size - this->mmap_pos, bytes); + std::error_code error; + this->data_ptr->_data = std::make_shared>(mio::make_mmap_source(this->_filename, this->mmap_pos, length, error)); + this->mmap_pos += length; + if (error) throw error; + + auto mmap_ptr = (mio::basic_mmap_source*)(this->data_ptr->_data.get()); + + // Create string view + this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length()); + + // Parse + this->current_row = CSVRow(this->data_ptr); + size_t remainder = this->parse(); + + if (this->mmap_pos == this->source_size || no_chunk()) { + this->_eof = true; + this->end_feed(); + } + + this->mmap_pos -= (length - remainder); + } +#ifdef _MSC_VER +#pragma endregion +#endif } } + #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e5b57a87..43358c36 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,6 +27,7 @@ target_sources(csv_test test_data_type.cpp test_raw_csv_data.cpp test_round_trip.cpp + test_row_start_position.cpp ) target_link_libraries(csv_test csv) target_link_libraries(csv_test Catch2::Catch2WithMain) diff --git a/tests/test_row_start_position.cpp b/tests/test_row_start_position.cpp new file mode 100644 index 00000000..c5162681 --- /dev/null +++ b/tests/test_row_start_position.cpp @@ -0,0 +1,57 @@ +/** @file + * Tests for CSV parsing + */ + +#include // remove() +#include + +#include +#include +#include +#include + +#include "csv.hpp" + +using namespace csv; +using std::string; +using std::vector; + +// +// CSVRow::current_row_start() +// + +TEST_CASE("CSVRow::current_row_start", "[current_row_start]") { + CSVGuessResult guessed_format = guess_format("./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); + REQUIRE(guessed_format.delim == ';'); + REQUIRE(guessed_format.header_row == 0); + + std::fstream fstream; + auto testfile = std::filesystem::path("./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); + std::ifstream ifs(testfile.c_str()); + std::string content((std::istreambuf_iterator(ifs)), (std::istreambuf_iterator())); + + CSVFormat format; + format.delimiter(guessed_format.delim).header_row(guessed_format.header_row); + + { + // parse from file + CSVReader reader(testfile.c_str(), format); + uint64_t pos = 0; + for (CSVRow& row : reader) { + pos = content.find_first_of('\n', pos) + 1; + REQUIRE(row.current_row_start() == pos); + } + } + + { + // parse from stream + auto stream = std::stringstream(content); + auto reader = CSVReader(stream, format); + + uint64_t pos = 0; + for (CSVRow& row : reader) { + pos = content.find_first_of('\n', pos) + 1; + REQUIRE(row.current_row_start() == pos); + } + } +} From 37bd871356fee902de78884929f241a18f16aa43 Mon Sep 17 00:00:00 2001 From: hirohira Date: Tue, 8 Oct 2024 07:36:44 +0900 Subject: [PATCH 2/2] fix: reset parser state in StreamParser::next StreamParser::next was missing parsing state reset. It caused incorrectly parsing the first element of the second and later chunks. This commit addresses the issue. --- include/internal/basic_csv_parser.hpp | 3 +++ single_include/csv.hpp | 3 +++ single_include_test/csv.hpp | 3 +++ 3 files changed, 9 insertions(+) diff --git a/include/internal/basic_csv_parser.hpp b/include/internal/basic_csv_parser.hpp index b245e9a2..d499b252 100644 --- a/include/internal/basic_csv_parser.hpp +++ b/include/internal/basic_csv_parser.hpp @@ -320,6 +320,9 @@ namespace csv { void next(size_t bytes = ITERATION_CHUNK_SIZE) override { if (this->eof()) return; + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; this->reset_data_ptr(); this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 74dcabce..5972ce9a 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -6165,6 +6165,9 @@ namespace csv { void next(size_t bytes = ITERATION_CHUNK_SIZE) override { if (this->eof()) return; + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; this->reset_data_ptr(); this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index 74dcabce..5972ce9a 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -6165,6 +6165,9 @@ namespace csv { void next(size_t bytes = ITERATION_CHUNK_SIZE) override { if (this->eof()) return; + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; this->reset_data_ptr(); this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared();