From 4a18e68d3a77f0354bae356ce067d857f62cb580 Mon Sep 17 00:00:00 2001 From: Fabio Rossini Sluzala Date: Wed, 20 Sep 2023 11:25:24 -0300 Subject: [PATCH] Improve Strutils::explode speed --- src/utils/Strutils.hpp | 32 +- src/utils/stringviewstream.hpp | 522 +++++++++++++++++++++++++++++++++ tests/test_strutils.cpp | 13 + 3 files changed, 561 insertions(+), 6 deletions(-) create mode 100644 src/utils/stringviewstream.hpp diff --git a/src/utils/Strutils.hpp b/src/utils/Strutils.hpp index 2d533b5..5f5c3a0 100644 --- a/src/utils/Strutils.hpp +++ b/src/utils/Strutils.hpp @@ -63,15 +63,35 @@ class Strutils { std::string("$1")); } - static auto explode(const std::string &s, char delim) - -> std::vector { - std::vector result; - std::istringstream iss(s); + template + static auto explode(std::string_view strview, std::string_view term) + -> std::vector { + size_t current = 0; + std::vector result; - for (std::string token; std::getline(iss, token, delim);) { - result.push_back(std::move(token)); + if (strview.empty()) { + return result; } + if (term.empty()) { + result.emplace_back(strview); + return result; + } + + do { + auto sep = strview.find(term, current); + + result.emplace_back(strview.substr( + current, + (sep == std::string_view::npos) ? sep : (sep - current))); + + if (sep == std::string_view::npos) { + break; + } + + current = sep + term.size(); + } while (current < strview.size()); + return result; } diff --git a/src/utils/stringviewstream.hpp b/src/utils/stringviewstream.hpp new file mode 100644 index 0000000..b0d5108 --- /dev/null +++ b/src/utils/stringviewstream.hpp @@ -0,0 +1,522 @@ +#pragma once + +#include +#include + +template > +class basic_stringviewbuf : public std::basic_streambuf { + struct xfer_bufptrs; + + public: + // Types: + using char_type = CharT; + using traits_type = Traits; + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 251. basic_stringviewbuf missing allocator_type + using int_type = typename traits_type::int_type; + using pos_type = typename traits_type::pos_type; + using off_type = typename traits_type::off_type; + + using streambuf_type = std::basic_streambuf; + using stringview_type = std::basic_string_view; + using size_type = typename stringview_type::size_type; + + protected: + /// Place to stash in || out || in | out settings for current stringviewbuf. + std::ios_base::openmode MMode; + + // Data Members: + stringview_type M_string; + + public: + // Constructors: + + /** + * @brief Starts with an empty string buffer. + * + * The default constructor initializes the parent class using its + * own default ctor. + */ + basic_stringviewbuf() + : streambuf_type(), MMode(std::ios_base::in), M_string() {} + + /** + * @brief Starts with an empty string buffer. + * @param mode Whether the buffer can read, or write, or both. + * + * The default constructor initializes the parent class using its + * own default ctor. + */ + explicit basic_stringviewbuf(std::ios_base::openmode mode) + : streambuf_type(), MMode(mode), M_string() {} + + /** + * @brief Starts with an existing string buffer. + * @param in_str A string to copy as a starting buffer. + * @param inMode Whether the buffer can read, or write, or both. + * + * This constructor initializes the parent class using its + * own default ctor. + */ + explicit basic_stringviewbuf( + const stringview_type &in_str, + std::ios_base::openmode inMode = std::ios_base::in) + : streambuf_type(), MMode(), M_string(in_str.data(), in_str.size()) { + M_stringviewbuf_init(inMode); + } + +#if __cplusplus >= 201103L + basic_stringviewbuf(const basic_stringviewbuf &) = delete; + + basic_stringviewbuf(basic_stringviewbuf &&i_rhs) noexcept + : basic_stringviewbuf(std::move(i_rhs), xfer_bufptrs(i_rhs, this)) { + i_rhs.M_sync(const_cast(i_rhs.M_string.data()), 0, 0); + } + +#if __cplusplus > 201703L && _GLIBCXX_USE_CXX11_ABI + explicit basic_stringviewbuf(const allocator_type &i_a) + : basic_stringviewbuf(ios_base::in | std::ios_base::out, i_a) {} + + basic_stringviewbuf(ios_base::openmode inMode, const allocator_type &i_a) + : streambuf_type(), MMode(inMode), M_string(i_a) {} + + explicit basic_stringviewbuf(stringview_type &&i_s, + ios_base::openmode inMode = ios_base::in | + ios_base::out) + : streambuf_type(), MMode(inMode), M_string(std::move(i_s)) { + M_stringviewbuf_init(inMode); + } + + template + basic_stringviewbuf(const basic_string &i_s, + const allocator_type &i_a) + : basic_stringviewbuf(i_s, ios_base::in | std::ios_base::out, i_a) {} + + template + basic_stringviewbuf(const basic_string &i_s, + ios_base::openmode inMode, const allocator_type &i_a) + : streambuf_type(), MMode(inMode), + M_string(i_s.data(), i_s.size(), i_a) { + M_stringviewbuf_init(inMode); + } + + template + explicit basic_stringviewbuf( + const basic_string &i_s, + ios_base::openmode inMode = ios_base::in | ios_base::out) + : basic_stringviewbuf(i_s, inMode, allocator_type{}) {} + + basic_stringviewbuf(basic_stringviewbuf &&i_rhs, const allocator_type &i_a) + : basic_stringviewbuf(std::move(i_rhs), i_a, + xfer_bufptrs(i_rhs, this)) { + i_rhs.M_sync(const_cast(i_rhs.M_string.data()), 0, 0); + } + + allocator_type getAllocator() const noexcept { + return M_string.getAllocator(); + } +#endif // C++20 + + // 27.8.2.2 Assign and swap: + + auto operator=(const basic_stringviewbuf &) + -> basic_stringviewbuf & = delete; + + auto operator=(basic_stringviewbuf &&i_rhs) noexcept + -> basic_stringviewbuf & { + xfer_bufptrs i_st{i_rhs, this}; + const streambuf_type &i_base = i_rhs; + streambuf_type::operator=(i_base); + this->pubimbue(i_rhs.getloc()); + MMode = i_rhs.MMode; + M_string = std::move(i_rhs.M_string); + i_rhs.M_sync(const_cast(i_rhs.M_string.data()), 0, 0); + return *this; + } + + void swap(basic_stringviewbuf &i_rhs) noexcept { + xfer_bufptrs i_l_st{*this, std::addressof(i_rhs)}; + xfer_bufptrs i_r_st{i_rhs, this}; + streambuf_type &i_base = i_rhs; + streambuf_type::swap(i_base); + i_rhs.pubimbue(this->pubimbue(i_rhs.getloc())); + std::swap(MMode, i_rhs.MMode); + std::swap(M_string, i_rhs.M_string); // XXX not exception safe + } +#endif // C++11 + + // Getters and setters: + + /** + * @brief Copying out the string buffer. + * @return A copy of one of the underlying sequences. + * + * If the buffer is only created in input mode, the underlying + * character sequence is equal to the input sequence; otherwise, it + * is equal to the output sequence. [27.7.1.2]/1 + */ + auto str() const -> stringview_type { + stringview_type i_ret(M_string.getAllocator()); + if (char_type *i_hi = M_highMark()) { + i_ret.assign(this->pbase(), i_hi); + } else { + i_ret = M_string; + } + return i_ret; + } + +#if __cplusplus > 201703L && _GLIBCXX_USE_CXX11_ABI +#if i_cpp_concepts + template <_Allocator_like _SAlloc> + basic_string str(const _SAlloc &i_sa) const { + auto i_sv = view(); + return {i_sv.data(), i_sv.size(), i_sa}; + } +#endif + + stringview_type str() && { + if (char_type *i_hi = M_highMark()) { + // Set length to end of character sequence and add null terminator. + M_string.M_set_length(M_highMark() - this->pbase()); + } + auto in_str = std::move(M_string); + M_string.clear(); + M_sync(M_string.data(), 0, 0); + return in_str; + } + + basic_string_view view() const noexcept { + if (char_type *i_hi = M_highMark()) + return {this->pbase(), i_hi}; + else + return M_string; + } +#endif // C++20 + + /** + * @brief Setting a new buffer. + * @param i_s The string to use as a new sequence. + * + * Deallocates any previous stored sequence, then copies @a s to + * use as a new one. + */ + void str(const stringview_type &i_s) { + // Cannot use M_string = i_s, since v3 strings are COW + // (not always true now but assign() always works). + M_string.assign(i_s.data(), i_s.size()); + M_stringviewbuf_init(MMode); + } + +#if __cplusplus > 201703L && _GLIBCXX_USE_CXX11_ABI +#if i_cpp_concepts + template <_Allocator_like _SAlloc> + requires(!is_same_v<_SAlloc, Alloc>) + void str(const basic_string &i_s) { + M_string.assign(i_s.data(), i_s.size()); + M_stringviewbuf_init(MMode); + } +#endif + + void str(stringview_type &&i_s) { + M_string = std::move(i_s); + M_stringviewbuf_init(MMode); + } +#endif + + protected: + // Common initialization code goes here. + void M_stringviewbuf_init(std::ios_base::openmode inMode) { + MMode = inMode; + size_type i_len = 0; + if ((MMode & (std::ios_base::ate | std::ios_base::app)) != 0) { + i_len = M_string.size(); + } + M_sync(const_cast(M_string.data()), 0, i_len); + } + + auto showmanyc() -> std::streamsize override { + std::streamsize i_ret = -1; + if ((MMode & std::ios_base::in) != 0) { + M_update_egptr(); + i_ret = this->egptr() - this->gptr(); + } + return i_ret; + } + + auto underflow() -> int_type override { + int_type ret = traits_type::eof(); + const bool testin = this->MMode & std::ios_base::in; + if (testin) { + // Update egptr() to match the actual string end. + M_update_egptr(); + + if (this->gptr() < this->egptr()) { + ret = traits_type::to_int_type(*this->gptr()); + } + } + return ret; + } + + auto pbackfail(int_type i_c = traits_type::eof()) -> int_type override { + int_type ret = traits_type::eof(); + if (this->eback() < this->gptr()) { + // Try to put back __c into input sequence in one of three ways. + // Order these tests done in is unspecified by the standard. + const bool testeof = traits_type::eq_int_type(i_c, ret); + if (!testeof) { + const bool testeq = traits_type::eq( + traits_type::to_char_type(i_c), this->gptr()[-1]); + const bool testout = this->MMode & std::ios_base::out; + if (testeq || testout) { + this->gbump(-1); + if (!testeq) { + *this->gptr() = traits_type::to_char_type(i_c); + } + ret = i_c; + } + } else { + this->gbump(-1); + ret = traits_type::not_eof(i_c); + } + } + return ret; + } + + auto overflow([[maybe_unused]] int_type i_c = traits_type::eof()) + -> int_type override { + return traits_type::eof(); + } + + /** + * @brief Manipulates the buffer. + * @param i_s Pointer to a buffer area. + * @param i_n Size of @a i_s. + * @return @c this + * + * If no buffer has already been created, and both @a i_s and @a i_n are + * non-zero, then @c i_s is used as a buffer; see + * https://gcc.gnu.org/onlinedocs/libstdc++/manual/streambufs.html#io.streambuf.buffering + * for more. + */ + auto setbuf(char_type *i_s, std::streamsize i_n) + -> streambuf_type * override { + if (i_s && i_n >= 0) { + // This is implementation-defined behavior, and assumes + // that an external char_type array of length i_n exists + // and has been pre-allocated. If this is not the case, + // things will quickly blow up. + + // Step 1: Destroy the current internal array. + M_string = {}; + + // Step 2: Use the external array. + M_sync(i_s, static_cast(i_n), 0); + } + return this; + } + + auto seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode mode) -> pos_type override { + pos_type ret = pos_type(off_type(-1)); + bool testin = (std::ios_base::in & MMode & mode) != 0; + bool testout = (std::ios_base::out & MMode & mode) != 0; + const bool testboth = testin && testout && way != std::ios_base::cur; + testin &= (mode & std::ios_base::out) == 0; + testout &= (mode & std::ios_base::in) == 0; + + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 453. basic_stringbuf::seekoff need not always fail for an empty + // stream. + const char_type *beg = testin ? this->eback() : this->pbase(); + if ((beg || !off) && (testin || testout || testboth)) { + M_update_egptr(); + + off_type newoffi = off; + off_type newoffo = newoffi; + if (way == std::ios_base::cur) { + newoffi += this->gptr() - beg; + newoffo += this->pptr() - beg; + } else if (way == std::ios_base::end) { + newoffo = newoffi += this->egptr() - beg; + } + + if ((testin || testboth) && newoffi >= 0 && + this->egptr() - beg >= newoffi) { + this->setg(this->eback(), this->eback() + newoffi, + this->egptr()); + ret = pos_type(newoffi); + } + if ((testout || testboth) && newoffo >= 0 && + this->egptr() - beg >= newoffo) { + M_pbump(this->pbase(), this->epptr(), newoffo); + ret = pos_type(newoffo); + } + } + return ret; + } + + auto seekpos(pos_type sp, + std::ios_base::openmode inMode = std::ios_base::in) + -> pos_type override { + pos_type ret = pos_type(off_type(-1)); + const bool testin = (std::ios_base::in & this->MMode & inMode) != 0; + const bool testout = (std::ios_base::out & this->MMode & inMode) != 0; + + const char_type *beg = testin ? this->eback() : this->pbase(); + if ((beg || !off_type(sp)) && (testin || testout)) { + M_update_egptr(); + + const off_type pos(sp); + const bool testpos = (0 <= pos && pos <= this->egptr() - beg); + if (testpos) { + if (testin) { + this->setg(this->eback(), this->eback() + pos, + this->egptr()); + } + if (testout) { + M_pbump(this->pbase(), this->epptr(), pos); + } + ret = sp; + } + } + return ret; + } + + // Internal function for correctly updating the internal buffer + // for a particular M_string, due to initialization or re-sizing + // of an existing M_string. + void M_sync(char_type *i_base, size_type i_i, size_type i_o) { + (void)i_o; + const bool testin = (MMode & std::ios_base::in) != 0; + char_type *endg = i_base + M_string.size(); + char_type *endp = i_base + M_string.size(); + + if (i_base != M_string.data()) { + // setbuf: __i == size of buffer area (_M_string.size() == 0). + endg += i_i; + i_i = 0; + endp = endg; + } + + if (testin) { + this->setg(i_base, i_base + i_i, endg); + } + } + + // Internal function for correctly updating egptr() to the actual + // string end. + void M_update_egptr() { + if (char_type *i_pptr = this->pptr()) { + char_type *i_egptr = this->egptr(); + if (!i_egptr || i_pptr > i_egptr) { + if ((MMode & std::ios_base::in) != 0) { + this->setg(this->eback(), this->gptr(), i_pptr); + } else { + this->setg(i_pptr, i_pptr, i_pptr); + } + } + } + } + + // Works around the issue with pbump, part of the protected + // interface of basic_streambuf, taking just an int. + void M_pbump(char_type *i_pbeg, char_type *i_pend, off_type i_off) { + this->setp(i_pbeg, i_pend); + while (i_off > __gnu_cxx::__numeric_traits::__max) { + this->pbump(__gnu_cxx::__numeric_traits::__max); + i_off -= __gnu_cxx::__numeric_traits::__max; + } + this->pbump(static_cast(i_off)); + } + + private: + // Return a pointer to the end of the underlying character sequence. + // This might not be the same character as M_string.end() because + // basic_stringviewbuf::overflow might have written to unused capacity + // in M_string without updating its length. + auto M_highMark() const _GLIBCXX_NOEXCEPT->char_type * { + if (char_type *i_pptr = this->pptr()) { + char_type *i_egptr = this->egptr(); + if (!i_egptr || i_pptr > i_egptr) { + return i_pptr; // Underlying sequence is [pbase, pptr). + } + return i_egptr; // Underlying sequence is [pbase, egptr). + } + return 0; // Underlying character sequence is just M_string. + } + +#if __cplusplus >= 201103L +#if _GLIBCXX_USE_CXX11_ABI + // This type captures the state of the gptr / pptr pointers as offsets + // so they can be restored in another object after moving the string. + struct xfer_bufptrs { + xfer_bufptrs(const basic_stringviewbuf &i_from, + basic_stringviewbuf *i_to) + : M_to{i_to}, M_goff{-1, -1, -1}, M_poff{-1, -1, -1} { + const CharT *const in_str = i_from.M_string.data(); + const CharT *i_end = nullptr; + if (i_from.eback()) { + M_goff[0] = i_from.eback() - in_str; + M_goff[1] = i_from.gptr() - in_str; + M_goff[2] = i_from.egptr() - in_str; + i_end = i_from.egptr(); + } + if (i_from.pbase()) { + M_poff[0] = i_from.pbase() - in_str; + M_poff[1] = i_from.pptr() - i_from.pbase(); + M_poff[2] = i_from.epptr() - in_str; + if (!i_end || i_from.pptr() > i_end) { + i_end = i_from.pptr(); + } + } + + // Set M_string length to the greater of the get and put areas. + if (i_end) { + // The const_cast avoids changing this constructor's signature, + // because it is exported from the dynamic library. + auto &Mut_from = const_cast(i_from); + Mut_from.M_string.M_length(i_end - in_str); + } + } + + ~xfer_bufptrs() { + auto *in_str = const_cast(M_to->M_string.data()); + if (M_goff[0] != -1) { + M_to->setg(in_str + M_goff[0], in_str + M_goff[1], + in_str + M_goff[2]); + } + if (M_poff[0] != -1) { + M_to->M_pbump(in_str + M_poff[0], in_str + M_poff[2], + M_poff[1]); + } + } + + basic_stringviewbuf *M_to; + off_type M_goff[3]; + off_type M_poff[3]; + }; +#else + // This type does nothing when using Copy-On-Write strings. + struct xfer_bufptrs { + xfer_bufptrs(const basic_stringviewbuf &, basic_stringviewbuf *) {} + }; +#endif + + // The move constructor initializes an xfer_bufptrs temporary then + // delegates to this constructor to performs moves during its lifetime. + basic_stringviewbuf(basic_stringviewbuf &&i_rhs, xfer_bufptrs && /*unused*/) + : streambuf_type(static_cast(i_rhs)), + MMode(i_rhs.MMode), M_string(std::move(i_rhs.M_string)) {} + +#if __cplusplus > 201703L && _GLIBCXX_USE_CXX11_ABI + // The move constructor initializes an xfer_bufptrs temporary then + // delegates to this constructor to performs moves during its lifetime. + basic_stringviewbuf(basic_stringviewbuf &&i_rhs, const allocator_type &i_a, + xfer_bufptrs &&) + : streambuf_type(static_cast(i_rhs)), + MMode(i_rhs.MMode), M_string(std::move(i_rhs.M_string), i_a) {} +#endif +#endif // C++11 +}; + +using stringstream_view = basic_stringviewbuf; diff --git a/tests/test_strutils.cpp b/tests/test_strutils.cpp index 6e2ebb4..e0e032f 100644 --- a/tests/test_strutils.cpp +++ b/tests/test_strutils.cpp @@ -14,3 +14,16 @@ TEST(TestStrutils, TestMultiConcat) { std::to_string(987654321), "a"), "12345678910123456789abcd123456789987654321a"); } + +// NOLINTNEXTLINE +TEST(TestStrutils, TestExplodeFunction) { + std::vector fullvecabc{"a", "b", "c"}; + std::vector fullvecab{"a", "b"}; + EXPECT_EQ(Strutils::explode("a,b,c", ","), fullvecabc); + EXPECT_EQ(Strutils::explode("a,b", ","), fullvecab); + EXPECT_EQ(Strutils::explode("", ","), std::vector{}); + EXPECT_EQ(Strutils::explode("", ""), std::vector{}); + EXPECT_EQ(Strutils::explode("a,b,c", ""), + std::vector{"a,b,c"}); + EXPECT_EQ(Strutils::explode("a,b,c", ","), fullvecabc); +}