Skip to content

Commit

Permalink
Add regex utils including regex to wildcard translation
Browse files Browse the repository at this point in the history
  • Loading branch information
Bill-hbrhbr committed Jul 14, 2024
1 parent 9ba0451 commit 58657e0
Show file tree
Hide file tree
Showing 10 changed files with 1,278 additions and 0 deletions.
3 changes: 3 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ include(cmake/Modules/FindLibraryDependencies.cmake)
FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}")

add_subdirectory(src/clp/string_utils)
add_subdirectory(src/clp/regex_utils)

add_subdirectory(src/clp/clg)
add_subdirectory(src/clp/clo)
Expand Down Expand Up @@ -475,6 +476,7 @@ set(SOURCE_FILES_unitTest
tests/test-Stopwatch.cpp
tests/test-StreamingCompression.cpp
tests/test-string_utils.cpp
tests/test-regex_utils.cpp
tests/test-TimestampPattern.cpp
tests/test-utf8_utils.cpp
tests/test-Utils.cpp
Expand All @@ -498,6 +500,7 @@ target_link_libraries(unitTest
${sqlite_LIBRARY_DEPENDENCIES}
${STD_FS_LIBS}
clp::string_utils
clp::regex_utils
yaml-cpp::yaml-cpp
ZStd::ZStd
)
Expand Down
22 changes: 22 additions & 0 deletions components/core/src/clp/regex_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
set(
REGEX_UTILS_HEADER_LIST
"ErrorCode.hpp"
"RegexToWildcardTranslatorConfig.hpp"
"constants.hpp"
"regex_utils.hpp"
)
add_library(
regex_utils
regex_utils_regex_to_wildcard.cpp
regex_utils_anchors.cpp
ErrorCode.cpp
${REGEX_UTILS_HEADER_LIST}
)
add_library(clp::regex_utils ALIAS regex_utils)
target_include_directories(regex_utils
PUBLIC
../
PRIVATE
"${PROJECT_SOURCE_DIR}/submodules"
)
target_compile_features(regex_utils PRIVATE cxx_std_20)
93 changes: 93 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#include "regex_utils/ErrorCode.hpp"

#include <string>
#include <string_view>
#include <system_error>

using std::error_category;
using std::error_code;
using std::string;
using std::string_view;

namespace clp::regex_utils {

/**
* Class for giving the error codes more detailed string descriptions.
* This class does not need to be seen outside the std error code wrapper implementation.
*/
class ErrorCodeCategory : public error_category {
public:
/**
* @return The class of errors.
*/
[[nodiscard]] char const* name() const noexcept override;

/**
* @param The error code encoded in int.
* @return The descriptive message for the error.
*/
[[nodiscard]] string message(int ev) const override;
};

auto ErrorCodeCategory::name() const noexcept -> char const* {
return "regex utility";
}

auto ErrorCodeCategory::message(int ev) const -> string {
switch (static_cast<ErrorCode>(ev)) {
case ErrorCode::Success:
return "Success.";

case ErrorCode::IllegalState:
return "Unrecognized state.";

case ErrorCode::Star:
return "Failed to translate due to metachar `*` (zero or more occurences).";

case ErrorCode::Plus:
return "Failed to translate due to metachar `+` (one or more occurences).";

case ErrorCode::Question:
return "Currently does not support returning a list of wildcard translations. The "
"metachar `?` (lazy match) may be supported in the future.";

case ErrorCode::Pipe:
return "Currently does not support returning a list of wildcard translations. The "
"regex OR condition feature may be supported in the future.";

case ErrorCode::Caret:
return "Failed to translate due to start anchor `^` in the middle of the string.";

case ErrorCode::Dollar:
return "Failed to translate due to end anchor `$` in the middle of the string.";

case ErrorCode::DisallowedEscapeSequence:
return "Disallowed escape sequence.";

case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

case ErrorCode::UnsupportedCharsets:
return "Currently only supports case-insensitive single-char charset (i.e. [aA] [bB]).";

case ErrorCode::IncompleteCharsetStructure:
return "Unmatched closing `]` at the end of the string.";

case ErrorCode::UnsupportedQuantifier:
return "Currently only supports exact positive number of repetitions in regex syntax.";

case ErrorCode::TokenUnquantifiable:
return "The preceding token is not quantifiable.";

default:
return "(unrecognized error)";
}
}

ErrorCodeCategory const cTheErrorCodeCategory{};

auto make_error_code(ErrorCode e) -> error_code {
return {static_cast<int>(e), cTheErrorCodeCategory};
}

} // namespace clp::regex_utils
46 changes: 46 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP
#define CLP_REGEX_UTILS_ERRORCODE_HPP

#include <cstdint>
#include <system_error>
#include <type_traits>

namespace clp::regex_utils {

/**
* Enum class for propagating and handling various regex utility errors.
* More detailed descriptions can be found in ErrorCode.cpp.
*/
enum class ErrorCode : uint8_t {
Success = 0,
IllegalState,
Star,
Plus,
Question,
Pipe,
Caret,
Dollar,
DisallowedEscapeSequence,
UnmatchedParenthesis,
UnsupportedCharsets,
IncompleteCharsetStructure,
UnsupportedQuantifier,
TokenUnquantifiable,
};

/**
* Wrapper function to turn a regular enum class into an std::error_code.
*
* @param An error code enum.
* @return The corresponding std::error_code type variable.
*/
[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code;

} // namespace clp::regex_utils

namespace std {
template <>
struct is_error_code_enum<clp::regex_utils::ErrorCode> : true_type {};
} // namespace std

#endif // CLP_REGEX_UTILS_ERRORCODE_HPP
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP

namespace clp::regex_utils {

class RegexToWildcardTranslatorConfig {
public:
// Constructors
RegexToWildcardTranslatorConfig() = default;

// Getters
[[nodiscard]] auto case_insensitive_wildcard() const -> bool {
return m_case_insensitive_wildcard;
}

[[nodiscard]] auto allow_anchors() const -> bool { return m_allow_anchors; }

[[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool {
return m_add_prefix_suffix_wildcards;
}

// Setters
void set_case_insensitive_wildcard(bool case_insensitive_wildcard) {
m_case_insensitive_wildcard = case_insensitive_wildcard;
}

void set_allow_anchors(bool allow_anchors) { m_allow_anchors = allow_anchors; }

void set_add_prefix_suffix_wildcards(bool add_prefix_suffix_wildcards) {
m_add_prefix_suffix_wildcards = add_prefix_suffix_wildcards;
}

private:
// Variables
bool m_case_insensitive_wildcard = false;
bool m_allow_anchors = true;
bool m_add_prefix_suffix_wildcards = false;
};

} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
48 changes: 48 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
#define CLP_REGEX_UTILS_CONSTANTS_HPP

#include <array>
#include <cstddef>
#include <string_view>

namespace clp::regex_utils {

constexpr size_t cCharBitarraySize = 128;

/**
* Create an ASCII character lookup table (bit array) at compile time.
*
* @param char_str A string that contains the characters to look up.
* @return The lookup table as bit array
*/
[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str
) -> std::array<bool, cCharBitarraySize> {
std::array<bool, cCharBitarraySize> bit_array{};
bit_array.fill(false);
for (char const ch : char_str) {
bit_array.at(ch) = true;
}
return bit_array;
}

constexpr char cZeroOrMoreCharsWildcard{'*'};
constexpr char cSingleCharWildcard{'?'};
constexpr char cRegexZeroOrMore{'*'};
constexpr char cRegexOneOrMore{'+'};
constexpr char cRegexZeroOrOne{'+'};
constexpr char cRegexStartAnchor{'^'};
constexpr char cRegexEndAnchor{'$'};
constexpr char cEscapeChar{'\\'};
constexpr char cCharsetNegate{'^'};

// This is a more complete set of meta characters than necessary, as the user might not be fully
// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences.
constexpr auto cRegexEscapeSeqAcceptedMetaChars = create_char_bit_array("^$.*{}[]()+|?<>-_/=!\\");
// This is the set of meta characters that need escaping in the wildcard syntax.
constexpr auto cRegexEscapeSeqWildcardOnlyMetaChars = create_char_bit_array("?*\\");
// This is the set of meta characters that need escaping in the character set.
constexpr auto cRegexCharsetEscapeSeqMetaChars = create_char_bit_array("^-]\\");

} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
49 changes: 49 additions & 0 deletions components/core/src/clp/regex_utils/regex_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef CLP_REGEX_UTILS_REGEX_UTILS_HPP
#define CLP_REGEX_UTILS_REGEX_UTILS_HPP

#include <string>
#include <string_view>

#include <boost-outcome/include/boost/outcome/config.hpp>
#include <boost-outcome/include/boost/outcome/std_result.hpp>

#include "regex_utils/RegexToWildcardTranslatorConfig.hpp"

namespace clp::regex_utils {

[[nodiscard]] auto regex_to_wildcard(std::string_view regex_str
) -> BOOST_OUTCOME_V2_NAMESPACE::std_result<std::string>;

[[nodiscard]] auto regex_to_wildcard(
std::string_view regex_str,
RegexToWildcardTranslatorConfig const& config
) -> BOOST_OUTCOME_V2_NAMESPACE::std_result<std::string>;

/**
* If a regex expression contains multiple starting or ending anchors, remove the duplicates.
*
* @param regex_str
* @return Trimmed the regex string, leaving at most one starting or ending anchor.
*/
[[nodiscard]] auto regex_trim_line_anchors(std::string_view regex_str) -> std::string;

/**
* Check if a regex string has a starting anchor character `^` (caret).
*
* @param regex_str
* @return True if the regex string begins with `^`, false otherwise.
*/
[[nodiscard]] auto regex_has_start_anchor(std::string_view regex_str) -> bool;

/**
* Check if a regex string has an ending anchor character `$` (dollar sign).
* Note that the regex string may end with an escaped `$`, in which case the `$` character retain
* its literal meaning.
*
* @param regex_str
* @return True if the regex string ends with an unescaped `$`, false otherwise.
*/
[[nodiscard]] auto regex_has_end_anchor(std::string_view regex_str) -> bool;
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_REGEX_UTILS_HPP
64 changes: 64 additions & 0 deletions components/core/src/clp/regex_utils/regex_utils_anchors.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include <string>
#include <string_view>

#include "regex_utils/constants.hpp"
#include "regex_utils/regex_utils.hpp"

using std::string;
using std::string_view;

namespace clp::regex_utils {

auto regex_trim_line_anchors(string_view regex_str) -> string {
string_view::const_iterator left(regex_str.begin());
string_view::const_iterator right(regex_str.end());

// Find the position of the first non-caret character
while (left != right && cRegexStartAnchor == *left) {
++left;
}
// Backtrack one char to include at least one start anchor, if there was any.
if (left != regex_str.begin()) {
--left;
}

// Find the position of the last non-dollar-sign character
while (left != right && cRegexEndAnchor == *(right - 1)) {
--right;
}
if (left != right && right != regex_str.end()) {
// There was at least one end anchor so we include it by advancing one char
++right;
}

// If there was more than one end anchor, we need to check if the current end anchor is escaped.
// If so, it's not a real end anchor, and we need to advance the end position once more to
// append a real end anchor.
string trimmed_regex_str(left, right);
if (right != regex_str.end() && !regex_has_end_anchor(trimmed_regex_str)) {
trimmed_regex_str += cRegexEndAnchor;
}
return trimmed_regex_str;
}

auto regex_has_start_anchor(string_view regex_str) -> bool {
return !regex_str.empty() && cRegexStartAnchor == regex_str.at(0);
}

auto regex_has_end_anchor(string_view regex_str) -> bool {
auto it{regex_str.rbegin()};
if (it == regex_str.rend() || cRegexEndAnchor != *it) {
return false;
}

// Check that ending regex dollar sigh char is unescaped.
// We need to scan the suffix until we encounter a character that is not an
// escape char, since escape chars can escape themselves.
bool escaped{false};
for (++it; it != regex_str.rend() && cEscapeChar == *it; ++it) {
escaped = !escaped;
}
return !escaped;
}

} // namespace clp::regex_utils
Loading

0 comments on commit 58657e0

Please sign in to comment.