Add regex utils including regex to wildcard translation

Bill-hbrhbr · Jul 14, 2024 · 58657e0 · 58657e0
1 parent 9ba0451
commit 58657e0
Show file tree

Hide file tree

Showing 10 changed files with 1,278 additions and 0 deletions.
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
@@ -210,6 +210,7 @@ include(cmake/Modules/FindLibraryDependencies.cmake)
 FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}")
 
 add_subdirectory(src/clp/string_utils)
+add_subdirectory(src/clp/regex_utils)
 
 add_subdirectory(src/clp/clg)
 add_subdirectory(src/clp/clo)
@@ -475,6 +476,7 @@ set(SOURCE_FILES_unitTest
         tests/test-Stopwatch.cpp
         tests/test-StreamingCompression.cpp
         tests/test-string_utils.cpp
+        tests/test-regex_utils.cpp
         tests/test-TimestampPattern.cpp
         tests/test-utf8_utils.cpp
         tests/test-Utils.cpp
@@ -498,6 +500,7 @@ target_link_libraries(unitTest
         ${sqlite_LIBRARY_DEPENDENCIES}
         ${STD_FS_LIBS}
         clp::string_utils
+        clp::regex_utils
         yaml-cpp::yaml-cpp
         ZStd::ZStd
         )

diff --git a/components/core/src/clp/regex_utils/CMakeLists.txt b/components/core/src/clp/regex_utils/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(
+        REGEX_UTILS_HEADER_LIST
+        "ErrorCode.hpp"
+        "RegexToWildcardTranslatorConfig.hpp"
+        "constants.hpp"
+        "regex_utils.hpp"
+)
+add_library(
+        regex_utils
+        regex_utils_regex_to_wildcard.cpp
+        regex_utils_anchors.cpp
+        ErrorCode.cpp
+        ${REGEX_UTILS_HEADER_LIST}
+)
+add_library(clp::regex_utils ALIAS regex_utils)
+target_include_directories(regex_utils
+        PUBLIC
+        ../
+        PRIVATE
+        "${PROJECT_SOURCE_DIR}/submodules"
+)
+target_compile_features(regex_utils PRIVATE cxx_std_20)
diff --git a/components/core/src/clp/regex_utils/ErrorCode.cpp b/components/core/src/clp/regex_utils/ErrorCode.cpp
@@ -0,0 +1,93 @@
+#include "regex_utils/ErrorCode.hpp"
+
+#include <string>
+#include <string_view>
+#include <system_error>
+
+using std::error_category;
+using std::error_code;
+using std::string;
+using std::string_view;
+
+namespace clp::regex_utils {
+
+/**
+ * Class for giving the error codes more detailed string descriptions.
+ * This class does not need to be seen outside the std error code wrapper implementation.
+ */
+class ErrorCodeCategory : public error_category {
+public:
+    /**
+     * @return The class of errors.
+     */
+    [[nodiscard]] char const* name() const noexcept override;
+
+    /**
+     * @param The error code encoded in int.
+     * @return The descriptive message for the error.
+     */
+    [[nodiscard]] string message(int ev) const override;
+};
+
+auto ErrorCodeCategory::name() const noexcept -> char const* {
+    return "regex utility";
+}
+
+auto ErrorCodeCategory::message(int ev) const -> string {
+    switch (static_cast<ErrorCode>(ev)) {
+        case ErrorCode::Success:
+            return "Success.";
+
+        case ErrorCode::IllegalState:
+            return "Unrecognized state.";
+
+        case ErrorCode::Star:
+            return "Failed to translate due to metachar `*` (zero or more occurences).";
+
+        case ErrorCode::Plus:
+            return "Failed to translate due to metachar `+` (one or more occurences).";
+
+        case ErrorCode::Question:
+            return "Currently does not support returning a list of wildcard translations. The "
+                   "metachar `?` (lazy match) may be supported in the future.";
+
+        case ErrorCode::Pipe:
+            return "Currently does not support returning a list of wildcard translations. The "
+                   "regex OR condition feature may be supported in the future.";
+
+        case ErrorCode::Caret:
+            return "Failed to translate due to start anchor `^` in the middle of the string.";
+
+        case ErrorCode::Dollar:
+            return "Failed to translate due to end anchor `$` in the middle of the string.";
+
+        case ErrorCode::DisallowedEscapeSequence:
+            return "Disallowed escape sequence.";
+
+        case ErrorCode::UnmatchedParenthesis:
+            return "Unmatched opening `(` or closing `)`.";
+
+        case ErrorCode::UnsupportedCharsets:
+            return "Currently only supports case-insensitive single-char charset (i.e. [aA] [bB]).";
+
+        case ErrorCode::IncompleteCharsetStructure:
+            return "Unmatched closing `]` at the end of the string.";
+
+        case ErrorCode::UnsupportedQuantifier:
+            return "Currently only supports exact positive number of repetitions in regex syntax.";
+
+        case ErrorCode::TokenUnquantifiable:
+            return "The preceding token is not quantifiable.";
+
+        default:
+            return "(unrecognized error)";
+    }
+}
+
+ErrorCodeCategory const cTheErrorCodeCategory{};
+
+auto make_error_code(ErrorCode e) -> error_code {
+    return {static_cast<int>(e), cTheErrorCodeCategory};
+}
+
+}  // namespace clp::regex_utils
diff --git a/components/core/src/clp/regex_utils/ErrorCode.hpp b/components/core/src/clp/regex_utils/ErrorCode.hpp
@@ -0,0 +1,46 @@
+#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP
+#define CLP_REGEX_UTILS_ERRORCODE_HPP
+
+#include <cstdint>
+#include <system_error>
+#include <type_traits>
+
+namespace clp::regex_utils {
+
+/**
+ * Enum class for propagating and handling various regex utility errors.
+ * More detailed descriptions can be found in ErrorCode.cpp.
+ */
+enum class ErrorCode : uint8_t {
+    Success = 0,
+    IllegalState,
+    Star,
+    Plus,
+    Question,
+    Pipe,
+    Caret,
+    Dollar,
+    DisallowedEscapeSequence,
+    UnmatchedParenthesis,
+    UnsupportedCharsets,
+    IncompleteCharsetStructure,
+    UnsupportedQuantifier,
+    TokenUnquantifiable,
+};
+
+/**
+ * Wrapper function to turn a regular enum class into an std::error_code.
+ *
+ * @param An error code enum.
+ * @return The corresponding std::error_code type variable.
+ */
+[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code;
+
+}  // namespace clp::regex_utils
+
+namespace std {
+template <>
+struct is_error_code_enum<clp::regex_utils::ErrorCode> : true_type {};
+}  // namespace std
+
+#endif  // CLP_REGEX_UTILS_ERRORCODE_HPP
diff --git a/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp b/components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp
@@ -0,0 +1,42 @@
+#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
+#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
+
+namespace clp::regex_utils {
+
+class RegexToWildcardTranslatorConfig {
+public:
+    // Constructors
+    RegexToWildcardTranslatorConfig() = default;
+
+    // Getters
+    [[nodiscard]] auto case_insensitive_wildcard() const -> bool {
+        return m_case_insensitive_wildcard;
+    }
+
+    [[nodiscard]] auto allow_anchors() const -> bool { return m_allow_anchors; }
+
+    [[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool {
+        return m_add_prefix_suffix_wildcards;
+    }
+
+    // Setters
+    void set_case_insensitive_wildcard(bool case_insensitive_wildcard) {
+        m_case_insensitive_wildcard = case_insensitive_wildcard;
+    }
+
+    void set_allow_anchors(bool allow_anchors) { m_allow_anchors = allow_anchors; }
+
+    void set_add_prefix_suffix_wildcards(bool add_prefix_suffix_wildcards) {
+        m_add_prefix_suffix_wildcards = add_prefix_suffix_wildcards;
+    }
+
+private:
+    // Variables
+    bool m_case_insensitive_wildcard = false;
+    bool m_allow_anchors = true;
+    bool m_add_prefix_suffix_wildcards = false;
+};
+
+}  // namespace clp::regex_utils
+
+#endif  // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp
@@ -0,0 +1,48 @@
+#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
+#define CLP_REGEX_UTILS_CONSTANTS_HPP
+
+#include <array>
+#include <cstddef>
+#include <string_view>
+
+namespace clp::regex_utils {
+
+constexpr size_t cCharBitarraySize = 128;
+
+/**
+ * Create an ASCII character lookup table (bit array) at compile time.
+ *
+ * @param char_str A string that contains the characters to look up.
+ * @return The lookup table as bit array
+ */
+[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str
+) -> std::array<bool, cCharBitarraySize> {
+    std::array<bool, cCharBitarraySize> bit_array{};
+    bit_array.fill(false);
+    for (char const ch : char_str) {
+        bit_array.at(ch) = true;
+    }
+    return bit_array;
+}
+
+constexpr char cZeroOrMoreCharsWildcard{'*'};
+constexpr char cSingleCharWildcard{'?'};
+constexpr char cRegexZeroOrMore{'*'};
+constexpr char cRegexOneOrMore{'+'};
+constexpr char cRegexZeroOrOne{'+'};
+constexpr char cRegexStartAnchor{'^'};
+constexpr char cRegexEndAnchor{'$'};
+constexpr char cEscapeChar{'\\'};
+constexpr char cCharsetNegate{'^'};
+
+// This is a more complete set of meta characters than necessary, as the user might not be fully
+// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences.
+constexpr auto cRegexEscapeSeqAcceptedMetaChars = create_char_bit_array("^$.*{}[]()+|?<>-_/=!\\");
+// This is the set of meta characters that need escaping in the wildcard syntax.
+constexpr auto cRegexEscapeSeqWildcardOnlyMetaChars = create_char_bit_array("?*\\");
+// This is the set of meta characters that need escaping in the character set.
+constexpr auto cRegexCharsetEscapeSeqMetaChars = create_char_bit_array("^-]\\");
+
+}  // namespace clp::regex_utils
+
+#endif  // CLP_REGEX_UTILS_CONSTANTS_HPP
diff --git a/components/core/src/clp/regex_utils/regex_utils.hpp b/components/core/src/clp/regex_utils/regex_utils.hpp
@@ -0,0 +1,49 @@
+#ifndef CLP_REGEX_UTILS_REGEX_UTILS_HPP
+#define CLP_REGEX_UTILS_REGEX_UTILS_HPP
+
+#include <string>
+#include <string_view>
+
+#include <boost-outcome/include/boost/outcome/config.hpp>
+#include <boost-outcome/include/boost/outcome/std_result.hpp>
+
+#include "regex_utils/RegexToWildcardTranslatorConfig.hpp"
+
+namespace clp::regex_utils {
+
+[[nodiscard]] auto regex_to_wildcard(std::string_view regex_str
+) -> BOOST_OUTCOME_V2_NAMESPACE::std_result<std::string>;
+
+[[nodiscard]] auto regex_to_wildcard(
+        std::string_view regex_str,
+        RegexToWildcardTranslatorConfig const& config
+) -> BOOST_OUTCOME_V2_NAMESPACE::std_result<std::string>;
+
+/**
+ * If a regex expression contains multiple starting or ending anchors, remove the duplicates.
+ *
+ * @param regex_str
+ * @return Trimmed the regex string, leaving at most one starting or ending anchor.
+ */
+[[nodiscard]] auto regex_trim_line_anchors(std::string_view regex_str) -> std::string;
+
+/**
+ * Check if a regex string has a starting anchor character `^` (caret).
+ *
+ * @param regex_str
+ * @return True if the regex string begins with `^`, false otherwise.
+ */
+[[nodiscard]] auto regex_has_start_anchor(std::string_view regex_str) -> bool;
+
+/**
+ * Check if a regex string has an ending anchor character `$` (dollar sign).
+ * Note that the regex string may end with an escaped `$`, in which case the `$` character retain
+ * its literal meaning.
+ *
+ * @param regex_str
+ * @return True if the regex string ends with an unescaped `$`, false otherwise.
+ */
+[[nodiscard]] auto regex_has_end_anchor(std::string_view regex_str) -> bool;
+}  // namespace clp::regex_utils
+
+#endif  // CLP_REGEX_UTILS_REGEX_UTILS_HPP
diff --git a/components/core/src/clp/regex_utils/regex_utils_anchors.cpp b/components/core/src/clp/regex_utils/regex_utils_anchors.cpp
@@ -0,0 +1,64 @@
+#include <string>
+#include <string_view>
+
+#include "regex_utils/constants.hpp"
+#include "regex_utils/regex_utils.hpp"
+
+using std::string;
+using std::string_view;
+
+namespace clp::regex_utils {
+
+auto regex_trim_line_anchors(string_view regex_str) -> string {
+    string_view::const_iterator left(regex_str.begin());
+    string_view::const_iterator right(regex_str.end());
+
+    // Find the position of the first non-caret character
+    while (left != right && cRegexStartAnchor == *left) {
+        ++left;
+    }
+    // Backtrack one char to include at least one start anchor, if there was any.
+    if (left != regex_str.begin()) {
+        --left;
+    }
+
+    // Find the position of the last non-dollar-sign character
+    while (left != right && cRegexEndAnchor == *(right - 1)) {
+        --right;
+    }
+    if (left != right && right != regex_str.end()) {
+        // There was at least one end anchor so we include it by advancing one char
+        ++right;
+    }
+
+    // If there was more than one end anchor, we need to check if the current end anchor is escaped.
+    // If so, it's not a real end anchor, and we need to advance the end position once more to
+    // append a real end anchor.
+    string trimmed_regex_str(left, right);
+    if (right != regex_str.end() && !regex_has_end_anchor(trimmed_regex_str)) {
+        trimmed_regex_str += cRegexEndAnchor;
+    }
+    return trimmed_regex_str;
+}
+
+auto regex_has_start_anchor(string_view regex_str) -> bool {
+    return !regex_str.empty() && cRegexStartAnchor == regex_str.at(0);
+}
+
+auto regex_has_end_anchor(string_view regex_str) -> bool {
+    auto it{regex_str.rbegin()};
+    if (it == regex_str.rend() || cRegexEndAnchor != *it) {
+        return false;
+    }
+
+    // Check that ending regex dollar sigh char is unescaped.
+    // We need to scan the suffix until we encounter a character that is not an
+    // escape char, since escape chars can escape themselves.
+    bool escaped{false};
+    for (++it; it != regex_str.rend() && cEscapeChar == *it; ++it) {
+        escaped = !escaped;
+    }
+    return !escaped;
+}
+
+}  // namespace clp::regex_utils