Skip to content

Commit

Permalink
Add support for translating escaped regex metacharacters.
Browse files Browse the repository at this point in the history
  • Loading branch information
Bill-hbrhbr committed Jul 19, 2024
1 parent 1d71b6f commit 9f6b02f
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 0 deletions.
4 changes: 4 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ auto ErrorCodeCategory::message(int ev) const -> string {
case ErrorCode::IllegalDollarSign:
return "Failed to translate due to end anchor `$` in the middle of the string.";

case ErrorCode::IllegalEscapeSequence:
return "Currently only supports escape sequences that are used to suppress special "
"meanings of regex metacharacters. Alphanumeric characters are disallowed.";

case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

Expand Down
1 change: 1 addition & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ enum class ErrorCode : uint8_t {
UnsupportedPipe,
IllegalCaret,
IllegalDollarSign,
IllegalEscapeSequence,
UnmatchedParenthesis,
};

Expand Down
29 changes: 29 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
#define CLP_REGEX_UTILS_CONSTANTS_HPP

#include <array>
#include <cstddef>
#include <string_view>

namespace clp::regex_utils {
constexpr size_t cCharBitarraySize = 128;

/**
* Create an ASCII character lookup table at compile time.
*
* @param char_str A string that contains the characters to look up.
* @return The lookup table as bit array.
*/
[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str
) -> std::array<bool, cCharBitarraySize> {
std::array<bool, cCharBitarraySize> bit_array{};
bit_array.fill(false);
for (char const ch : char_str) {
bit_array.at(ch) = true;
}
return bit_array;
}

// Wildcard meta characters
constexpr char cZeroOrMoreCharsWildcard{'*'};
constexpr char cSingleCharWildcard{'?'};
Expand All @@ -14,6 +36,13 @@ constexpr char cRegexStartAnchor{'^'};
constexpr char cRegexEndAnchor{'$'};
constexpr char cEscapeChar{'\\'};
constexpr char cCharsetNegate{'^'};

// Character bitmaps
// This is a more complete set of meta characters than necessary, as the user might not be fully
// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences.
constexpr auto cRegexEscapeSeqMetaChars = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\");
// This is the set of meta characters that need to be escaped in the wildcard syntax.
constexpr auto cWildcardMetaChars = create_char_bit_array("?*\\");
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
34 changes: 34 additions & 0 deletions components/core/src/clp/regex_utils/regex_translation_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class TranslatorState {
enum class RegexPatternState : uint8_t {
NORMAL = 0,
DOT,
ESCAPED,
END,
};

Expand Down Expand Up @@ -92,6 +93,14 @@ using StateTransitionFuncSig
*/
[[nodiscard]] StateTransitionFuncSig dot_state_transition;

/**
* Appends regex metacharacters literally to the wildcard string.
*
* These metacharacters are escaped by backslashes, so they have their special meanings suppressed.
* For metacharacters shared by the regex and the wildcard syntax, keep the escape backslashes.
*/
[[nodiscard]] StateTransitionFuncSig escaped_state_transition;

/**
* Disallows the appearances of other characters after encountering an end anchor in the string.
*/
Expand All @@ -114,6 +123,9 @@ auto normal_state_transition(
case '.':
state.set_next_state(TranslatorState::RegexPatternState::DOT);
break;
case cEscapeChar:
state.set_next_state(TranslatorState::RegexPatternState::ESCAPED);
break;
case cRegexEndAnchor:
state.set_next_state(TranslatorState::RegexPatternState::END);
break;
Expand Down Expand Up @@ -159,6 +171,25 @@ auto dot_state_transition(
return ErrorCode::Success;
}

auto escaped_state_transition(
TranslatorState& state,
string_view::const_iterator& it,
string& wildcard_str,
[[maybe_unused]] RegexToWildcardTranslatorConfig const& config
) -> error_code {
auto const ch{*it};
if (!cRegexEscapeSeqMetaChars.at(ch)) {
return ErrorCode::IllegalEscapeSequence;
}
if (cWildcardMetaChars.at(ch)) {
wildcard_str = wildcard_str + cEscapeChar + ch;
} else {
wildcard_str += ch;
}
state.set_next_state(TranslatorState::RegexPatternState::NORMAL);
return ErrorCode::Success;
}

auto end_state_transition(
[[maybe_unused]] TranslatorState& state,
string_view::const_iterator& it,
Expand Down Expand Up @@ -226,6 +257,9 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
case TranslatorState::RegexPatternState::DOT:
ec = dot_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::ESCAPED:
ec = escaped_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::END:
ec = end_state_transition(state, it, wildcard_str, config);
break;
Expand Down
13 changes: 13 additions & 0 deletions components/core/tests/test-regex_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") {
REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus));
REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe));
REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret));

// Test escaped meta characters
REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!"));
REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!"));
REQUIRE(
(regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\")
.value()
== "\\*+\\?|^$.{}[]()<>-_/=!\\\\")
);
REQUIRE(
(regex_to_wildcard("abc\\Qdefghi\\Ejkl").error()
== clp::regex_utils::ErrorCode::IllegalEscapeSequence)
);
}

TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") {
Expand Down

0 comments on commit 9f6b02f

Please sign in to comment.