Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

regex-utils: Add support for translating regex character sets into wildcards when possible. #493

Merged
merged 7 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ auto ErrorCodeCategory::message(int ev) const -> string {
case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

case ErrorCode::IncompleteCharsetStructure:
return "Unmatched closing `]` at the end of the string.";

case ErrorCode::UnsupportedCharsetPattern:
return "Currently only supports character set that can be reduced to a single "
"character.";

default:
return "(unrecognized error)";
}
Expand Down
2 changes: 2 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ enum class ErrorCode : uint8_t {
IllegalDollarSign,
IllegalEscapeSequence,
UnmatchedParenthesis,
IncompleteCharsetStructure,
UnsupportedCharsetPattern,
};

/**
Expand Down
3 changes: 3 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ constexpr auto cRegexEscapeSeqMetaCharsLut = create_char_bit_array("*+?|^$.{}[](
// The set of wildcard metacharacters that must remain escaped in the translated string to be
// treated as a literal.
constexpr auto cWildcardMetaCharsLut = create_char_bit_array("?*\\");
// The set of metacharacters that can be preceded with an escape backslash in the regex character
// set to be treated as a literal.
constexpr auto cRegexCharsetEscapeSeqMetaCharsLut = create_char_bit_array("^-]\\");
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
154 changes: 148 additions & 6 deletions components/core/src/clp/regex_utils/regex_translation_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
#include "regex_utils/regex_translation_utils.hpp"

#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <system_error>

#include <outcome/single-header/outcome.hpp>
#include <string_utils/string_utils.hpp>

#include "regex_utils/constants.hpp"
#include "regex_utils/ErrorCode.hpp"
#include "regex_utils/RegexToWildcardTranslatorConfig.hpp"

namespace clp::regex_utils {
using clp::string_utils::is_alphabet;
using std::error_code;
using std::optional;
using std::string;
using std::string_view;

Expand All @@ -31,6 +35,8 @@ class TranslatorState {
* literally.</li>
* <li>Dot: Encountered a period `.`. Expecting wildcard expression.</li>
* <li>Escaped: Encountered a backslash `\`. Expecting an escape sequence.</li>
* <li>Charset: Encountered an opening square bracket `[`. Expecting a character set.</li>
* <li>CharsetEscaped: Encountered an escape backslash in the character set.</li>
* <li>End: Encountered a dollar sign `$`, meaning the regex string has reached the end
* anchor.</li>
* </ul>
Expand All @@ -39,6 +45,8 @@ class TranslatorState {
Normal = 0,
Dot,
Escaped,
Charset,
CharsetEscaped,
End,
};

Expand All @@ -48,12 +56,23 @@ class TranslatorState {
// Getters
[[nodiscard]] auto get_state() const -> RegexPatternState { return m_state; }

[[nodiscard]] auto get_charset_begin_it() const -> optional<string_view::const_iterator> {
return m_charset_begin_it;
}

// Setters
auto set_next_state(RegexPatternState const& state) -> void { m_state = state; }

auto set_charset_begin_it(string_view::const_iterator charset_begin_it) -> void {
m_charset_begin_it = charset_begin_it;
}

auto invalidate_charset_begin_it() -> void { m_charset_begin_it.reset(); }

private:
// Members
RegexPatternState m_state{RegexPatternState::Normal};
optional<string_view::const_iterator> m_charset_begin_it;
};

/**
Expand All @@ -65,7 +84,7 @@ class TranslatorState {
* @param[in, out] it The iterator that represents the current regex string scan position. May be
* updated to advance or backtrack the scan position.
* @param[out] wildcard_str The translated wildcard string. May or may not be updated.
* @param[in] config The translator config.
* @param[in] config The translator config predefined by the user.
* @return clp::regex_utils::ErrorCode
*/
using StateTransitionFuncSig
Expand Down Expand Up @@ -103,6 +122,24 @@ using StateTransitionFuncSig
*/
[[nodiscard]] StateTransitionFuncSig escaped_state_transition;

/**
* Reduces a regex character set into a single character so that the regex string is still
* translatable into a wildcard string.
*
* In most cases, only a trival character set containing a single character is reducable. However,
* if the output wildcard query will be analyzed in case-insensitive mode, character set patterns
* such as [aA] [Bb] are also reducable. Does not support empty charsets.
* On error, returns IncompleteCharsetStructure, UnsupportedCharsetPattern, or IllegalState.
*/
[[nodiscard]] StateTransitionFuncSig charset_state_transition;

/**
* A transient state used to defer handling of escape sequences in a charset pattern.
*
* Allows the charset state to accurately capture the appearance of a closing bracket `]`.
*/
[[nodiscard]] StateTransitionFuncSig charset_escaped_state_transition;

/**
* Disallows the appearances of other characters after encountering an end anchor in the string.
*/
Expand All @@ -114,6 +151,23 @@ using StateTransitionFuncSig
*/
[[nodiscard]] StateTransitionFuncSig final_state_cleanup;

/**
* Appends a single character as a literal to the wildcard string.
*
* If the literal is a metacharacter in the wildcard syntax, prepend the literal with an escape
* backslash.
* @param ch The literal to be appended.
* @param wildcard_str The wildcard string to be updated.
*/
auto append_char_to_wildcard(char ch, string& wildcard_str) -> void;

/**
* @param ch0
* @param ch1
* @return Whether the given chars are the same, but with opposing letter cases, e.g. 'A' vs. 'a'.
*/
[[nodiscard]] auto is_same_char_opposite_case(char ch0, char ch1) -> bool;

auto normal_state_transition(
TranslatorState& state,
string_view::const_iterator& it,
Expand All @@ -128,6 +182,10 @@ auto normal_state_transition(
case cEscapeChar:
state.set_next_state(TranslatorState::RegexPatternState::Escaped);
break;
case '[':
state.set_charset_begin_it(it + 1);
state.set_next_state(TranslatorState::RegexPatternState::Charset);
break;
case cRegexEndAnchor:
state.set_next_state(TranslatorState::RegexPatternState::End);
break;
Expand Down Expand Up @@ -183,14 +241,72 @@ auto escaped_state_transition(
if (false == cRegexEscapeSeqMetaCharsLut.at(ch)) {
return ErrorCode::IllegalEscapeSequence;
}
if (cWildcardMetaCharsLut.at(ch)) {
wildcard_str += cEscapeChar;
append_char_to_wildcard(ch, wildcard_str);
state.set_next_state(TranslatorState::RegexPatternState::Normal);
return ErrorCode::Success;
}

auto charset_state_transition(
TranslatorState& state,
string_view::const_iterator& it,
string& wildcard_str,
RegexToWildcardTranslatorConfig const& config
) -> error_code {
auto const charset_begin_it_opt{state.get_charset_begin_it()};
if (false == charset_begin_it_opt.has_value()) {
return ErrorCode::IllegalState;
}
wildcard_str += ch;
string_view::const_iterator const charset_begin_it = charset_begin_it_opt.value();

auto const ch{*it};
if (cEscapeChar == ch) {
state.set_next_state(TranslatorState::RegexPatternState::CharsetEscaped);
return ErrorCode::Success;
}
if (']' != ch) {
return ErrorCode::Success;
}

auto const charset_len{it - charset_begin_it};
if (0 == charset_len || charset_len > 2) {
return ErrorCode::UnsupportedCharsetPattern;
}

auto const ch0{*charset_begin_it};
auto const ch1{*(charset_begin_it + 1)};
char parsed_char{};

if (1 == charset_len) {
if (cCharsetNegate == ch0 || cEscapeChar == ch0) {
return ErrorCode::UnsupportedCharsetPattern;
}
parsed_char = ch0;
} else { // 2 == charset_len
if (cEscapeChar == ch0 && cRegexCharsetEscapeSeqMetaCharsLut.at(ch1)) {
parsed_char = ch1;
} else if (config.case_insensitive_wildcard() && is_same_char_opposite_case(ch0, ch1)) {
parsed_char = ch0 > ch1 ? ch0 : ch1; // choose the lower case character
} else {
return ErrorCode::UnsupportedCharsetPattern;
}
}

append_char_to_wildcard(parsed_char, wildcard_str);
state.invalidate_charset_begin_it();
state.set_next_state(TranslatorState::RegexPatternState::Normal);
return ErrorCode::Success;
}

auto charset_escaped_state_transition(
TranslatorState& state,
[[maybe_unused]] string_view::const_iterator& it,
[[maybe_unused]] string& wildcard_str,
[[maybe_unused]] RegexToWildcardTranslatorConfig const& config
) -> error_code {
state.set_next_state(TranslatorState::RegexPatternState::Charset);
return ErrorCode::Success;
}

auto end_state_transition(
[[maybe_unused]] TranslatorState& state,
string_view::const_iterator& it,
Expand All @@ -215,6 +331,9 @@ auto final_state_cleanup(
// multichar wildcard
wildcard_str += cSingleCharWildcard;
break;
case TranslatorState::RegexPatternState::Charset:
case TranslatorState::RegexPatternState::CharsetEscaped:
return ErrorCode::IncompleteCharsetStructure;
default:
break;
}
Expand All @@ -226,10 +345,27 @@ auto final_state_cleanup(
}
return ErrorCode::Success;
}

auto append_char_to_wildcard(char ch, string& wildcard_str) -> void {
if (cWildcardMetaCharsLut.at(ch)) {
wildcard_str += cEscapeChar;
}
wildcard_str += ch;
}

auto is_same_char_opposite_case(char ch0, char ch1) -> bool {
int const upper_lower_case_ascii_offset{'a' - 'A'};
return (is_alphabet(ch0) && is_alphabet(ch1)
&& (((ch0 - ch1) == upper_lower_case_ascii_offset)
|| ((ch1 - ch0) == upper_lower_case_ascii_offset)));
}
} // namespace

auto regex_to_wildcard(string_view regex_str) -> OUTCOME_V2_NAMESPACE::std_result<string> {
return regex_to_wildcard(regex_str, {false, false});
return regex_to_wildcard(
regex_str,
{/*case_insensitive_wildcard=*/false, /*add_prefix_suffix_wildcards=*/false}
);
}

auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig const& config)
Expand All @@ -238,9 +374,9 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
return string{};
}

TranslatorState state;
string_view::const_iterator it{regex_str.cbegin()};
string wildcard_str;
TranslatorState state;

// If there is no starting anchor character, append multichar wildcard prefix
if (cRegexStartAnchor == *it) {
Expand All @@ -261,6 +397,12 @@ auto regex_to_wildcard(string_view regex_str, RegexToWildcardTranslatorConfig co
case TranslatorState::RegexPatternState::Escaped:
ec = escaped_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::Charset:
ec = charset_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::CharsetEscaped:
ec = charset_escaped_state_transition(state, it, wildcard_str, config);
break;
case TranslatorState::RegexPatternState::End:
ec = end_state_transition(state, it, wildcard_str, config);
break;
Expand Down
43 changes: 42 additions & 1 deletion components/core/tests/test-regex_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,50 @@ TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_me
);
}

TEST_CASE("regex_to_wildcard_charset", "[regex_utils][re2wc][charset]") {
REQUIRE((regex_to_wildcard("x[y]z").value() == "xyz"));
REQUIRE((regex_to_wildcard("x[\\^]z").value() == "x^z"));
REQUIRE((regex_to_wildcard("x[\\]]z").value() == "x]z"));
REQUIRE((regex_to_wildcard("x[-]z").value() == "x-z"));
REQUIRE((regex_to_wildcard("x[\\-]z").value() == "x-z"));
REQUIRE((regex_to_wildcard("x[\\\\]z").value() == "x\\\\z"));
REQUIRE((regex_to_wildcard("[a][b][\\^][-][\\-][\\]][\\\\][c][d]").value() == "ab^--]\\\\cd"));

REQUIRE((regex_to_wildcard("x[]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("x[a-z]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("x[^^]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("x[^0-9]y").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("[xX][yY]").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern));

REQUIRE((regex_to_wildcard("[\\").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE((regex_to_wildcard("[\\\\").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9").error() == ErrorCode::IncompleteCharsetStructure));
}

TEST_CASE("regex_to_wildcard_case_insensitive_config", "[regex_utils][re2wc][case_insensitive]") {
RegexToWildcardTranslatorConfig const config{/*case_insensitive_wildcard=*/true, false};
REQUIRE((regex_to_wildcard("[xX][yY]", config).value() == "xy"));
REQUIRE((regex_to_wildcard("[Yy][Xx]", config).value() == "yx"));
REQUIRE((regex_to_wildcard("[aA][Bb][Cc]", config).value() == "abc"));
REQUIRE((regex_to_wildcard("[aA][Bb][\\^][-][\\]][Cc][dD]", config).value() == "ab^-]cd"));

REQUIRE((regex_to_wildcard("[xX").error() == ErrorCode::IncompleteCharsetStructure));
REQUIRE(
(regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD", config).error()
== ErrorCode::IncompleteCharsetStructure)
);
REQUIRE((regex_to_wildcard("ch:[a-zA-Z0-9]").error() == ErrorCode::UnsupportedCharsetPattern));
REQUIRE(
(regex_to_wildcard("[aA][Bb][^[-[\\[Cc[dD]", config).error()
== ErrorCode::UnsupportedCharsetPattern)
);
}

TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") {
// Test anchors and prefix/suffix wildcards
RegexToWildcardTranslatorConfig const config{false, true};
RegexToWildcardTranslatorConfig const config{false, /*add_prefix_suffix_wildcards=*/true};
REQUIRE(((regex_to_wildcard("^", config).value() == "*")));
REQUIRE((regex_to_wildcard("$", config).value() == "*"));
REQUIRE((regex_to_wildcard("^xyz$", config).value() == "xyz"));
Expand Down
17 changes: 13 additions & 4 deletions docs/src/dev-guide/components-core/regex-utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,23 @@ For a detailed description on the options order and usage, see the
* Escape sequences with alphanumeric characters are disallowed.
* E.g. Special utility escape sequences `\Q`, `\E`, `\A` etc. and back references `\1` `\2` etc.
cannot be translated.
* Character set
* Reduces a character set into a single character if possible.
* A trivial character set containing a single character or a single escaped metacharacter.
* E.g. `[a]` into `a`, `[\^]` into `^`
* If the `case_insensitive_wildcard` config is turned on, the translator can also reduce the
case-insensitive style character set patterns into a single lowercase character:
* E.g. `[aA]` into `a`, `[Bb]` into `b`, `[xX][Yy][zZ]` into `xyz`

### Custom configuration

The `RegexToWildcardTranslatorConfig` class objects are currently immutable once instantiated. The
constructor takes the following arguments in order:
The `RegexToWildcardTranslatorConfig` class objects are currently immutable once instantiated. By
default, all of the options are set to `false`.

* `case_insensitive_wildcard`: to be added later along with the character set translation
implementation.
The constructor takes the following option arguments in order:

* `case_insensitive_wildcard`: see **Character set** bullet point in the
[Functionalities](#functionalities) section.

* `add_prefix_suffix_wildcards`: in the absence of regex anchors, add prefix or suffix wildcards so
the query becomes a substring query.
Expand Down
Loading