From 35d34c57a9650124f3635ea45859d5b84f0cc103 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 23 Jul 2024 04:09:59 -0400 Subject: [PATCH] Address review concerns on wording --- .../core/src/clp/regex_utils/constants.hpp | 14 +++++++------- .../clp/regex_utils/regex_translation_utils.cpp | 11 ++++++----- components/core/tests/test-regex_utils.cpp | 17 +++++++++++------ .../dev-guide/components-core/regex-utils.md | 10 ++++++---- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/components/core/src/clp/regex_utils/constants.hpp b/components/core/src/clp/regex_utils/constants.hpp index aff77126a..83ba3fabd 100644 --- a/components/core/src/clp/regex_utils/constants.hpp +++ b/components/core/src/clp/regex_utils/constants.hpp @@ -9,7 +9,7 @@ namespace clp::regex_utils { constexpr size_t cCharBitarraySize = 128; /** - * Create an ASCII character lookup table at compile time. + * Creates an ASCII character lookup table at compile time. * * @param char_str A string that contains the characters to look up. * @return The lookup table as bit array. @@ -18,7 +18,7 @@ constexpr size_t cCharBitarraySize = 128; ) -> std::array { std::array bit_array{}; bit_array.fill(false); - for (char const ch : char_str) { + for (auto const ch : char_str) { bit_array.at(ch) = true; } return bit_array; @@ -38,11 +38,11 @@ constexpr char cEscapeChar{'\\'}; constexpr char cCharsetNegate{'^'}; // Character bitmaps -// This is a more complete set of meta characters than necessary, as the user might not be fully -// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences. -constexpr auto cRegexEscapeSeqMetaChars = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\"); -// This is the set of meta characters that need to be escaped in the wildcard syntax. -constexpr auto cWildcardMetaChars = create_char_bit_array("?*\\"); +// The set of characters that can be preceded with an escape backslash. When escaped, these +// characters are used as literals. +constexpr auto cRegexEscapeSeqMetaCharsLUT = create_char_bit_array("*+?|^$.{}[]()<>-_/=!\\"); +// The set of metacharacters that need to be escaped in the wildcard syntax. +constexpr auto cWildcardMetaCharsLUT = create_char_bit_array("?*\\"); } // namespace clp::regex_utils #endif // CLP_REGEX_UTILS_CONSTANTS_HPP diff --git a/components/core/src/clp/regex_utils/regex_translation_utils.cpp b/components/core/src/clp/regex_utils/regex_translation_utils.cpp index acccc2bfb..9f153aae0 100644 --- a/components/core/src/clp/regex_utils/regex_translation_utils.cpp +++ b/components/core/src/clp/regex_utils/regex_translation_utils.cpp @@ -94,10 +94,11 @@ using StateTransitionFuncSig [[nodiscard]] StateTransitionFuncSig dot_state_transition; /** - * Appends regex metacharacters literally to the wildcard string. + * Appends an escaped regex metacharacter as a literal character to the wildcard string by + * discarding its preceding backslash. * - * These metacharacters are escaped by backslashes, so they have their special meanings suppressed. - * For metacharacters shared by the regex and the wildcard syntax, keep the escape backslashes. + * The preceding backslash must be kept for characters that also have special meanings in the + * wildcard syntax, e.g. `abc.\*xyz` should be translated into `abc?\*xyz` instead of `abc?*xyz`. */ [[nodiscard]] StateTransitionFuncSig escaped_state_transition; @@ -178,10 +179,10 @@ auto escaped_state_transition( [[maybe_unused]] RegexToWildcardTranslatorConfig const& config ) -> error_code { auto const ch{*it}; - if (!cRegexEscapeSeqMetaChars.at(ch)) { + if (false == cRegexEscapeSeqMetaCharsLUT.at(ch)) { return ErrorCode::IllegalEscapeSequence; } - if (cWildcardMetaChars.at(ch)) { + if (cWildcardMetaCharsLUT.at(ch)) { wildcard_str = wildcard_str + cEscapeChar + ch; } else { wildcard_str += ch; diff --git a/components/core/tests/test-regex_utils.cpp b/components/core/tests/test-regex_utils.cpp index 633eb5c64..51d22473f 100644 --- a/components/core/tests/test-regex_utils.cpp +++ b/components/core/tests/test-regex_utils.cpp @@ -8,37 +8,42 @@ using clp::regex_utils::ErrorCode; using clp::regex_utils::regex_to_wildcard; using clp::regex_utils::RegexToWildcardTranslatorConfig; -TEST_CASE("regex_to_wildcard", "[regex_utils][regex_to_wildcard]") { - // Test empty string +TEST_CASE("regex_to_wildcard_simple_translations", "[regex_utils][re2wc][simple_translations]") { REQUIRE(regex_to_wildcard("").value().empty()); - // Test simple wildcard translations REQUIRE((regex_to_wildcard("xyz").value() == "xyz")); REQUIRE((regex_to_wildcard(". xyz .* zyx .").value() == "? xyz * zyx ?")); REQUIRE((regex_to_wildcard(". xyz .+ zyx .*").value() == "? xyz ?* zyx *")); +} - // Test unescaped meta characters +TEST_CASE("regex_to_wildcard_unescaped_metachar", "[regex_utils][re2wc][unescaped_metachar]") { REQUIRE((regex_to_wildcard(".? xyz .* zyx .").error() == ErrorCode::UnsupportedQuestionMark)); REQUIRE((regex_to_wildcard(". xyz .** zyx .").error() == ErrorCode::UntranslatableStar)); REQUIRE((regex_to_wildcard(". xyz .*+ zyx .").error() == ErrorCode::UntranslatablePlus)); REQUIRE((regex_to_wildcard(". xyz |.* zyx .").error() == ErrorCode::UnsupportedPipe)); REQUIRE((regex_to_wildcard(". xyz ^.* zyx .").error() == ErrorCode::IllegalCaret)); + REQUIRE((regex_to_wildcard(". xyz $.* zyx .").error() == ErrorCode::IllegalDollarSign)); +} + - // Test escaped meta characters +TEST_CASE("regex_to_wildcard_escaped_metachar", "[regex_utils][re2wc][escaped_metachar]") { + // Escape backslash is superfluous for the following set of characters REQUIRE((regex_to_wildcard("<>-_/=!").value() == "<>-_/=!")); REQUIRE((regex_to_wildcard("\\<\\>\\-\\_\\/\\=\\!").value() == "<>-_/=!")); + // Test the full escape sequences set REQUIRE( (regex_to_wildcard("\\*\\+\\?\\|\\^\\$\\.\\{\\}\\[\\]\\(\\)\\<\\>\\-\\_\\/\\=\\!\\\\") .value() == "\\*+\\?|^$.{}[]()<>-_/=!\\\\") ); + // Test unsupported escape sequences REQUIRE( (regex_to_wildcard("abc\\Qdefghi\\Ejkl").error() == clp::regex_utils::ErrorCode::IllegalEscapeSequence) ); } -TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][regex_to_wildcard][anchor_config]") { +TEST_CASE("regex_to_wildcard_anchor_config", "[regex_utils][re2wc][anchor_config]") { // Test anchors and prefix/suffix wildcards RegexToWildcardTranslatorConfig const config{false, true}; REQUIRE(((regex_to_wildcard("^", config).value() == "*"))); diff --git a/docs/src/dev-guide/components-core/regex-utils.md b/docs/src/dev-guide/components-core/regex-utils.md index c154a3c8c..f7af037df 100644 --- a/docs/src/dev-guide/components-core/regex-utils.md +++ b/docs/src/dev-guide/components-core/regex-utils.md @@ -63,11 +63,13 @@ For a detailed description on the options order and usage, see the * Turn `.+` into `?*` * E.g. `abc.*def.ghi.+` will get translated to `abc*def?ghi?*` * Metacharacter escape sequences - * Append the escaped metacharacters literally to the wildcard output string. - * The list of characters that require escaping is `[\/^$.|?*+(){}`. - * The list of characters that are optional to escape is `],<>-_=!`. Escaping them will not have - any effect (the escape backslash is ignored). This list may be expanded in the future. + * An escaped regex metacharacter is treated as a literal and appended to the wildcard output. + * The list of characters that require escaping to have their special meanings suppressed is + `[\/^$.|?*+(){}`. + * Superfluous escape characters are ignored for the following characters: `],<>-_=!`. * E.g. `a\[\+b\-\_c-_d` will get translated to `a[+b-_c-_d` + * Note: generally, any non-alphanumeric character can be escaped to use it as a literal. The + list this utils library supports is non-exhaustive and can be expanded when necessary. * For metacharacters shared by both syntaxes, keep the escape backslashes. * The list of characters that fall into this category is `*?\`. All wildcard metacharacters are also regex metacharacters.