forked from y-scope/clp
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add regex utils including regex to wildcard translation
- Loading branch information
1 parent
9ba0451
commit 58657e0
Showing
10 changed files
with
1,278 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
set( | ||
REGEX_UTILS_HEADER_LIST | ||
"ErrorCode.hpp" | ||
"RegexToWildcardTranslatorConfig.hpp" | ||
"constants.hpp" | ||
"regex_utils.hpp" | ||
) | ||
add_library( | ||
regex_utils | ||
regex_utils_regex_to_wildcard.cpp | ||
regex_utils_anchors.cpp | ||
ErrorCode.cpp | ||
${REGEX_UTILS_HEADER_LIST} | ||
) | ||
add_library(clp::regex_utils ALIAS regex_utils) | ||
target_include_directories(regex_utils | ||
PUBLIC | ||
../ | ||
PRIVATE | ||
"${PROJECT_SOURCE_DIR}/submodules" | ||
) | ||
target_compile_features(regex_utils PRIVATE cxx_std_20) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#include "regex_utils/ErrorCode.hpp" | ||
|
||
#include <string> | ||
#include <string_view> | ||
#include <system_error> | ||
|
||
using std::error_category; | ||
using std::error_code; | ||
using std::string; | ||
using std::string_view; | ||
|
||
namespace clp::regex_utils { | ||
|
||
/** | ||
* Class for giving the error codes more detailed string descriptions. | ||
* This class does not need to be seen outside the std error code wrapper implementation. | ||
*/ | ||
class ErrorCodeCategory : public error_category { | ||
public: | ||
/** | ||
* @return The class of errors. | ||
*/ | ||
[[nodiscard]] char const* name() const noexcept override; | ||
|
||
/** | ||
* @param The error code encoded in int. | ||
* @return The descriptive message for the error. | ||
*/ | ||
[[nodiscard]] string message(int ev) const override; | ||
}; | ||
|
||
auto ErrorCodeCategory::name() const noexcept -> char const* { | ||
return "regex utility"; | ||
} | ||
|
||
auto ErrorCodeCategory::message(int ev) const -> string { | ||
switch (static_cast<ErrorCode>(ev)) { | ||
case ErrorCode::Success: | ||
return "Success."; | ||
|
||
case ErrorCode::IllegalState: | ||
return "Unrecognized state."; | ||
|
||
case ErrorCode::Star: | ||
return "Failed to translate due to metachar `*` (zero or more occurences)."; | ||
|
||
case ErrorCode::Plus: | ||
return "Failed to translate due to metachar `+` (one or more occurences)."; | ||
|
||
case ErrorCode::Question: | ||
return "Currently does not support returning a list of wildcard translations. The " | ||
"metachar `?` (lazy match) may be supported in the future."; | ||
|
||
case ErrorCode::Pipe: | ||
return "Currently does not support returning a list of wildcard translations. The " | ||
"regex OR condition feature may be supported in the future."; | ||
|
||
case ErrorCode::Caret: | ||
return "Failed to translate due to start anchor `^` in the middle of the string."; | ||
|
||
case ErrorCode::Dollar: | ||
return "Failed to translate due to end anchor `$` in the middle of the string."; | ||
|
||
case ErrorCode::DisallowedEscapeSequence: | ||
return "Disallowed escape sequence."; | ||
|
||
case ErrorCode::UnmatchedParenthesis: | ||
return "Unmatched opening `(` or closing `)`."; | ||
|
||
case ErrorCode::UnsupportedCharsets: | ||
return "Currently only supports case-insensitive single-char charset (i.e. [aA] [bB])."; | ||
|
||
case ErrorCode::IncompleteCharsetStructure: | ||
return "Unmatched closing `]` at the end of the string."; | ||
|
||
case ErrorCode::UnsupportedQuantifier: | ||
return "Currently only supports exact positive number of repetitions in regex syntax."; | ||
|
||
case ErrorCode::TokenUnquantifiable: | ||
return "The preceding token is not quantifiable."; | ||
|
||
default: | ||
return "(unrecognized error)"; | ||
} | ||
} | ||
|
||
ErrorCodeCategory const cTheErrorCodeCategory{}; | ||
|
||
auto make_error_code(ErrorCode e) -> error_code { | ||
return {static_cast<int>(e), cTheErrorCodeCategory}; | ||
} | ||
|
||
} // namespace clp::regex_utils |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP | ||
#define CLP_REGEX_UTILS_ERRORCODE_HPP | ||
|
||
#include <cstdint> | ||
#include <system_error> | ||
#include <type_traits> | ||
|
||
namespace clp::regex_utils { | ||
|
||
/** | ||
* Enum class for propagating and handling various regex utility errors. | ||
* More detailed descriptions can be found in ErrorCode.cpp. | ||
*/ | ||
enum class ErrorCode : uint8_t { | ||
Success = 0, | ||
IllegalState, | ||
Star, | ||
Plus, | ||
Question, | ||
Pipe, | ||
Caret, | ||
Dollar, | ||
DisallowedEscapeSequence, | ||
UnmatchedParenthesis, | ||
UnsupportedCharsets, | ||
IncompleteCharsetStructure, | ||
UnsupportedQuantifier, | ||
TokenUnquantifiable, | ||
}; | ||
|
||
/** | ||
* Wrapper function to turn a regular enum class into an std::error_code. | ||
* | ||
* @param An error code enum. | ||
* @return The corresponding std::error_code type variable. | ||
*/ | ||
[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code; | ||
|
||
} // namespace clp::regex_utils | ||
|
||
namespace std { | ||
template <> | ||
struct is_error_code_enum<clp::regex_utils::ErrorCode> : true_type {}; | ||
} // namespace std | ||
|
||
#endif // CLP_REGEX_UTILS_ERRORCODE_HPP |
42 changes: 42 additions & 0 deletions
42
components/core/src/clp/regex_utils/RegexToWildcardTranslatorConfig.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP | ||
#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP | ||
|
||
namespace clp::regex_utils { | ||
|
||
class RegexToWildcardTranslatorConfig { | ||
public: | ||
// Constructors | ||
RegexToWildcardTranslatorConfig() = default; | ||
|
||
// Getters | ||
[[nodiscard]] auto case_insensitive_wildcard() const -> bool { | ||
return m_case_insensitive_wildcard; | ||
} | ||
|
||
[[nodiscard]] auto allow_anchors() const -> bool { return m_allow_anchors; } | ||
|
||
[[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool { | ||
return m_add_prefix_suffix_wildcards; | ||
} | ||
|
||
// Setters | ||
void set_case_insensitive_wildcard(bool case_insensitive_wildcard) { | ||
m_case_insensitive_wildcard = case_insensitive_wildcard; | ||
} | ||
|
||
void set_allow_anchors(bool allow_anchors) { m_allow_anchors = allow_anchors; } | ||
|
||
void set_add_prefix_suffix_wildcards(bool add_prefix_suffix_wildcards) { | ||
m_add_prefix_suffix_wildcards = add_prefix_suffix_wildcards; | ||
} | ||
|
||
private: | ||
// Variables | ||
bool m_case_insensitive_wildcard = false; | ||
bool m_allow_anchors = true; | ||
bool m_add_prefix_suffix_wildcards = false; | ||
}; | ||
|
||
} // namespace clp::regex_utils | ||
|
||
#endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP | ||
#define CLP_REGEX_UTILS_CONSTANTS_HPP | ||
|
||
#include <array> | ||
#include <cstddef> | ||
#include <string_view> | ||
|
||
namespace clp::regex_utils { | ||
|
||
constexpr size_t cCharBitarraySize = 128; | ||
|
||
/** | ||
* Create an ASCII character lookup table (bit array) at compile time. | ||
* | ||
* @param char_str A string that contains the characters to look up. | ||
* @return The lookup table as bit array | ||
*/ | ||
[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str | ||
) -> std::array<bool, cCharBitarraySize> { | ||
std::array<bool, cCharBitarraySize> bit_array{}; | ||
bit_array.fill(false); | ||
for (char const ch : char_str) { | ||
bit_array.at(ch) = true; | ||
} | ||
return bit_array; | ||
} | ||
|
||
constexpr char cZeroOrMoreCharsWildcard{'*'}; | ||
constexpr char cSingleCharWildcard{'?'}; | ||
constexpr char cRegexZeroOrMore{'*'}; | ||
constexpr char cRegexOneOrMore{'+'}; | ||
constexpr char cRegexZeroOrOne{'+'}; | ||
constexpr char cRegexStartAnchor{'^'}; | ||
constexpr char cRegexEndAnchor{'$'}; | ||
constexpr char cEscapeChar{'\\'}; | ||
constexpr char cCharsetNegate{'^'}; | ||
|
||
// This is a more complete set of meta characters than necessary, as the user might not be fully | ||
// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences. | ||
constexpr auto cRegexEscapeSeqAcceptedMetaChars = create_char_bit_array("^$.*{}[]()+|?<>-_/=!\\"); | ||
// This is the set of meta characters that need escaping in the wildcard syntax. | ||
constexpr auto cRegexEscapeSeqWildcardOnlyMetaChars = create_char_bit_array("?*\\"); | ||
// This is the set of meta characters that need escaping in the character set. | ||
constexpr auto cRegexCharsetEscapeSeqMetaChars = create_char_bit_array("^-]\\"); | ||
|
||
} // namespace clp::regex_utils | ||
|
||
#endif // CLP_REGEX_UTILS_CONSTANTS_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#ifndef CLP_REGEX_UTILS_REGEX_UTILS_HPP | ||
#define CLP_REGEX_UTILS_REGEX_UTILS_HPP | ||
|
||
#include <string> | ||
#include <string_view> | ||
|
||
#include <boost-outcome/include/boost/outcome/config.hpp> | ||
#include <boost-outcome/include/boost/outcome/std_result.hpp> | ||
|
||
#include "regex_utils/RegexToWildcardTranslatorConfig.hpp" | ||
|
||
namespace clp::regex_utils { | ||
|
||
[[nodiscard]] auto regex_to_wildcard(std::string_view regex_str | ||
) -> BOOST_OUTCOME_V2_NAMESPACE::std_result<std::string>; | ||
|
||
[[nodiscard]] auto regex_to_wildcard( | ||
std::string_view regex_str, | ||
RegexToWildcardTranslatorConfig const& config | ||
) -> BOOST_OUTCOME_V2_NAMESPACE::std_result<std::string>; | ||
|
||
/** | ||
* If a regex expression contains multiple starting or ending anchors, remove the duplicates. | ||
* | ||
* @param regex_str | ||
* @return Trimmed the regex string, leaving at most one starting or ending anchor. | ||
*/ | ||
[[nodiscard]] auto regex_trim_line_anchors(std::string_view regex_str) -> std::string; | ||
|
||
/** | ||
* Check if a regex string has a starting anchor character `^` (caret). | ||
* | ||
* @param regex_str | ||
* @return True if the regex string begins with `^`, false otherwise. | ||
*/ | ||
[[nodiscard]] auto regex_has_start_anchor(std::string_view regex_str) -> bool; | ||
|
||
/** | ||
* Check if a regex string has an ending anchor character `$` (dollar sign). | ||
* Note that the regex string may end with an escaped `$`, in which case the `$` character retain | ||
* its literal meaning. | ||
* | ||
* @param regex_str | ||
* @return True if the regex string ends with an unescaped `$`, false otherwise. | ||
*/ | ||
[[nodiscard]] auto regex_has_end_anchor(std::string_view regex_str) -> bool; | ||
} // namespace clp::regex_utils | ||
|
||
#endif // CLP_REGEX_UTILS_REGEX_UTILS_HPP |
64 changes: 64 additions & 0 deletions
64
components/core/src/clp/regex_utils/regex_utils_anchors.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#include <string> | ||
#include <string_view> | ||
|
||
#include "regex_utils/constants.hpp" | ||
#include "regex_utils/regex_utils.hpp" | ||
|
||
using std::string; | ||
using std::string_view; | ||
|
||
namespace clp::regex_utils { | ||
|
||
auto regex_trim_line_anchors(string_view regex_str) -> string { | ||
string_view::const_iterator left(regex_str.begin()); | ||
string_view::const_iterator right(regex_str.end()); | ||
|
||
// Find the position of the first non-caret character | ||
while (left != right && cRegexStartAnchor == *left) { | ||
++left; | ||
} | ||
// Backtrack one char to include at least one start anchor, if there was any. | ||
if (left != regex_str.begin()) { | ||
--left; | ||
} | ||
|
||
// Find the position of the last non-dollar-sign character | ||
while (left != right && cRegexEndAnchor == *(right - 1)) { | ||
--right; | ||
} | ||
if (left != right && right != regex_str.end()) { | ||
// There was at least one end anchor so we include it by advancing one char | ||
++right; | ||
} | ||
|
||
// If there was more than one end anchor, we need to check if the current end anchor is escaped. | ||
// If so, it's not a real end anchor, and we need to advance the end position once more to | ||
// append a real end anchor. | ||
string trimmed_regex_str(left, right); | ||
if (right != regex_str.end() && !regex_has_end_anchor(trimmed_regex_str)) { | ||
trimmed_regex_str += cRegexEndAnchor; | ||
} | ||
return trimmed_regex_str; | ||
} | ||
|
||
auto regex_has_start_anchor(string_view regex_str) -> bool { | ||
return !regex_str.empty() && cRegexStartAnchor == regex_str.at(0); | ||
} | ||
|
||
auto regex_has_end_anchor(string_view regex_str) -> bool { | ||
auto it{regex_str.rbegin()}; | ||
if (it == regex_str.rend() || cRegexEndAnchor != *it) { | ||
return false; | ||
} | ||
|
||
// Check that ending regex dollar sigh char is unescaped. | ||
// We need to scan the suffix until we encounter a character that is not an | ||
// escape char, since escape chars can escape themselves. | ||
bool escaped{false}; | ||
for (++it; it != regex_str.rend() && cEscapeChar == *it; ++it) { | ||
escaped = !escaped; | ||
} | ||
return !escaped; | ||
} | ||
|
||
} // namespace clp::regex_utils |
Oops, something went wrong.