Skip to content

Commit

Permalink
Add initial implementation of regex_utils library containing a regex …
Browse files Browse the repository at this point in the history
…to wildcard string translator and a corresponding std::error_code enum and category. (y-scope#482)

Co-authored-by: Bingran Hu <[email protected]>
Co-authored-by: davidlion <[email protected]>
Co-authored-by: Lin Zhihao <[email protected]>
Co-authored-by: Kirk Rodrigues <[email protected]>
  • Loading branch information
5 people authored Jul 19, 2024
1 parent 24e4690 commit 44aaff9
Show file tree
Hide file tree
Showing 11 changed files with 611 additions and 2 deletions.
7 changes: 5 additions & 2 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ set(sqlite_DYNAMIC_LIBS "dl;m;pthread")
include(cmake/Modules/FindLibraryDependencies.cmake)
FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}")

add_subdirectory(src/clp/regex_utils)
add_subdirectory(src/clp/string_utils)

add_subdirectory(src/clp/clg)
Expand Down Expand Up @@ -304,11 +305,11 @@ set(SOURCE_FILES_unitTest
src/clp/ffi/ir_stream/decoding_methods.inc
src/clp/ffi/ir_stream/encoding_methods.cpp
src/clp/ffi/ir_stream/encoding_methods.hpp
src/clp/ffi/ir_stream/protocol_constants.hpp
src/clp/ffi/ir_stream/Serializer.cpp
src/clp/ffi/ir_stream/Serializer.hpp
src/clp/ffi/ir_stream/utils.cpp
src/clp/ffi/ir_stream/utils.hpp
src/clp/ffi/ir_stream/protocol_constants.hpp
src/clp/ffi/SchemaTree.cpp
src/clp/ffi/SchemaTree.hpp
src/clp/ffi/SchemaTreeNode.hpp
Expand Down Expand Up @@ -435,10 +436,10 @@ set(SOURCE_FILES_unitTest
src/clp/StringReader.hpp
src/clp/Thread.cpp
src/clp/Thread.hpp
src/clp/time_types.hpp
src/clp/TimestampPattern.cpp
src/clp/TimestampPattern.hpp
src/clp/TraceableException.hpp
src/clp/time_types.hpp
src/clp/type_utils.hpp
src/clp/utf8_utils.cpp
src/clp/utf8_utils.hpp
Expand Down Expand Up @@ -470,6 +471,7 @@ set(SOURCE_FILES_unitTest
tests/test-NetworkReader.cpp
tests/test-ParserWithUserSchema.cpp
tests/test-query_methods.cpp
tests/test-regex_utils.cpp
tests/test-Segment.cpp
tests/test-SQLiteDB.cpp
tests/test-Stopwatch.cpp
Expand Down Expand Up @@ -497,6 +499,7 @@ target_link_libraries(unitTest
spdlog::spdlog
${sqlite_LIBRARY_DEPENDENCIES}
${STD_FS_LIBS}
clp::regex_utils
clp::string_utils
yaml-cpp::yaml-cpp
ZStd::ZStd
Expand Down
20 changes: 20 additions & 0 deletions components/core/src/clp/regex_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
set(
REGEX_UTILS_HEADER_LIST
"constants.hpp"
"ErrorCode.hpp"
"regex_translation_utils.hpp"
"RegexToWildcardTranslatorConfig.hpp"
)
add_library(
regex_utils
ErrorCode.cpp
regex_translation_utils.cpp
${REGEX_UTILS_HEADER_LIST}
)
add_library(clp::regex_utils ALIAS regex_utils)
target_include_directories(regex_utils
PRIVATE
../
"${PROJECT_SOURCE_DIR}/submodules"
)
target_compile_features(regex_utils PRIVATE cxx_std_20)
82 changes: 82 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#include "regex_utils/ErrorCode.hpp"

#include <string>
#include <string_view>
#include <system_error>

namespace clp::regex_utils {
using std::error_code;

namespace {
using std::error_category;
using std::string;
using std::string_view;

/**
* Class for giving the error codes more detailed string descriptions.
*/
class ErrorCodeCategory : public error_category {
public:
/**
* @return The class of errors.
*/
[[nodiscard]] auto name() const noexcept -> char const* override;

/**
* @param The error code encoded in int.
* @return The descriptive message for the error.
*/
[[nodiscard]] auto message(int ev) const -> string override;
};

auto ErrorCodeCategory::name() const noexcept -> char const* {
return "regex utility";
}

auto ErrorCodeCategory::message(int ev) const -> string {
switch (static_cast<ErrorCode>(ev)) {
case ErrorCode::Success:
return "Success.";

case ErrorCode::IllegalState:
return "Unrecognized state.";

case ErrorCode::UntranslatableStar:
return "Unable to express regex quantifier `*` in wildcard, which repeats a token for "
"zero or more occurences, unless it is combined with a wildcard `.`";

case ErrorCode::UntranslatablePlus:
return "Unable to express regex quantifier `+` in wildcard, which repeats a token for "
"one or more occurences, unless it is combined with a wildcard `.`";

case ErrorCode::UnsupportedQuestionMark:
return "Unable to express regex quantifier `?` in wildcard, which makes the preceding "
"token optional, unless the translator supports returning a list of possible "
"wildcard translations.";

case ErrorCode::UnsupportedPipe:
return "Unable to express regex OR `|` in wildcard, which allows the query string to "
"match a single token out of a series of options, unless the translator "
"supports returning a list of possible wildcard translations.";

case ErrorCode::IllegalCaret:
return "Failed to translate due to start anchor `^` in the middle of the string.";

case ErrorCode::IllegalDollarSign:
return "Failed to translate due to end anchor `$` in the middle of the string.";

case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

default:
return "(unrecognized error)";
}
}

ErrorCodeCategory const cErrorCodeCategoryInstance;
} // namespace

auto make_error_code(ErrorCode e) -> error_code {
return {static_cast<int>(e), cErrorCodeCategoryInstance};
}
} // namespace clp::regex_utils
39 changes: 39 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP
#define CLP_REGEX_UTILS_ERRORCODE_HPP

#include <cstdint>
#include <system_error>
#include <type_traits>

namespace clp::regex_utils {
/**
* Enum class for propagating and handling various regex utility errors.
* More detailed descriptions can be found in ErrorCode.cpp.
*/
enum class ErrorCode : uint8_t {
Success = 0,
IllegalState,
UntranslatableStar,
UntranslatablePlus,
UnsupportedQuestionMark,
UnsupportedPipe,
IllegalCaret,
IllegalDollarSign,
UnmatchedParenthesis,
};

/**
* Wrapper function to turn a regular enum class into an std::error_code.
*
* @param An error code enum.
* @return The corresponding std::error_code type variable.
*/
[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code;
} // namespace clp::regex_utils

namespace std {
template <>
struct is_error_code_enum<clp::regex_utils::ErrorCode> : true_type {};
} // namespace std

#endif // CLP_REGEX_UTILS_ERRORCODE_HPP
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP

namespace clp::regex_utils {
/**
* Allows users to customize and fine tune how to translate a regex string to wildcard.
*
* This class won't affect the core logic and state trasition mechanics of the regex to wildcard
* translator, but it can make the translator more versatile. For detailed descriptions of how each
* option should be used, see the getter function docstrings.
*/
class RegexToWildcardTranslatorConfig {
public:
RegexToWildcardTranslatorConfig(
bool case_insensitive_wildcard,
bool add_prefix_suffix_wildcards
)
: m_case_insensitive_wildcard{case_insensitive_wildcard},
m_add_prefix_suffix_wildcards{add_prefix_suffix_wildcards} {};

/**
* @return True if the final translated wildcard string will be fed into a case-insensitive
* wildcard analyzer. In such cases, we can safely translate charset patterns such as [aA] [Bb]
* into singular lowercase characters a, b.
*/
[[nodiscard]] auto case_insensitive_wildcard() const -> bool {
return m_case_insensitive_wildcard;
}

/**
* @return True if in the absense of starting or ending anchors in the regex string, we append
* prefix or suffix zero or more characters wildcards. In other words, this config is true if
* the search is a substring search, and false if the search is an exact search.
*/
[[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool {
return m_add_prefix_suffix_wildcards;
}

private:
// Variables
bool m_case_insensitive_wildcard;
bool m_add_prefix_suffix_wildcards;
};
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
19 changes: 19 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
#define CLP_REGEX_UTILS_CONSTANTS_HPP

namespace clp::regex_utils {
// Wildcard meta characters
constexpr char cZeroOrMoreCharsWildcard{'*'};
constexpr char cSingleCharWildcard{'?'};

// Regex meta characters
constexpr char cRegexZeroOrMore{'*'};
constexpr char cRegexOneOrMore{'+'};
constexpr char cRegexZeroOrOne{'?'};
constexpr char cRegexStartAnchor{'^'};
constexpr char cRegexEndAnchor{'$'};
constexpr char cEscapeChar{'\\'};
constexpr char cCharsetNegate{'^'};
} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
Loading

0 comments on commit 44aaff9

Please sign in to comment.