Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regex to wildcard translation along with other useful regex utilities in a separate library. #471

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ set(sqlite_DYNAMIC_LIBS "dl;m;pthread")
include(cmake/Modules/FindLibraryDependencies.cmake)
FindDynamicLibraryDependencies(sqlite "${sqlite_DYNAMIC_LIBS}")

add_subdirectory(src/clp/regex_utils)
add_subdirectory(src/clp/string_utils)

add_subdirectory(src/clp/clg)
Expand Down Expand Up @@ -458,26 +459,27 @@ set(SOURCE_FILES_unitTest
tests/LogSuppressor.hpp
tests/test-BufferedFileReader.cpp
tests/test-EncodedVariableInterpreter.cpp
tests/test-Grep.cpp
tests/test-MemoryMappedFile.cpp
tests/test-NetworkReader.cpp
tests/test-ParserWithUserSchema.cpp
tests/test-SQLiteDB.cpp
tests/test-Segment.cpp
tests/test-Stopwatch.cpp
tests/test-StreamingCompression.cpp
tests/test-TimestampPattern.cpp
tests/test-Utils.cpp
tests/test-encoding_methods.cpp
tests/test-ffi_SchemaTree.cpp
tests/test-Grep.cpp
tests/test-ir_encoding_methods.cpp
tests/test-ir_parsing.cpp
tests/test-kql.cpp
tests/test-main.cpp
tests/test-math_utils.cpp
tests/test-MemoryMappedFile.cpp
tests/test-NetworkReader.cpp
tests/test-ParserWithUserSchema.cpp
tests/test-query_methods.cpp
tests/test-Segment.cpp
tests/test-SQLiteDB.cpp
tests/test-Stopwatch.cpp
tests/test-StreamingCompression.cpp
tests/test-regex_utils.cpp
tests/test-string_utils.cpp
tests/test-TimestampPattern.cpp
tests/test-utf8_utils.cpp
tests/test-Utils.cpp
)
add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest})
target_include_directories(unitTest
Expand All @@ -497,6 +499,7 @@ target_link_libraries(unitTest
spdlog::spdlog
${sqlite_LIBRARY_DEPENDENCIES}
${STD_FS_LIBS}
clp::regex_utils
clp::string_utils
yaml-cpp::yaml-cpp
ZStd::ZStd
Expand Down
22 changes: 22 additions & 0 deletions components/core/src/clp/regex_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
set(
REGEX_UTILS_HEADER_LIST
"ErrorCode.hpp"
"RegexToWildcardTranslatorConfig.hpp"
"constants.hpp"
"regex_utils.hpp"
)
add_library(
regex_utils
regex_utils.cpp
regex_utils_anchors.cpp
ErrorCode.cpp
${REGEX_UTILS_HEADER_LIST}
)
add_library(clp::regex_utils ALIAS regex_utils)
target_include_directories(regex_utils
PUBLIC
../
PRIVATE
"${PROJECT_SOURCE_DIR}/submodules"
)
target_compile_features(regex_utils PRIVATE cxx_std_20)
93 changes: 93 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#include "regex_utils/ErrorCode.hpp"

#include <string>
#include <string_view>
#include <system_error>

using std::error_category;
using std::error_code;
using std::string;
using std::string_view;

namespace clp::regex_utils {

/**
* Class for giving the error codes more detailed string descriptions.
* This class does not need to be seen outside the std error code wrapper implementation.
*/
class ErrorCodeCategory : public error_category {
public:
/**
* @return The class of errors.
*/
[[nodiscard]] char const* name() const noexcept override;

/**
* @param The error code encoded in int.
* @return The descriptive message for the error.
*/
[[nodiscard]] string message(int ev) const override;
};

auto ErrorCodeCategory::name() const noexcept -> char const* {
return "regex utility";
}

auto ErrorCodeCategory::message(int ev) const -> string {
switch (static_cast<ErrorCode>(ev)) {
case ErrorCode::Success:
return "Success.";

case ErrorCode::IllegalState:
return "Unrecognized state.";

case ErrorCode::Star:
return "Failed to translate due to metachar `*` (zero or more occurences).";

case ErrorCode::Plus:
return "Failed to translate due to metachar `+` (one or more occurences).";

case ErrorCode::Question:
return "Currently does not support returning a list of wildcard translations. The "
"metachar `?` (lazy match) may be supported in the future.";

case ErrorCode::Pipe:
return "Currently does not support returning a list of wildcard translations. The "
"regex OR condition feature may be supported in the future.";

case ErrorCode::Caret:
return "Failed to translate due to start anchor `^` in the middle of the string.";

case ErrorCode::Dollar:
return "Failed to translate due to end anchor `$` in the middle of the string.";

case ErrorCode::DisallowedEscapeSequence:
return "Disallowed escape sequence.";

case ErrorCode::UnmatchedParenthesis:
return "Unmatched opening `(` or closing `)`.";

case ErrorCode::UnsupportedCharsets:
return "Currently only supports case-insensitive single-char charset (i.e. [aA] [bB]).";

case ErrorCode::IncompleteCharsetStructure:
return "Unmatched closing `]` at the end of the string.";

case ErrorCode::UnsupportedQuantifier:
return "Currently only supports exact positive number of repetitions in regex syntax.";

case ErrorCode::TokenUnquantifiable:
return "The preceding token is not quantifiable.";

default:
return "(unrecognized error)";
}
}

ErrorCodeCategory const cTheErrorCodeCategory{};

auto make_error_code(ErrorCode e) -> error_code {
return {static_cast<int>(e), cTheErrorCodeCategory};
}

} // namespace clp::regex_utils
46 changes: 46 additions & 0 deletions components/core/src/clp/regex_utils/ErrorCode.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef CLP_REGEX_UTILS_ERRORCODE_HPP
#define CLP_REGEX_UTILS_ERRORCODE_HPP

#include <cstdint>
#include <system_error>
#include <type_traits>

namespace clp::regex_utils {

/**
* Enum class for propagating and handling various regex utility errors.
* More detailed descriptions can be found in ErrorCode.cpp.
*/
enum class ErrorCode : uint8_t {
Success = 0,
IllegalState,
Star,
Plus,
Question,
Pipe,
Caret,
Dollar,
DisallowedEscapeSequence,
UnmatchedParenthesis,
UnsupportedCharsets,
IncompleteCharsetStructure,
UnsupportedQuantifier,
TokenUnquantifiable,
};

/**
* Wrapper function to turn a regular enum class into an std::error_code.
*
* @param An error code enum.
* @return The corresponding std::error_code type variable.
*/
[[nodiscard]] auto make_error_code(ErrorCode ec) -> std::error_code;

} // namespace clp::regex_utils

namespace std {
template <>
struct is_error_code_enum<clp::regex_utils::ErrorCode> : true_type {};
} // namespace std

#endif // CLP_REGEX_UTILS_ERRORCODE_HPP
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
#define CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP

namespace clp::regex_utils {

class RegexToWildcardTranslatorConfig {
public:
// Constructors
RegexToWildcardTranslatorConfig() = default;

// Getters
[[nodiscard]] auto case_insensitive_wildcard() const -> bool {
return m_case_insensitive_wildcard;
}

[[nodiscard]] auto allow_anchors() const -> bool { return m_allow_anchors; }

[[nodiscard]] auto add_prefix_suffix_wildcards() const -> bool {
return m_add_prefix_suffix_wildcards;
}

// Setters
void set_case_insensitive_wildcard(bool case_insensitive_wildcard) {
m_case_insensitive_wildcard = case_insensitive_wildcard;
}

void set_allow_anchors(bool allow_anchors) { m_allow_anchors = allow_anchors; }

void set_add_prefix_suffix_wildcards(bool add_prefix_suffix_wildcards) {
m_add_prefix_suffix_wildcards = add_prefix_suffix_wildcards;
}

private:
// Variables
bool m_case_insensitive_wildcard = false;
bool m_allow_anchors = true;
bool m_add_prefix_suffix_wildcards = false;
};

} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_REGEXTOWILDCARDTRANSLATORCONFIG_HPP
48 changes: 48 additions & 0 deletions components/core/src/clp/regex_utils/constants.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef CLP_REGEX_UTILS_CONSTANTS_HPP
#define CLP_REGEX_UTILS_CONSTANTS_HPP

#include <array>
#include <cstddef>
#include <string_view>

namespace clp::regex_utils {

constexpr size_t cCharBitarraySize = 128;

/**
* Create an ASCII character lookup table (bit array) at compile time.
*
* @param char_str A string that contains the characters to look up.
* @return The lookup table as bit array
*/
[[nodiscard]] constexpr auto create_char_bit_array(std::string_view char_str
) -> std::array<bool, cCharBitarraySize> {
std::array<bool, cCharBitarraySize> bit_array{};
bit_array.fill(false);
for (char const ch : char_str) {
bit_array.at(ch) = true;
}
return bit_array;
}

constexpr char cZeroOrMoreCharsWildcard{'*'};
constexpr char cSingleCharWildcard{'?'};
constexpr char cRegexZeroOrMore{'*'};
constexpr char cRegexOneOrMore{'+'};
constexpr char cRegexZeroOrOne{'+'};
constexpr char cRegexStartAnchor{'^'};
constexpr char cRegexEndAnchor{'$'};
constexpr char cEscapeChar{'\\'};
constexpr char cCharsetNegate{'^'};

// This is a more complete set of meta characters than necessary, as the user might not be fully
// knowledgeable on which meta characters to escape, and may introduce unnecessary escape sequences.
constexpr auto cRegexEscapeSeqAcceptedMetaChars = create_char_bit_array("^$.*{}[]()+|?<>-_/=!\\");
// This is the set of meta characters that need escaping in the wildcard syntax.
constexpr auto cRegexEscapeSeqWildcardOnlyMetaChars = create_char_bit_array("?*\\");
// This is the set of meta characters that need escaping in the character set.
constexpr auto cRegexCharsetEscapeSeqMetaChars = create_char_bit_array("^-]\\");

} // namespace clp::regex_utils

#endif // CLP_REGEX_UTILS_CONSTANTS_HPP
Loading
Loading