From c91f625fbb65a91de7caa8422477255a6547dd97 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sat, 16 Sep 2023 14:53:33 -0400 Subject: [PATCH 1/8] Move parsing methods from ffi to ir namespace. --- components/core/CMakeLists.txt | 5 + components/core/src/ffi/encoding_methods.cpp | 107 ---------------- components/core/src/ffi/encoding_methods.hpp | 63 +-------- components/core/src/ffi/encoding_methods.inc | 12 +- .../src/ffi/ir_stream/decoding_methods.inc | 3 +- .../src/ffi/ir_stream/encoding_methods.cpp | 5 +- .../src/ffi/search/CompositeWildcardToken.cpp | 4 +- .../core/src/ffi/search/query_methods.cpp | 5 +- components/core/src/ir/parsing.cpp | 117 +++++++++++++++++ components/core/src/ir/parsing.hpp | 77 +++++++++++ .../core/tests/test-encoding_methods.cpp | 121 +----------------- .../core/tests/test-ir_encoding_methods.cpp | 6 +- components/core/tests/test-ir_parsing.cpp | 120 +++++++++++++++++ components/core/tests/test-query_methods.cpp | 3 +- 14 files changed, 348 insertions(+), 300 deletions(-) create mode 100644 components/core/src/ir/parsing.cpp create mode 100644 components/core/src/ir/parsing.hpp create mode 100644 components/core/tests/test-ir_parsing.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 1fe764b0c..26e7bbb79 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -251,6 +251,8 @@ set(SOURCE_FILES_clp src/ir/LogEvent.hpp src/ir/LogEventDeserializer.cpp src/ir/LogEventDeserializer.hpp + src/ir/parsing.cpp + src/ir/parsing.hpp src/ir/utils.cpp src/ir/utils.hpp src/LibarchiveFileReader.cpp @@ -789,6 +791,8 @@ set(SOURCE_FILES_unitTest src/ir/LogEvent.hpp src/ir/LogEventDeserializer.cpp src/ir/LogEventDeserializer.hpp + src/ir/parsing.cpp + src/ir/parsing.hpp src/ir/utils.cpp src/ir/utils.hpp src/LibarchiveFileReader.cpp @@ -890,6 +894,7 @@ set(SOURCE_FILES_unitTest tests/test-encoding_methods.cpp tests/test-Grep.cpp tests/test-ir_encoding_methods.cpp + tests/test-ir_parsing.cpp tests/test-main.cpp tests/test-math_utils.cpp tests/test-ParserWithUserSchema.cpp diff --git a/components/core/src/ffi/encoding_methods.cpp b/components/core/src/ffi/encoding_methods.cpp index 9a7b17d36..43eefb3e6 100644 --- a/components/core/src/ffi/encoding_methods.cpp +++ b/components/core/src/ffi/encoding_methods.cpp @@ -6,113 +6,6 @@ using std::string_view; namespace ffi { -/* - * For performance, we rely on the ASCII ordering of characters to compare - * ranges of characters at a time instead of comparing individual characters - */ -bool is_delim(signed char c) { - return !( - '+' == c || ('-' <= c && c <= '.') || ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') - || '\\' == c || '_' == c || ('a' <= c && c <= 'z') - ); -} - -bool is_variable_placeholder(char c) { - return (enum_to_underlying_type(VariablePlaceholder::Integer) == c) - || (enum_to_underlying_type(VariablePlaceholder::Dictionary) == c) - || (enum_to_underlying_type(VariablePlaceholder::Float) == c); -} - -bool could_be_multi_digit_hex_value(const string_view str) { - if (str.length() < 2) { - return false; - } - - return std::all_of(str.cbegin(), str.cend(), [](char c) { - return ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'); - }); -} - -bool is_var(std::string_view value) { - size_t begin_pos = 0; - size_t end_pos = 0; - bool contains_var_placeholder; - if (get_bounds_of_next_var(value, begin_pos, end_pos, contains_var_placeholder)) { - // Ensure the entire value is a variable - return (0 == begin_pos && value.length() == end_pos); - } else { - return false; - } -} - -bool get_bounds_of_next_var( - const string_view str, - size_t& begin_pos, - size_t& end_pos, - bool& contains_var_placeholder -) { - auto const msg_length = str.length(); - if (end_pos >= msg_length) { - return false; - } - - while (true) { - begin_pos = end_pos; - - // Find next non-delimiter - bool char_before_var_was_equals_sign = false; - char previous_char = 0; - for (; begin_pos < msg_length; ++begin_pos) { - auto c = str[begin_pos]; - if (false == is_delim(c)) { - char_before_var_was_equals_sign = ('=' == previous_char); - break; - } - previous_char = c; - - if (enum_to_underlying_type(VariablePlaceholder::Integer) == c - || enum_to_underlying_type(VariablePlaceholder::Dictionary) == c - || enum_to_underlying_type(VariablePlaceholder::Float) == c) - { - contains_var_placeholder = true; - } - } - if (msg_length == begin_pos) { - // Early exit for performance - return false; - } - - bool contains_decimal_digit = false; - bool contains_alphabet = false; - - // Find next delimiter - end_pos = begin_pos; - for (; end_pos < msg_length; ++end_pos) { - auto c = str[end_pos]; - if (is_decimal_digit(c)) { - contains_decimal_digit = true; - } else if (is_alphabet(c)) { - contains_alphabet = true; - } else if (is_delim(c)) { - break; - } - } - - string_view variable(str.cbegin() + begin_pos, end_pos - begin_pos); - // Treat token as variable if: - // - it contains a decimal digit, or - // - it's directly preceded by '=' and contains an alphabet char, or - // - it could be a multi-digit hex value - if (contains_decimal_digit || (char_before_var_was_equals_sign && contains_alphabet) - || could_be_multi_digit_hex_value(variable)) - { - break; - } - } - - return (msg_length != begin_pos); -} - eight_byte_encoded_variable_t encode_four_byte_float_as_eight_byte( four_byte_encoded_variable_t four_byte_encoded_var ) { diff --git a/components/core/src/ffi/encoding_methods.hpp b/components/core/src/ffi/encoding_methods.hpp index 8463368b0..fa26ef675 100644 --- a/components/core/src/ffi/encoding_methods.hpp +++ b/components/core/src/ffi/encoding_methods.hpp @@ -4,6 +4,7 @@ #include #include +#include "../ir/parsing.hpp" #include "../TraceableException.hpp" // TODO Some of the methods in this file are mostly duplicated from code that @@ -15,12 +16,6 @@ using epoch_time_ms_t = int64_t; using eight_byte_encoded_variable_t = int64_t; using four_byte_encoded_variable_t = int32_t; -enum class VariablePlaceholder : char { - Integer = 0x11, - Dictionary = 0x12, - Float = 0x13, -}; - class EncodingException : public TraceableException { public: // Constructors @@ -55,8 +50,6 @@ static constexpr char cVariableEncodingMethodsVersion[] = "com.yscope.clp.VariableEncodingMethodsV1"; static constexpr char cVariablesSchemaVersion[] = "com.yscope.clp.VariablesSchemaV2"; -constexpr char cVariablePlaceholderEscapeCharacter = '\\'; - static constexpr char cTooFewDictionaryVarsErrorMessage[] = "There are fewer dictionary variables than dictionary variable delimiters in the " "logtype."; @@ -70,58 +63,6 @@ constexpr size_t cMaxDigitsInRepresentableFourByteFloatVar = 8; constexpr uint64_t cEightByteEncodedFloatDigitsBitMask = (1ULL << 54) - 1; constexpr uint32_t cFourByteEncodedFloatDigitsBitMask = (1UL << 25) - 1; -/** - * Checks if the given character is a delimiter - * We treat everything *except* the following quoted characters as a - * delimiter: "+-.0-9A-Z\_a-z" - * @param c - * @return Whether c is a delimiter - */ -bool is_delim(signed char c); - -/** - * @param c - * @return Whether the character is a variable placeholder - */ -bool is_variable_placeholder(char c); - -/** - * @param str - * @return Whether the given string could be a multi-digit hex value - */ -bool could_be_multi_digit_hex_value(std::string_view str); - -/** - * @param value - * @return Whether the given value is a variable according to the schemas - * specified in ffi::get_bounds_of_next_var - */ -bool is_var(std::string_view value); - -/** - * Gets the bounds of the next variable in the given string - * A variable is a token (word between two delimiters) that matches one of - * these schemas: - * - ".*[0-9].*" - * - "=(.*[a-zA-Z].*)" (the variable is within the capturing group) - * - "[a-fA-F0-9]{2,}" - * @param str String to search within - * @param begin_pos Begin position of last variable, changes to begin - * position of next variable - * @param end_pos End position of last variable, changes to end position of - * next variable - * @param contains_var_placeholder Whether the string already contains a - * variable placeholder (for efficiency, this is only set to true, so the - * caller must reset it to false if necessary) - * @return true if a variable was found, false otherwise - */ -bool get_bounds_of_next_var( - std::string_view str, - size_t& begin_pos, - size_t& end_pos, - bool& contains_var_placeholder -); - /** * Encodes the given string into a representable float variable if possible * @tparam encoded_variable_t Type of the encoded variable @@ -321,7 +262,7 @@ std::string decode_message( * @param encoded_vars_length * @return true if a match was found, false otherwise */ -template +template bool wildcard_query_matches_any_encoded_var( std::string_view wildcard_query, std::string_view logtype, diff --git a/components/core/src/ffi/encoding_methods.inc b/components/core/src/ffi/encoding_methods.inc index 13868585b..38c5a00ef 100644 --- a/components/core/src/ffi/encoding_methods.inc +++ b/components/core/src/ffi/encoding_methods.inc @@ -3,9 +3,12 @@ #include +#include "../ir/parsing.hpp" #include "../string_utils.hpp" #include "../type_utils.hpp" +using ir::VariablePlaceholder; + namespace ffi { template bool encode_float_string(std::string_view str, encoded_variable_t& encoded_var) { @@ -364,7 +367,7 @@ bool encode_message_generically( size_t constant_begin_pos = 0; logtype.clear(); logtype.reserve(message.length()); - while (get_bounds_of_next_var( + while (ir::get_bounds_of_next_var( message, var_begin_pos, var_end_pos, @@ -428,8 +431,11 @@ bool encode_message( auto final_constant_handler = [&constant_handler](std::string_view constant, std::string& logtype) { // Ensure final constant doesn't contain a variable placeholder - bool contains_variable_placeholder - = std::any_of(constant.cbegin(), constant.cend(), is_variable_placeholder); + bool contains_variable_placeholder = std::any_of( + constant.cbegin(), + constant.cend(), + ir::is_variable_placeholder + ); return constant_handler(constant, contains_variable_placeholder, logtype); }; auto encoded_variable_handler = [&encoded_vars](encoded_variable_t encoded_variable) { diff --git a/components/core/src/ffi/ir_stream/decoding_methods.inc b/components/core/src/ffi/ir_stream/decoding_methods.inc index c02a933dc..139688388 100644 --- a/components/core/src/ffi/ir_stream/decoding_methods.inc +++ b/components/core/src/ffi/ir_stream/decoding_methods.inc @@ -4,6 +4,7 @@ #include #include +#include "../../ir/parsing.hpp" #include "../encoding_methods.hpp" #include "decoding_methods.hpp" #include "protocol_constants.hpp" @@ -97,7 +98,7 @@ void generic_decode_message( break; } - case cVariablePlaceholderEscapeCharacter: { + case ir::cVariablePlaceholderEscapeCharacter: { // Ensure the escape character is followed by a // character that's being escaped if (cur_pos == logtype_length - 1) { diff --git a/components/core/src/ffi/ir_stream/encoding_methods.cpp b/components/core/src/ffi/ir_stream/encoding_methods.cpp index 10e9210e9..4e3ea2b2c 100644 --- a/components/core/src/ffi/ir_stream/encoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/encoding_methods.cpp @@ -2,6 +2,7 @@ #include +#include "../../ir/parsing.hpp" #include "byteswap.hpp" #include "protocol_constants.hpp" @@ -182,9 +183,9 @@ static bool append_constant_to_logtype(string_view constant, bool, string& logty auto constant_len = constant.length(); for (size_t i = 0; i < constant_len; ++i) { auto c = constant[i]; - if (cVariablePlaceholderEscapeCharacter == c || is_variable_placeholder(c)) { + if (ir::cVariablePlaceholderEscapeCharacter == c || ir::is_variable_placeholder(c)) { logtype.append(constant, begin_pos, i - begin_pos); - logtype += cVariablePlaceholderEscapeCharacter; + logtype += ir::cVariablePlaceholderEscapeCharacter; // NOTE: We don't need to append the character of interest // immediately since the next constant copy operation will get it begin_pos = i; diff --git a/components/core/src/ffi/search/CompositeWildcardToken.cpp b/components/core/src/ffi/search/CompositeWildcardToken.cpp index b57c37da5..199acc8dd 100644 --- a/components/core/src/ffi/search/CompositeWildcardToken.cpp +++ b/components/core/src/ffi/search/CompositeWildcardToken.cpp @@ -1,5 +1,7 @@ #include "CompositeWildcardToken.hpp" +#include "../../ir/parsing.hpp" + using std::string; using std::string_view; using std::variant; @@ -256,7 +258,7 @@ void CompositeWildcardToken::try_add_wildcard_variable( ); } else { string_view var(m_query.cbegin() + begin_pos, end_pos - begin_pos); - if (is_var(var)) { + if (ir::is_var(var)) { m_variables.emplace_back( std::in_place_type>, m_query, diff --git a/components/core/src/ffi/search/query_methods.cpp b/components/core/src/ffi/search/query_methods.cpp index cacd27ab7..e343e78fc 100644 --- a/components/core/src/ffi/search/query_methods.cpp +++ b/components/core/src/ffi/search/query_methods.cpp @@ -1,8 +1,10 @@ #include "query_methods.hpp" +#include "../../ir/parsing.hpp" #include "CompositeWildcardToken.hpp" #include "QueryMethodFailed.hpp" +using ir::is_delim; using std::pair; using std::string; using std::string_view; @@ -188,7 +190,7 @@ void tokenize_query( // - it could be a multi-digit hex value if (contains_decimal_digit || (begin_pos > 0 && '=' == wildcard_query[begin_pos - 1] && contains_alphabet) - || ffi::could_be_multi_digit_hex_value(variable)) + || ir::could_be_multi_digit_hex_value(variable)) { tokens.emplace_back( std::in_place_type>, @@ -295,5 +297,4 @@ template void tokenize_query( CompositeWildcardToken>>& tokens, vector& composite_wildcard_token_indexes ); - } // namespace ffi::search diff --git a/components/core/src/ir/parsing.cpp b/components/core/src/ir/parsing.cpp new file mode 100644 index 000000000..507e89a1b --- /dev/null +++ b/components/core/src/ir/parsing.cpp @@ -0,0 +1,117 @@ +#include "parsing.hpp" + +#include + +#include "../string_utils.hpp" +#include "../type_utils.hpp" + +using std::string_view; + +namespace ir { +/* + * For performance, we rely on the ASCII ordering of characters to compare + * ranges of characters at a time instead of comparing individual characters + */ +bool is_delim(signed char c) { + return !( + '+' == c || ('-' <= c && c <= '.') || ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') + || '\\' == c || '_' == c || ('a' <= c && c <= 'z') + ); +} + +bool is_variable_placeholder(char c) { + return (enum_to_underlying_type(VariablePlaceholder::Integer) == c) + || (enum_to_underlying_type(VariablePlaceholder::Dictionary) == c) + || (enum_to_underlying_type(VariablePlaceholder::Float) == c); +} + +bool could_be_multi_digit_hex_value(string_view const str) { + if (str.length() < 2) { + return false; + } + + return std::all_of(str.cbegin(), str.cend(), [](char c) { + return ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'); + }); +} + +bool is_var(std::string_view value) { + size_t begin_pos = 0; + size_t end_pos = 0; + bool contains_var_placeholder; + if (get_bounds_of_next_var(value, begin_pos, end_pos, contains_var_placeholder)) { + // Ensure the entire value is a variable + return (0 == begin_pos && value.length() == end_pos); + } else { + return false; + } +} + +bool get_bounds_of_next_var( + string_view const str, + size_t& begin_pos, + size_t& end_pos, + bool& contains_var_placeholder +) { + auto const msg_length = str.length(); + if (end_pos >= msg_length) { + return false; + } + + while (true) { + begin_pos = end_pos; + + // Find next non-delimiter + bool char_before_var_was_equals_sign = false; + char previous_char = 0; + for (; begin_pos < msg_length; ++begin_pos) { + auto c = str[begin_pos]; + if (false == is_delim(c)) { + char_before_var_was_equals_sign = ('=' == previous_char); + break; + } + previous_char = c; + + if (enum_to_underlying_type(VariablePlaceholder::Integer) == c + || enum_to_underlying_type(VariablePlaceholder::Dictionary) == c + || enum_to_underlying_type(VariablePlaceholder::Float) == c) + { + contains_var_placeholder = true; + } + } + if (msg_length == begin_pos) { + // Early exit for performance + return false; + } + + bool contains_decimal_digit = false; + bool contains_alphabet = false; + + // Find next delimiter + end_pos = begin_pos; + for (; end_pos < msg_length; ++end_pos) { + auto c = str[end_pos]; + if (is_decimal_digit(c)) { + contains_decimal_digit = true; + } else if (is_alphabet(c)) { + contains_alphabet = true; + } else if (is_delim(c)) { + break; + } + } + + string_view variable(str.cbegin() + begin_pos, end_pos - begin_pos); + // Treat token as variable if: + // - it contains a decimal digit, or + // - it's directly preceded by '=' and contains an alphabet char, or + // - it could be a multi-digit hex value + if (contains_decimal_digit || (char_before_var_was_equals_sign && contains_alphabet) + || could_be_multi_digit_hex_value(variable)) + { + break; + } + } + + return (msg_length != begin_pos); +} +} // namespace ir diff --git a/components/core/src/ir/parsing.hpp b/components/core/src/ir/parsing.hpp new file mode 100644 index 000000000..fa9bdeca0 --- /dev/null +++ b/components/core/src/ir/parsing.hpp @@ -0,0 +1,77 @@ +#ifndef IR_PARSING_HPP +#define IR_PARSING_HPP + +/** + * TODO Technically, the methods in this file are more general than for their + * use in generating CLP's IR. However, introducing a parsing namespace in the + * root source directory would be confusing since we also have the + * compressor_frontend namespace. Once most of compressor_frontend is moved into + * https://github.com/y-scope/log-surgeon, we should reconsider the placement of + * the methods in this file. + */ + +#include + +namespace ir { +enum class VariablePlaceholder : char { + Integer = 0x11, + Dictionary = 0x12, + Float = 0x13, +}; + +constexpr char cVariablePlaceholderEscapeCharacter = '\\'; + +/** + * Checks if the given character is a delimiter + * We treat everything *except* the following quoted characters as a + * delimiter: "+-.0-9A-Z\_a-z" + * @param c + * @return Whether c is a delimiter + */ +bool is_delim(signed char c); + +/** + * @param c + * @return Whether the character is a variable placeholder + */ +bool is_variable_placeholder(char c); + +/** + * @param str + * @return Whether the given string could be a multi-digit hex value + */ +bool could_be_multi_digit_hex_value(std::string_view str); + +/** + * @param value + * @return Whether the given value is a variable according to the schemas + * specified in ffi::get_bounds_of_next_var + */ +bool is_var(std::string_view value); + +/** + * Gets the bounds of the next variable in the given string + * A variable is a token (word between two delimiters) that matches one of + * these schemas: + * - ".*[0-9].*" + * - "=(.*[a-zA-Z].*)" (the variable is within the capturing group) + * - "[a-fA-F0-9]{2,}" + * @param str String to search within + * @param begin_pos Begin position of last variable, changes to begin + * position of next variable + * @param end_pos End position of last variable, changes to end position of + * next variable + * @param contains_var_placeholder Whether the string already contains a + * variable placeholder (for efficiency, this is only set to true, so the + * caller must reset it to false if necessary) + * @return true if a variable was found, false otherwise + */ +bool get_bounds_of_next_var( + std::string_view str, + size_t& begin_pos, + size_t& end_pos, + bool& contains_var_placeholder +); +} // namespace ir + +#endif // IR_PARSING_HPP diff --git a/components/core/tests/test-encoding_methods.cpp b/components/core/tests/test-encoding_methods.cpp index 33b42f29d..815c5890f 100644 --- a/components/core/tests/test-encoding_methods.cpp +++ b/components/core/tests/test-encoding_methods.cpp @@ -12,10 +12,10 @@ using ffi::four_byte_encoded_variable_t; using ffi::encode_float_string; using ffi::encode_integer_string; using ffi::encode_message; -using ffi::get_bounds_of_next_var; -using ffi::VariablePlaceholder; using ffi::wildcard_match_encoded_vars; using ffi::wildcard_query_matches_any_encoded_var; +using ir::get_bounds_of_next_var; +using ir::VariablePlaceholder; using std::string; using std::string_view; using std::vector; @@ -29,123 +29,6 @@ using std::vector; static void string_views_from_strings (const vector& strings, vector& string_views); -TEST_CASE("ffi::get_bounds_of_next_var", "[ffi][get_bounds_of_next_var]") { - string str; - size_t begin_pos; - size_t end_pos; - bool contains_var_placeholder = false; - - // Since the outputs we use to validate depend on the schema/encoding - // methods, we validate the versions - REQUIRE(strcmp("com.yscope.clp.VariableEncodingMethodsV1", - ffi::cVariableEncodingMethodsVersion) == 0); - REQUIRE(strcmp("com.yscope.clp.VariablesSchemaV2", ffi::cVariablesSchemaVersion) == 0); - - // Corner cases - // Empty string - str = ""; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(false == contains_var_placeholder); - - // end_pos past the end of the string - str = "abc"; - begin_pos = 0; - end_pos = string::npos; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(false == contains_var_placeholder); - - // Non-variables - str = "/"; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); - - str = "xyz"; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); - - str = "="; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); - - // Variables - str = "~=x!abc123;1.2%x:+394/-"; - begin_pos = 0; - end_pos = 0; - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("x" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("abc123" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("1.2" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("+394" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(false == contains_var_placeholder); - - str = " ad ff 95 24 0d ff "; - begin_pos = 0; - end_pos = 0; - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); - - // String containing variable placeholder - str = " text "; - str += enum_to_underlying_type(VariablePlaceholder::Integer); - str += " var123 "; - begin_pos = 0; - end_pos = 0; - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE(contains_var_placeholder); - REQUIRE("var123" == str.substr(begin_pos, end_pos - begin_pos)); -} - TEMPLATE_TEST_CASE("Encoding integers", "[ffi][encode-integer]", eight_byte_encoded_variable_t, four_byte_encoded_variable_t) { diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 3c896e908..c4d586d3e 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -10,6 +10,7 @@ #include "../src/ffi/ir_stream/decoding_methods.hpp" #include "../src/ffi/ir_stream/encoding_methods.hpp" #include "../src/ffi/ir_stream/protocol_constants.hpp" +#include "../src/ir/parsing.hpp" using ffi::decode_float_var; using ffi::decode_integer_var; @@ -20,7 +21,6 @@ using ffi::encode_integer_string; using ffi::encode_message; using ffi::epoch_time_ms_t; using ffi::four_byte_encoded_variable_t; -using ffi::get_bounds_of_next_var; using ffi::ir_stream::cProtocol::EightByteEncodingMagicNumber; using ffi::ir_stream::cProtocol::FourByteEncodingMagicNumber; using ffi::ir_stream::cProtocol::MagicNumberLength; @@ -28,8 +28,8 @@ using ffi::ir_stream::decode_preamble; using ffi::ir_stream::encoded_tag_t; using ffi::ir_stream::get_encoding_type; using ffi::ir_stream::IRErrorCode; -using ffi::VariablePlaceholder; using ffi::wildcard_query_matches_any_encoded_var; +using ir::VariablePlaceholder; using std::chrono::duration_cast; using std::chrono::milliseconds; using std::chrono::system_clock; @@ -443,7 +443,7 @@ TEST_CASE("message_decode_error", "[ffi][decode_next_message]") { // Test if a trailing escape triggers a decoder error auto ir_with_extra_escape{ir_buf}; - ir_with_extra_escape.at(logtype_end_pos - 1) = ffi::cVariablePlaceholderEscapeCharacter; + ir_with_extra_escape.at(logtype_end_pos - 1) = ir::cVariablePlaceholderEscapeCharacter; BufferReader ir_with_extra_escape_buffer{ size_checked_pointer_cast(ir_with_extra_escape.data()), ir_with_extra_escape.size()}; diff --git a/components/core/tests/test-ir_parsing.cpp b/components/core/tests/test-ir_parsing.cpp new file mode 100644 index 000000000..10f348c82 --- /dev/null +++ b/components/core/tests/test-ir_parsing.cpp @@ -0,0 +1,120 @@ +#include + +#include "../src/ir/parsing.hpp" +#include "../src/type_utils.hpp" + +using ir::get_bounds_of_next_var; +using std::string; +using std::string_view; +using std::vector; + +TEST_CASE("ir::get_bounds_of_next_var", "[ir][get_bounds_of_next_var]") { + string str; + size_t begin_pos; + size_t end_pos; + bool contains_var_placeholder = false; + + // Corner cases + // Empty string + str = ""; + begin_pos = 0; + end_pos = 0; + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(false == contains_var_placeholder); + + // end_pos past the end of the string + str = "abc"; + begin_pos = 0; + end_pos = string::npos; + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(false == contains_var_placeholder); + + // Non-variables + str = "/"; + begin_pos = 0; + end_pos = 0; + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(str.length() == begin_pos); + REQUIRE(false == contains_var_placeholder); + + str = "xyz"; + begin_pos = 0; + end_pos = 0; + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(str.length() == begin_pos); + REQUIRE(false == contains_var_placeholder); + + str = "="; + begin_pos = 0; + end_pos = 0; + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(str.length() == begin_pos); + REQUIRE(false == contains_var_placeholder); + + // Variables + str = "~=x!abc123;1.2%x:+394/-"; + begin_pos = 0; + end_pos = 0; + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("x" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("abc123" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("1.2" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("+394" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(false == contains_var_placeholder); + + str = " ad ff 95 24 0d ff "; + begin_pos = 0; + end_pos = 0; + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); + REQUIRE(false == contains_var_placeholder); + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(str.length() == begin_pos); + REQUIRE(false == contains_var_placeholder); + + // String containing variable placeholder + str = " text "; + str += enum_to_underlying_type(ir::VariablePlaceholder::Integer); + str += " var123 "; + begin_pos = 0; + end_pos = 0; + + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(contains_var_placeholder); + REQUIRE("var123" == str.substr(begin_pos, end_pos - begin_pos)); +} diff --git a/components/core/tests/test-query_methods.cpp b/components/core/tests/test-query_methods.cpp index 0333e3150..ea172ef1c 100644 --- a/components/core/tests/test-query_methods.cpp +++ b/components/core/tests/test-query_methods.cpp @@ -10,6 +10,7 @@ #include "../src/ffi/search/query_methods.hpp" #include "../src/ffi/search/QueryMethodFailed.hpp" #include "../src/ffi/search/WildcardToken.hpp" +#include "../src/ir/parsing.hpp" using ffi::eight_byte_encoded_variable_t; using ffi::four_byte_encoded_variable_t; @@ -18,7 +19,7 @@ using ffi::search::generate_subqueries; using ffi::search::Subquery; using ffi::search::TokenType; using ffi::search::WildcardToken; -using ffi::VariablePlaceholder; +using ir::VariablePlaceholder; using std::string; using std::variant; using std::vector; From d0779ba318316e55c7ceabb34718a2909963154c Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 08:00:11 -0400 Subject: [PATCH 2/8] Replace existing parsing methods with those from ir namespace. --- components/core/CMakeLists.txt | 4 + components/core/cmake/utils.cmake | 2 + components/core/src/Grep.cpp | 4 +- .../core/src/LogTypeDictionaryEntry.cpp | 4 +- components/core/src/Utils.cpp | 49 ----------- components/core/src/Utils.hpp | 43 ---------- .../core/src/streaming_archive/Constants.hpp | 2 +- .../tests/test-EncodedVariableInterpreter.cpp | 2 +- components/core/tests/test-Utils.cpp | 82 ------------------- 9 files changed, 14 insertions(+), 178 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 26e7bbb79..1cc7a43c3 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -434,6 +434,8 @@ set(SOURCE_FILES_clg src/Grep.cpp src/Grep.hpp src/ir/LogEvent.hpp + src/ir/parsing.cpp + src/ir/parsing.hpp src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp src/LogTypeDictionaryReader.cpp @@ -590,6 +592,8 @@ set(SOURCE_FILES_clo src/Grep.cpp src/Grep.hpp src/ir/LogEvent.hpp + src/ir/parsing.cpp + src/ir/parsing.hpp src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp src/LogTypeDictionaryReader.cpp diff --git a/components/core/cmake/utils.cmake b/components/core/cmake/utils.cmake index 146029bbf..f2bb940ce 100644 --- a/components/core/cmake/utils.cmake +++ b/components/core/cmake/utils.cmake @@ -9,6 +9,8 @@ set(SOURCE_FILES_make-dictionaries-readable ${CMAKE_CURRENT_SOURCE_DIR}/src/FileReader.hpp ${CMAKE_CURRENT_SOURCE_DIR}/src/FileWriter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/FileWriter.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/ir/parsing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/ir/parsing.hpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LogTypeDictionaryEntry.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LogTypeDictionaryEntry.hpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LogTypeDictionaryReader.cpp diff --git a/components/core/src/Grep.cpp b/components/core/src/Grep.cpp index 9ad133e81..0fdc72061 100644 --- a/components/core/src/Grep.cpp +++ b/components/core/src/Grep.cpp @@ -9,6 +9,7 @@ #include "StringReader.hpp" #include "Utils.hpp" +using ir::is_delim; using std::string; using std::vector; using streaming_archive::reader::Archive; @@ -556,7 +557,8 @@ bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_ // - it contains a decimal digit, or // - it could be a multi-digit hex value, or // - it's directly preceded by an equals sign and contains an alphabet without a wildcard between the equals sign and the first alphabet of the token - if (contains_decimal_digit || could_be_multi_digit_hex_value(value, begin_pos, end_pos)) { + auto variable = static_cast(value).substr(begin_pos, end_pos - begin_pos); + if (contains_decimal_digit || ir::could_be_multi_digit_hex_value(variable)) { is_var = true; } else if (begin_pos > 0 && '=' == value[begin_pos - 1] && contains_alphabet) { // Find first alphabet or wildcard in token diff --git a/components/core/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp index f84f01e67..8d7557180 100644 --- a/components/core/src/LogTypeDictionaryEntry.cpp +++ b/components/core/src/LogTypeDictionaryEntry.cpp @@ -1,6 +1,7 @@ #include "LogTypeDictionaryEntry.hpp" // Project headers +#include "ir/parsing.hpp" #include "type_utils.hpp" #include "Utils.hpp" @@ -74,7 +75,8 @@ void LogTypeDictionaryEntry::add_float_var () { bool LogTypeDictionaryEntry::parse_next_var (const string& msg, size_t& var_begin_pos, size_t& var_end_pos, string& var) { auto last_var_end_pos = var_end_pos; - if (get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) { + bool contains_var_placeholder{}; + if (ir::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos, contains_var_placeholder)) { // Append to log type: from end of last variable to start of current variable add_constant(msg, last_var_end_pos, var_begin_pos - last_var_end_pos); diff --git a/components/core/src/Utils.cpp b/components/core/src/Utils.cpp index a41ae8702..a0b226fee 100644 --- a/components/core/src/Utils.cpp +++ b/components/core/src/Utils.cpp @@ -85,55 +85,6 @@ ErrorCode create_directory_structure (const string& path, mode_t mode) { return ErrorCode_Success; } -bool get_bounds_of_next_var (const string& msg, size_t& begin_pos, size_t& end_pos) { - const auto msg_length = msg.length(); - if (end_pos >= msg_length) { - return false; - } - - while (true) { - begin_pos = end_pos; - // Find next non-delimiter - for (; begin_pos < msg_length; ++begin_pos) { - if (false == is_delim(msg[begin_pos])) { - break; - } - } - if (msg_length == begin_pos) { - // Early exit for performance - return false; - } - - bool contains_decimal_digit = false; - bool contains_alphabet = false; - - // Find next delimiter - end_pos = begin_pos; - for (; end_pos < msg_length; ++end_pos) { - char c = msg[end_pos]; - if (is_decimal_digit(c)) { - contains_decimal_digit = true; - } else if (is_alphabet(c)) { - contains_alphabet = true; - } else if (is_delim(c)) { - break; - } - } - - // Treat token as variable if: - // - it contains a decimal digit, or - // - it's directly preceded by an equals sign and contains an alphabet, or - // - it could be a multi-digit hex value - if (contains_decimal_digit || (begin_pos > 0 && '=' == msg[begin_pos - 1] && contains_alphabet) || - could_be_multi_digit_hex_value(msg, begin_pos, end_pos)) - { - break; - } - } - - return (msg_length != begin_pos); -} - string get_parent_directory_path (const string& path) { string dirname = get_unambiguous_path(path); diff --git a/components/core/src/Utils.hpp b/components/core/src/Utils.hpp index 6f8b843f3..e3fa814a0 100644 --- a/components/core/src/Utils.hpp +++ b/components/core/src/Utils.hpp @@ -14,39 +14,6 @@ #include "FileReader.hpp" #include "ParsedMessage.hpp" -/** - * Checks if the given segment of the stringcould be a multi-digit hex value - * @param str - * @param begin_pos - * @param end_pos - * @return true if yes, false otherwise - */ -inline bool could_be_multi_digit_hex_value (const std::string& str, size_t begin_pos, size_t end_pos) { - if (end_pos - begin_pos < 2) { - return false; - } - - for (size_t i = begin_pos; i < end_pos; ++i) { - auto c = str[i]; - if (!( ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9') )) { - return false; - } - } - - return true; -} - -/** - * Checks if character is a delimiter - * We treat everything except the following quoted characters as a delimiter: "+-./0-9A-Z\a-z" - * NOTE: For performance, we rely on the ASCII ordering of characters to compare ranges of characters at a time instead of comparing individual characters - * @param c - * @return true if c is a delimiter, false otherwise - */ -inline bool is_delim (char c) { - return !('+' == c || ('-' <= c && c <= '9') || ('A' <= c && c <= 'Z') || '\\' == c || '_' == c || ('a' <= c && c <= 'z')); -} - /** * Creates a directory with the given path * @param path @@ -82,16 +49,6 @@ ErrorCode create_directory_structure (const std::string& path, mode_t mode); */ std::string get_parent_directory_path (const std::string& path); -/** - * Returns bounds of next variable in given string - * A variable is a token (word between two delimiters) that contains numbers or is directly preceded by an equals sign - * @param msg - * @param begin_pos Begin position of last variable, changes to begin position of next variable - * @param end_pos End position of last variable, changes to end position of next variable - * @return true if a variable was found, false otherwise - */ -bool get_bounds_of_next_var (const std::string& msg, size_t& begin_pos, size_t& end_pos); - /** * Removes ".", "..", and consecutive "/" from a given path and returns the result * @param path The given path diff --git a/components/core/src/streaming_archive/Constants.hpp b/components/core/src/streaming_archive/Constants.hpp index 60334aad2..8d9eb25bf 100644 --- a/components/core/src/streaming_archive/Constants.hpp +++ b/components/core/src/streaming_archive/Constants.hpp @@ -5,7 +5,7 @@ #define STREAMING_ARCHIVE_CONSTANTS_HPP namespace streaming_archive { - constexpr archive_format_version_t cArchiveFormatVersion = cArchiveFormatDevVersionFlag | 7; + constexpr archive_format_version_t cArchiveFormatVersion = cArchiveFormatDevVersionFlag | 8; constexpr char cSegmentsDirname[] = "s"; constexpr char cSegmentListFilename[] = "segment_list.txt"; constexpr char cLogTypeDictFilename[] = "logtype.dict"; diff --git a/components/core/tests/test-EncodedVariableInterpreter.cpp b/components/core/tests/test-EncodedVariableInterpreter.cpp index 04df2e770..9353e949a 100644 --- a/components/core/tests/test-EncodedVariableInterpreter.cpp +++ b/components/core/tests/test-EncodedVariableInterpreter.cpp @@ -230,7 +230,7 @@ TEST_CASE("EncodedVariableInterpreter", "[EncodedVariableInterpreter]") { string large_val_str = to_string(cVariableDictionaryIdMax) + "0"; vector var_strs = {"4938", large_val_str, "-25.5196868642755", - "-00.00", "bin/python2.7.3"}; + "-00.00", "python2.7.3"}; msg = "here is a string with a small int " + var_strs[0] + " and a very large int " + var_strs[1] + " and a double " + var_strs[2] + diff --git a/components/core/tests/test-Utils.cpp b/components/core/tests/test-Utils.cpp index 35edb835f..3f309e5f1 100644 --- a/components/core/tests/test-Utils.cpp +++ b/components/core/tests/test-Utils.cpp @@ -82,85 +82,3 @@ TEST_CASE("get_unambiguous_path", "[get_unambiguous_path]") { REQUIRE(get_unambiguous_path("abc///def/../ghi/./") == "abc/ghi"); REQUIRE(get_unambiguous_path("../abc///def/../ghi/./") == "abc/ghi"); } - -TEST_CASE("get_bounds_of_next_var", "[get_bounds_of_next_var]") { - string str; - size_t begin_pos; - size_t end_pos; - - // Corner cases - // Empty string - str = ""; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - - // end_pos past the end of the string - str = "abc"; - begin_pos = 0; - end_pos = string::npos; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - - // Non-variables - str = "/"; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - REQUIRE(str.length() == begin_pos); - - str = "xyz"; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - REQUIRE(str.length() == begin_pos); - - str = "="; - begin_pos = 0; - end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - REQUIRE(str.length() == begin_pos); - - // Variables - str = "~=x!abc123;1.2%x:+394/-"; - begin_pos = 0; - end_pos = 0; - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("x" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("abc123" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("1.2" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("+394/-" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - - str = " ad ff 95 24 0d ff "; - begin_pos = 0; - end_pos = 0; - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); - REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); - - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); - REQUIRE(str.length() == begin_pos); -} From e2474fa3ac5a50ad4f1dfc97316c36c37236cbe6 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 08:25:01 -0400 Subject: [PATCH 3/8] Improve performance of parsing methods from ffi to match those in CLP. --- .../core/src/LogTypeDictionaryEntry.cpp | 3 +- components/core/src/ffi/encoding_methods.hpp | 7 --- components/core/src/ffi/encoding_methods.inc | 38 ++++--------- .../src/ffi/ir_stream/encoding_methods.cpp | 26 +-------- components/core/src/ir/parsing.cpp | 39 ++++++------- components/core/src/ir/parsing.hpp | 6 +- components/core/tests/test-ir_parsing.cpp | 55 ++++++------------- 7 files changed, 48 insertions(+), 126 deletions(-) diff --git a/components/core/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp index 8d7557180..164ededf7 100644 --- a/components/core/src/LogTypeDictionaryEntry.cpp +++ b/components/core/src/LogTypeDictionaryEntry.cpp @@ -75,8 +75,7 @@ void LogTypeDictionaryEntry::add_float_var () { bool LogTypeDictionaryEntry::parse_next_var (const string& msg, size_t& var_begin_pos, size_t& var_end_pos, string& var) { auto last_var_end_pos = var_end_pos; - bool contains_var_placeholder{}; - if (ir::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos, contains_var_placeholder)) { + if (ir::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) { // Append to log type: from end of last variable to start of current variable add_constant(msg, last_var_end_pos, var_begin_pos - last_var_end_pos); diff --git a/components/core/src/ffi/encoding_methods.hpp b/components/core/src/ffi/encoding_methods.hpp index fa26ef675..6618db346 100644 --- a/components/core/src/ffi/encoding_methods.hpp +++ b/components/core/src/ffi/encoding_methods.hpp @@ -169,10 +169,6 @@ std::string decode_integer_var(encoded_variable_t encoded_var); * components of the message. * @tparam encoded_variable_t Type of the encoded variable * @tparam ConstantHandler Method to handle constants. Signature: - * (std::string_view constant, bool constant_contains_variable_placeholder, - * std::string& logtype) -> bool - * @tparam FinalConstantHandler Method to handle the constant after the last - * variable. Signature: * (std::string_view constant, std::string& logtype) -> bool * @tparam EncodedVariableHandler Method to handle encoded variables. * Signature: (encoded_variable_t) -> void @@ -182,7 +178,6 @@ std::string decode_integer_var(encoded_variable_t encoded_var); * @param message * @param logtype * @param constant_handler - * @param final_constant_handler * @param encoded_variable_handler * @param dictionary_variable_handler * @return true on success, false otherwise @@ -190,14 +185,12 @@ std::string decode_integer_var(encoded_variable_t encoded_var); template < typename encoded_variable_t, typename ConstantHandler, - typename FinalConstantHandler, typename EncodedVariableHandler, typename DictionaryVariableHandler> bool encode_message_generically( std::string_view message, std::string& logtype, ConstantHandler constant_handler, - FinalConstantHandler final_constant_handler, EncodedVariableHandler encoded_variable_handler, DictionaryVariableHandler dictionary_variable_handler ); diff --git a/components/core/src/ffi/encoding_methods.inc b/components/core/src/ffi/encoding_methods.inc index 38c5a00ef..d85b22312 100644 --- a/components/core/src/ffi/encoding_methods.inc +++ b/components/core/src/ffi/encoding_methods.inc @@ -350,32 +350,23 @@ std::string decode_integer_var(encoded_variable_t encoded_var) { template < typename encoded_variable_t, typename ConstantHandler, - typename FinalConstantHandler, typename EncodedVariableHandler, typename DictionaryVariableHandler> bool encode_message_generically( std::string_view message, std::string& logtype, ConstantHandler constant_handler, - FinalConstantHandler final_constant_handler, EncodedVariableHandler encoded_variable_handler, DictionaryVariableHandler dictionary_variable_handler ) { size_t var_begin_pos = 0; size_t var_end_pos = 0; - bool constant_contains_variable_placeholder = false; size_t constant_begin_pos = 0; logtype.clear(); logtype.reserve(message.length()); - while (ir::get_bounds_of_next_var( - message, - var_begin_pos, - var_end_pos, - constant_contains_variable_placeholder - )) - { + while (ir::get_bounds_of_next_var(message, var_begin_pos, var_end_pos)) { std::string_view constant{&message[constant_begin_pos], var_begin_pos - constant_begin_pos}; - if (false == constant_handler(constant, constant_contains_variable_placeholder, logtype)) { + if (false == constant_handler(constant, logtype)) { return false; } constant_begin_pos = var_end_pos; @@ -401,7 +392,7 @@ bool encode_message_generically( std::string_view constant{ &message[constant_begin_pos], message.length() - constant_begin_pos}; - if (false == final_constant_handler(constant, logtype)) { + if (false == constant_handler(constant, logtype)) { return false; } } @@ -416,11 +407,13 @@ bool encode_message( std::vector& encoded_vars, std::vector& dictionary_var_bounds ) { - auto constant_handler = []( // clang-format off - std::string_view constant, - bool contains_variable_placeholder, - std::string& logtype - ) { // clang-format on + auto constant_handler = [](std::string_view constant, std::string& logtype) { + // Ensure constant doesn't contain a variable placeholder + bool contains_variable_placeholder = std::any_of( + constant.cbegin(), + constant.cend(), + ir::is_variable_placeholder + ); if (contains_variable_placeholder) { return false; } @@ -428,16 +421,6 @@ bool encode_message( logtype.append(constant); return true; }; - auto final_constant_handler = - [&constant_handler](std::string_view constant, std::string& logtype) { - // Ensure final constant doesn't contain a variable placeholder - bool contains_variable_placeholder = std::any_of( - constant.cbegin(), - constant.cend(), - ir::is_variable_placeholder - ); - return constant_handler(constant, contains_variable_placeholder, logtype); - }; auto encoded_variable_handler = [&encoded_vars](encoded_variable_t encoded_variable) { encoded_vars.push_back(encoded_variable); }; @@ -457,7 +440,6 @@ bool encode_message( message, logtype, constant_handler, - final_constant_handler, encoded_variable_handler, dictionary_variable_handler )) diff --git a/components/core/src/ffi/ir_stream/encoding_methods.cpp b/components/core/src/ffi/ir_stream/encoding_methods.cpp index 4e3ea2b2c..2d637dbe3 100644 --- a/components/core/src/ffi/ir_stream/encoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/encoding_methods.cpp @@ -52,29 +52,16 @@ static void add_base_metadata_fields( ); /** - * Appends a constant to the logtype, escaping variable placeholders when - * @p contains_variable_placeholder is true + * Appends a constant to the logtype, escaping any variable placeholders * @param constant - * @param contains_variable_placeholder Whether the constant contains a variable - * placeholder * @param logtype * @return true */ static bool append_constant_to_logtype( string_view constant, - bool contains_variable_placeholder, string& logtype ); -/** - * Appends the final constant to the logtype, escaping variable placeholders as - * necessary - * @param constant - * @param logtype - * @return Same as append_constant_to_logtype - */ -static bool append_final_constant_to_logtype(string_view constant, string& logtype); - /** * A functor for encoding dictionary variables in a message */ @@ -178,7 +165,7 @@ static void add_base_metadata_fields( metadata[cProtocol::Metadata::TimeZoneIdKey] = time_zone_id; } -static bool append_constant_to_logtype(string_view constant, bool, string& logtype) { +static bool append_constant_to_logtype(string_view constant, string& logtype) { size_t begin_pos = 0; auto constant_len = constant.length(); for (size_t i = 0; i < constant_len; ++i) { @@ -195,13 +182,6 @@ static bool append_constant_to_logtype(string_view constant, bool, string& logty return true; } -static bool append_final_constant_to_logtype(string_view constant, string& logtype) { - // Since we don't know if the last constant contains a variable placeholder, - // we'll simply say it does and append_constant_to_logtype will escape one - // if it finds one - return append_constant_to_logtype(constant, true, logtype); -} - namespace eight_byte_encoding { bool encode_preamble( string_view timestamp_pattern, @@ -242,7 +222,6 @@ namespace eight_byte_encoding { message, logtype, append_constant_to_logtype, - append_final_constant_to_logtype, encoded_var_handler, DictionaryVariableHandler(ir_buf) )) @@ -317,7 +296,6 @@ namespace four_byte_encoding { message, logtype, append_constant_to_logtype, - append_final_constant_to_logtype, encoded_var_handler, DictionaryVariableHandler(ir_buf) )) diff --git a/components/core/src/ir/parsing.cpp b/components/core/src/ir/parsing.cpp index 507e89a1b..c6cc9d1d5 100644 --- a/components/core/src/ir/parsing.cpp +++ b/components/core/src/ir/parsing.cpp @@ -1,7 +1,5 @@ #include "parsing.hpp" -#include - #include "../string_utils.hpp" #include "../type_utils.hpp" @@ -25,21 +23,27 @@ bool is_variable_placeholder(char c) { || (enum_to_underlying_type(VariablePlaceholder::Float) == c); } -bool could_be_multi_digit_hex_value(string_view const str) { +// NOTE: `inline` improves performance by 1-2% +inline bool could_be_multi_digit_hex_value(string_view str) { if (str.length() < 2) { return false; } - return std::all_of(str.cbegin(), str.cend(), [](char c) { - return ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'); - }); + // NOTE: This is 1-2% faster than using std::all_of with the opposite + // condition + for (auto c : str) { + if (!(('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'))) { + return false; + } + } + + return true; } bool is_var(std::string_view value) { size_t begin_pos = 0; size_t end_pos = 0; - bool contains_var_placeholder; - if (get_bounds_of_next_var(value, begin_pos, end_pos, contains_var_placeholder)) { + if (get_bounds_of_next_var(value, begin_pos, end_pos)) { // Ensure the entire value is a variable return (0 == begin_pos && value.length() == end_pos); } else { @@ -50,8 +54,7 @@ bool is_var(std::string_view value) { bool get_bounds_of_next_var( string_view const str, size_t& begin_pos, - size_t& end_pos, - bool& contains_var_placeholder + size_t& end_pos ) { auto const msg_length = str.length(); if (end_pos >= msg_length) { @@ -62,22 +65,11 @@ bool get_bounds_of_next_var( begin_pos = end_pos; // Find next non-delimiter - bool char_before_var_was_equals_sign = false; - char previous_char = 0; for (; begin_pos < msg_length; ++begin_pos) { auto c = str[begin_pos]; if (false == is_delim(c)) { - char_before_var_was_equals_sign = ('=' == previous_char); break; } - previous_char = c; - - if (enum_to_underlying_type(VariablePlaceholder::Integer) == c - || enum_to_underlying_type(VariablePlaceholder::Dictionary) == c - || enum_to_underlying_type(VariablePlaceholder::Float) == c) - { - contains_var_placeholder = true; - } } if (msg_length == begin_pos) { // Early exit for performance @@ -100,12 +92,13 @@ bool get_bounds_of_next_var( } } - string_view variable(str.cbegin() + begin_pos, end_pos - begin_pos); + auto variable = str.substr(begin_pos, end_pos - begin_pos); // Treat token as variable if: // - it contains a decimal digit, or // - it's directly preceded by '=' and contains an alphabet char, or // - it could be a multi-digit hex value - if (contains_decimal_digit || (char_before_var_was_equals_sign && contains_alphabet) + if (contains_decimal_digit + || (begin_pos > 0 && '=' == str[begin_pos - 1] && contains_alphabet) || could_be_multi_digit_hex_value(variable)) { break; diff --git a/components/core/src/ir/parsing.hpp b/components/core/src/ir/parsing.hpp index fa9bdeca0..99405cc15 100644 --- a/components/core/src/ir/parsing.hpp +++ b/components/core/src/ir/parsing.hpp @@ -61,16 +61,12 @@ bool is_var(std::string_view value); * position of next variable * @param end_pos End position of last variable, changes to end position of * next variable - * @param contains_var_placeholder Whether the string already contains a - * variable placeholder (for efficiency, this is only set to true, so the - * caller must reset it to false if necessary) * @return true if a variable was found, false otherwise */ bool get_bounds_of_next_var( std::string_view str, size_t& begin_pos, - size_t& end_pos, - bool& contains_var_placeholder + size_t& end_pos ); } // namespace ir diff --git a/components/core/tests/test-ir_parsing.cpp b/components/core/tests/test-ir_parsing.cpp index 10f348c82..9a8286d22 100644 --- a/components/core/tests/test-ir_parsing.cpp +++ b/components/core/tests/test-ir_parsing.cpp @@ -12,100 +12,82 @@ TEST_CASE("ir::get_bounds_of_next_var", "[ir][get_bounds_of_next_var]") { string str; size_t begin_pos; size_t end_pos; - bool contains_var_placeholder = false; // Corner cases // Empty string str = ""; begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(false == contains_var_placeholder); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); // end_pos past the end of the string str = "abc"; begin_pos = 0; end_pos = string::npos; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(false == contains_var_placeholder); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); // Non-variables str = "/"; begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); str = "xyz"; begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); str = "="; begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); // Variables str = "~=x!abc123;1.2%x:+394/-"; begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("x" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("abc123" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("1.2" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("+394" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); - REQUIRE(false == contains_var_placeholder); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); str = " ad ff 95 24 0d ff "; begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); - REQUIRE(false == contains_var_placeholder); - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == false); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == false); REQUIRE(str.length() == begin_pos); - REQUIRE(false == contains_var_placeholder); // String containing variable placeholder str = " text "; @@ -114,7 +96,6 @@ TEST_CASE("ir::get_bounds_of_next_var", "[ir][get_bounds_of_next_var]") { begin_pos = 0; end_pos = 0; - REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos, contains_var_placeholder) == true); - REQUIRE(contains_var_placeholder); + REQUIRE(get_bounds_of_next_var(str, begin_pos, end_pos) == true); REQUIRE("var123" == str.substr(begin_pos, end_pos - begin_pos)); } From 1c920b41265f72cf2e096658b41abfd177d35dc8 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 08:44:21 -0400 Subject: [PATCH 4/8] Fix build error --- components/core/src/Grep.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/Grep.cpp b/components/core/src/Grep.cpp index 0fdc72061..2ab4f7875 100644 --- a/components/core/src/Grep.cpp +++ b/components/core/src/Grep.cpp @@ -6,6 +6,7 @@ // Project headers #include "compressor_frontend/Constants.hpp" #include "EncodedVariableInterpreter.hpp" +#include "ir/parsing.hpp" #include "StringReader.hpp" #include "Utils.hpp" From aa1d327319f0487de18226a6c4a4aa704cedda48 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 08:55:29 -0400 Subject: [PATCH 5/8] C++ 101: Inline methods should be defined in the header. --- components/core/src/ir/parsing.cpp | 17 ----------------- components/core/src/ir/parsing.hpp | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/components/core/src/ir/parsing.cpp b/components/core/src/ir/parsing.cpp index c6cc9d1d5..6b9ebf08b 100644 --- a/components/core/src/ir/parsing.cpp +++ b/components/core/src/ir/parsing.cpp @@ -23,23 +23,6 @@ bool is_variable_placeholder(char c) { || (enum_to_underlying_type(VariablePlaceholder::Float) == c); } -// NOTE: `inline` improves performance by 1-2% -inline bool could_be_multi_digit_hex_value(string_view str) { - if (str.length() < 2) { - return false; - } - - // NOTE: This is 1-2% faster than using std::all_of with the opposite - // condition - for (auto c : str) { - if (!(('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'))) { - return false; - } - } - - return true; -} - bool is_var(std::string_view value) { size_t begin_pos = 0; size_t end_pos = 0; diff --git a/components/core/src/ir/parsing.hpp b/components/core/src/ir/parsing.hpp index 99405cc15..a5d447728 100644 --- a/components/core/src/ir/parsing.hpp +++ b/components/core/src/ir/parsing.hpp @@ -37,10 +37,25 @@ bool is_delim(signed char c); bool is_variable_placeholder(char c); /** + * NOTE: This method is marked inline for a 1-2% performance improvement * @param str * @return Whether the given string could be a multi-digit hex value */ -bool could_be_multi_digit_hex_value(std::string_view str); +inline bool could_be_multi_digit_hex_value(std::string_view str) { + if (str.length() < 2) { + return false; + } + + // NOTE: This is 1-2% faster than using std::all_of with the opposite + // condition + for (auto c : str) { + if (!(('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'))) { + return false; + } + } + + return true; +} /** * @param value From 7fd4656dcf392f815768b47919773140910520a8 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 09:52:12 -0400 Subject: [PATCH 6/8] Replace LogTypeDictionaryEntry::VarDelim with ir::VariablePlaceholder; Deduplicate logtype constant escaping method; Replace all instances of variable delimiter with variable placeholder. --- .../core/src/EncodedVariableInterpreter.cpp | 17 +++--- .../core/src/LogTypeDictionaryEntry.cpp | 61 ++++++------------- .../core/src/LogTypeDictionaryEntry.hpp | 35 +++++------ components/core/src/ffi/encoding_methods.hpp | 4 +- .../src/ffi/ir_stream/encoding_methods.cpp | 14 +---- components/core/src/ir/parsing.cpp | 22 +++++-- components/core/src/ir/parsing.hpp | 13 ++-- .../make_dictionaries_readable/README.md | 6 +- .../make-dictionaries-readable.cpp | 17 +++--- .../tests/test-EncodedVariableInterpreter.cpp | 4 +- 10 files changed, 84 insertions(+), 109 deletions(-) diff --git a/components/core/src/EncodedVariableInterpreter.cpp b/components/core/src/EncodedVariableInterpreter.cpp index 246897ea7..df241b3a4 100644 --- a/components/core/src/EncodedVariableInterpreter.cpp +++ b/components/core/src/EncodedVariableInterpreter.cpp @@ -8,6 +8,7 @@ #include "Defs.h" #include "ffi/encoding_methods.hpp" #include "ffi/ir_stream/decoding_methods.hpp" +#include "ir/parsing.hpp" #include "spdlog_with_specializations.hpp" #include "string_utils.hpp" #include "type_utils.hpp" @@ -296,25 +297,25 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic return false; } - LogTypeDictionaryEntry::VarDelim var_delim; + ir::VariablePlaceholder var_placeholder; size_t constant_begin_pos = 0; string float_str; variable_dictionary_id_t var_dict_id; for (size_t i = 0; i < num_vars_in_logtype; ++i) { - size_t var_position = logtype_dict_entry.get_var_info(i, var_delim); + size_t var_position = logtype_dict_entry.get_var_info(i, var_placeholder); // Add the constant that's between the last variable and this one decompressed_msg.append(logtype_value, constant_begin_pos, var_position - constant_begin_pos); - switch (var_delim) { - case LogTypeDictionaryEntry::VarDelim::Integer: + switch (var_placeholder) { + case ir::VariablePlaceholder::Integer: decompressed_msg += std::to_string(encoded_vars[i]); break; - case LogTypeDictionaryEntry::VarDelim::Float: + case ir::VariablePlaceholder::Float: convert_encoded_float_to_string(encoded_vars[i], float_str); decompressed_msg += float_str; break; - case LogTypeDictionaryEntry::VarDelim::Dictionary: + case ir::VariablePlaceholder::Dictionary: var_dict_id = decode_var_dict_id(encoded_vars[i]); decompressed_msg += var_dict.get_value(var_dict_id); break; @@ -323,10 +324,10 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic "EncodedVariableInterpreter: Logtype '{}' contains " "unexpected variable placeholder 0x{:x}", logtype_value, - enum_to_underlying_type(var_delim)); + enum_to_underlying_type(var_placeholder)); return false; } - // Move past the variable delimiter + // Move past the variable placeholder constant_begin_pos = var_position + 1; } // Append remainder of logtype, if any diff --git a/components/core/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp index 164ededf7..e8c89949a 100644 --- a/components/core/src/LogTypeDictionaryEntry.cpp +++ b/components/core/src/LogTypeDictionaryEntry.cpp @@ -7,43 +7,16 @@ using std::string; -// Constants -static constexpr char cEscapeChar = '\\'; - -// Local function prototypes -/** - * Escapes any variable delimiters in the identified portion of the given value - * @param value - * @param begin_ix - * @param end_ix - * @param escaped_value - */ -static void escape_variable_delimiters (const std::string& value, size_t begin_ix, size_t end_ix, std::string& escaped_value); - -static void escape_variable_delimiters (const string& value, size_t begin_ix, size_t end_ix, string& escaped_value) { - for (size_t i = begin_ix; i < end_ix; ++i) { - auto c = value[i]; - - // Add escape character if necessary - if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Integer) == c || - enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Float) == c || - enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Dictionary) == c || - cEscapeChar == c) { - escaped_value += cEscapeChar; - } - - // Add character - escaped_value += value[i]; - } -} - -size_t LogTypeDictionaryEntry::get_var_info (size_t var_ix, VarDelim& var_delim) const { +size_t LogTypeDictionaryEntry::get_var_info( + size_t var_ix, + ir::VariablePlaceholder& var_placeholder +) const { if (var_ix >= m_var_positions.size()) { return SIZE_MAX; } auto var_position = m_var_positions[var_ix]; - var_delim = (VarDelim)m_value[var_position]; + var_placeholder = static_cast(m_value[var_position]); return m_var_positions[var_ix]; } @@ -134,18 +107,18 @@ ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Dec if (is_escaped) { constant += c; is_escaped = false; - } else if (cEscapeChar == c) { + } else if (ir::cVariablePlaceholderEscapeCharacter == c) { is_escaped = true; } else { - if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Integer) == c) { + if (enum_to_underlying_type(ir::VariablePlaceholder::Integer) == c) { add_constant(constant, 0, constant.length()); constant.clear(); add_int_var(); - } else if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Float) == c) { + } else if (enum_to_underlying_type(ir::VariablePlaceholder::Float) == c) { add_constant(constant, 0, constant.length()); constant.clear(); add_float_var(); - } else if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Dictionary) == c) + } else if (enum_to_underlying_type(ir::VariablePlaceholder::Dictionary) == c) { add_constant(constant, 0, constant.length()); constant.clear(); @@ -171,21 +144,25 @@ void LogTypeDictionaryEntry::read_from_file (streaming_compression::Decompressor } void LogTypeDictionaryEntry::get_value_with_unfounded_variables_escaped (string& escaped_logtype_value) const { + auto value_view = static_cast(m_value); size_t begin_ix = 0; // Reset escaped value and reserve enough space to at least contain the whole value escaped_logtype_value.clear(); - escaped_logtype_value.reserve(m_value.length()); + escaped_logtype_value.reserve(value_view.length()); for (auto var_position : m_var_positions) { size_t end_ix = var_position; - escape_variable_delimiters(m_value, begin_ix, end_ix, escaped_logtype_value); + ir::escape_and_append_constant_to_logtype( + value_view.substr(begin_ix, end_ix - begin_ix), + escaped_logtype_value + ); - // Add variable delimiter - escaped_logtype_value += m_value[end_ix]; + // Add variable placeholder + escaped_logtype_value += value_view[end_ix]; // Move begin to start of next portion of logtype between variables begin_ix = end_ix + 1; } - // Escape any variable delimiters in remainder of value - escape_variable_delimiters(m_value, begin_ix, m_value.length(), escaped_logtype_value); + // Escape any variable placeholders in remainder of value + ir::escape_and_append_constant_to_logtype(value_view.substr(begin_ix), escaped_logtype_value); } diff --git a/components/core/src/LogTypeDictionaryEntry.hpp b/components/core/src/LogTypeDictionaryEntry.hpp index 85a2aabde..aade2fbad 100644 --- a/components/core/src/LogTypeDictionaryEntry.hpp +++ b/components/core/src/LogTypeDictionaryEntry.hpp @@ -9,6 +9,7 @@ #include "DictionaryEntry.hpp" #include "ErrorCode.hpp" #include "FileReader.hpp" +#include "ir/parsing.hpp" #include "streaming_compression/zstd/Compressor.hpp" #include "streaming_compression/zstd/Decompressor.hpp" #include "TraceableException.hpp" @@ -31,15 +32,6 @@ class LogTypeDictionaryEntry : public DictionaryEntry { } }; - // Constants - enum class VarDelim : char { - // NOTE: These values are used within logtypes to denote variables, so care must be taken when changing them - Integer = 0x11, - Dictionary = 0x12, - Float = 0x13, - Length = 3 - }; - // Constructors LogTypeDictionaryEntry () = default; // Use default copy constructor @@ -51,35 +43,35 @@ class LogTypeDictionaryEntry : public DictionaryEntry { // Methods /** - * Adds a dictionary variable delimiter to the given logtype + * Adds a dictionary variable placeholder to the given logtype * @param logtype */ static void add_dict_var (std::string& logtype) { - logtype += enum_to_underlying_type(VarDelim::Dictionary); + logtype += enum_to_underlying_type(ir::VariablePlaceholder::Dictionary); } /** - * Adds an integer variable delimiter to the given logtype + * Adds an integer variable placeholder to the given logtype * @param logtype */ static void add_int_var (std::string& logtype) { - logtype += enum_to_underlying_type(VarDelim::Integer); + logtype += enum_to_underlying_type(ir::VariablePlaceholder::Integer); } /** - * Adds a float variable delimiter to the given logtype + * Adds a float variable placeholder to the given logtype * @param logtype */ static void add_float_var (std::string& logtype) { - logtype += enum_to_underlying_type(VarDelim::Float); + logtype += enum_to_underlying_type(ir::VariablePlaceholder::Float); } size_t get_num_vars () const { return m_var_positions.size(); } /** * Gets all info about a variable in the logtype * @param var_ix The index of the variable to get the info for - * @param var_delim + * @param var_placeholder * @return The variable's position in the logtype, or SIZE_MAX if var_ix is out of bounds */ - size_t get_var_info (size_t var_ix, VarDelim& var_delim) const; + size_t get_var_info (size_t var_ix, ir::VariablePlaceholder& var_placeholder) const; /** * Gets the size (in-memory) of the data contained in this entry @@ -95,15 +87,15 @@ class LogTypeDictionaryEntry : public DictionaryEntry { */ void add_constant (const std::string& value_containing_constant, size_t begin_pos, size_t length); /** - * Adds an int variable delimiter + * Adds an int variable placeholder */ void add_int_var (); /** - * Adds a float variable delimiter + * Adds a float variable placeholder */ void add_float_var (); /** - * Adds a dictionary variable delimiter + * Adds a dictionary variable placeholder */ void add_dictionary_var (); @@ -147,7 +139,8 @@ class LogTypeDictionaryEntry : public DictionaryEntry { private: // Methods /** - * Escapes any variable delimiters that don't correspond to the positions of variables in the logtype entry's value + * Escapes any variable placeholders that don't correspond to the positions + * of variables in the logtype entry's value * @param escaped_logtype_value */ void get_value_with_unfounded_variables_escaped (std::string& escaped_logtype_value) const; diff --git a/components/core/src/ffi/encoding_methods.hpp b/components/core/src/ffi/encoding_methods.hpp index 6618db346..ec4a8ec37 100644 --- a/components/core/src/ffi/encoding_methods.hpp +++ b/components/core/src/ffi/encoding_methods.hpp @@ -51,10 +51,10 @@ static constexpr char cVariableEncodingMethodsVersion[] static constexpr char cVariablesSchemaVersion[] = "com.yscope.clp.VariablesSchemaV2"; static constexpr char cTooFewDictionaryVarsErrorMessage[] - = "There are fewer dictionary variables than dictionary variable delimiters in the " + = "There are fewer dictionary variables than dictionary variable placeholders in the " "logtype."; static constexpr char cTooFewEncodedVarsErrorMessage[] - = "There are fewer encoded variables than encoded variable delimiters in the logtype."; + = "There are fewer encoded variables than encoded variable placeholders in the logtype."; static constexpr char cUnexpectedEscapeCharacterMessage[] = "Unexpected escape character without escaped value at the end of the logtype."; diff --git a/components/core/src/ffi/ir_stream/encoding_methods.cpp b/components/core/src/ffi/ir_stream/encoding_methods.cpp index 2d637dbe3..ec427ab76 100644 --- a/components/core/src/ffi/ir_stream/encoding_methods.cpp +++ b/components/core/src/ffi/ir_stream/encoding_methods.cpp @@ -166,19 +166,7 @@ static void add_base_metadata_fields( } static bool append_constant_to_logtype(string_view constant, string& logtype) { - size_t begin_pos = 0; - auto constant_len = constant.length(); - for (size_t i = 0; i < constant_len; ++i) { - auto c = constant[i]; - if (ir::cVariablePlaceholderEscapeCharacter == c || ir::is_variable_placeholder(c)) { - logtype.append(constant, begin_pos, i - begin_pos); - logtype += ir::cVariablePlaceholderEscapeCharacter; - // NOTE: We don't need to append the character of interest - // immediately since the next constant copy operation will get it - begin_pos = i; - } - } - logtype.append(constant, begin_pos, constant_len - begin_pos); + ir::escape_and_append_constant_to_logtype(constant, logtype); return true; } diff --git a/components/core/src/ir/parsing.cpp b/components/core/src/ir/parsing.cpp index 6b9ebf08b..1fa59be34 100644 --- a/components/core/src/ir/parsing.cpp +++ b/components/core/src/ir/parsing.cpp @@ -34,11 +34,7 @@ bool is_var(std::string_view value) { } } -bool get_bounds_of_next_var( - string_view const str, - size_t& begin_pos, - size_t& end_pos -) { +bool get_bounds_of_next_var(string_view const str, size_t& begin_pos, size_t& end_pos) { auto const msg_length = str.length(); if (end_pos >= msg_length) { return false; @@ -90,4 +86,20 @@ bool get_bounds_of_next_var( return (msg_length != begin_pos); } + +void escape_and_append_constant_to_logtype(string_view constant, std::string& logtype) { + size_t begin_pos = 0; + auto constant_len = constant.length(); + for (size_t i = 0; i < constant_len; ++i) { + auto c = constant[i]; + if (cVariablePlaceholderEscapeCharacter == c || is_variable_placeholder(c)) { + logtype.append(constant, begin_pos, i - begin_pos); + logtype += ir::cVariablePlaceholderEscapeCharacter; + // NOTE: We don't need to append the character of interest + // immediately since the next constant copy operation will get it + begin_pos = i; + } + } + logtype.append(constant, begin_pos, constant_len - begin_pos); +} } // namespace ir diff --git a/components/core/src/ir/parsing.hpp b/components/core/src/ir/parsing.hpp index a5d447728..5ee93f340 100644 --- a/components/core/src/ir/parsing.hpp +++ b/components/core/src/ir/parsing.hpp @@ -78,11 +78,14 @@ bool is_var(std::string_view value); * next variable * @return true if a variable was found, false otherwise */ -bool get_bounds_of_next_var( - std::string_view str, - size_t& begin_pos, - size_t& end_pos -); +bool get_bounds_of_next_var(std::string_view str, size_t& begin_pos, size_t& end_pos); + +/** + * Appends the given constant to the logtype, escaping any variable placeholders + * @param constant + * @param logtype + */ +void escape_and_append_constant_to_logtype(std::string_view constant, std::string& logtype); } // namespace ir #endif // IR_PARSING_HPP diff --git a/components/core/src/utils/make_dictionaries_readable/README.md b/components/core/src/utils/make_dictionaries_readable/README.md index 4a232a285..c3d574ef6 100644 --- a/components/core/src/utils/make_dictionaries_readable/README.md +++ b/components/core/src/utils/make_dictionaries_readable/README.md @@ -4,6 +4,6 @@ For a dictionary, `make-dictionaries-readable` prints one entry per line. For log type dictionary entries, this requires making some characters printable: * Newlines are replaced with `\n` -* Dictionary variable delimiters are replaced with `\d` -* Non-dictionary integer variable delimiters are replaced with `\i` -* Non-dictionary float variable delimiters are replaced with `\f` +* Dictionary variable placeholders are replaced with `\d` +* Non-dictionary integer variable placeholders are replaced with `\i` +* Non-dictionary float variable placeholders are replaced with `\f` diff --git a/components/core/src/utils/make_dictionaries_readable/make-dictionaries-readable.cpp b/components/core/src/utils/make_dictionaries_readable/make-dictionaries-readable.cpp index a84e8e08a..d1785038f 100644 --- a/components/core/src/utils/make_dictionaries_readable/make-dictionaries-readable.cpp +++ b/components/core/src/utils/make_dictionaries_readable/make-dictionaries-readable.cpp @@ -10,6 +10,7 @@ // Project headers #include "../../FileWriter.hpp" +#include "../../ir/parsing.hpp" #include "../../LogTypeDictionaryReader.hpp" #include "../../spdlog_with_specializations.hpp" #include "../../streaming_archive/Constants.hpp" @@ -66,28 +67,28 @@ int main (int argc, const char* argv[]) { size_t constant_begin_pos = 0; for (size_t var_ix = 0; var_ix < entry.get_num_vars(); ++var_ix) { - LogTypeDictionaryEntry::VarDelim var_delim; - size_t var_pos = entry.get_var_info(var_ix, var_delim); + ir::VariablePlaceholder var_placeholder; + size_t var_pos = entry.get_var_info(var_ix, var_placeholder); // Add the constant that's between the last variable and this one, with newlines escaped human_readable_value.append(value, constant_begin_pos, var_pos - constant_begin_pos); - switch (var_delim) { - case LogTypeDictionaryEntry::VarDelim::Integer: + switch (var_placeholder) { + case ir::VariablePlaceholder::Integer: human_readable_value += "\\i"; break; - case LogTypeDictionaryEntry::VarDelim::Float: + case ir::VariablePlaceholder::Float: human_readable_value += "\\f"; break; - case LogTypeDictionaryEntry::VarDelim::Dictionary: + case ir::VariablePlaceholder::Dictionary: human_readable_value += "\\d"; break; default: SPDLOG_ERROR("Logtype '{}' contains unexpected variable placeholder 0x{:x}", - value, enum_to_underlying_type(var_delim)); + value, enum_to_underlying_type(var_placeholder)); return -1; } - // Move past the variable delimiter + // Move past the variable placeholder constant_begin_pos = var_pos + 1; } // Append remainder of value, if any diff --git a/components/core/tests/test-EncodedVariableInterpreter.cpp b/components/core/tests/test-EncodedVariableInterpreter.cpp index 9353e949a..c95971204 100644 --- a/components/core/tests/test-EncodedVariableInterpreter.cpp +++ b/components/core/tests/test-EncodedVariableInterpreter.cpp @@ -245,10 +245,10 @@ TEST_CASE("EncodedVariableInterpreter", "[EncodedVariableInterpreter]") { // Test var_ids is correctly populated size_t encoded_var_id_ix = 0; - LogTypeDictionaryEntry::VarDelim var_placeholder; + ir::VariablePlaceholder var_placeholder; for (auto var_ix = 0; var_ix < logtype_dict_entry.get_num_vars(); var_ix++) { std::ignore = logtype_dict_entry.get_var_info(var_ix, var_placeholder); - if (LogTypeDictionaryEntry::VarDelim::Dictionary == var_placeholder) { + if (ir::VariablePlaceholder::Dictionary == var_placeholder) { auto var = encoded_vars[var_ix]; REQUIRE(var_ids.size() > encoded_var_id_ix); REQUIRE(EncodedVariableInterpreter::decode_var_dict_id(var) == From 22dc8206063034a87c63a927644323f2f3e49ea1 Mon Sep 17 00:00:00 2001 From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:14:58 -0400 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- components/core/src/LogTypeDictionaryEntry.cpp | 3 +-- components/core/src/ir/parsing.cpp | 4 ++-- components/core/src/ir/parsing.hpp | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/components/core/src/LogTypeDictionaryEntry.cpp b/components/core/src/LogTypeDictionaryEntry.cpp index e8c89949a..1d6ec4a7c 100644 --- a/components/core/src/LogTypeDictionaryEntry.cpp +++ b/components/core/src/LogTypeDictionaryEntry.cpp @@ -118,8 +118,7 @@ ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Dec add_constant(constant, 0, constant.length()); constant.clear(); add_float_var(); - } else if (enum_to_underlying_type(ir::VariablePlaceholder::Dictionary) == c) - { + } else if (enum_to_underlying_type(ir::VariablePlaceholder::Dictionary) == c) { add_constant(constant, 0, constant.length()); constant.clear(); add_dictionary_var(); diff --git a/components/core/src/ir/parsing.cpp b/components/core/src/ir/parsing.cpp index 1fa59be34..27678a418 100644 --- a/components/core/src/ir/parsing.cpp +++ b/components/core/src/ir/parsing.cpp @@ -36,7 +36,7 @@ bool is_var(std::string_view value) { bool get_bounds_of_next_var(string_view const str, size_t& begin_pos, size_t& end_pos) { auto const msg_length = str.length(); - if (end_pos >= msg_length) { + if (msg_length <= end_pos) { return false; } @@ -77,7 +77,7 @@ bool get_bounds_of_next_var(string_view const str, size_t& begin_pos, size_t& en // - it's directly preceded by '=' and contains an alphabet char, or // - it could be a multi-digit hex value if (contains_decimal_digit - || (begin_pos > 0 && '=' == str[begin_pos - 1] && contains_alphabet) + || (0 < begin_pos && '=' == str[begin_pos - 1] && contains_alphabet) || could_be_multi_digit_hex_value(variable)) { break; diff --git a/components/core/src/ir/parsing.hpp b/components/core/src/ir/parsing.hpp index 5ee93f340..d4a738366 100644 --- a/components/core/src/ir/parsing.hpp +++ b/components/core/src/ir/parsing.hpp @@ -49,7 +49,7 @@ inline bool could_be_multi_digit_hex_value(std::string_view str) { // NOTE: This is 1-2% faster than using std::all_of with the opposite // condition for (auto c : str) { - if (!(('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'))) { + if (false == (('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') || ('0' <= c && c <= '9'))) { return false; } } From 10e339a55748d313c72c2fba99b2ec5668d63113 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:15:43 -0400 Subject: [PATCH 8/8] Add suggestion from code review with formatting. --- components/core/src/ir/parsing.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/components/core/src/ir/parsing.cpp b/components/core/src/ir/parsing.cpp index 27678a418..940252671 100644 --- a/components/core/src/ir/parsing.cpp +++ b/components/core/src/ir/parsing.cpp @@ -11,10 +11,9 @@ namespace ir { * ranges of characters at a time instead of comparing individual characters */ bool is_delim(signed char c) { - return !( - '+' == c || ('-' <= c && c <= '.') || ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') - || '\\' == c || '_' == c || ('a' <= c && c <= 'z') - ); + return false + == ('+' == c || ('-' <= c && c <= '.') || ('0' <= c && c <= '9') + || ('A' <= c && c <= 'Z') || '\\' == c || '_' == c || ('a' <= c && c <= 'z')); } bool is_variable_placeholder(char c) {