diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index a3d67162a..b82d07075 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -240,6 +240,8 @@ set(SOURCE_FILES_clp src/Profiler.hpp src/Query.cpp src/Query.hpp + src/QueryToken.cpp + src/QueryToken.hpp src/ReaderInterface.cpp src/ReaderInterface.hpp src/SQLiteDB.cpp @@ -373,6 +375,8 @@ set(SOURCE_FILES_clg src/Profiler.hpp src/Query.cpp src/Query.hpp + src/QueryToken.cpp + src/QueryToken.hpp src/ReaderInterface.cpp src/ReaderInterface.hpp src/SQLiteDB.cpp @@ -493,6 +497,8 @@ set(SOURCE_FILES_clo src/Profiler.hpp src/Query.cpp src/Query.hpp + src/QueryToken.cpp + src/QueryToken.hpp src/ReaderInterface.cpp src/ReaderInterface.hpp src/SQLiteDB.cpp @@ -671,6 +677,8 @@ set(SOURCE_FILES_unitTest src/Profiler.hpp src/Query.cpp src/Query.hpp + src/QueryToken.cpp + src/QueryToken.hpp src/ReaderInterface.cpp src/ReaderInterface.hpp src/SQLiteDB.cpp diff --git a/components/core/src/Grep.cpp b/components/core/src/Grep.cpp index 9ad133e81..2e4ee98a0 100644 --- a/components/core/src/Grep.cpp +++ b/components/core/src/Grep.cpp @@ -3,9 +3,12 @@ // C++ libraries #include +// Log surgeon +#include + // Project headers -#include "compressor_frontend/Constants.hpp" #include "EncodedVariableInterpreter.hpp" +#include "QueryToken.hpp" #include "StringReader.hpp" #include "Utils.hpp" @@ -22,215 +25,6 @@ enum class SubQueryMatchabilityResult { SupercedesAllSubQueries // The subquery will cause all messages to be matched }; -// Class representing a token in a query. It is used to interpret a token in user's search string. -class QueryToken { -public: - // Constructors - QueryToken (const string& query_string, size_t begin_pos, size_t end_pos, bool is_var); - - // Methods - bool cannot_convert_to_non_dict_var () const; - bool contains_wildcards () const; - bool has_greedy_wildcard_in_middle () const; - bool has_prefix_greedy_wildcard () const; - bool has_suffix_greedy_wildcard () const; - bool is_ambiguous_token () const; - bool is_float_var () const; - bool is_int_var () const; - bool is_var () const; - bool is_wildcard () const; - - size_t get_begin_pos () const; - size_t get_end_pos () const; - const string& get_value () const; - - bool change_to_next_possible_type (); - -private: - // Types - // Type for the purpose of generating different subqueries. E.g., if a token is of type DictOrIntVar, it would generate a different subquery than - // if it was of type Logtype. - enum class Type { - Wildcard, - // Ambiguous indicates the token can be more than one of the types listed below - Ambiguous, - Logtype, - DictionaryVar, - FloatVar, - IntVar - }; - - // Variables - bool m_cannot_convert_to_non_dict_var; - bool m_contains_wildcards; - bool m_has_greedy_wildcard_in_middle; - bool m_has_prefix_greedy_wildcard; - bool m_has_suffix_greedy_wildcard; - - size_t m_begin_pos; - size_t m_end_pos; - string m_value; - - // Type if variable has unambiguous type - Type m_type; - // Types if variable type is ambiguous - vector m_possible_types; - // Index of the current possible type selected for generating a subquery - size_t m_current_possible_type_ix; -}; - -QueryToken::QueryToken (const string& query_string, const size_t begin_pos, const size_t end_pos, - const bool is_var) : m_current_possible_type_ix(0) -{ - m_begin_pos = begin_pos; - m_end_pos = end_pos; - m_value.assign(query_string, m_begin_pos, m_end_pos - m_begin_pos); - - // Set wildcard booleans and determine type - if ("*" == m_value) { - m_has_prefix_greedy_wildcard = true; - m_has_suffix_greedy_wildcard = false; - m_has_greedy_wildcard_in_middle = false; - m_contains_wildcards = true; - m_type = Type::Wildcard; - } else { - m_has_prefix_greedy_wildcard = ('*' == m_value[0]); - m_has_suffix_greedy_wildcard = ('*' == m_value[m_value.length() - 1]); - - m_has_greedy_wildcard_in_middle = false; - for (size_t i = 1; i < m_value.length() - 1; ++i) { - if ('*' == m_value[i]) { - m_has_greedy_wildcard_in_middle = true; - break; - } - } - - m_contains_wildcards = (m_has_prefix_greedy_wildcard || m_has_suffix_greedy_wildcard || - m_has_greedy_wildcard_in_middle); - - if (!is_var) { - if (!m_contains_wildcards) { - m_type = Type::Logtype; - } else { - m_type = Type::Ambiguous; - m_possible_types.push_back(Type::Logtype); - m_possible_types.push_back(Type::IntVar); - m_possible_types.push_back(Type::FloatVar); - m_possible_types.push_back(Type::DictionaryVar); - } - } else { - string value_without_wildcards = m_value; - if (m_has_prefix_greedy_wildcard) { - value_without_wildcards = value_without_wildcards.substr(1); - } - if (m_has_suffix_greedy_wildcard) { - value_without_wildcards.resize(value_without_wildcards.length() - 1); - } - - encoded_variable_t encoded_var; - bool converts_to_non_dict_var = false; - if (EncodedVariableInterpreter::convert_string_to_representable_integer_var( - value_without_wildcards, encoded_var) || - EncodedVariableInterpreter::convert_string_to_representable_float_var( - value_without_wildcards, encoded_var)) { - converts_to_non_dict_var = true; - } - - if (!converts_to_non_dict_var) { - // Dictionary variable - m_type = Type::DictionaryVar; - m_cannot_convert_to_non_dict_var = true; - } else { - m_type = Type::Ambiguous; - m_possible_types.push_back(Type::IntVar); - m_possible_types.push_back(Type::FloatVar); - m_possible_types.push_back(Type::DictionaryVar); - m_cannot_convert_to_non_dict_var = false; - } - } - } -} - -bool QueryToken::cannot_convert_to_non_dict_var () const { - return m_cannot_convert_to_non_dict_var; -} - -bool QueryToken::contains_wildcards () const { - return m_contains_wildcards; -} - -bool QueryToken::has_greedy_wildcard_in_middle () const { - return m_has_greedy_wildcard_in_middle; -} - -bool QueryToken::has_prefix_greedy_wildcard () const { - return m_has_prefix_greedy_wildcard; -} - -bool QueryToken::has_suffix_greedy_wildcard () const { - return m_has_suffix_greedy_wildcard; -} - -bool QueryToken::is_ambiguous_token () const { - return Type::Ambiguous == m_type; -} - -bool QueryToken::is_float_var () const { - Type type; - if (Type::Ambiguous == m_type) { - type = m_possible_types[m_current_possible_type_ix]; - } else { - type = m_type; - } - return Type::FloatVar == type; -} - -bool QueryToken::is_int_var () const { - Type type; - if (Type::Ambiguous == m_type) { - type = m_possible_types[m_current_possible_type_ix]; - } else { - type = m_type; - } - return Type::IntVar == type; -} - -bool QueryToken::is_var () const { - Type type; - if (Type::Ambiguous == m_type) { - type = m_possible_types[m_current_possible_type_ix]; - } else { - type = m_type; - } - return (Type::IntVar == type || Type::FloatVar == type || Type::DictionaryVar == type); -} - -bool QueryToken::is_wildcard () const { - return Type::Wildcard == m_type; -} - -size_t QueryToken::get_begin_pos () const { - return m_begin_pos; -} - -size_t QueryToken::get_end_pos () const { - return m_end_pos; -} - -const string& QueryToken::get_value () const { - return m_value; -} - -bool QueryToken::change_to_next_possible_type () { - if (m_current_possible_type_ix < m_possible_types.size() - 1) { - ++m_current_possible_type_ix; - return true; - } else { - m_current_possible_type_ix = 0; - return false; - } -} - // Local prototypes /** * Process a QueryToken that is definitely a variable @@ -241,7 +35,12 @@ bool QueryToken::change_to_next_possible_type () { * @param logtype * @return true if this token might match a message, false otherwise */ -static bool process_var_token (const QueryToken& query_token, const Archive& archive, bool ignore_case, SubQuery& sub_query, string& logtype); +static bool process_var_token (const QueryToken& query_token, + const Archive& archive, + bool ignore_case, + SubQuery& sub_query, + string& logtype, + bool use_heuristic); /** * Finds a message matching the given query * @param query @@ -266,7 +65,8 @@ static bool find_matching_message (const Query& query, Archive& archive, const S static SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archive& archive, string& processed_search_string, vector& query_tokens, bool ignore_case, SubQuery& sub_query); -static bool process_var_token (const QueryToken& query_token, const Archive& archive, bool ignore_case, SubQuery& sub_query, string& logtype) { +static bool process_var_token (const QueryToken& query_token, const Archive& archive, + bool ignore_case, SubQuery& sub_query, string& logtype) { // Even though we may have a precise variable, we still fallback to decompressing to ensure that it is in the right place in the message sub_query.mark_wildcard_match_required(); @@ -331,8 +131,12 @@ static bool find_matching_message (const Query& query, Archive& archive, const S return true; } -SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archive& archive, string& processed_search_string, vector& query_tokens, - bool ignore_case, SubQuery& sub_query) +SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archive& archive, + string& processed_search_string, + vector& query_tokens, + bool ignore_case, + SubQuery& sub_query, + bool use_heuristic) { size_t last_token_end_pos = 0; string logtype; @@ -389,7 +193,7 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv } bool Grep::process_raw_query (const Archive& archive, const string& search_string, epochtime_t search_begin_ts, epochtime_t search_end_ts, bool ignore_case, - Query& query, compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer, + Query& query, log_surgeon::lexers::ByteLexer& forward_lexer, log_surgeon::lexers::ByteLexer& reverse_lexer, bool use_heuristic) { // Set properties which require no processing @@ -404,12 +208,6 @@ bool Grep::process_raw_query (const Archive& archive, const string& search_strin // Clean-up search string processed_search_string = clean_up_wildcard_search_string(processed_search_string); - query.set_search_string(processed_search_string); - - // Replace non-greedy wildcards with greedy wildcards since we currently have no support for searching compressed files with non-greedy wildcards - std::replace(processed_search_string.begin(), processed_search_string.end(), '?', '*'); - // Clean-up in case any instances of "?*" or "*?" were changed into "**" - processed_search_string = clean_up_wildcard_search_string(processed_search_string); // Split search_string into tokens with wildcards vector query_tokens; @@ -417,13 +215,26 @@ bool Grep::process_raw_query (const Archive& archive, const string& search_strin size_t end_pos = 0; bool is_var; if (use_heuristic) { + query.set_search_string(processed_search_string); + + // Replace non-greedy wildcards with greedy wildcards since we currently have no support for searching compressed files with non-greedy wildcards + std::replace(processed_search_string.begin(), processed_search_string.end(), '?', '*'); + // Clean-up in case any instances of "?*" or "*?" were changed into "**" + processed_search_string = clean_up_wildcard_search_string(processed_search_string); while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, is_var)) { query_tokens.emplace_back(processed_search_string, begin_pos, end_pos, is_var); } } else { - while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer)) { - query_tokens.emplace_back(processed_search_string, begin_pos, end_pos, is_var); + std::string post_processed_search_string; + post_processed_search_string.reserve(processed_search_string.size()); + while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, + is_var, forward_lexer, reverse_lexer, + post_processed_search_string)) { + query_tokens.emplace_back(post_processed_search_string, begin_pos, + end_pos, is_var); } + processed_search_string = post_processed_search_string; + query.set_search_string(processed_search_string); } // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since we fall-back to decompression + wildcard matching for those. @@ -447,7 +258,12 @@ bool Grep::process_raw_query (const Archive& archive, const string& search_strin sub_query.clear(); // Compute logtypes and variables for query - auto matchability = generate_logtypes_and_vars_for_subquery(archive, processed_search_string, query_tokens, query.get_ignore_case(), sub_query); + auto matchability = generate_logtypes_and_vars_for_subquery(archive, + processed_search_string, + query_tokens, + query.get_ignore_case(), + sub_query, + use_heuristic); switch (matchability) { case SubQueryMatchabilityResult::SupercedesAllSubQueries: // Clear all sub-queries since they will be superceded by this sub-query @@ -477,7 +293,8 @@ bool Grep::process_raw_query (const Archive& archive, const string& search_strin return query.contains_sub_queries(); } -bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, size_t& end_pos, bool& is_var) { +bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, + size_t& end_pos, bool& is_var) { const auto value_length = value.length(); if (end_pos >= value_length) { return false; @@ -589,9 +406,12 @@ bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_ return (value_length != begin_pos); } -bool -Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, size_t& end_pos, bool& is_var, - compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer) { +bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, + size_t& end_pos, bool& is_var, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer, + string& post_processed_value) { + const size_t value_length = value.length(); if (end_pos >= value_length) { return false; @@ -667,35 +487,51 @@ Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, break; } } + SearchToken search_token; if (has_wildcard_in_middle || (has_prefix_wildcard && has_suffix_wildcard)) { // DO NOTHING - } else if (has_suffix_wildcard) { //asdsas* - StringReader stringReader; - stringReader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); - forward_lexer.reset(stringReader); - compressor_frontend::Token token = forward_lexer.scan_with_wildcard(value[end_pos - 1]); - if (token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenEndID) { - is_var = true; - } - } else if (has_prefix_wildcard) { // *asdas - std::string value_reverse = value.substr(begin_pos + 1, end_pos - begin_pos - 1); - std::reverse(value_reverse.begin(), value_reverse.end()); + } else { StringReader stringReader; - stringReader.open(value_reverse); - reverse_lexer.reset(stringReader); - compressor_frontend::Token token = reverse_lexer.scan_with_wildcard(value[begin_pos]); - if (token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token.m_type_ids->at(0) != (int)compressor_frontend::SymbolID::TokenEndID) { - is_var = true; + log_surgeon::Reader reader_wrapper{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + stringReader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; + log_surgeon::ParserInputBuffer parser_input_buffer; + if (has_suffix_wildcard) { //text* + /// TODO: this is way to convoluted, can't you just set the string as the + /// buffer storage? + stringReader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); + parser_input_buffer.read_if_safe(reader_wrapper); + forward_lexer.reset(); + forward_lexer.scan_with_wildcard(parser_input_buffer, + value[end_pos - 1], + search_token); + } else if (has_prefix_wildcard) { // *text + std::string value_reverse = value.substr(begin_pos + 1, end_pos - begin_pos - 1); + std::reverse(value_reverse.begin(), value_reverse.end()); + stringReader.open(value_reverse); + parser_input_buffer.read_if_safe(reader_wrapper); + reverse_lexer.reset(); + reverse_lexer.scan_with_wildcard(parser_input_buffer, + value[begin_pos], + search_token); + } else { // no wildcards + stringReader.open(value.substr(begin_pos, end_pos - begin_pos)); + parser_input_buffer.read_if_safe(reader_wrapper); + forward_lexer.reset(); + forward_lexer.scan(parser_input_buffer, search_token); + search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0)); } - } else { // no wildcards - StringReader stringReader; - stringReader.open(value.substr(begin_pos, end_pos - begin_pos)); - forward_lexer.reset(stringReader); - compressor_frontend::Token token = forward_lexer.scan(); - if (token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenEndID) { + if (search_token.m_type_ids_set.find((int) + log_surgeon::SymbolID::TokenUncaughtStringID) == + search_token.m_type_ids_set.end() && + search_token.m_type_ids_set.find((int) + log_surgeon::SymbolID::TokenEndID) == + search_token.m_type_ids_set.end()) + { is_var = true; } } diff --git a/components/core/src/Grep.hpp b/components/core/src/Grep.hpp index 68225eb1b..acb4a52cf 100644 --- a/components/core/src/Grep.hpp +++ b/components/core/src/Grep.hpp @@ -4,12 +4,14 @@ // C++ libraries #include +// Log surgeon +#include + // Project headers #include "Defs.h" #include "Query.hpp" #include "streaming_archive/reader/Archive.hpp" #include "streaming_archive/reader/File.hpp" -#include "compressor_frontend/Lexer.hpp" class Grep { @@ -37,8 +39,8 @@ class Grep { * @return true if query may match messages, false otherwise */ static bool process_raw_query (const streaming_archive::reader::Archive& archive, const std::string& search_string, epochtime_t search_begin_ts, - epochtime_t search_end_ts, bool ignore_case, Query& query, compressor_frontend::lexers::ByteLexer& forward_lexer, - compressor_frontend::lexers::ByteLexer& reverse_lexer, bool use_heuristic); + epochtime_t search_end_ts, bool ignore_case, Query& query, log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer, bool use_heuristic); /** * Returns bounds of next potential variable (either a definite variable or a token with wildcards) @@ -58,11 +60,17 @@ class Grep { * @param is_var Whether the token is definitely a variable * @param forward_lexer DFA for determining if input is in the schema * @param reverse_lexer DFA for determining if reverse of input is in the schema + * @param post_processed_string + * @param is_typed + * @param typed_begin_pos + * @param typed_end_pos * @return true if another potential variable was found, false otherwise */ - static bool get_bounds_of_next_potential_var (const std::string& value, size_t& begin_pos, size_t& end_pos, bool& is_var, - compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer); - + static bool get_bounds_of_next_potential_var (const std::string& value, size_t& begin_pos, + size_t& end_pos, bool& is_var, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer, + std::string& post_processed_string); /** * Marks which sub-queries in each query are relevant to the given file * @param compressed_file @@ -99,4 +107,14 @@ class Grep { static size_t search (const Query& query, size_t limit, streaming_archive::reader::Archive& archive, streaming_archive::reader::File& compressed_file); }; + +/** + * Wraps the tokens normally return from the log_surgeon lexer, and storing the variable ids of the + * tokens in a search query in a set. This allows for optimized search performance. + */ +class SearchToken : public log_surgeon::Token { +public: + std::set m_type_ids_set; +}; + #endif // GREP_HPP diff --git a/components/core/src/QueryToken.cpp b/components/core/src/QueryToken.cpp new file mode 100644 index 000000000..6f6fc829b --- /dev/null +++ b/components/core/src/QueryToken.cpp @@ -0,0 +1,158 @@ +#include "QueryToken.hpp" + +// Project headers +#include "EncodedVariableInterpreter.hpp" + +using std::string; + +QueryToken::QueryToken (const string& query_string, const size_t begin_pos, const size_t end_pos, + const bool is_var) : m_current_possible_type_ix(0) +{ + m_begin_pos = begin_pos; + m_end_pos = end_pos; + m_value.assign(query_string, m_begin_pos, m_end_pos - m_begin_pos); + + // Set wildcard booleans and determine type + if ("*" == m_value) { + m_has_prefix_greedy_wildcard = true; + m_has_suffix_greedy_wildcard = false; + m_has_greedy_wildcard_in_middle = false; + m_contains_wildcards = true; + m_type = Type::Wildcard; + } else { + m_has_prefix_greedy_wildcard = ('*' == m_value[0]); + m_has_suffix_greedy_wildcard = ('*' == m_value[m_value.length() - 1]); + + m_has_greedy_wildcard_in_middle = false; + for (size_t i = 1; i < m_value.length() - 1; ++i) { + if ('*' == m_value[i]) { + m_has_greedy_wildcard_in_middle = true; + break; + } + } + + m_contains_wildcards = (m_has_prefix_greedy_wildcard || m_has_suffix_greedy_wildcard || + m_has_greedy_wildcard_in_middle); + + if (!is_var) { + if (!m_contains_wildcards) { + m_type = Type::Logtype; + } else { + m_type = Type::Ambiguous; + m_possible_types.push_back(Type::Logtype); + m_possible_types.push_back(Type::IntVar); + m_possible_types.push_back(Type::FloatVar); + m_possible_types.push_back(Type::DictionaryVar); + } + } else { + string value_without_wildcards = m_value; + if (m_has_prefix_greedy_wildcard) { + value_without_wildcards = value_without_wildcards.substr(1); + } + if (m_has_suffix_greedy_wildcard) { + value_without_wildcards.resize(value_without_wildcards.length() - 1); + } + + encoded_variable_t encoded_var; + bool converts_to_non_dict_var = false; + if (EncodedVariableInterpreter::convert_string_to_representable_integer_var( + value_without_wildcards, encoded_var) || + EncodedVariableInterpreter::convert_string_to_representable_float_var( + value_without_wildcards, encoded_var)) { + converts_to_non_dict_var = true; + } + + if (!converts_to_non_dict_var) { + // Dictionary variable + m_type = Type::DictionaryVar; + m_cannot_convert_to_non_dict_var = true; + } else { + m_type = Type::Ambiguous; + m_possible_types.push_back(Type::IntVar); + m_possible_types.push_back(Type::FloatVar); + m_possible_types.push_back(Type::DictionaryVar); + m_cannot_convert_to_non_dict_var = false; + } + } + } +} + +bool QueryToken::cannot_convert_to_non_dict_var () const { + return m_cannot_convert_to_non_dict_var; +} + +bool QueryToken::contains_wildcards () const { + return m_contains_wildcards; +} + +bool QueryToken::has_greedy_wildcard_in_middle () const { + return m_has_greedy_wildcard_in_middle; +} + +bool QueryToken::has_prefix_greedy_wildcard () const { + return m_has_prefix_greedy_wildcard; +} + +bool QueryToken::has_suffix_greedy_wildcard () const { + return m_has_suffix_greedy_wildcard; +} + +bool QueryToken::is_ambiguous_token () const { + return Type::Ambiguous == m_type; +} + +bool QueryToken::is_float_var () const { + Type type; + if (Type::Ambiguous == m_type) { + type = m_possible_types[m_current_possible_type_ix]; + } else { + type = m_type; + } + return Type::FloatVar == type; +} + +bool QueryToken::is_int_var () const { + Type type; + if (Type::Ambiguous == m_type) { + type = m_possible_types[m_current_possible_type_ix]; + } else { + type = m_type; + } + return Type::IntVar == type; +} + +bool QueryToken::is_var () const { + Type type; + if (Type::Ambiguous == m_type) { + type = m_possible_types[m_current_possible_type_ix]; + } else { + type = m_type; + } + return (Type::IntVar == type || Type::FloatVar == type || Type::DictionaryVar == type); +} + +bool QueryToken::is_wildcard () const { + return Type::Wildcard == m_type; +} + +size_t QueryToken::get_begin_pos () const { + return m_begin_pos; +} + +size_t QueryToken::get_end_pos () const { + return m_end_pos; +} + +const string& QueryToken::get_value () const { + return m_value; +} + +bool QueryToken::change_to_next_possible_type () { + if (m_current_possible_type_ix < m_possible_types.size() - 1) { + ++m_current_possible_type_ix; + return true; + } else { + m_current_possible_type_ix = 0; + return false; + } +} diff --git a/components/core/src/QueryToken.hpp b/components/core/src/QueryToken.hpp new file mode 100644 index 000000000..450413fd0 --- /dev/null +++ b/components/core/src/QueryToken.hpp @@ -0,0 +1,72 @@ +#ifndef QUERY_TOKEN_HPP +#define QUERY_TOKEN_HPP + +// C++ standard libraries +#include +#include + +// Project headers +#include "Query.hpp" +#include "TraceableException.hpp" +#include "VariableDictionaryReader.hpp" +#include "VariableDictionaryWriter.hpp" + +// Class representing a token in a query. It is used to interpret a token in user's search string. +class QueryToken { +public: + // Constructors + QueryToken (const std::string& query_string, size_t begin_pos, size_t end_pos, bool is_var); + + // Methods + bool cannot_convert_to_non_dict_var () const; + bool contains_wildcards () const; + bool has_greedy_wildcard_in_middle () const; + bool has_prefix_greedy_wildcard () const; + bool has_suffix_greedy_wildcard () const; + bool is_ambiguous_token () const; + bool is_float_var () const; + bool is_int_var () const; + bool is_var () const; + bool is_wildcard () const; + + size_t get_begin_pos () const; + size_t get_end_pos () const; + const std::string& get_value () const; + + bool change_to_next_possible_type (); + +private: + // Types + // Type for the purpose of generating different subqueries. E.g., if a token is of type + // DictOrIntVar, it would generate a different subquery than if it was of type Logtype. + enum class Type { + Wildcard, + // Ambiguous indicates the token can be more than one of the types listed below + Ambiguous, + Logtype, + DictionaryVar, + FloatVar, + IntVar + }; + + // Variables + bool m_cannot_convert_to_non_dict_var; + bool m_contains_wildcards; + bool m_has_greedy_wildcard_in_middle; + bool m_has_prefix_greedy_wildcard; + bool m_has_suffix_greedy_wildcard; + + size_t m_begin_pos; + size_t m_end_pos; + std::string m_value; + + // Type if variable has unambiguous type + Type m_type; + // Types if variable type is ambiguous + std::vector m_possible_types; + // Index of the current possible type selected for generating a subquery + size_t m_current_possible_type_ix; +}; + +#endif // QUERY_TOKEN_HPP + \ No newline at end of file diff --git a/components/core/src/Utils.cpp b/components/core/src/Utils.cpp index 328cdfd4c..520a3b64f 100644 --- a/components/core/src/Utils.cpp +++ b/components/core/src/Utils.cpp @@ -17,6 +17,9 @@ // spdlog #include +// Log surgeon +#include + // Project headers #include "string_utils.hpp" @@ -215,3 +218,124 @@ ErrorCode read_list_of_paths (const string& list_path, vector& paths) { return ErrorCode_Success; } + +void load_lexer_from_file (std::string schema_file_path, + bool reverse, + log_surgeon::lexers::ByteLexer& lexer) { + FileReader schema_reader; + schema_reader.try_open(schema_file_path); + /// TODO: this wrapper is repeated a lot + log_surgeon::Reader reader_wrapper{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + schema_reader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; + log_surgeon::SchemaParser sp; + std::unique_ptr schema_ast = sp.generate_schema_ast(reader_wrapper); + auto* delimiters_ptr = dynamic_cast( + schema_ast->m_delimiters.get()); + if (!lexer.m_symbol_id.empty()) { + throw std::runtime_error("Error: symbol_ids initialized before setting enum symbol_ids"); + } + /// TODO: this is a copy of other code + lexer.m_symbol_id[log_surgeon::cTokenEnd] = (int) log_surgeon::SymbolID::TokenEndID; + lexer.m_symbol_id[log_surgeon::cTokenUncaughtString] = + (int) log_surgeon::SymbolID::TokenUncaughtStringID; + lexer.m_symbol_id[log_surgeon::cTokenInt] = (int) log_surgeon::SymbolID::TokenIntId; + lexer.m_symbol_id[log_surgeon::cTokenFloat] = (int) log_surgeon::SymbolID::TokenFloatId; + lexer.m_symbol_id[log_surgeon::cTokenFirstTimestamp] = (int) log_surgeon::SymbolID::TokenFirstTimestampId; + lexer.m_symbol_id[log_surgeon::cTokenNewlineTimestamp] = (int) log_surgeon::SymbolID::TokenNewlineTimestampId; + lexer.m_symbol_id[log_surgeon::cTokenNewline] = (int) log_surgeon::SymbolID::TokenNewlineId; + + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenEndID] = log_surgeon::cTokenEnd; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenUncaughtStringID] = + log_surgeon::cTokenUncaughtString; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenIntId] = log_surgeon::cTokenInt; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenFloatId] = log_surgeon::cTokenFloat; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenFirstTimestampId] = + log_surgeon::cTokenFirstTimestamp; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenNewlineTimestampId] = + log_surgeon::cTokenNewlineTimestamp; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenNewlineId] = log_surgeon::cTokenNewline; + + /// TODO: figure out why this needs to be specially added + lexer.add_rule(lexer.m_symbol_id["newLine"], + std::move(std::make_unique>( + log_surgeon::finite_automata::RegexASTLiteral< + log_surgeon::finite_automata::RegexNFAByteState>('\n')))); + + if (delimiters_ptr != nullptr) { + lexer.add_delimiters(delimiters_ptr->m_delimiters); + } + for (std::unique_ptr const& parser_ast: schema_ast->m_schema_vars) { + auto* rule = dynamic_cast(parser_ast.get()); + + if ("timestamp" == rule->m_name) { + continue; + } + + if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) { + lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size(); + lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name; + } + + // transform '.' from any-character into any non-delimiter character + rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters_ptr->m_delimiters); + + /// TODO: this error function is a copy + // currently, error out if non-timestamp pattern contains a delimiter + // check if regex contains a delimiter + bool is_possible_input[log_surgeon::cUnicodeMax] = {false}; + rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); + bool contains_delimiter = false; + uint32_t delimiter_name; + for (uint32_t delimiter: delimiters_ptr->m_delimiters) { + if (is_possible_input[delimiter]) { + contains_delimiter = true; + delimiter_name = delimiter; + break; + } + } + if (contains_delimiter) { + FileReader schema_reader; + ErrorCode error_code = schema_reader.try_open(schema_ast->m_file_path); + if (ErrorCode_Success != error_code) { + throw std::runtime_error(schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name + + "' has regex pattern which contains delimiter '" + char(delimiter_name) + "'.\n"); + } else { + // more detailed debugging based on looking at the file + string line; + for (uint32_t i = 0; i <= rule->m_line_num; i++) { + schema_reader.read_to_delimiter('\n', false, false, line); + } + int colon_pos = 0; + for (char i : line) { + colon_pos++; + if (i == ':') { + break; + } + } + string indent(10, ' '); + string spaces(colon_pos, ' '); + string arrows(line.size() - colon_pos, '^'); + + throw std::runtime_error(schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name + + "' has regex pattern which contains delimiter '" + char(delimiter_name) + "'.\n" + + indent + line + "\n" + indent + spaces + arrows + "\n"); + + } + } + + lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr)); + } + if (reverse) { + lexer.generate_reverse(); + } else { + lexer.generate(); + } + + schema_reader.close(); +} diff --git a/components/core/src/Utils.hpp b/components/core/src/Utils.hpp index 6f8b843f3..8f3aa903d 100644 --- a/components/core/src/Utils.hpp +++ b/components/core/src/Utils.hpp @@ -8,6 +8,9 @@ #include #include +// Log surgeon +#include + // Project headers #include "Defs.h" #include "ErrorCode.hpp" @@ -108,4 +111,14 @@ std::string get_unambiguous_path (const std::string& path); */ ErrorCode read_list_of_paths (const std::string& list_path, std::vector& paths); +/** + * Loads a lexer from a file + * @param schema_file_path + * @param done + * @param forward_lexer_ptr + */ +void load_lexer_from_file (std::string schema_file_path, + bool done, + log_surgeon::lexers::ByteLexer& forward_lexer_ptr); + #endif // UTILS_HPP diff --git a/components/core/src/clg/clg.cpp b/components/core/src/clg/clg.cpp index c99cddc22..f7873c953 100644 --- a/components/core/src/clg/clg.cpp +++ b/components/core/src/clg/clg.cpp @@ -9,18 +9,20 @@ #include #include +// Log surgeon +#include + // Project headers #include "../Defs.h" -#include "../compressor_frontend/utils.hpp" #include "../Grep.hpp" #include "../GlobalMySQLMetadataDB.hpp" #include "../GlobalSQLiteMetadataDB.hpp" #include "../Profiler.hpp" #include "../streaming_archive/Constants.hpp" +#include "../Utils.hpp" #include "CommandLineArguments.hpp" using clg::CommandLineArguments; -using compressor_frontend::load_lexer_from_file; using std::cout; using std::cerr; using std::endl; @@ -132,7 +134,7 @@ static bool open_archive (const string& archive_path, Archive& archive_reader) { } static bool search (const vector& search_strings, CommandLineArguments& command_line_args, Archive& archive, - compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer, bool use_heuristic) { + log_surgeon::lexers::ByteLexer& forward_lexer, log_surgeon::lexers::ByteLexer& reverse_lexer, bool use_heuristic) { ErrorCode error_code; auto search_begin_ts = command_line_args.get_search_begin_ts(); auto search_end_ts = command_line_args.get_search_end_ts(); @@ -388,12 +390,12 @@ int main (int argc, const char* argv[]) { /// TODO: if performance is too slow, can make this more efficient by only diffing files with the same checksum const uint32_t max_map_schema_length = 100000; - std::map forward_lexer_map; - std::map reverse_lexer_map; - compressor_frontend::lexers::ByteLexer one_time_use_forward_lexer; - compressor_frontend::lexers::ByteLexer one_time_use_reverse_lexer; - compressor_frontend::lexers::ByteLexer* forward_lexer_ptr; - compressor_frontend::lexers::ByteLexer* reverse_lexer_ptr; + std::map forward_lexer_map; + std::map reverse_lexer_map; + log_surgeon::lexers::ByteLexer one_time_use_forward_lexer; + log_surgeon::lexers::ByteLexer one_time_use_reverse_lexer; + log_surgeon::lexers::ByteLexer* forward_lexer_ptr; + log_surgeon::lexers::ByteLexer* reverse_lexer_ptr; string archive_id; Archive archive_reader; @@ -431,12 +433,12 @@ int main (int argc, const char* argv[]) { // if there is a chance there might be a difference make a new lexer as it's pretty fast to create if (forward_lexer_map_it == forward_lexer_map.end()) { // Create forward lexer - auto insert_result = forward_lexer_map.emplace(buf, compressor_frontend::lexers::ByteLexer()); + auto insert_result = forward_lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); forward_lexer_ptr = &insert_result.first->second; load_lexer_from_file(schema_file_path, false, *forward_lexer_ptr); // Create reverse lexer - insert_result = reverse_lexer_map.emplace(buf, compressor_frontend::lexers::ByteLexer()); + insert_result = reverse_lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); reverse_lexer_ptr = &insert_result.first->second; load_lexer_from_file(schema_file_path, true, *reverse_lexer_ptr); } else { diff --git a/components/core/src/clo/clo.cpp b/components/core/src/clo/clo.cpp index 6f1a2d135..ff76737d0 100644 --- a/components/core/src/clo/clo.cpp +++ b/components/core/src/clo/clo.cpp @@ -17,7 +17,6 @@ // Project headers #include "../Defs.h" -#include "../compressor_frontend/utils.hpp" #include "../Grep.hpp" #include "../Profiler.hpp" #include "../networking/socket_utils.hpp" @@ -27,7 +26,6 @@ #include "ControllerMonitoringThread.hpp" using clo::CommandLineArguments; -using compressor_frontend::load_lexer_from_file; using std::cout; using std::cerr; using std::endl; @@ -204,16 +202,16 @@ static bool search_archive (const CommandLineArguments& command_line_args, const // Load lexers from schema file if it exists auto schema_file_path = archive_path / streaming_archive::cSchemaFileName; - unique_ptr forward_lexer, reverse_lexer; + unique_ptr forward_lexer, reverse_lexer; bool use_heuristic = true; if (boost::filesystem::exists(schema_file_path)) { use_heuristic = false; // Create forward lexer - forward_lexer.reset(new compressor_frontend::lexers::ByteLexer()); + forward_lexer.reset(new log_surgeon::lexers::ByteLexer()); load_lexer_from_file(schema_file_path.string(), false, *forward_lexer); // Create reverse lexer - reverse_lexer.reset(new compressor_frontend::lexers::ByteLexer()); + reverse_lexer.reset(new log_surgeon::lexers::ByteLexer()); load_lexer_from_file(schema_file_path.string(), true, *reverse_lexer); } diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index e75382d2b..45204fbed 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -11,10 +11,18 @@ // libarchive #include +// Log surgeon +#include +#include + // Project headers #include "../Profiler.hpp" #include "utils.hpp" +using log_surgeon::LogEventView; +using log_surgeon::ReaderParser; +using log_surgeon::Reader; +using log_surgeon::ReaderParser; using std::cout; using std::endl; using std::set; @@ -104,9 +112,11 @@ namespace clp { file_to_compress.get_path_for_compression(), file_to_compress.get_group_id(), archive_writer, m_file_reader); } else { - parse_and_encode(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, - file_to_compress.get_path_for_compression(), - file_to_compress.get_group_id(), archive_writer, m_file_reader); + parse_and_encode_with_library(target_data_size_of_dicts, archive_user_config, + target_encoded_file_size, + file_to_compress.get_path_for_compression(), + file_to_compress.get_group_id(), archive_writer, + m_file_reader); } } else { if (false == try_compressing_as_archive(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, file_to_compress, @@ -125,9 +135,11 @@ namespace clp { return succeeded; } - void FileCompressor::parse_and_encode (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, const string& path_for_compression, group_id_t group_id, - streaming_archive::writer::Archive& archive_writer, ReaderInterface& reader) + void FileCompressor::parse_and_encode_with_library (size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, const string& path_for_compression, + group_id_t group_id, streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader) { archive_writer.m_target_data_size_of_dicts = target_data_size_of_dicts; archive_writer.m_archive_user_config = archive_user_config; @@ -136,30 +148,30 @@ namespace clp { archive_writer.m_target_encoded_file_size = target_encoded_file_size; // Open compressed file archive_writer.create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); - // TODO: decide what to actually do about this - // for now reset reader rather than try reading m_utf8_validation_buf as it would be - // very awkward to combine sources to/in the parser + /// TODO:Add the m_utf8_validation_buf into the start of the input buffer reader.seek_from_begin(0); - m_log_parser->set_archive_writer_ptr(&archive_writer); - m_log_parser->get_archive_writer_ptr()->old_ts_pattern.clear(); - try { - m_log_parser->parse(reader); - } catch (std::string const err) { - if (err.find("Lexer failed to find a match after checking entire buffer") != std::string::npos) { - close_file_and_append_to_segment(archive_writer); - SPDLOG_ERROR(err); - } else { - throw (err); + archive_writer.m_old_ts_pattern.clear(); + archive_writer.m_timestamp_set = false; + Reader reader_wrapper{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + reader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; + m_reader_parser->reset_and_set_reader(reader_wrapper); + static LogEventView log_view{&m_reader_parser->get_log_parser()}; + while (false == m_reader_parser->done()) { + if (log_surgeon::ErrorCode err{m_reader_parser->get_next_event_view(log_view)}; + log_surgeon::ErrorCode::Success != err) { + SPDLOG_ERROR("Parsing Failed"); + throw (std::runtime_error("Parsing Failed")); } + archive_writer.write_msg_using_schema(log_view); } - // TODO: separate variables from static text - //Stopwatch close_file_watch("close_file_watch"); - //close_file_watch.start(); close_file_and_append_to_segment(archive_writer); // archive_writer_config needs to persist between files archive_user_config = archive_writer.m_archive_user_config; - //close_file_watch.stop(); - //close_file_watch.print(); } void FileCompressor::parse_and_encode_with_heuristic (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, @@ -279,8 +291,11 @@ namespace clp { boost_path_for_compression.string(), file_to_compress.get_group_id(), archive_writer, m_libarchive_file_reader); } else { - parse_and_encode(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, boost_path_for_compression.string(), - file_to_compress.get_group_id(), archive_writer, m_libarchive_file_reader); + parse_and_encode_with_library(target_data_size_of_dicts, archive_user_config, + target_encoded_file_size, + boost_path_for_compression.string(), + file_to_compress.get_group_id(), archive_writer, + m_libarchive_file_reader); } } else { SPDLOG_ERROR("Cannot compress {} - not UTF-8 encoded.", m_libarchive_reader.get_path()); diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index faa6d0a07..197b0b59b 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -4,6 +4,10 @@ // Boost libraries #include +// Log surgeon +#include +#include + // Project headers #include "../FileReader.hpp" #include "../LibarchiveFileReader.hpp" @@ -12,7 +16,6 @@ #include "../ParsedMessage.hpp" #include "../streaming_archive/writer/Archive.hpp" #include "FileToCompress.hpp" -#include "../compressor_frontend/LogParser.hpp" namespace clp { constexpr size_t cUtf8ValidationBufCapacity = 4096; @@ -23,8 +26,10 @@ namespace clp { class FileCompressor { public: // Constructors - FileCompressor (boost::uuids::random_generator& uuid_generator, std::unique_ptr log_parser) : m_uuid_generator( - uuid_generator), m_log_parser(std::move(log_parser)) {} + FileCompressor (boost::uuids::random_generator& uuid_generator, + std::unique_ptr reader_parser) : + m_uuid_generator(uuid_generator), + m_reader_parser(std::move(reader_parser)) {} // Methods /** @@ -53,7 +58,7 @@ namespace clp { * @param archive_writer * @param reader */ - void parse_and_encode (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, + void parse_and_encode_with_library (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, size_t target_encoded_file_size, const std::string& path_for_compression, group_id_t group_id, streaming_archive::writer::Archive& archive_writer, ReaderInterface& reader); @@ -84,7 +89,7 @@ namespace clp { size_t m_utf8_validation_buf_length; MessageParser m_message_parser; ParsedMessage m_parsed_message; - std::unique_ptr m_log_parser; + std::unique_ptr m_reader_parser; }; } diff --git a/components/core/src/clp/compression.cpp b/components/core/src/clp/compression.cpp index dcb7d8b94..0ab0159d0 100644 --- a/components/core/src/clp/compression.cpp +++ b/components/core/src/clp/compression.cpp @@ -55,7 +55,7 @@ namespace clp { bool compress (CommandLineArguments& command_line_args, vector& files_to_compress, const vector& empty_directory_paths, vector& grouped_files_to_compress, size_t target_encoded_file_size, - std::unique_ptr log_parser, bool use_heuristic) { + std::unique_ptr reader_parser, bool use_heuristic) { auto output_dir = boost::filesystem::path(command_line_args.get_output_dir()); // Create output directory in case it doesn't exist @@ -108,7 +108,7 @@ namespace clp { archive_writer.add_empty_directories(empty_directory_paths); bool all_files_compressed_successfully = true; - FileCompressor file_compressor(uuid_generator, std::move(log_parser)); + FileCompressor file_compressor(uuid_generator, std::move(reader_parser)); auto target_data_size_of_dictionaries = command_line_args.get_target_data_size_of_dictionaries(); // Compress all files diff --git a/components/core/src/clp/compression.hpp b/components/core/src/clp/compression.hpp index 8291acb0b..ab6b49e06 100644 --- a/components/core/src/clp/compression.hpp +++ b/components/core/src/clp/compression.hpp @@ -8,11 +8,14 @@ // Boost libraries #include +// Log surgeon +#include +#include + // Project headers #include "CommandLineArguments.hpp" #include "FileToCompress.hpp" #include "StructuredFileToCompress.hpp" -#include "../compressor_frontend/LogParser.hpp" namespace clp { /** @@ -26,9 +29,12 @@ namespace clp { * @param use_heuristic * @return true if compression was successful, false otherwise */ - bool compress (CommandLineArguments& command_line_args, std::vector& files_to_compress, - const std::vector& empty_directory_paths, std::vector& grouped_files_to_compress, - size_t target_encoded_file_size, std::unique_ptr log_parser, bool use_heuristic); + bool compress (CommandLineArguments& command_line_args, + std::vector& files_to_compress, + const std::vector& empty_directory_paths, + std::vector& grouped_files_to_compress, + size_t target_encoded_file_size, + std::unique_ptr reader_parser, bool use_heuristic); /** * Reads a list of grouped files and a list of their IDs diff --git a/components/core/src/clp/run.cpp b/components/core/src/clp/run.cpp index 1b2eacbdc..f5912ec3d 100644 --- a/components/core/src/clp/run.cpp +++ b/components/core/src/clp/run.cpp @@ -7,8 +7,10 @@ #include #include +// Log Surgeon +#include + // Project headers -#include "../compressor_frontend/LogParser.hpp" #include "../Profiler.hpp" #include "../Utils.hpp" #include "CommandLineArguments.hpp" @@ -60,10 +62,10 @@ namespace clp { if (CommandLineArguments::Command::Compress == command_line_args.get_command()) { /// TODO: make this not a unique_ptr and test performance difference - std::unique_ptr log_parser; + std::unique_ptr reader_parser; if (!command_line_args.get_use_heuristic()) { const std::string& schema_file_path = command_line_args.get_schema_file_path(); - log_parser = std::make_unique(schema_file_path); + reader_parser = std::make_unique(schema_file_path); } boost::filesystem::path path_prefix_to_remove(command_line_args.get_path_prefix_to_remove()); @@ -91,8 +93,10 @@ namespace clp { bool compression_successful; try { - compression_successful = compress(command_line_args, files_to_compress, empty_directory_paths, grouped_files_to_compress, - command_line_args.get_target_encoded_file_size(), std::move(log_parser), + compression_successful = compress(command_line_args, files_to_compress, + empty_directory_paths, grouped_files_to_compress, + command_line_args.get_target_encoded_file_size(), + std::move(reader_parser), command_line_args.get_use_heuristic()); } catch (TraceableException& e) { ErrorCode error_code = e.get_error_code(); diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index 0eceefdf9..955975852 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -21,12 +21,17 @@ // spdlog #include +// Log surgeon +#include +#include + // Project headers +#include "../../clp/utils.hpp" #include "../../EncodedVariableInterpreter.hpp" #include "../../Utils.hpp" #include "../Constants.hpp" -#include "../../compressor_frontend/LogParser.hpp" +using log_surgeon::LogEventView; using std::list; using std::make_unique; using std::string; @@ -280,66 +285,76 @@ namespace streaming_archive::writer { } } - void Archive::write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, const bool has_delimiter, - const bool has_timestamp) { + void Archive::write_msg_using_schema (LogEventView& log_view) { epochtime_t timestamp = 0; TimestampPattern* timestamp_pattern = nullptr; - if (has_timestamp) { + if (log_view.get_log_output_buffer()->has_timestamp()) { size_t start; size_t end; timestamp_pattern = (TimestampPattern*) TimestampPattern::search_known_ts_patterns( - uncompressed_msg[0].get_string(), timestamp, start, end); - if (old_ts_pattern != *timestamp_pattern) { + log_view.get_log_output_buffer()->get_mutable_token(0).to_string(), timestamp, + start, end); + if (m_old_ts_pattern != *timestamp_pattern) { change_ts_pattern(timestamp_pattern); - old_ts_pattern = *timestamp_pattern; + m_old_ts_pattern = *timestamp_pattern; + m_timestamp_set = true; } assert(nullptr != timestamp_pattern); + } else { + if (false == m_timestamp_set || false == m_old_ts_pattern.get_format().empty()) { + change_ts_pattern(nullptr); + m_old_ts_pattern.clear(); + m_timestamp_set = true; + } } if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { - clp::split_file_and_archive(m_archive_user_config, m_path_for_compression, m_group_id, timestamp_pattern, *this); + clp::split_file_and_archive(m_archive_user_config, m_path_for_compression, m_group_id, + timestamp_pattern, *this); } else if (m_file->get_encoded_size_in_bytes() >= m_target_encoded_file_size) { clp::split_file(m_path_for_compression, m_group_id, timestamp_pattern, *this); } - m_encoded_vars.clear(); m_var_ids.clear(); m_logtype_dict_entry.clear(); - size_t num_uncompressed_bytes = 0; // Timestamp is included in the uncompressed message size - uint32_t start_pos = uncompressed_msg[0].m_start_pos; + uint32_t start_pos = log_view.get_log_output_buffer()->get_token(0).m_start_pos; if (timestamp_pattern == nullptr) { - start_pos = uncompressed_msg[1].m_start_pos; + start_pos = log_view.get_log_output_buffer()->get_token(1).m_start_pos; } - uint32_t end_pos = uncompressed_msg[uncompressed_msg_pos - 1].m_end_pos; + uint32_t end_pos = log_view.get_log_output_buffer()->get_token( + log_view.get_log_output_buffer()->pos() - 1).m_end_pos; if (start_pos <= end_pos) { num_uncompressed_bytes = end_pos - start_pos; } else { - num_uncompressed_bytes = *uncompressed_msg[0].m_buffer_size_ptr - start_pos + end_pos; - } - for (uint32_t i = 1; i < uncompressed_msg_pos; i++) { - compressor_frontend::Token& token = uncompressed_msg[i]; - int token_type = token.m_type_ids->at(0); - if (has_delimiter && token_type != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token_type != (int) compressor_frontend::SymbolID::TokenNewlineId) { + num_uncompressed_bytes = log_view.get_log_output_buffer()->get_token(0).m_buffer_size - start_pos + end_pos; + } + for (uint32_t i = 1; i < log_view.get_log_output_buffer()->pos(); i++) { + log_surgeon::Token& token = log_view.get_log_output_buffer()->get_mutable_token(i); + int token_type = token.m_type_ids_ptr->at(0); + if (log_view.get_log_output_buffer()->has_delimiters() && + token_type != (int) log_surgeon::SymbolID::TokenUncaughtStringID && + token_type != (int) log_surgeon::SymbolID::TokenNewlineId) + { m_logtype_dict_entry.add_constant(token.get_delimiter(), 0, 1); - if (token.m_start_pos == *token.m_buffer_size_ptr - 1) { + if (token.m_start_pos == token.m_buffer_size - 1) { token.m_start_pos = 0; } else { token.m_start_pos++; } } switch (token_type) { - case (int) compressor_frontend::SymbolID::TokenNewlineId: - case (int) compressor_frontend::SymbolID::TokenUncaughtStringID: { - m_logtype_dict_entry.add_constant(token.get_string(), 0, token.get_length()); + case (int) log_surgeon::SymbolID::TokenNewlineId: + case (int) log_surgeon::SymbolID::TokenUncaughtStringID: { + m_logtype_dict_entry.add_constant(token.to_string(), 0, token.get_length()); break; } - case (int) compressor_frontend::SymbolID::TokenIntId: { + case (int) log_surgeon::SymbolID::TokenIntId: { encoded_variable_t encoded_var; - if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(token.get_string(), encoded_var)) { + if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( + token.to_string(), encoded_var)) { variable_dictionary_id_t id; - m_var_dict.add_entry(token.get_string(), id); + m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); } else { @@ -348,12 +363,12 @@ namespace streaming_archive::writer { m_encoded_vars.push_back(encoded_var); break; } - case (int) compressor_frontend::SymbolID::TokenFloatId: { + case (int) log_surgeon::SymbolID::TokenFloatId: { encoded_variable_t encoded_var; if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( - token.get_string(), encoded_var)) { + token.to_string(), encoded_var)) { variable_dictionary_id_t id; - m_var_dict.add_entry(token.get_string(), id); + m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); } else { @@ -366,7 +381,7 @@ namespace streaming_archive::writer { // Variable string looks like a dictionary variable, so encode it as so encoded_variable_t encoded_var; variable_dictionary_id_t id; - m_var_dict.add_entry(token.get_string(), id); + m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_var_ids.push_back(id); diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index d16b86eb6..7d5576db3 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -13,13 +13,16 @@ #include #include +// Log Surgeon +#include +#include + // Project headers #include "../../ArrayBackedPosIntSet.hpp" #include "../../ErrorCode.hpp" #include "../../GlobalMetadataDB.hpp" #include "../../LogTypeDictionaryWriter.hpp" #include "../../VariableDictionaryWriter.hpp" -#include "../../compressor_frontend/Token.hpp" #include "../MetadataDB.hpp" namespace streaming_archive { namespace writer { @@ -59,8 +62,8 @@ namespace streaming_archive { namespace writer { } }; - TimestampPattern old_ts_pattern; - + TimestampPattern m_old_ts_pattern; + bool m_timestamp_set; size_t m_target_data_size_of_dicts; UserConfig m_archive_user_config; std::string m_path_for_compression; @@ -70,7 +73,7 @@ namespace streaming_archive { namespace writer { // Constructors Archive () : m_segments_dir_fd(-1), m_compression_level(0), m_global_metadata_db(nullptr), - old_ts_pattern(), m_schema_file_path() {} + m_old_ts_pattern(), m_timestamp_set(false), m_schema_file_path() {} // Destructor ~Archive (); @@ -136,7 +139,7 @@ namespace streaming_archive { namespace writer { * @param has_timestamp * @throw FileWriter::OperationFailed if any write fails */ - void write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, bool has_delimiter, bool has_timestamp); + void write_msg_using_schema (log_surgeon::LogEventView& log_event_view); /** * Writes snapshot of archive to disk including metadata of all files and new dictionary entries diff --git a/components/core/tests/test-Grep.cpp b/components/core/tests/test-Grep.cpp index 5591e1817..67745e82d 100644 --- a/components/core/tests/test-Grep.cpp +++ b/components/core/tests/test-Grep.cpp @@ -4,79 +4,82 @@ // Catch2 #include "../submodules/Catch2/single_include/catch2/catch.hpp" +// Log Surgeon +#include +#include + // Project headers -#include "../src/compressor_frontend/Lexer.hpp" -#include "../src/compressor_frontend/SchemaParser.hpp" -#include "../src/compressor_frontend/utils.hpp" #include "../src/Grep.hpp" -using compressor_frontend::DelimiterStringAST; -using compressor_frontend::lexers::ByteLexer; -using compressor_frontend::ParserAST; -using compressor_frontend::SchemaFileAST; -using compressor_frontend::SchemaParser; -using compressor_frontend::SchemaVarAST; +using log_surgeon::DelimiterStringAST; +using log_surgeon::lexers::ByteLexer; +using log_surgeon::ParserAST; +using log_surgeon::SchemaAST; +using log_surgeon::SchemaParser; +using log_surgeon::SchemaVarAST; using std::string; TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { ByteLexer forward_lexer; - compressor_frontend::load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, forward_lexer); + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, forward_lexer); ByteLexer reverse_lexer; - compressor_frontend::load_lexer_from_file("../tests/test_schema_files/search_schema.txt", true, reverse_lexer); + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", true, reverse_lexer); string str; size_t begin_pos; size_t end_pos; bool is_var; + std::string post_string; // m_end_pos past the end of the string str = ""; begin_pos = string::npos; end_pos = string::npos; - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, + reverse_lexer, post_string) == false); // Empty string str = ""; begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == false); // No tokens str = "="; begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == false); // No wildcards str = " MAC address 95: ad ff 95 24 0d ff =-abc- "; begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); @@ -84,7 +87,7 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var REQUIRE("-abc-" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == false); REQUIRE(str.length() == begin_pos); // With wildcards @@ -92,27 +95,27 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var begin_pos = 0; end_pos = 0; - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1\\*x"); REQUIRE(is_var == true); //REQUIRE(is_var == true); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "abc*123"); REQUIRE(is_var == false); //REQUIRE(is_var == true); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1.2"); REQUIRE(is_var == true); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394/-"); REQUIRE(is_var == true); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == true); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "-*abc-"); REQUIRE(is_var == false); - REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer) == false); + REQUIRE(Grep::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer, post_string) == false); } diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index ae0ee6a2d..432d368b0 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -1,3 +1,6 @@ +/// TODO: move this test to log_surgeon +/// TODO: move load_lexer_from_file into SearchParser in log_surgeon + // C libraries #include @@ -8,34 +11,44 @@ // Catch2 #include "../submodules/Catch2/single_include/catch2/catch.hpp" +// Log Surgeon +#include + // Project headers #include "../src/clp/run.hpp" -#include "../src/compressor_frontend/utils.hpp" -#include "../src/compressor_frontend/LogParser.hpp" +#include "../src/Utils.hpp" #include "../src/GlobalMySQLMetadataDB.hpp" -using compressor_frontend::DelimiterStringAST; -using compressor_frontend::LALR1Parser; -using compressor_frontend::lexers::ByteLexer; -using compressor_frontend::LogParser; -using compressor_frontend::ParserAST; -using compressor_frontend::SchemaFileAST; -using compressor_frontend::SchemaParser; -using compressor_frontend::SchemaVarAST; -using compressor_frontend::Token; - -std::unique_ptr generate_schema_ast(const std::string& schema_file) { +using log_surgeon::DelimiterStringAST; +using log_surgeon::LALR1Parser; +using log_surgeon::lexers::ByteLexer; +using log_surgeon::LogParser; +using log_surgeon::ParserAST; +using log_surgeon::SchemaAST; +using log_surgeon::SchemaParser; +using log_surgeon::SchemaVarAST; +using log_surgeon::Token; + +std::unique_ptr generate_schema_ast(const std::string& schema_file) { SchemaParser schema_parser; - FileReader schema_file_reader; - schema_file_reader.open(schema_file); - REQUIRE(schema_file_reader.is_open()); - std::unique_ptr schema_ast = schema_parser.generate_schema_ast(schema_file_reader); + FileReader schema_reader; + /// TODO: this wrapper is repeated a lot + log_surgeon::Reader reader_wrapper{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + schema_reader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; + schema_reader.open(schema_file); + REQUIRE(schema_reader.is_open()); + std::unique_ptr schema_ast = schema_parser.generate_schema_ast(reader_wrapper); REQUIRE(schema_ast.get() != nullptr); return schema_ast; } std::unique_ptr generate_log_parser(const std::string& schema_file) { - std::unique_ptr schema_ast = generate_schema_ast(schema_file); + std::unique_ptr schema_ast = generate_schema_ast(schema_file); std::unique_ptr log_parser = std::make_unique(schema_file); REQUIRE(log_parser.get() != nullptr); return log_parser; @@ -74,26 +87,23 @@ TEST_CASE("Test error for missing schema file", "[LALR1Parser][SchemaParser]") { TEST_CASE("Test error for empty schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/empty_schema.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), file_name +":1:1: error: empty file\n" - +" \n" - +"^\n"); + REQUIRE_THROWS_WITH(generate_schema_ast(file_path), "Schema:1:1: error: empty file\n" + " \n" + "^\n"); } TEST_CASE("Test error for colon missing schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/colon_missing_schema.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), file_name +":3:4: error: expected ':','AlphaNumeric' before ' ' token\n" - +" int [0-9]+\n" - +" ^\n"); + REQUIRE_THROWS_WITH(generate_schema_ast(file_path), "Schema:3:4: error: expected ':','AlphaNumeric' before ' ' token\n" + " int [0-9]+\n" + " ^\n"); } TEST_CASE("Test error for multi-character tokens in schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/schema_with_multicharacter_token_error.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), file_name +":2:11: error: expected ':' before ' ' token\n" - +" delimiters : \\r\\n\n" - +" ^\n"); + REQUIRE_THROWS_WITH(generate_schema_ast(file_path), "Schema:2:11: error: expected ':' before ' ' token\n" + " delimiters : \\r\\n\n" + " ^\n"); } TEST_CASE("Test creating schema parser", "[LALR1Parser][SchemaParser]") { @@ -109,13 +119,14 @@ TEST_CASE("Test creating log parser without delimiters", "[LALR1Parser][LogParse "When using --schema-path, \"delimiters:\" line must be used."); } -TEST_CASE("Test error for creating log file with delimiter in regex pattern", "[LALR1Parser][SchemaParser]") { - std::string file_path = "../tests/test_schema_files/schema_with_delimiter_in_regex_error.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_log_parser(file_path), file_name + ":2: error: 'equals' has regex pattern which contains delimiter '='.\n" - + " equals:.*=.*\n" - + " ^^^^^\n"); -} +/// TODO: This test doesn't currently work because delimiters are allowed in schema files, and there is no option to disable this yet +//TEST_CASE("Test error for creating log file with delimiter in regex pattern", "[LALR1Parser]SchemaParser]") { +// std::string file_path = "../tests/test_schema_files/schema_with_delimiter_in_regex_error.txt"; +// std::string file_name = boost::filesystem::canonical(file_path).string(); +// REQUIRE_THROWS_WITH(generate_log_parser(file_path), file_name + ":2: error: 'equals' has regex pattern which contains delimiter '='.\n" +// + " equals:.*=.*\n" +// + " ^^^^^\n"); +//} /// TODO: This error check is performed correctly by CLP, but it is handled by something different now so this test will fail as is //TEST_CASE("Test error for missing log file", "[LALR1Parser][LogParser]") { @@ -129,15 +140,28 @@ TEST_CASE("Test forward lexer", "[Search]") { ByteLexer forward_lexer; std::string schema_file_name = "../tests/test_schema_files/search_schema.txt"; std::string schema_file_path = boost::filesystem::weakly_canonical(schema_file_name).string(); - compressor_frontend::load_lexer_from_file(schema_file_path, false, forward_lexer); + load_lexer_from_file(schema_file_path, false, forward_lexer); FileReader reader; + /// TODO: this wrapper is repeated a lot + log_surgeon::Reader reader_wrapper{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + reader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; reader.open("../tests/test_search_queries/easy.txt"); - forward_lexer.reset(reader); - Token token = forward_lexer.scan(); - while (token.m_type_ids->at(0) != (int)compressor_frontend::SymbolID::TokenEndID) { - SPDLOG_INFO("token:" + token.get_string() + "\n"); - SPDLOG_INFO("token.m_type_ids->back():" + forward_lexer.m_id_symbol[token.m_type_ids->back()] + "\n"); - token = forward_lexer.scan(); + log_surgeon::ParserInputBuffer parser_input_buffer; + parser_input_buffer.read_if_safe(reader_wrapper); + forward_lexer.reset(); + Token token; + log_surgeon::ErrorCode error_code = forward_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); + while (token.m_type_ids_ptr->at(0) != (int)log_surgeon::SymbolID::TokenEndID) { + SPDLOG_INFO("token:" + token.to_string() + "\n"); + SPDLOG_INFO("token.m_type_ids->back():" + forward_lexer.m_id_symbol[token.m_type_ids_ptr->back()] + "\n"); + log_surgeon::ErrorCode error_code = forward_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); } } @@ -145,14 +169,27 @@ TEST_CASE("Test reverse lexer", "[Search]") { ByteLexer reverse_lexer; std::string schema_file_name = "../tests/test_schema_files/search_schema.txt"; std::string schema_file_path = boost::filesystem::weakly_canonical(schema_file_name).string(); - compressor_frontend::load_lexer_from_file(schema_file_path, true, reverse_lexer); + load_lexer_from_file(schema_file_path, false, reverse_lexer); FileReader reader; + /// TODO: this wrapper is repeated a lot + log_surgeon::Reader reader_wrapper{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + reader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; reader.open("../tests/test_search_queries/easy.txt"); - reverse_lexer.reset(reader); - Token token = reverse_lexer.scan(); - while (token.m_type_ids->at(0) != (int)compressor_frontend::SymbolID::TokenEndID) { - SPDLOG_INFO("token:" + token.get_string() + "\n"); - SPDLOG_INFO("token.m_type_ids->back():" + reverse_lexer.m_id_symbol[token.m_type_ids->back()] + "\n"); - token = reverse_lexer.scan(); + log_surgeon::ParserInputBuffer parser_input_buffer; + parser_input_buffer.read_if_safe(reader_wrapper); + reverse_lexer.reset(); + Token token; + log_surgeon::ErrorCode error_code = reverse_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); + while (token.m_type_ids_ptr->at(0) != (int)log_surgeon::SymbolID::TokenEndID) { + SPDLOG_INFO("token:" + token.to_string() + "\n"); + SPDLOG_INFO("token.m_type_ids->back():" + reverse_lexer.m_id_symbol[token.m_type_ids_ptr->back()] + "\n"); + log_surgeon::ErrorCode error_code = reverse_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); } } diff --git a/components/core/tests/test-Stopwatch.cpp b/components/core/tests/test-Stopwatch.cpp index 17a8c7c0b..2fb1b1a8a 100644 --- a/components/core/tests/test-Stopwatch.cpp +++ b/components/core/tests/test-Stopwatch.cpp @@ -38,6 +38,7 @@ TEST_CASE("Stopwatch", "[Stopwatch]") { REQUIRE(time_taken < 1.1); } + ///TODO: this test fails all the time SECTION("Test multiple measurements") { // Measure some work stopwatch.start();