From d78922d9777ada1973140577e3f03b78eb09bb02 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 26 Jun 2023 16:24:50 -0400 Subject: [PATCH] - Initial timestamp format parser implementation - Current bug with multiple newlines before timestamps --- components/core/CMakeLists.txt | 16 +- components/core/src/TimeFormatFileParser.cpp | 1 - components/core/src/TimeFormatFileParser.hpp | 110 ------ components/core/src/TimestampPattern.cpp | 93 ++--- components/core/src/TimestampPattern.hpp | 15 +- .../core/src/TimestampPatternsFileParser.cpp | 353 ++++++++++++++++++ .../core/src/TimestampPatternsFileParser.hpp | 228 +++++++++++ components/core/src/Utils.cpp | 8 +- .../core/src/clp/CommandLineArguments.cpp | 19 +- .../core/src/clp/CommandLineArguments.hpp | 2 + components/core/src/clp/FileCompressor.cpp | 3 +- components/core/src/clp/run.cpp | 19 +- .../src/streaming_archive/reader/File.cpp | 2 +- .../src/streaming_archive/writer/Archive.cpp | 3 +- components/core/tests/test_log_files/log.txt | 10 + .../default_formats.txt | 10 +- 16 files changed, 696 insertions(+), 196 deletions(-) delete mode 100644 components/core/src/TimeFormatFileParser.cpp delete mode 100644 components/core/src/TimeFormatFileParser.hpp create mode 100644 components/core/src/TimestampPatternsFileParser.cpp create mode 100644 components/core/src/TimestampPatternsFileParser.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 072a1a06d..20550c483 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -287,8 +287,8 @@ set(SOURCE_FILES_clp src/string_utils.hpp src/StringReader.cpp src/StringReader.hpp - src/TimeFormatFileParser.cpp - src/TimeFormatFileParser.hpp + src/TimestampPatternsFileParser.cpp + src/TimestampPatternsFileParser.hpp src/TimestampPattern.cpp src/TimestampPattern.hpp src/TraceableException.cpp @@ -420,8 +420,8 @@ set(SOURCE_FILES_clg src/string_utils.hpp src/StringReader.cpp src/StringReader.hpp - src/TimeFormatFileParser.cpp - src/TimeFormatFileParser.hpp + src/TimestampPatternsFileParser.cpp + src/TimestampPatternsFileParser.hpp src/TimestampPattern.cpp src/TimestampPattern.hpp src/TraceableException.cpp @@ -546,8 +546,8 @@ set(SOURCE_FILES_clo src/StringReader.hpp src/Thread.cpp src/Thread.hpp - src/TimeFormatFileParser.cpp - src/TimeFormatFileParser.hpp + src/TimestampPatternsFileParser.cpp + src/TimestampPatternsFileParser.hpp src/TimestampPattern.cpp src/TimestampPattern.hpp src/TraceableException.cpp @@ -731,8 +731,8 @@ set(SOURCE_FILES_unitTest src/string_utils.tpp src/StringReader.cpp src/StringReader.hpp - src/TimeFormatFileParser.cpp - src/TimeFormatFileParser.hpp + src/TimestampPatternsFileParser.cpp + src/TimestampPatternsFileParser.hpp src/TimestampPattern.cpp src/TimestampPattern.hpp src/TraceableException.cpp diff --git a/components/core/src/TimeFormatFileParser.cpp b/components/core/src/TimeFormatFileParser.cpp deleted file mode 100644 index d886b7021..000000000 --- a/components/core/src/TimeFormatFileParser.cpp +++ /dev/null @@ -1 +0,0 @@ -#include "TimeFormatFileParser.hpp \ No newline at end of file diff --git a/components/core/src/TimeFormatFileParser.hpp b/components/core/src/TimeFormatFileParser.hpp deleted file mode 100644 index d4f09daa4..000000000 --- a/components/core/src/TimeFormatFileParser.hpp +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef TIME_FORMAT_FILE_PARSER_HPP -#define TIME_FORMAT_FILE_PARSER_HPP - -#include - -// ASTs used in SchemaParser AST -class SchemaAST : public log_surgeon::ParserAST { -public: - // Constructor - SchemaAST() = default; - - /// TODO: shouldn't this add delimiters instead of setting it? - auto set_delimiters(std::unique_ptr delimiters_in) -> void { - m_delimiters = std::move(delimiters_in); - } - - auto add_schema_var(std::unique_ptr schema_var) -> void { - m_schema_vars.push_back(std::move(schema_var)); - } - - std::vector> m_schema_vars; - std::unique_ptr m_delimiters; - std::string m_file_path; -}; - -class IdentifierAST : public ParserAST { -public: - // Constructor - explicit IdentifierAST(char character) { m_name.push_back(character); } - - auto add_character(char character) -> void { m_name.push_back(character); } - - std::string m_name; -}; - -class SchemaVarAST : public ParserAST { -public: - // Constructor - SchemaVarAST(std::string name, - std::unique_ptr> - regex_ptr, - uint32_t line_num) - : m_line_num(line_num), - m_name(std::move(name)), - m_regex_ptr(std::move(regex_ptr)) {} - - uint32_t m_line_num; - std::string m_name; - std::unique_ptr> m_regex_ptr; -}; - -class DelimiterStringAST : public ParserAST { -public: - // Constructor - explicit DelimiterStringAST(uint32_t delimiter) { m_delimiters.push_back(delimiter); } - - auto add_delimiter(uint32_t delimiter) -> void { m_delimiters.push_back(delimiter); } - - std::vector m_delimiters; -}; - -class TimeFormatFileParser : - public log_surgeon::LALR1Parser { -public: - // Constructor - TimeFormatFileParser(); - - /** - * A semantic rule that needs access to soft_reset() - * @param m - * @return std::unique_ptr - */ - auto existing_schema_rule(NonTerminal* m) -> std::unique_ptr; - - /** - * Parse a user defined schema to generate a schema AST used for generating the log lexer - * @param reader - * @return std::unique_ptr - */ - auto generate_schema_ast(Reader& reader) -> std::unique_ptr; - - /** - * Wrapper around generate_schema_ast() - * @param schema_file_path - * @return std::unique_ptr - */ - static auto try_schema_file(std::string const& schema_file_path) - -> std::unique_ptr; - -private: - /** - * After lexing half of the buffer, reads into that half of the buffer and changes variables - * accordingly - * @param next_children_start - */ - auto soft_reset(uint32_t& next_children_start) -> void; - - /** - * Add all lexical rules needed for schema lexing - */ - auto add_lexical_rules() -> void; - - /** - * Add all productions needed for schema parsing - */ - auto add_productions() -> void; -}; - -#endif //TIME_FORMAT_FILE_PARSER_HPP diff --git a/components/core/src/TimestampPattern.cpp b/components/core/src/TimestampPattern.cpp index c07811660..f081e84f6 100644 --- a/components/core/src/TimestampPattern.cpp +++ b/components/core/src/TimestampPattern.cpp @@ -81,63 +81,7 @@ static bool convert_string_to_number (const string& str, const size_t begin_ix, * To initialize m_known_ts_patterns, we first create a vector of patterns then copy it to a dynamic array. This eases * maintenance of the list and the cost doesn't matter since it is only done once when the program starts. */ -void TimestampPattern::init () { - // First create vector of observed patterns so that it's easy to maintain - vector patterns; - // E.g. 2015-01-31T15:50:45.392 - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S.%3"); - // E.g. 2015-01-31T15:50:45,392 - patterns.emplace_back(0, "%Y-%m-%dT%H:%M:%S,%3"); - // E.g. [2015-01-31T15:50:45 - patterns.emplace_back(0, "[%Y-%m-%dT%H:%M:%S"); - // E.g. [20170106-16:56:41] - patterns.emplace_back(0, "[%Y%m%d-%H:%M:%S]"); - // E.g. 2015-01-31 15:50:45,392 - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S,%3"); - // E.g. 2015-01-31 15:50:45.392 - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S.%3"); - // E.g. [2015-01-31 15:50:45,085] - patterns.emplace_back(0, "[%Y-%m-%d %H:%M:%S,%3]"); - // E.g. 2015-01-31 15:50:45 - patterns.emplace_back(0, "%Y-%m-%d %H:%M:%S"); - // E.g. Start-Date: 2015-01-31 15:50:45 - patterns.emplace_back(1, "%Y-%m-%d %H:%M:%S"); - // E.g. 2015/01/31 15:50:45 - patterns.emplace_back(0, "%Y/%m/%d %H:%M:%S"); - // E.g. 15/01/31 15:50:45 - patterns.emplace_back(0, "%y/%m/%d %H:%M:%S"); - // E.g. 150131 9:50:45 - patterns.emplace_back(0, "%y%m%d %k:%M:%S"); - // E.g. 01 Jan 2016 15:50:17,085 - patterns.emplace_back(0, "%d %b %Y %H:%M:%S,%3"); - // E.g. Jan 01, 2016 3:50:17 PM - patterns.emplace_back(0, "%b %d, %Y %l:%M:%S %p"); - // E.g. January 31, 2015 15:50 - patterns.emplace_back(0, "%B %d, %Y %H:%M"); - // E.g. E [31/Jan/2015:15:50:45 - patterns.emplace_back(1, "[%d/%b/%Y:%H:%M:%S"); - // E.g. localhost - - [01/Jan/2016:15:50:17 - // E.g. 192.168.4.5 - - [01/Jan/2016:15:50:17 - patterns.emplace_back(3, "[%d/%b/%Y:%H:%M:%S"); - // E.g. 192.168.4.5 - - [01/01/2016:15:50:17 - patterns.emplace_back(3, "[%d/%m/%Y:%H:%M:%S"); - // E.g. INFO [main] 2015-01-31 15:50:45,085 - patterns.emplace_back(2, "%Y-%m-%d %H:%M:%S,%3"); - // E.g. Started POST "/api/v3/internal/allowed" for 127.0.0.1 at 2017-06-18 00:20:44 - patterns.emplace_back(6, "%Y-%m-%d %H:%M:%S"); - // E.g. update-alternatives 2015-01-31 15:50:45 - patterns.emplace_back(1, "%Y-%m-%d %H:%M:%S"); - // E.g. ERROR: apport (pid 4557) Sun Jan 1 15:50:45 2015 - patterns.emplace_back(4, "%a %b %e %H:%M:%S %Y"); - // E.g. <<<2016-11-10 03:02:29:936 - patterns.emplace_back(0, "<<<%Y-%m-%d %H:%M:%S:%3"); - - // TODO These patterns are imprecise and will prevent searching by timestamp; but for now, it's no worse than not parsing a timestamp - // E.g. Jan 21 11:56:42 - patterns.emplace_back(0, "%b %d %H:%M:%S"); - // E.g. 01-21 11:56:42.392 - patterns.emplace_back(0, "%m-%d %H:%M:%S.%3"); - +void TimestampPattern::init (vector& patterns) { // Initialize m_known_ts_patterns with vector's contents m_known_ts_patterns_len = patterns.size(); m_known_ts_patterns = std::make_unique(m_known_ts_patterns_len); @@ -160,6 +104,10 @@ const TimestampPattern* TimestampPattern::search_known_ts_patterns (const string return nullptr; } +const string& TimestampPattern::get_regex () const { + return m_regex; +} + const string& TimestampPattern::get_format () const { return m_format; } @@ -175,6 +123,7 @@ bool TimestampPattern::is_empty () const { void TimestampPattern::clear () { m_num_spaces_before_ts = 0; m_format.clear(); + m_regex.clear(); } bool TimestampPattern::parse_timestamp (const string& line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos) const { @@ -503,6 +452,32 @@ bool TimestampPattern::parse_timestamp (const string& line, epochtime_t& timesta break; } + case 'r': { // Relative timestamp in millisecond + int cFieldLength = 0; + while(line_ix + cFieldLength < line_length) { + if('0' <= line[line_ix + cFieldLength] && line[line_ix + cFieldLength] <= + '9') + { + cFieldLength++; + } else { + break; + } + } + if(cFieldLength == 0) { + return false; + } + int value; + if (!convert_string_to_number(line, line_ix, line_ix + cFieldLength, '0', + value) || value < 0) + { + return false; + } + millisecond = value; + line_ix += cFieldLength; + + break; + } + default: return false; } @@ -698,6 +673,10 @@ void TimestampPattern::insert_formatted_timestamp (const epochtime_t timestamp, append_padded_value(millisecond, '0', 3, new_msg); break; + case 'r': // Relative timestamp + new_msg += std::to_string(timestamp); + break; + default: { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } diff --git a/components/core/src/TimestampPattern.hpp b/components/core/src/TimestampPattern.hpp index f7653c3f6..b6899eba1 100644 --- a/components/core/src/TimestampPattern.hpp +++ b/components/core/src/TimestampPattern.hpp @@ -5,6 +5,7 @@ #include #include #include +#include // Project headers #include "Defs.h" @@ -54,13 +55,16 @@ class TimestampPattern { // Constructors TimestampPattern () : m_num_spaces_before_ts(0) {} - TimestampPattern (uint8_t num_spaces_before_ts, const std::string& format) : m_num_spaces_before_ts(num_spaces_before_ts), m_format(format) {} + TimestampPattern (uint8_t num_spaces_before_ts, const std::string& format, + const std::string& regex) : + m_num_spaces_before_ts(num_spaces_before_ts), m_format(format), m_regex(regex) {} // Methods /** * Static initializer for class. This must be called before using the class. + * @param patterns */ - static void init (); + static void init (std::vector& patterns); /** * Searches for a known timestamp pattern which can parse the timestamp from the given line, and if found, parses the timestamp @@ -73,6 +77,12 @@ class TimestampPattern { static const TimestampPattern* search_known_ts_patterns (const std::string& line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos); + /** + * Gets the timestamp pattern's regex string + * @return See description + */ + const std::string& get_regex () const; + /** * Gets the timestamp pattern's format string * @return See description @@ -136,6 +146,7 @@ class TimestampPattern { // ^ ^ ^ uint8_t m_num_spaces_before_ts; std::string m_format; + std::string m_regex; }; #endif // TIMESTAMPPATTERN_HPP diff --git a/components/core/src/TimestampPatternsFileParser.cpp b/components/core/src/TimestampPatternsFileParser.cpp new file mode 100644 index 000000000..8a645ed5e --- /dev/null +++ b/components/core/src/TimestampPatternsFileParser.cpp @@ -0,0 +1,353 @@ +#include "TimestampPatternsFileParser.hpp" + +// C++ libraries +#include +#include +#include + +// Log Surgeon +#include +#include +#include +#include +#include +#include + +using FileReader = log_surgeon::FileReader; +using NonTerminal = log_surgeon::NonTerminal; +using ParserAST = log_surgeon::ParserAST; +template using ParserValue = log_surgeon::ParserValue; +using Reader = log_surgeon::Reader; +using RegexASTByte = + log_surgeon::finite_automata::RegexAST; +using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup< + log_surgeon::finite_automata::RegexNFAByteState>; +using RegexASTCatByte = + log_surgeon::finite_automata::RegexASTCat; +using Token = log_surgeon::Token; + +using std::make_unique; +using std::string; +using std::unique_ptr; + +TimestampPatternsFileParser::TimestampPatternsFileParser() : m_timestamp_patterns(), + m_current_timestamp_num_spaces(""), + m_current_timestamp_format(""), + m_current_timestamp_regex("") +{ + add_lexical_rules(); + add_productions(); + generate(); +} + +auto TimestampPatternsFileParser::generate_timestamp_patterns(Reader& reader) -> void { + parse(reader); +} + +auto TimestampPatternsFileParser::try_timestamp_patterns_file(string const& schema_file_path) +-> std::vector { + FileReader file_reader; + log_surgeon::ErrorCode error_code = file_reader.try_open(schema_file_path); + if (log_surgeon::ErrorCode::Success != error_code) { + if (log_surgeon::ErrorCode::Errno == error_code) { + throw std::runtime_error( + strfmt("Failed to read '%s', errno=%d", schema_file_path.c_str(), errno)); + } + int code{static_cast>(error_code)}; + throw std::runtime_error( + strfmt("Failed to read '%s', error_code=%d", schema_file_path.c_str(), code)); + } + TimestampPatternsFileParser parser; + Reader reader{[&](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + file_reader.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }}; + parser.generate_timestamp_patterns(reader); + file_reader.close(); + return parser.m_timestamp_patterns; +} + +auto TimestampPatternsFileParser::timestamp_pattern_rule(NonTerminal* m) -> unique_ptr { + ///TODO: how should this fail if m_current_timestamp_num_spaces is too big for uint8_t? + for(uint8_t i = 0; i < stoi(m_current_timestamp_num_spaces); i++) { + m_current_timestamp_regex.insert(0, "[^ ]+ "); + } + m_timestamp_patterns.emplace_back(stoi(m_current_timestamp_num_spaces), + m_current_timestamp_format, + m_current_timestamp_regex); + m_current_timestamp_num_spaces.clear(); + m_current_timestamp_format.clear(); + m_current_timestamp_regex.clear(); + return nullptr; +} + +auto TimestampPatternsFileParser::existing_num_spaces_rule(NonTerminal* m) -> unique_ptr { + m_current_timestamp_num_spaces += m->token_cast(1)->to_string(); + return nullptr; +} + + +auto TimestampPatternsFileParser::new_num_spaces_rule(NonTerminal* m) -> unique_ptr { + m_current_timestamp_num_spaces += m->token_cast(0)->to_string(); + return nullptr; +} + +auto TimestampPatternsFileParser::percent_r_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%r"; + m_current_timestamp_regex += "\\d+"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_Y_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%Y"; + m_current_timestamp_regex += "\\d{4}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_y_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%y"; + m_current_timestamp_regex += "\\d{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_m_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%m"; + m_current_timestamp_regex += "\\d{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_b_rule(NonTerminal* /* m */) -> unique_ptr { +m_current_timestamp_format += "%b"; +m_current_timestamp_regex += "[A-Za-z]{3}"; +return nullptr; +} + +auto TimestampPatternsFileParser::percent_B_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%B"; + m_current_timestamp_regex += "[A-Za-z]{3,9}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_d_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%d"; + m_current_timestamp_regex += "\\d{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_e_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%e"; + m_current_timestamp_regex += "\\d{1,2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_a_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%a"; + m_current_timestamp_regex += "[A-Za-z]{3}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_H_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%H"; + m_current_timestamp_regex += "\\d{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_k_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%k"; + m_current_timestamp_regex += "\\d{1,2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_l_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%l"; + m_current_timestamp_regex += "\\d{1,2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_p_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%p"; + m_current_timestamp_regex += "[A-Za-z]{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_M_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%M"; + m_current_timestamp_regex += "\\d{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_S_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%S"; + m_current_timestamp_regex += "\\d{2}"; + return nullptr; +} + +auto TimestampPatternsFileParser::percent_3_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%3"; + m_current_timestamp_regex += "\\d{3}"; + return nullptr; +} + +auto TimestampPatternsFileParser::cancel_literal_rule(NonTerminal* /* m */) -> unique_ptr { + m_current_timestamp_format += "%"; + m_current_timestamp_regex += "%"; + return nullptr; +} + +auto TimestampPatternsFileParser::literal_rule(NonTerminal* m) -> unique_ptr { + m_current_timestamp_format += m->token_cast(0)->to_string(); + m_current_timestamp_regex += m->token_cast(0)->to_string(); + return nullptr; +} + +auto TimestampPatternsFileParser::special_literal_rule(NonTerminal* m) -> unique_ptr { + m_current_timestamp_format += m->token_cast(0)->to_string(); + m_current_timestamp_regex += "\\" + m->token_cast(0)->to_string(); + return nullptr; +} + +void TimestampPatternsFileParser::add_lexical_rules() { + add_token_group("Digit", make_unique('0', '9')); + add_token("Colon", ':'); + add_token("Percent", '%'); + add_token("Y", 'Y'); + add_token("y", 'y'); + add_token("m", 'm'); + add_token("b", 'b'); + add_token("B", 'B'); + add_token("d", 'd'); + add_token("e", 'e'); + add_token("a", 'a'); + add_token("H", 'H'); + add_token("k", 'k'); + add_token("l", 'l'); + add_token("p", 'p'); + add_token("M", 'M'); + add_token("S", 'S'); + add_token("3", '3'); + add_token("r", 'r'); + add_token("NewLine", '\n'); + add_token("CarriageReturn", '\r'); + // special characters that must be led by a '\' in regex to be literals + // (refer to productions in SchemaParser using regex_cancel_literal_rule) + std::vector special_characters; + special_characters.push_back('('); + special_characters.push_back(')'); + special_characters.push_back('*'); + special_characters.push_back('+'); + special_characters.push_back('-'); + special_characters.push_back('.'); + special_characters.push_back('['); + special_characters.push_back('\\'); + special_characters.push_back(']'); + special_characters.push_back('^'); + special_characters.push_back('{'); + special_characters.push_back('|'); + special_characters.push_back('}'); + unique_ptr special_characters_group + = make_unique(special_characters); + add_token_group("SpecialCharacters", std::move(special_characters_group)); + // default constructs to an m_negate group + unique_ptr literal_characters = make_unique(); + literal_characters->add_literal('\r'); + literal_characters->add_literal('\n'); + literal_characters->add_literal('%'); + for(uint32_t i : special_characters) { + literal_characters->add_literal(i); + } + add_token_group("LiteralCharacter", std::move(literal_characters)); + // everything below is for comments + add_token("Hash", '#'); + // default constructs to an m_negate group + unique_ptr comment_characters = make_unique(); + comment_characters->add_literal('\r'); + comment_characters->add_literal('\n'); + add_token_group("CommentCharacter", std::move(comment_characters)); +} + +void TimestampPatternsFileParser::add_productions() { + add_production("TimestampPatterns", {"Comment"}, nullptr); + add_production("TimestampPatterns", {"TimestampPattern"}, nullptr); + add_production("TimestampPatterns", {"TimestampPatterns", "PortableNewLine"},nullptr); + add_production("TimestampPatterns", {"TimestampPatterns", "PortableNewLine", "Comment"}, + nullptr); + add_production("TimestampPatterns", + {"TimestampPatterns", "PortableNewLine", "TimestampPattern"}, nullptr); + add_production("PortableNewLine", {"CarriageReturn", "NewLine"}, nullptr); + add_production("PortableNewLine", {"NewLine"}, nullptr); + add_production("Comment", {"Hash", "CommentString"}, nullptr); + add_production("CommentString", {"CommentString", "CommentCharacter"}, nullptr); + add_production("CommentString", {"CommentCharacter"}, nullptr); + add_production("TimestampPattern", {"NumSpaces", "Colon", "TimeFormat"}, + std::bind(&TimestampPatternsFileParser::timestamp_pattern_rule, this, + std::placeholders::_1)); + add_production("TimeFormat", {"TimeFormat", "Literal"}, nullptr); + add_production("TimeFormat", {"Literal"}, nullptr); + add_production("NumSpaces", {"NumSpaces", "Digit"}, + std::bind(&TimestampPatternsFileParser::existing_num_spaces_rule, this, + std::placeholders::_1)); + add_production("NumSpaces", {"Digit"}, + std::bind(&TimestampPatternsFileParser::new_num_spaces_rule, this, + std::placeholders::_1)); + /// TODO: add relative restrictions into lexer + add_production("Literal", {"Percent", "r"}, + std::bind(&TimestampPatternsFileParser::percent_r_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "Y"}, + std::bind(&TimestampPatternsFileParser::percent_Y_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "y"}, + std::bind(&TimestampPatternsFileParser::percent_y_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "m"}, + std::bind(&TimestampPatternsFileParser::percent_m_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "b"}, + std::bind(&TimestampPatternsFileParser::percent_b_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "B"}, + std::bind(&TimestampPatternsFileParser::percent_B_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "d"}, + std::bind(&TimestampPatternsFileParser::percent_d_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "e"}, + std::bind(&TimestampPatternsFileParser::percent_e_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "a"}, + std::bind(&TimestampPatternsFileParser::percent_a_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "H"}, + std::bind(&TimestampPatternsFileParser::percent_H_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "k"}, + std::bind(&TimestampPatternsFileParser::percent_k_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "l"}, + std::bind(&TimestampPatternsFileParser::percent_l_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "p"}, + std::bind(&TimestampPatternsFileParser::percent_p_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "M"}, + std::bind(&TimestampPatternsFileParser::percent_M_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "S"}, + std::bind(&TimestampPatternsFileParser::percent_S_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "3"}, + std::bind(&TimestampPatternsFileParser::percent_3_rule, this, + std::placeholders::_1)); + add_production("Literal", {"Percent", "Percent"}, + std::bind(&TimestampPatternsFileParser::cancel_literal_rule, this, + std::placeholders::_1)); + add_production("Literal", {"LiteralCharacter"}, + std::bind(&TimestampPatternsFileParser::literal_rule, this, + std::placeholders::_1)); + add_production("Literal", {"SpecialCharacters"}, + std::bind(&TimestampPatternsFileParser::special_literal_rule, this, + std::placeholders::_1)); +} diff --git a/components/core/src/TimestampPatternsFileParser.hpp b/components/core/src/TimestampPatternsFileParser.hpp new file mode 100644 index 000000000..dfb72a421 --- /dev/null +++ b/components/core/src/TimestampPatternsFileParser.hpp @@ -0,0 +1,228 @@ +#ifndef TIMESTAMP_PATTERNS_FILE_PARSER_HPP +#define TIMESTAMP_PATTERNS_FILE_PARSER_HPP + +// Log Surgeon +#include +#include + +// Project headers +#include "TimestampPattern.hpp" + +class TimestampPatternsFileParser : + public log_surgeon::LALR1Parser { +public: + // Constructor + TimestampPatternsFileParser(); + + /** + * Adds current timestamp pattern to m_timestamp_patterns and resets + * @param m unused + * @return nullptr + */ + auto timestamp_pattern_rule(log_surgeon::NonTerminal* /* m */) + -> std::unique_ptr; + + /** + * Begins building the digit string for number of spaces in the timestamp + * @param m + * @return nullptr + */ + auto new_num_spaces_rule(log_surgeon::NonTerminal* m) -> + std::unique_ptr; + + /** + * Extends existing digit string for number of spaces in the timestamp + * @param m + * @return nullptr + */ + auto existing_num_spaces_rule(log_surgeon::NonTerminal* m) -> + std::unique_ptr; + + /** + * If "%r" is lexed adds "%r" to time format string and 1 or more digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_r_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%Y" is lexed adds "%Y" to time format string and 4 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_Y_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%y" is lexed adds "%y" to time format string and 2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_y_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%m" is lexed adds "%m" to time format string and 2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_m_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%b" is lexed adds "%b" to time format string and 3 characters to regex string + * @param m unused + * @return nullptr + */ + auto percent_b_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%B" is lexed adds "%B" to time format string and 3-9 characters to regex string + * @param m unused + * @return nullptr + */ + auto percent_B_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%d" is lexed adds "%d" to time format string and 2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_d_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%e" is lexed adds "%e" to time format string and 1-2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_e_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%a" is lexed adds "%a" to time format string and 3 characters to regex string + * @param m unused + * @return nullptr + */ + auto percent_a_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%h" is lexed adds "%h" to time format string and 2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_H_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%k" is lexed adds "%k" to time format string and 1-2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_k_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%l" is lexed adds "%l" to time format string and 1-2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_l_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%p" is lexed adds "%p" to time format string and AM/PM to regex string + * @param m unused + * @return nullptr + */ + auto percent_p_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%M" is lexed adds "%M" to time format string and 2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_M_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%S" is lexed adds "%S" to time format string and 2 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_S_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%3" is lexed adds "%3" to time format string and 3 digits to regex string + * @param m unused + * @return nullptr + */ + auto percent_3_rule(log_surgeon::NonTerminal* /* m */) -> + std::unique_ptr; + + /** + * If "%%" is lexed, adds '%" to to the time format and regex strings + * @param m unused + * @return nullptr + */ + auto cancel_literal_rule(log_surgeon::NonTerminal* /* m */) + -> std::unique_ptr; + + /** + * Adds a lexed literal to the time format and regex strings + * @param m contains lexed character + * @return nullptr + */ + auto literal_rule(log_surgeon::NonTerminal* m) -> std::unique_ptr; + + /** + * Adds a lexed special literal to the time format and regex strings + * (e.g. '-' in regex is "\-") + * @param m contains lexed character + * @return nullptr + */ + auto special_literal_rule(log_surgeon::NonTerminal* m) -> std::unique_ptr; + + /** + * Parse user defined timestamp patterns file in reader and store them in m_timestamp_patterns + * @param reader + */ + auto generate_timestamp_patterns(log_surgeon::Reader& reader) -> void; + + /** + * Wrapper around generate_timestamp_patterns_ast() + * @param file_path + * @return a vector containing the parsed timestamp patterns + */ + static auto try_timestamp_patterns_file(std::string const& file_path) + -> std::vector; + +private: + /** + * Add all lexical rules needed for timestamp patterns lexing + */ + auto add_lexical_rules() -> void; + + /** + * Add all productions needed for timestamp patterns parsing + */ + auto add_productions() -> void; + + // contains all timestamp patterns parsed + std::vector m_timestamp_patterns; + // contains num_spaces of timestamp pattern currently being parsed + std::string m_current_timestamp_num_spaces; + // contains time format of timestamp pattern currently being parsed + std::string m_current_timestamp_format; + // contains regex of timestamp pattern currently being parsed + std::string m_current_timestamp_regex; +}; + +#endif //TIMESTAMP_PATTERNS_FILE_PARSER_HPP diff --git a/components/core/src/Utils.cpp b/components/core/src/Utils.cpp index 520a3b64f..26bff70d2 100644 --- a/components/core/src/Utils.cpp +++ b/components/core/src/Utils.cpp @@ -243,21 +243,21 @@ void load_lexer_from_file (std::string schema_file_path, lexer.m_symbol_id[log_surgeon::cTokenEnd] = (int) log_surgeon::SymbolID::TokenEndID; lexer.m_symbol_id[log_surgeon::cTokenUncaughtString] = (int) log_surgeon::SymbolID::TokenUncaughtStringID; - lexer.m_symbol_id[log_surgeon::cTokenInt] = (int) log_surgeon::SymbolID::TokenIntId; - lexer.m_symbol_id[log_surgeon::cTokenFloat] = (int) log_surgeon::SymbolID::TokenFloatId; lexer.m_symbol_id[log_surgeon::cTokenFirstTimestamp] = (int) log_surgeon::SymbolID::TokenFirstTimestampId; lexer.m_symbol_id[log_surgeon::cTokenNewlineTimestamp] = (int) log_surgeon::SymbolID::TokenNewlineTimestampId; + lexer.m_symbol_id[log_surgeon::cTokenInt] = (int) log_surgeon::SymbolID::TokenIntId; + lexer.m_symbol_id[log_surgeon::cTokenFloat] = (int) log_surgeon::SymbolID::TokenFloatId; lexer.m_symbol_id[log_surgeon::cTokenNewline] = (int) log_surgeon::SymbolID::TokenNewlineId; lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenEndID] = log_surgeon::cTokenEnd; lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenUncaughtStringID] = log_surgeon::cTokenUncaughtString; - lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenIntId] = log_surgeon::cTokenInt; - lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenFloatId] = log_surgeon::cTokenFloat; lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenFirstTimestampId] = log_surgeon::cTokenFirstTimestamp; lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenNewlineTimestampId] = log_surgeon::cTokenNewlineTimestamp; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenIntId] = log_surgeon::cTokenInt; + lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenFloatId] = log_surgeon::cTokenFloat; lexer.m_id_symbol[(int) log_surgeon::SymbolID::TokenNewlineId] = log_surgeon::cTokenNewline; /// TODO: figure out why this needs to be specially added diff --git a/components/core/src/clp/CommandLineArguments.cpp b/components/core/src/clp/CommandLineArguments.cpp index e5c7c415b..5d12b4257 100644 --- a/components/core/src/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/CommandLineArguments.cpp @@ -161,6 +161,7 @@ namespace clp { po::options_description extraction_positional_options; extraction_positional_options.add_options() ("archives-dir", po::value(&m_archives_dir)) + ("ts-file-path", po::value(&m_ts_patterns_file_path)) ("output-dir", po::value(&m_output_dir)) ("paths", po::value< vector >(&m_input_paths)->composing()) ; @@ -208,10 +209,12 @@ namespace clp { // Define compression hidden positional options po::options_description compression_positional_options; compression_positional_options.add_options() + ("ts-file-path", po::value(&m_ts_patterns_file_path)) ("output-dir", po::value(&m_output_dir)) ("input-paths", po::value< vector >(&m_input_paths)->composing()) ; po::positional_options_description compression_positional_options_description; + compression_positional_options_description.add("ts-file-path", 1); compression_positional_options_description.add("output-dir", 1); compression_positional_options_description.add("input-paths", -1); @@ -257,7 +260,7 @@ namespace clp { cerr << "Examples:" << endl; cerr << " # Compress file1.txt and dir1 into the output dir" << endl; - cerr << " " << get_program_name() << " c output-dir file1.txt dir1" << endl; + cerr << " " << get_program_name() << " c ts-file-path output-dir file1.txt dir1" << endl; cerr << endl; po::options_description visible_options; @@ -302,7 +305,19 @@ namespace clp { } } } - + // Validate timestamp patterns file + if (m_ts_patterns_file_path.empty()) { + throw invalid_argument("Timestamp file (ts-file-path) not specified or empty."); + } + if (false == boost::filesystem::exists(m_ts_patterns_file_path)) { + throw invalid_argument("Specified timestamp file (ts-file-path) '" + + m_ts_patterns_file_path + "' does not exist."); + } + if (false == boost::filesystem::is_regular_file(m_ts_patterns_file_path)) { + throw invalid_argument("Specified timestamp file (ts-file-path) '" + + m_ts_patterns_file_path + "' is not a regular file."); + } + // Validate an output directory was specified if (m_output_dir.empty()) { throw invalid_argument("output-dir not specified or empty."); diff --git a/components/core/src/clp/CommandLineArguments.hpp b/components/core/src/clp/CommandLineArguments.hpp index 6d61024ee..09ee3b8d5 100644 --- a/components/core/src/clp/CommandLineArguments.hpp +++ b/components/core/src/clp/CommandLineArguments.hpp @@ -33,6 +33,7 @@ namespace clp { const std::string& get_path_prefix_to_remove () const { return m_path_prefix_to_remove; } const std::string& get_output_dir () const { return m_output_dir; } const std::string& get_schema_file_path () const { return m_schema_file_path; } + const std::string& get_ts_patterns_file_path () const { return m_ts_patterns_file_path; } bool get_use_heuristic () const { return (m_schema_file_path.empty()); } bool show_progress () const { return m_show_progress; } bool print_archive_stats_progress () const { return m_print_archive_stats_progress; } @@ -56,6 +57,7 @@ namespace clp { std::string m_path_prefix_to_remove; std::string m_output_dir; std::string m_schema_file_path; + std::string m_ts_patterns_file_path; bool m_show_progress; bool m_print_archive_stats_progress; size_t m_target_encoded_file_size; diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 45204fbed..170696d45 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -91,7 +91,7 @@ namespace clp { size_t target_encoded_file_size, const FileToCompress& file_to_compress, streaming_archive::writer::Archive& archive_writer, bool use_heuristic) { std::string file_name = std::filesystem::canonical(file_to_compress.get_path()).string(); - + SPDLOG_INFO("Start parsing {}", file_name); PROFILER_SPDLOG_INFO("Start parsing {}", file_name) Profiler::start_continuous_measurement(); @@ -131,6 +131,7 @@ namespace clp { Profiler::stop_continuous_measurement(); LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::ParseLogFile) PROFILER_SPDLOG_INFO("Done parsing {}", file_name) + SPDLOG_INFO("Done parsing {}", file_name); return succeeded; } diff --git a/components/core/src/clp/run.cpp b/components/core/src/clp/run.cpp index f5912ec3d..bc785f4b3 100644 --- a/components/core/src/clp/run.cpp +++ b/components/core/src/clp/run.cpp @@ -12,6 +12,7 @@ // Project headers #include "../Profiler.hpp" +#include "../TimestampPatternsFileParser.hpp" #include "../Utils.hpp" #include "CommandLineArguments.hpp" #include "compression.hpp" @@ -35,8 +36,7 @@ namespace clp { return -1; } Profiler::init(); - TimestampPattern::init(); - + clp::CommandLineArguments command_line_args("clp"); auto parsing_result = command_line_args.parse_arguments(argc, argv); switch (parsing_result) { @@ -48,7 +48,13 @@ namespace clp { // Continue processing break; } - + + const std::string& ts_patterns_file_path = + command_line_args.get_ts_patterns_file_path(); + std::vector timestamp_patterns = + TimestampPatternsFileParser::try_timestamp_patterns_file(ts_patterns_file_path); + TimestampPattern::init(timestamp_patterns); + vector input_paths = command_line_args.get_input_paths(); Profiler::start_continuous_measurement(); @@ -65,7 +71,12 @@ namespace clp { std::unique_ptr reader_parser; if (!command_line_args.get_use_heuristic()) { const std::string& schema_file_path = command_line_args.get_schema_file_path(); - reader_parser = std::make_unique(schema_file_path); + log_surgeon::Schema schema(schema_file_path); + // TODO: give an error if timestamp is specified in schema file + for(TimestampPattern timestamp_pattern : timestamp_patterns) { + schema.add_variable("timestamp", timestamp_pattern.get_regex(), 0); + } + reader_parser = std::make_unique(schema); } boost::filesystem::path path_prefix_to_remove(command_line_args.get_path_prefix_to_remove()); diff --git a/components/core/src/streaming_archive/reader/File.cpp b/components/core/src/streaming_archive/reader/File.cpp index 8b3ac80ca..1a844585b 100644 --- a/components/core/src/streaming_archive/reader/File.cpp +++ b/components/core/src/streaming_archive/reader/File.cpp @@ -69,7 +69,7 @@ namespace streaming_archive::reader { m_timestamp_patterns.emplace_back( std::piecewise_construct, std::forward_as_tuple(msg_num), - forward_as_tuple(num_spaces_before_ts, timestamp_format)); + forward_as_tuple(num_spaces_before_ts, timestamp_format, "")); } m_num_messages = file_metadata_ix.get_num_messages(); diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index 955975852..ac71d978e 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -292,8 +292,7 @@ namespace streaming_archive::writer { size_t start; size_t end; timestamp_pattern = (TimestampPattern*) TimestampPattern::search_known_ts_patterns( - log_view.get_log_output_buffer()->get_mutable_token(0).to_string(), timestamp, - start, end); + log_view.get_timestamp()->to_string(), timestamp, start, end); if (m_old_ts_pattern != *timestamp_pattern) { change_ts_pattern(timestamp_pattern); m_old_ts_pattern = *timestamp_pattern; diff --git a/components/core/tests/test_log_files/log.txt b/components/core/tests/test_log_files/log.txt index f514ec82d..42c4240f4 100644 --- a/components/core/tests/test_log_files/log.txt +++ b/components/core/tests/test_log_files/log.txt @@ -1,3 +1,13 @@ + ... 52 more +2016-05-08 07:34:05.255 Some Static Text Then MyDog123 APet4123\test.txt Then 123 then 123.123 + +75 + +test test 90061999ms one day one hour one minute one second rest milliseconds +999 rest milliseconds +1999 one second rest milliseconds +61999 one minute one second rest milliseconds +3661999 one hour one minute one second rest milliseconds 2016-05-08 07:34:05.251 MyDog123 APet4123\test.txt 2016-05-08 07:34:05.252 statictext123 2016-05-08 07:34:05.253 123 diff --git a/components/core/tests/test_time_format_files/default_formats.txt b/components/core/tests/test_time_format_files/default_formats.txt index 09245ab8b..05afab9fe 100644 --- a/components/core/tests/test_time_format_files/default_formats.txt +++ b/components/core/tests/test_time_format_files/default_formats.txt @@ -51,9 +51,11 @@ 0:%b %d %H:%M:%S # E.g. 01-21 11:56:42.392 0:%m-%d %H:%M:%S.%3 -# E.g. 925123679 -0:%rn -# E.g. 925123679ns -0:%rnns +# E.g. 3661999 +0:%r +# E.g. 3661999ms +0:%rms +# E.g. INFO [main] 3661999ms +2:%rms # This is not allowed: 0:%rs:%3 E.g. 925:913 # This is not allowed: 0:%rs913 E.g. 925913