diff --git a/docs/schema.md b/docs/schema.md index 4fd60f9d..df5ae19a 100644 --- a/docs/schema.md +++ b/docs/schema.md @@ -96,7 +96,7 @@ delimiters: \t\r\n:,!;% timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1} timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\] int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Custom variables hex:[a-fA-F]+ @@ -109,7 +109,7 @@ equals:.*=.*[a-zA-Z0-9].* * `timestamp` matches two different patterns: * 2023-04-19 12:32:08.064 * [20230419-12:32:08] -* `int`, `double`, `hex`, `hasNumber`, and `equals` all match different user +* `int`, `float`, `hex`, `hasNumber`, and `equals` all match different user defined variables. ## Regular Expression Syntax diff --git a/examples/schema.txt b/examples/schema.txt index b2a340ef..171da848 100644 --- a/examples/schema.txt +++ b/examples/schema.txt @@ -54,7 +54,7 @@ delimiters: \t\r\n:,!;% // First set of variables int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Second set of variables hex:[a-fA-F]+ diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index 5752749a..471efcca 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -48,6 +48,7 @@ namespace utf8 { // 0xFF are invalid UTF-8 code units static unsigned char const cCharEOF = 0xFF; static unsigned char const cCharErr = 0xFE; + static unsigned char const cCharStartOfFile = 0xFD; } // namespace utf8 } // namespace log_surgeon diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index 7a362210..d5a12bab 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -83,6 +83,13 @@ class Lexer { */ auto reset() -> void; + /** + * Set the lexer state as if it had already read a delimiter (used for + * treating start of file as a delimiter) + * @param input_buffer containing the data to be lexed + */ + auto prepend_start_of_file_char(ParserInputBuffer& input_buffer) -> void; + /** * Flip lexer states to match static buffer flipping. * @param old_storage_size The previous buffer size used to calculate the diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 874911d9..bf73d43e 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -329,6 +329,17 @@ void Lexer::reset() { m_prev_state = nullptr; } +template +void Lexer::prepend_start_of_file_char(ParserInputBuffer& input_buffer +) { + m_prev_state = m_dfa->get_root()->next(utf8::cCharStartOfFile); + m_asked_for_more_data = true; + m_start_pos = input_buffer.storage().pos(); + m_match_pos = input_buffer.storage().pos(); + m_match_line = m_line; + m_type_ids = nullptr; +} + template void Lexer::add_delimiters(std::vector const& delimiters) { assert(!delimiters.empty()); @@ -339,6 +350,7 @@ void Lexer::add_delimiters(std::vector con for (uint32_t delimiter : delimiters) { m_is_delimiter[delimiter] = true; } + m_is_delimiter[utf8::cCharStartOfFile] = true; } template diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 9f677d0c..2d26fb98 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -63,7 +63,15 @@ void LogParser::add_rules(SchemaAST const* schema_ast) { unique_ptr> first_timestamp_regex_ast( rule->m_regex_ptr->clone() ); - add_rule("firstTimestamp", std::move(first_timestamp_regex_ast)); + unique_ptr> r1 + = make_unique>(utf8::cCharStartOfFile); + add_rule( + "firstTimestamp", + make_unique>( + std::move(r1), + std::move(first_timestamp_regex_ast) + ) + ); unique_ptr> newline_timestamp_regex_ast( rule->m_regex_ptr->clone() ); @@ -143,6 +151,7 @@ void LogParser::add_rules(SchemaAST const* schema_ast) { auto LogParser::reset() -> void { m_input_buffer.reset(); m_lexer.reset(); + m_lexer.prepend_start_of_file_char(m_input_buffer); } // TODO: if the first text is a variable in the no timestamp case you lose the