Allow non-timestamped log messages to start with a variable. (#9)

y-scope · Sep 22, 2023 · e2f94cf · e2f94cf
1 parent 20ef752
commit e2f94cf
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 4 deletions.
diff --git a/docs/schema.md b/docs/schema.md
@@ -96,7 +96,7 @@ delimiters: \t\r\n:,!;%
 timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1}
 timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\]
 int:\-{0,1}[0-9]+
-double:\-{0,1}[0-9]+\.[0-9]+
+float:\-{0,1}[0-9]+\.[0-9]+
 
 // Custom variables
 hex:[a-fA-F]+
@@ -109,7 +109,7 @@ equals:.*=.*[a-zA-Z0-9].*
 * `timestamp` matches two different patterns:
     * 2023-04-19 12:32:08.064
     * [20230419-12:32:08]
-* `int`, `double`, `hex`, `hasNumber`, and `equals` all match different user
+* `int`, `float`, `hex`, `hasNumber`, and `equals` all match different user
   defined variables.
 
 ## Regular Expression Syntax

diff --git a/examples/schema.txt b/examples/schema.txt
@@ -54,7 +54,7 @@ delimiters: \t\r\n:,!;%
 
 // First set of variables
 int:\-{0,1}[0-9]+
-double:\-{0,1}[0-9]+\.[0-9]+
+float:\-{0,1}[0-9]+\.[0-9]+
 
 // Second set of variables
 hex:[a-fA-F]+

diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp
@@ -48,6 +48,7 @@ namespace utf8 {
     // 0xFF are invalid UTF-8 code units
     static unsigned char const cCharEOF = 0xFF;
     static unsigned char const cCharErr = 0xFE;
+    static unsigned char const cCharStartOfFile = 0xFD;
 }  // namespace utf8
 }  // namespace log_surgeon
 

diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp
@@ -83,6 +83,13 @@ class Lexer {
      */
     auto reset() -> void;
 
+    /**
+     * Set the lexer state as if it had already read a delimiter (used for
+     * treating start of file as a delimiter)
+     * @param input_buffer containing the data to be lexed
+     */
+    auto prepend_start_of_file_char(ParserInputBuffer& input_buffer) -> void;
+
     /**
      * Flip lexer states to match static buffer flipping.
      * @param old_storage_size The previous buffer size used to calculate the

diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp
@@ -329,6 +329,17 @@ void Lexer<NFAStateType, DFAStateType>::reset() {
     m_prev_state = nullptr;
 }
 
+template <typename NFAStateType, typename DFAStateType>
+void Lexer<NFAStateType, DFAStateType>::prepend_start_of_file_char(ParserInputBuffer& input_buffer
+) {
+    m_prev_state = m_dfa->get_root()->next(utf8::cCharStartOfFile);
+    m_asked_for_more_data = true;
+    m_start_pos = input_buffer.storage().pos();
+    m_match_pos = input_buffer.storage().pos();
+    m_match_line = m_line;
+    m_type_ids = nullptr;
+}
+
 template <typename NFAStateType, typename DFAStateType>
 void Lexer<NFAStateType, DFAStateType>::add_delimiters(std::vector<uint32_t> const& delimiters) {
     assert(!delimiters.empty());
@@ -339,6 +350,7 @@ void Lexer<NFAStateType, DFAStateType>::add_delimiters(std::vector<uint32_t> con
     for (uint32_t delimiter : delimiters) {
         m_is_delimiter[delimiter] = true;
     }
+    m_is_delimiter[utf8::cCharStartOfFile] = true;
 }
 
 template <typename NFAStateType, typename DFAStateType>

diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp
@@ -63,7 +63,15 @@ void LogParser::add_rules(SchemaAST const* schema_ast) {
             unique_ptr<RegexAST<RegexNFAByteState>> first_timestamp_regex_ast(
                     rule->m_regex_ptr->clone()
             );
-            add_rule("firstTimestamp", std::move(first_timestamp_regex_ast));
+            unique_ptr<RegexASTLiteral<RegexNFAByteState>> r1
+                    = make_unique<RegexASTLiteral<RegexNFAByteState>>(utf8::cCharStartOfFile);
+            add_rule(
+                    "firstTimestamp",
+                    make_unique<RegexASTCat<RegexNFAByteState>>(
+                            std::move(r1),
+                            std::move(first_timestamp_regex_ast)
+                    )
+            );
             unique_ptr<RegexAST<RegexNFAByteState>> newline_timestamp_regex_ast(
                     rule->m_regex_ptr->clone()
             );
@@ -143,6 +151,7 @@ void LogParser::add_rules(SchemaAST const* schema_ast) {
 auto LogParser::reset() -> void {
     m_input_buffer.reset();
     m_lexer.reset();
+    m_lexer.prepend_start_of_file_char(m_input_buffer);
 }
 
 // TODO: if the first text is a variable in the no timestamp case you lose the