Add metadata support and better number representation

avast · Feb 26, 2024 · c203e25 · c203e25
1 parent ba83809
commit c203e25
Show file tree

Hide file tree

Showing 16 changed files with 339 additions and 61 deletions.
diff --git a/example.yar b/example.yar
@@ -5,10 +5,12 @@ rule test
 {
 	//Rule block comment
 
+	meta:
+		author = "Author"
+		description = -20.3
 	//String comment
 	strings:
-		$a = "foo"
 		$b = "bar"
 	condition:
-		$b and not true or false
+		$b and not true or true
 }
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
@@ -10,37 +10,22 @@ use crate::{
 };
 use logos::Logos;
 use std::fmt;
-use std::num::ParseIntError;
 use text_size::{TextRange, TextSize};
 
 #[derive(Default, Debug, Clone, PartialEq)]
 pub(crate) enum LexingError {
-    InvalidInteger(String),
     #[default]
     InvalidCharacter,
 }
 
-// Implement Display trait for LexingError
 impl fmt::Display for LexingError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            LexingError::InvalidInteger(msg) => write!(f, "Invalid integer: {}", msg),
             LexingError::InvalidCharacter => write!(f, "Invalid character"),
         }
     }
 }
 
-/// Error type returned by calling `lex.slice().parse()` to u8.
-impl From<ParseIntError> for LexingError {
-    fn from(err: ParseIntError) -> Self {
-        use std::num::IntErrorKind::*;
-        match err.kind() {
-            PosOverflow | NegOverflow => LexingError::InvalidInteger("overflow error".to_owned()),
-            _ => LexingError::InvalidInteger("other error".to_owned()),
-        }
-    }
-}
-
 #[derive(Logos, Debug, PartialEq)]
 #[logos(error = LexingError)]
 pub(crate) enum LogosToken {
@@ -58,6 +43,8 @@ pub(crate) enum LogosToken {
     // Keywords
     #[token("rule")]
     Rule,
+    #[token("meta")]
+    Meta,
     #[token("strings")]
     Strings,
     #[token("condition")]
@@ -72,7 +59,7 @@ pub(crate) enum LogosToken {
     #[regex("[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
     Identifier(String),
     // Variables
-    #[regex(r"\$[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
+    #[regex(r"\$_?[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
     Variable(String),
     // Strings
     #[regex(r#""[^"]*""#, |lex| lex.slice().to_string())]
@@ -92,9 +79,12 @@ pub(crate) enum LogosToken {
     RParen,
     #[token(",")]
     Comma,
-    // Numbers
-    #[regex(r"[0-9]+", |lex| lex.slice().parse())]
-    Number(i64),
+    // Integer
+    #[regex(r"-?0x[a-fA-F0-9]+|-?0o[0-7]+|-?[0-9]+(KB|MB)?", |lex| lex.slice().to_string())]
+    Integer(String),
+    // Float
+    #[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().to_string())]
+    Float(String),
     // Booleans
     #[token("true")]
     True,
@@ -103,7 +93,7 @@ pub(crate) enum LogosToken {
 
     // Whitespace - I want to preserve whitespace tokens to implement full fidelity
     // and error resilience
-    #[regex(r"[ \t\n\f]+")]
+    #[regex(r"[ \t\n\r]+")]
     Whitespace,
 
     // Comments
@@ -159,6 +149,7 @@ pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
 fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind {
     match token {
         LogosToken::Rule => SyntaxKind::RULE_KW,
+        LogosToken::Meta => SyntaxKind::META_KW,
         LogosToken::Strings => SyntaxKind::STRINGS_KW,
         LogosToken::Condition => SyntaxKind::CONDITION_KW,
         LogosToken::And => SyntaxKind::AND_KW,
@@ -174,7 +165,8 @@ fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind {
         LogosToken::LParen => T!['('],
         LogosToken::RParen => T![')'],
         LogosToken::Comma => T![,],
-        LogosToken::Number(_) => SyntaxKind::NUMBER,
+        LogosToken::Integer(_) => SyntaxKind::INT_LIT,
+        LogosToken::Float(_) => SyntaxKind::FLOAT_LIT,
         LogosToken::True => SyntaxKind::TRUE_KW,
         LogosToken::False => SyntaxKind::FALSE_KW,
         LogosToken::Whitespace => SyntaxKind::WHITESPACE,
@@ -238,7 +230,7 @@ mod tests {
             rule foo {
                 condition:
                     $a = "test"
-                    $b = 1234567890123456789012345678901234567890
+                    $b = §
             }
         "#;
         let (tokens, errors) = tokenize(input);

diff --git a/src/lib.rs b/src/lib.rs
@@ -4,14 +4,13 @@
 use crate::{
     parser::SyntaxKind,
     syntax::{
-        syntax_error::SyntaxError,
-        syntax_node::{SyntaxNode, SyntaxToken},
-        text_token_source::TextTokenSource,
+        syntax_error::SyntaxError, syntax_node::SyntaxNode, text_token_source::TextTokenSource,
         text_tree_sink::TextTreeSink,
     },
 };
 
 pub use crate::syntax::ast::*;
+pub use crate::syntax::syntax_node::SyntaxToken;
 pub use crate::syntax::SourceFile;
 
 // use only for tests
@@ -21,7 +20,6 @@ use rowan_test::{NodeOrToken, WalkEvent};
 use std::fs;
 #[cfg(test)]
 use std::io::Write;
-use std::ops::Range;
 #[cfg(test)]
 use text_size::TextRange;
 
@@ -37,6 +35,9 @@ fn api_walktrough() {
     // without errors
     let source_code = "
         rule test_rule {
+            meta:
+                author = \"author\"
+                number = -123
             // This is a comment
             strings:
                 $a = \"test\"
@@ -80,6 +81,34 @@ fn api_walktrough() {
             assert_eq!(comment.text(), "This is a comment");
         }
 
+        // We can also obtain the meta part of the rule
+        // it consits of meta keyword and multiple `META_STMT` nodes
+        let meta = block.meta().unwrap();
+
+        // We can obtain the meta token
+        assert!(meta.meta_token().is_some());
+        assert!(meta.meta_token().unwrap().kind() == SyntaxKind::META_KW);
+
+        // and also the `COLON` token
+        assert!(meta.colon_token().is_some());
+
+        // Each meta statement consists of a variable token
+        // an assign token and a literal token
+        for meta_stmt in meta.meta_stmts() {
+            // each meta statement contains a identifier token
+            // an assign token and a literal token
+            let id = meta_stmt.identifier_token().unwrap();
+
+            // For now pattern can be only a string literal
+            assert!(!id.text().is_empty());
+
+            // and also the assign token
+            assert!(meta_stmt.assign_token().is_some());
+
+            // assert that the literal token is either a string or an int
+            assert!(meta_stmt.string_lit_token().is_some() || meta_stmt.int_lit_token().is_some());
+        }
+
         // This block expression consists (for now) of two parts
         // optional strings and required condition part
         // Firstly we can obtain the strings part
@@ -99,7 +128,7 @@ fn api_walktrough() {
         for variable_stmt in strings.variable_stmts() {
             // each variable statement contains a variable token
             // an assign token and a literal token
-            // now I will showm only the pattern token as an example
+            // now I will show only the pattern token as an example
             let pattern = variable_stmt.pattern().unwrap();
 
             // For now pattern can be only a string literal
@@ -184,7 +213,7 @@ fn api_walktrough() {
         // Some helpers:
         // for example get token at specific offset. This can be useful
         // to obtain the token at given Error offset, to get its text, length etc.
-        let tkn = expression_stmt_syntax.token_at_offset(151.into());
+        let tkn = expression_stmt_syntax.token_at_offset(232.into());
 
         // We can have offset that is between two tokens, so we use `right_biased` method
         // to obtain the token on the right side of the offset if it is between two tokens
@@ -284,7 +313,11 @@ fn api_walktrough() {
 
         // But luckily we can obtain the token at the offset
         // and from it we can get both its text and length
-        let tkn = ast.syntax().token_at_offset(173.into()).right_biased().unwrap();
+        let tkn = ast
+            .syntax()
+            .token_at_offset(parse_struct.errors()[1].range().start())
+            .right_biased()
+            .unwrap();
 
         assert_eq!(tkn.text(), "nor");
         // Error node contains also appropriate nested SyntaxKind

diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs
@@ -4,7 +4,9 @@ use super::*;
 
 /// Recovery set for `strings` block. This also should be adjusted and tweaked to
 /// better represents recovery set later on
-const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[T![strings]]);
+const STRINGS_RECOVERY_SET: TokenSet = TokenSet::new(&[T![strings]]);
+
+const META_RECOVERY_SET: TokenSet = TokenSet::new(&[T![identifier]]);
 
 /// Parse a rule body
 /// A rule body consists `{`, rule_body and `}`
@@ -29,9 +31,19 @@ pub(crate) fn block_expr(p: &mut Parser) {
 pub(super) fn rule_body(p: &mut Parser) {
     let mut has_strings = false;
     let mut has_condition = false;
+    let mut has_meta = false;
     while !p.at(EOF) && !p.at(T!['}']) {
         match p.current() {
-            // add metadata support later
+            T![meta] => {
+                if has_meta {
+                    p.error("only one meta block is allowed");
+                }
+                if has_condition || has_strings {
+                    p.error("meta block must come before strings and condition blocks");
+                }
+                meta(p);
+                has_meta = true;
+            }
             T![strings] => {
                 if has_strings {
                     p.error("only one strings block is allowed");
@@ -59,7 +71,7 @@ pub(super) fn rule_body(p: &mut Parser) {
                     p.eat(T![:]);
                     if p.current() == T![variable] && p.nth(1) == T![=] {
                         strings_body(p)
-                    } else {
+                    } else if let Some(_) = expression(p, None, 1) {
                         condition_body(p);
                     }
                 }
@@ -68,6 +80,17 @@ pub(super) fn rule_body(p: &mut Parser) {
     }
 }
 
+/// Parse a `meta` block
+/// It consists of `meta` keyword, `:` token and meta body
+fn meta(p: &mut Parser) {
+    assert!(p.at(T![meta]));
+    let m = p.start();
+    p.bump(T![meta]);
+    p.expect(T![:]);
+    meta_body(p);
+    m.complete(p, META);
+}
+
 /// Parse a `strings` block
 /// It consists of `strings` keyword,`:` token and strings body
 fn strings(p: &mut Parser) {
@@ -90,32 +113,54 @@ fn condition(p: &mut Parser) {
     m.complete(p, CONDITION);
 }
 
+/// Parse a `meta` body
+/// It consists of a list of `variable` and `=` token and a string
+pub(super) fn meta_body(p: &mut Parser) {
+    while !p.at(EOF) && !p.at(T![strings]) && !p.at(T![condition]) && !p.at(T!['}']) {
+        let m = p.start();
+        if p.at(T![identifier]) {
+            p.bump(T![identifier]);
+        } else {
+            p.err_recover("expected an identifier", META_RECOVERY_SET);
+        }
+        p.expect(T![=]);
+        match p.current() {
+            STRING_LIT | TRUE_KW | FALSE_KW | INT_LIT | FLOAT_LIT => {
+                p.bump(p.current());
+            }
+            _ => {
+                p.error("expected a valid metadata value");
+                return;
+            }
+        }
+        m.complete(p, META_STMT);
+    }
+}
+
 /// Parse a `strings` body
 /// It consists of a list of `variable` and `=` token and a string
 pub(super) fn strings_body(p: &mut Parser) {
-    // add support for meta also
     while !p.at(EOF) && !p.at(T![condition]) && !p.at(T!['}']) {
         let m = p.start();
         if p.at(T![variable]) {
             p.bump(T![variable]);
         } else {
-            p.err_recover("expected a variable", VARIABLE_RECOVERY_SET);
+            p.err_recover("expected a variable", STRINGS_RECOVERY_SET);
         }
         p.expect(T![=]);
         // so far only strings are supported, later add match for hex strings and regex
-        pattern(p);
+        match p.current() {
+            STRING_LIT => pattern(p),
+            _ => p.err_and_bump("expected a string"),
+        }
         m.complete(p, VARIABLE_STMT);
     }
 }
 
-/// Parse a string. For now string can be only basic plaintext string
-// add support for hex and regex strings later on
+/// Parse a plaintext string pattern
 fn pattern(p: &mut Parser) {
     let m = p.start();
-    match p.current() {
-        STRING_LIT => p.bump(STRING_LIT),
-        _ => p.err_and_bump("expected a string"),
-    }
+    p.bump(STRING_LIT);
     // add string modifiers
     m.complete(p, PATTERN);
 }
@@ -124,7 +169,6 @@ fn pattern(p: &mut Parser) {
 /// It consists of a list of expressions
 /// Pratt parser is used to parse expressions
 pub(super) fn condition_body(p: &mut Parser) {
-    // add support for meta also
     while !p.at(EOF) && !p.at(T!['}']) {
         let m = p.start();
         if let Some(cm) = expression(p, Some(m), 1) {

diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs
@@ -6,7 +6,7 @@ const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[T![variable], T![true], T![f
 // So far the only literals we support are true, false and variables
 // numbers will be added later
 pub(crate) const LITERAL_FIRST: TokenSet =
-    TokenSet::new(&[T![true], T![false], T![variable], T![string_lit], NUMBER]);
+    TokenSet::new(&[T![true], T![false], T![variable], T![string_lit], INT_LIT, FLOAT_LIT]);
 
 /// Parse a literal
 /// Literal right now is only: true, false, variable, string_lit or number