Skip to content

Commit

Permalink
Add metadata support and better number representation
Browse files Browse the repository at this point in the history
  • Loading branch information
TommYDeeee committed Feb 26, 2024
1 parent ba83809 commit c203e25
Show file tree
Hide file tree
Showing 16 changed files with 339 additions and 61 deletions.
6 changes: 4 additions & 2 deletions example.yar
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ rule test
{
//Rule block comment

meta:
author = "Author"
description = -20.3
//String comment
strings:
$a = "foo"
$b = "bar"
condition:
$b and not true or false
$b and not true or true
}
36 changes: 14 additions & 22 deletions src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,22 @@ use crate::{
};
use logos::Logos;
use std::fmt;
use std::num::ParseIntError;
use text_size::{TextRange, TextSize};

#[derive(Default, Debug, Clone, PartialEq)]
pub(crate) enum LexingError {
InvalidInteger(String),
#[default]
InvalidCharacter,
}

// Implement Display trait for LexingError
impl fmt::Display for LexingError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
LexingError::InvalidInteger(msg) => write!(f, "Invalid integer: {}", msg),
LexingError::InvalidCharacter => write!(f, "Invalid character"),
}
}
}

/// Error type returned by calling `lex.slice().parse()` to u8.
impl From<ParseIntError> for LexingError {
fn from(err: ParseIntError) -> Self {
use std::num::IntErrorKind::*;
match err.kind() {
PosOverflow | NegOverflow => LexingError::InvalidInteger("overflow error".to_owned()),
_ => LexingError::InvalidInteger("other error".to_owned()),
}
}
}

#[derive(Logos, Debug, PartialEq)]
#[logos(error = LexingError)]
pub(crate) enum LogosToken {
Expand All @@ -58,6 +43,8 @@ pub(crate) enum LogosToken {
// Keywords
#[token("rule")]
Rule,
#[token("meta")]
Meta,
#[token("strings")]
Strings,
#[token("condition")]
Expand All @@ -72,7 +59,7 @@ pub(crate) enum LogosToken {
#[regex("[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
Identifier(String),
// Variables
#[regex(r"\$[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
#[regex(r"\$_?[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
Variable(String),
// Strings
#[regex(r#""[^"]*""#, |lex| lex.slice().to_string())]
Expand All @@ -92,9 +79,12 @@ pub(crate) enum LogosToken {
RParen,
#[token(",")]
Comma,
// Numbers
#[regex(r"[0-9]+", |lex| lex.slice().parse())]
Number(i64),
// Integer
#[regex(r"-?0x[a-fA-F0-9]+|-?0o[0-7]+|-?[0-9]+(KB|MB)?", |lex| lex.slice().to_string())]
Integer(String),
// Float
#[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().to_string())]
Float(String),
// Booleans
#[token("true")]
True,
Expand All @@ -103,7 +93,7 @@ pub(crate) enum LogosToken {

// Whitespace - I want to preserve whitespace tokens to implement full fidelity
// and error resilience
#[regex(r"[ \t\n\f]+")]
#[regex(r"[ \t\n\r]+")]
Whitespace,

// Comments
Expand Down Expand Up @@ -159,6 +149,7 @@ pub fn tokenize(text: &str) -> (Vec<Token>, Vec<SyntaxError>) {
fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind {
match token {
LogosToken::Rule => SyntaxKind::RULE_KW,
LogosToken::Meta => SyntaxKind::META_KW,
LogosToken::Strings => SyntaxKind::STRINGS_KW,
LogosToken::Condition => SyntaxKind::CONDITION_KW,
LogosToken::And => SyntaxKind::AND_KW,
Expand All @@ -174,7 +165,8 @@ fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind {
LogosToken::LParen => T!['('],
LogosToken::RParen => T![')'],
LogosToken::Comma => T![,],
LogosToken::Number(_) => SyntaxKind::NUMBER,
LogosToken::Integer(_) => SyntaxKind::INT_LIT,
LogosToken::Float(_) => SyntaxKind::FLOAT_LIT,
LogosToken::True => SyntaxKind::TRUE_KW,
LogosToken::False => SyntaxKind::FALSE_KW,
LogosToken::Whitespace => SyntaxKind::WHITESPACE,
Expand Down Expand Up @@ -238,7 +230,7 @@ mod tests {
rule foo {
condition:
$a = "test"
$b = 1234567890123456789012345678901234567890
$b = §
}
"#;
let (tokens, errors) = tokenize(input);
Expand Down
47 changes: 40 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
use crate::{
parser::SyntaxKind,
syntax::{
syntax_error::SyntaxError,
syntax_node::{SyntaxNode, SyntaxToken},
text_token_source::TextTokenSource,
syntax_error::SyntaxError, syntax_node::SyntaxNode, text_token_source::TextTokenSource,
text_tree_sink::TextTreeSink,
},
};

pub use crate::syntax::ast::*;
pub use crate::syntax::syntax_node::SyntaxToken;
pub use crate::syntax::SourceFile;

// use only for tests
Expand All @@ -21,7 +20,6 @@ use rowan_test::{NodeOrToken, WalkEvent};
use std::fs;
#[cfg(test)]
use std::io::Write;
use std::ops::Range;
#[cfg(test)]
use text_size::TextRange;

Expand All @@ -37,6 +35,9 @@ fn api_walktrough() {
// without errors
let source_code = "
rule test_rule {
meta:
author = \"author\"
number = -123
// This is a comment
strings:
$a = \"test\"
Expand Down Expand Up @@ -80,6 +81,34 @@ fn api_walktrough() {
assert_eq!(comment.text(), "This is a comment");
}

// We can also obtain the meta part of the rule
// it consits of meta keyword and multiple `META_STMT` nodes
let meta = block.meta().unwrap();

// We can obtain the meta token
assert!(meta.meta_token().is_some());
assert!(meta.meta_token().unwrap().kind() == SyntaxKind::META_KW);

// and also the `COLON` token
assert!(meta.colon_token().is_some());

// Each meta statement consists of a variable token
// an assign token and a literal token
for meta_stmt in meta.meta_stmts() {
// each meta statement contains a identifier token
// an assign token and a literal token
let id = meta_stmt.identifier_token().unwrap();

// For now pattern can be only a string literal
assert!(!id.text().is_empty());

// and also the assign token
assert!(meta_stmt.assign_token().is_some());

// assert that the literal token is either a string or an int
assert!(meta_stmt.string_lit_token().is_some() || meta_stmt.int_lit_token().is_some());
}

// This block expression consists (for now) of two parts
// optional strings and required condition part
// Firstly we can obtain the strings part
Expand All @@ -99,7 +128,7 @@ fn api_walktrough() {
for variable_stmt in strings.variable_stmts() {
// each variable statement contains a variable token
// an assign token and a literal token
// now I will showm only the pattern token as an example
// now I will show only the pattern token as an example
let pattern = variable_stmt.pattern().unwrap();

// For now pattern can be only a string literal
Expand Down Expand Up @@ -184,7 +213,7 @@ fn api_walktrough() {
// Some helpers:
// for example get token at specific offset. This can be useful
// to obtain the token at given Error offset, to get its text, length etc.
let tkn = expression_stmt_syntax.token_at_offset(151.into());
let tkn = expression_stmt_syntax.token_at_offset(232.into());

// We can have offset that is between two tokens, so we use `right_biased` method
// to obtain the token on the right side of the offset if it is between two tokens
Expand Down Expand Up @@ -284,7 +313,11 @@ fn api_walktrough() {

// But luckily we can obtain the token at the offset
// and from it we can get both its text and length
let tkn = ast.syntax().token_at_offset(173.into()).right_biased().unwrap();
let tkn = ast
.syntax()
.token_at_offset(parse_struct.errors()[1].range().start())
.right_biased()
.unwrap();

assert_eq!(tkn.text(), "nor");
// Error node contains also appropriate nested SyntaxKind
Expand Down
70 changes: 57 additions & 13 deletions src/parser/grammar/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ use super::*;

/// Recovery set for `strings` block. This also should be adjusted and tweaked to
/// better represents recovery set later on
const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[T![strings]]);
const STRINGS_RECOVERY_SET: TokenSet = TokenSet::new(&[T![strings]]);

const META_RECOVERY_SET: TokenSet = TokenSet::new(&[T![identifier]]);

/// Parse a rule body
/// A rule body consists `{`, rule_body and `}`
Expand All @@ -29,9 +31,19 @@ pub(crate) fn block_expr(p: &mut Parser) {
pub(super) fn rule_body(p: &mut Parser) {
let mut has_strings = false;
let mut has_condition = false;
let mut has_meta = false;
while !p.at(EOF) && !p.at(T!['}']) {
match p.current() {
// add metadata support later
T![meta] => {
if has_meta {
p.error("only one meta block is allowed");
}
if has_condition || has_strings {
p.error("meta block must come before strings and condition blocks");
}
meta(p);
has_meta = true;
}
T![strings] => {
if has_strings {
p.error("only one strings block is allowed");
Expand Down Expand Up @@ -59,7 +71,7 @@ pub(super) fn rule_body(p: &mut Parser) {
p.eat(T![:]);
if p.current() == T![variable] && p.nth(1) == T![=] {
strings_body(p)
} else {
} else if let Some(_) = expression(p, None, 1) {
condition_body(p);
}
}
Expand All @@ -68,6 +80,17 @@ pub(super) fn rule_body(p: &mut Parser) {
}
}

/// Parse a `meta` block
/// It consists of `meta` keyword, `:` token and meta body
fn meta(p: &mut Parser) {
assert!(p.at(T![meta]));
let m = p.start();
p.bump(T![meta]);
p.expect(T![:]);
meta_body(p);
m.complete(p, META);
}

/// Parse a `strings` block
/// It consists of `strings` keyword,`:` token and strings body
fn strings(p: &mut Parser) {
Expand All @@ -90,32 +113,54 @@ fn condition(p: &mut Parser) {
m.complete(p, CONDITION);
}

/// Parse a `meta` body
/// It consists of a list of `variable` and `=` token and a string
pub(super) fn meta_body(p: &mut Parser) {
while !p.at(EOF) && !p.at(T![strings]) && !p.at(T![condition]) && !p.at(T!['}']) {
let m = p.start();
if p.at(T![identifier]) {
p.bump(T![identifier]);
} else {
p.err_recover("expected an identifier", META_RECOVERY_SET);
}
p.expect(T![=]);
match p.current() {
STRING_LIT | TRUE_KW | FALSE_KW | INT_LIT | FLOAT_LIT => {
p.bump(p.current());
}
_ => {
p.error("expected a valid metadata value");
return;
}
}
m.complete(p, META_STMT);
}
}

/// Parse a `strings` body
/// It consists of a list of `variable` and `=` token and a string
pub(super) fn strings_body(p: &mut Parser) {
// add support for meta also
while !p.at(EOF) && !p.at(T![condition]) && !p.at(T!['}']) {
let m = p.start();
if p.at(T![variable]) {
p.bump(T![variable]);
} else {
p.err_recover("expected a variable", VARIABLE_RECOVERY_SET);
p.err_recover("expected a variable", STRINGS_RECOVERY_SET);
}
p.expect(T![=]);
// so far only strings are supported, later add match for hex strings and regex
pattern(p);
match p.current() {
STRING_LIT => pattern(p),
_ => p.err_and_bump("expected a string"),
}
m.complete(p, VARIABLE_STMT);
}
}

/// Parse a string. For now string can be only basic plaintext string
// add support for hex and regex strings later on
/// Parse a plaintext string pattern
fn pattern(p: &mut Parser) {
let m = p.start();
match p.current() {
STRING_LIT => p.bump(STRING_LIT),
_ => p.err_and_bump("expected a string"),
}
p.bump(STRING_LIT);
// add string modifiers
m.complete(p, PATTERN);
}
Expand All @@ -124,7 +169,6 @@ fn pattern(p: &mut Parser) {
/// It consists of a list of expressions
/// Pratt parser is used to parse expressions
pub(super) fn condition_body(p: &mut Parser) {
// add support for meta also
while !p.at(EOF) && !p.at(T!['}']) {
let m = p.start();
if let Some(cm) = expression(p, Some(m), 1) {
Expand Down
2 changes: 1 addition & 1 deletion src/parser/grammar/expressions/atom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[T![variable], T![true], T![f
// So far the only literals we support are true, false and variables
// numbers will be added later
pub(crate) const LITERAL_FIRST: TokenSet =
TokenSet::new(&[T![true], T![false], T![variable], T![string_lit], NUMBER]);
TokenSet::new(&[T![true], T![false], T![variable], T![string_lit], INT_LIT, FLOAT_LIT]);

/// Parse a literal
/// Literal right now is only: true, false, variable, string_lit or number
Expand Down
Loading

0 comments on commit c203e25

Please sign in to comment.