Integer literal AST node

vcfxb · Jul 27, 2024 · 447d69c · 447d69c
1 parent 62d8516
commit 447d69c
Show file tree

Hide file tree

Showing 14 changed files with 179 additions and 39 deletions.
diff --git a/.github/workflows/cargo-check.yml b/.github/workflows/cargo-check.yml
@@ -21,7 +21,7 @@ jobs:
           - source-tracking
           - reporting
           - file_memmap
-          - ast-model
+          - ast-models
           - lexer
           - parser
           - wright_library_defaults

diff --git a/.github/workflows/cargo-test.yml b/.github/workflows/cargo-test.yml
@@ -21,7 +21,7 @@ jobs:
           - source-tracking
           - reporting
           - file_memmap
-          - ast-model
+          - ast-models
           - lexer
           - parser
           - wright_library_defaults

diff --git a/wright/Cargo.toml b/wright/Cargo.toml
@@ -62,13 +62,14 @@ wright_library_defaults = ["file_memmap", "parser"]
 # Wright's parser depends on the ability to report parsing errors and construct AST models.
 parser = [
     "reporting",
-    "ast-model",
+    "ast-models",
     "lexer"
 ]
 
 # Wright's abstract syntax tree model is built on types from the "source_tracking" module.
-ast-model = [
+ast-models = [
     "source-tracking",
+    "dep:num",
     # "derive_more/from"
 ]
 
@@ -115,11 +116,14 @@ none = []
 # SIMPLE DEPENDENCIES: 
 [dependencies]
 
-# Big Integers
-# num = "0.4"
-
 # DEPENDENCIES:
 
+# Num gives us integer types of unbound size/domain.
+# Used in AST node representations for integer literals.
+[dependencies.num]
+version = "0.4"
+optional = true
+
 # Unicode identifier functions.
 # Used by:
 # - "parser"

diff --git a/wright/build.rs b/wright/build.rs
@@ -4,17 +4,12 @@
 use rustc_version::{version_meta, Channel};
 
 fn main() {
-    // Set cfg flags depending on release channel.
-    // See: https://stackoverflow.com/a/70914430.
-    let channel = match version_meta().unwrap().channel {
-        Channel::Stable => "CHANNEL_STABLE",
-        Channel::Beta => "CHANNEL_BETA",
-        Channel::Nightly => "CHANNEL_NIGHTLY",
-        Channel::Dev => "CHANNEL_DEV",
-    };
+    // Set a cfg flag if we're on the nightly channel.
 
-    println!("cargo:rustc-cfg={}", channel);
-    println!("cargo::rustc-check-cfg=cfg({})", channel);
+    println!("cargo::rustc-check-cfg=cfg(CHANNEL_NIGHTLY)");
+    if version_meta().unwrap().channel == Channel::Nightly {
+        println!("cargo:rustc-cfg=CHANNEL_NIGHTLY");
+    }
 
     // Save build info.
     // See https://docs.rs/built/0.7.4/built/index.html.

diff --git a/wright/src/ast.rs b/wright/src/ast.rs
@@ -4,3 +4,5 @@
 
 pub mod identifier;
 pub mod path;
+// pub mod ty;
+pub mod literal;
diff --git a/wright/src/ast/literal.rs b/wright/src/ast/literal.rs
@@ -0,0 +1,16 @@
+//! AST node models representing literal values in source code.
+
+use num::BigUint;
+
+use crate::source_tracking::fragment::Fragment;
+
+/// An integer literal from source. This only contains unsigned integers as writing negative numbers is considered 
+/// to be a combination of an integer literal with a unary negation.
+#[derive(Debug)]
+pub struct IntegerLiteral {
+    /// The [Fragment] of source code containing this integer literal.
+    pub fragment: Fragment,
+
+    /// The value of the integer parsed from the matching source.
+    pub value: BigUint,
+}
diff --git a/wright/src/ast/ty.rs b/wright/src/ast/ty.rs
@@ -0,0 +1 @@
+
diff --git a/wright/src/lexer/integer_literal.rs b/wright/src/lexer/integer_literal.rs
@@ -34,6 +34,9 @@ pub fn try_consume_integer_literal(lexer: &mut Lexer) -> Option<Token> {
         }
     }
 
+    // The first character after the optional prefix is required to be a digit, not underscore.
+    bytes_consumed += chars.next_if(|c| c.is_digit(radix))?.len_utf8();
+
     // Add the rest of the integer literal.
     bytes_consumed += chars
         .take_while(|c| c.is_digit(radix) || *c == '_')
@@ -45,6 +48,8 @@ pub fn try_consume_integer_literal(lexer: &mut Lexer) -> Option<Token> {
 
 #[cfg(test)]
 mod tests {
+    use crate::lexer::integer_literal::try_consume_integer_literal;
+
     use super::{Lexer, TokenTy};
 
     #[test]
@@ -55,5 +60,15 @@ mod tests {
 
         assert_eq!(token.fragment.as_str(), "123_456_789");
         assert_eq!(token.variant, TokenTy::IntegerLiteral);
+        assert_eq!(lexer.remaining.as_str(), ".");
+    }
+
+    #[test]
+    fn cant_start_with_underscore() {
+        let mut lexer = Lexer::new_test("0x__10");
+
+        assert!(try_consume_integer_literal(&mut lexer).is_none());
+
+        assert_eq!(lexer.remaining.as_str(), "0x__10");
     }
 }
diff --git a/wright/src/lib.rs b/wright/src/lib.rs
@@ -6,6 +6,8 @@
 #![deny(missing_copy_implementations, missing_debug_implementations)]
 #![deny(rustdoc::broken_intra_doc_links)]
 #![warn(missing_docs)]
+
+
 // Compiler directive to get docs.rs (which uses the nightly version of the rust compiler) to show
 // info about featurer required for various modules and functionality.
 //
@@ -45,7 +47,7 @@ pub mod reporting;
 #[cfg(feature = "lexer")]
 pub mod lexer;
 
-#[cfg(feature = "ast-model")]
+#[cfg(feature = "ast-models")]
 pub mod ast;
 
 #[cfg(feature = "parser")]

diff --git a/wright/src/parser.rs b/wright/src/parser.rs
@@ -3,6 +3,8 @@
 //! [AST]: crate::ast
 //! [Token]: crate::lexer::token::Token
 
+use error::{ParserError, ParserErrorKind};
+
 use super::lexer::Lexer;
 use crate::{
     lexer::token::{Token, TokenTy},
@@ -13,6 +15,7 @@ use std::collections::VecDeque;
 pub mod error;
 mod identifier;
 mod path;
+mod literal;
 pub mod whitespace;
 
 /// The [Parser] struct wraps a [Lexer] and adds lookahead and functions that are useful for parsing.
@@ -31,11 +34,23 @@ impl Parser {
         }
     }
 
-    /// Get the next [Token] from this [Parser]. This may be a clone of a token that's already been peeked.
-    pub fn next_token(&mut self) -> Option<Token> {
-        self.lookahead
+    /// Get the next [Token] from this [Parser]. This may be a token that's already been peeked.
+    /// Return an error if a [Token] with [TokenTy::Unknown] is encountered.
+    pub fn next_token(&mut self) -> Result<Option<Token>, ParserError> {
+        let token = self.lookahead
             .pop_front()
-            .or_else(|| self.lexer.next_token())
+            .or_else(|| self.lexer.next_token());
+
+        // Check for unknown tokens, which should always convert to an error.
+        if let Some(Token { variant: TokenTy::Unknown, fragment }) = token {
+            Err(ParserError {
+                kind: ParserErrorKind::EncounteredUnknownToken,
+                location: fragment,
+                help: None,
+            })
+        } else {
+            Ok(token)
+        }
     }
 
     /// Advance this [Parser] by `n` [Token]s. If this [Parser] runs out of [Token]s, panic.
@@ -103,6 +118,7 @@ impl Parser {
     pub fn next_if_is(&mut self, token_ty: TokenTy) -> Option<Token> {
         // Peeking successfully first means that the lookahead vec will never be empty here.
         (self.peek()?.variant == token_ty)
+            // SAFETY: We just peeked a token to check its variant so this unwrap is alway ok.
             .then(|| unsafe { self.lookahead.pop_front().unwrap_unchecked() })
     }
 

diff --git a/wright/src/parser/error.rs b/wright/src/parser/error.rs
@@ -12,11 +12,13 @@ use std::borrow::Cow;
 #[allow(missing_docs)]
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum ParserErrorKind {
-    UnterminatedStringLiteralEncountered,
-    UnterminatedMultilineCommentEncountered,
+    EncounteredUnknownToken,
+    EncounteredUnterminatedComment,
+    EncounteredUnterminatedString,
     ExpectedIdentifier,
     ExpectedPath,
     ExpectedWhitespace,
+    ExpectedIntegerLiteral,
 }
 
 impl ParserErrorKind {
@@ -25,22 +27,15 @@ impl ParserErrorKind {
         use ParserErrorKind::*;
 
         match self {
+            EncounteredUnknownToken => "encountered unknown token",
+            EncounteredUnterminatedComment => "encountered unterminated multiline comment while parsing",
+            EncounteredUnterminatedString => "encountered unterminated string literal while parsing",
             ExpectedIdentifier => "expected identifier",
+            ExpectedIntegerLiteral => "expected integer literal",
             ExpectedPath => "expected path or identifier",
             ExpectedWhitespace => "expected whitespace character(s)",
-            UnterminatedMultilineCommentEncountered => {
-                "encountered unterminated multiline comment while parsing"
-            }
-            UnterminatedStringLiteralEncountered => {
-                "encountered unterminated string literal while parsing"
-            }
         }
     }
-
-    /// Return this [ParserErrorKind] cast to a [u64], adding 1, preceded by the letters "WPE" standing for "Wright Parser Error".
-    pub fn error_code_string(self) -> String {
-        format!("WPE{}", self as u64 + 1)
-    }
 }
 
 /// An error that occurred while parsing.
@@ -64,7 +59,6 @@ impl ParserError {
         let description = self.kind.describe();
 
         let mut diagnostic = Diagnostic::error()
-            .with_code(self.kind.error_code_string())
             .with_message(description)
             .with_highlights([Highlight::primary(self.location, "")]);
 

diff --git a/wright/src/parser/literal.rs b/wright/src/parser/literal.rs
@@ -0,0 +1,95 @@
+//! Literal parsing implementation.
+
+use num::{BigUint, Num};
+
+use crate::{ast::literal::IntegerLiteral, lexer::token::TokenTy};
+
+use super::{error::{ParserError, ParserErrorKind}, Parser};
+
+
+impl IntegerLiteral {
+    /// Parse an integer literal from the given [Parser].
+    pub fn parse(parser: &mut Parser) -> Result<Self, ParserError> {
+        // Get the token containing the integer literal from the parser.
+        let Some(int_lit_token) = parser.next_if_is(TokenTy::IntegerLiteral) else {
+            return match parser.peek_fragment() {
+                Some(frag) => Err(ParserError {
+                    kind: ParserErrorKind::ExpectedIntegerLiteral,
+                    location: frag.clone(),
+                    help: None,
+                }),
+
+                None => Err(ParserError {
+                    kind: ParserErrorKind::ExpectedIntegerLiteral,
+                    location: parser.lexer.remaining.clone(),
+                    help: Some("found end of source".into()),
+                })
+            };
+        };
+
+        // Get the string to pass to num for the rest of parsing.
+        let mut parse_str: &str = int_lit_token.fragment.as_str();
+        let mut chars = parse_str.chars();
+
+        // Unwrap: Integer literals must be at minimum 1 character, enforced by the lexer.
+        // use null byte as a sentinel value for the second one, since we're just using the prefix to check for 
+        // a radix to pass to num.
+        let prefix: [char; 2] = [chars.next().unwrap(), chars.next().unwrap_or('\0')];
+
+        // Determine the radix and remove any prefix in the process.
+        let radix: u32 = match prefix {
+            // Hexidecimal.
+            ['0', 'x' | 'X'] => {
+                parse_str = &parse_str[2..];
+                16
+            },
+
+            // Binary.
+            ['0', 'b' | 'B'] => {
+                parse_str = &parse_str[2..];
+                2
+            }
+
+            // Octal
+            ['0', 'o'] => {
+                parse_str = &parse_str[2..];
+                8
+            }
+
+            // All other patterns are not radix-prefixes.
+            _ => 10,
+        };
+
+        // Pass the remainder of parsing off to num.
+        let value = BigUint::from_str_radix(parse_str, radix)
+            // We can use expect here for now since we have validated the format of the string 
+            // on our own before passing it off.
+            .expect("num should successfully parse");
+
+        Ok(IntegerLiteral {
+            fragment: int_lit_token.fragment,
+            value,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use num::BigUint;
+
+    use crate::{ast::literal::IntegerLiteral, lexer::Lexer, parser::Parser};
+
+    #[test]
+    fn normal() {
+        let mut parser = Parser::new(Lexer::new_test("1000"));
+
+        let int_lit = IntegerLiteral::parse(&mut parser).unwrap();
+
+        assert_eq!(int_lit.value, BigUint::new(vec![1000]));
+        assert_eq!(parser.lexer.remaining.as_str(), "");
+        assert_eq!(int_lit.fragment.as_str(), "1000");
+    }
+
+    // #[test]
+    // fn ingore_underscores
+}
diff --git a/wright/src/parser/path.rs b/wright/src/parser/path.rs
@@ -73,8 +73,8 @@ fn parse_segment(parser: &mut Parser) -> Option<Identifier> {
     for sep_token_sequence in VALID_SEGMENT_SEQUENCES {
         if parser.matches(sep_token_sequence) {
             parser.advance(sep_token_sequence.len() - 1);
-            // We can unwrap here because we just checked/matched that this parser ends with an identifier.
-            return Some(Identifier::parse(parser).unwrap());
+            // SAFETY: We just checked/matched that this parser ends with an identifier.
+            return Some(unsafe { Identifier::parse(parser).unwrap_unchecked() });
         }
     }
 

diff --git a/wright/tests/parser.rs b/wright/tests/parser.rs
@@ -23,7 +23,7 @@ fn test_parse_fail_identifier_to_diagnostic() -> anyhow::Result<()> {
     assert_eq!(
         std::str::from_utf8(buffer.as_slice())?,
         "\
-    error[WPE3]: expected identifier
+    error: expected identifier
   ┌─ <NO_NAME>:1:1
   │
 1 │ 12345