From c6914299fa1ee05fc6e32103b4091b6732985c88 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Wed, 1 Jan 2025 02:05:59 +0000 Subject: [PATCH] perf(lexer): optimize `Source` --- crates/oxc_parser/src/lexer/comment.rs | 2 +- crates/oxc_parser/src/lexer/identifier.rs | 2 +- crates/oxc_parser/src/lexer/search.rs | 12 +- crates/oxc_parser/src/lexer/source.rs | 202 ++++++++++++++++------ crates/oxc_parser/src/lexer/template.rs | 10 +- 5 files changed, 164 insertions(+), 64 deletions(-) diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index ddd2645a756ef..0b9e061e8ea98 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -95,7 +95,7 @@ impl<'a> Lexer<'a> { if next_byte == b'*' { // SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary let after_star = unsafe { pos.add(1) }; - if after_star.addr() < self.source.end_addr() { + if after_star.is_not_end_of(&self.source) { // If next byte isn't `/`, continue // SAFETY: Have checked there's at least 1 further byte to read if unsafe { after_star.read() } == b'/' { diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index 21161230c9698..de005aa1b96c2 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -217,7 +217,7 @@ impl<'a> Lexer<'a> { pub fn private_identifier(&mut self) -> Kind { // Handle EOF directly after `#` let start_pos = self.source.position(); - if start_pos.addr() == self.source.end_addr() { + if start_pos.is_end_of(&self.source) { return cold_branch(|| { let start = self.offset(); self.error(diagnostics::unexpected_end(Span::new(start, start))); diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index d94d424c34d80..952f1f2ed1aae 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -434,7 +434,7 @@ macro_rules! byte_search { let mut $pos = $start; #[allow(unused_unsafe, clippy::unnecessary_safety_comment)] // Silence warnings if macro called in unsafe code 'outer: loop { - let $byte = if $pos.addr() <= $lexer.source.end_for_batch_search_addr() { + let $byte = if $lexer.source.has_search_batch_remaining_from($pos) { // Search a batch of `SEARCH_BATCH_SIZE` bytes. // // `'inner: loop {}` is not a real loop - it always exits on first turn. @@ -444,7 +444,7 @@ macro_rules! byte_search { // compiler to unroll it. // // SAFETY: - // `$pos.addr() <= lexer.source.end_for_batch_search_addr()` check above ensures + // `$lexer.source.has_search_batch_remaining_from($pos)` check above ensures // there are at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`. // So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds. 'inner: loop { @@ -466,9 +466,13 @@ macro_rules! byte_search { } else { // Not enough bytes remaining for a batch. Process byte-by-byte. // Same as above, `'inner: loop {}` is not a real loop here - always exits on first turn. - let end_addr = $lexer.source.end_addr(); + let end_pos = $lexer.source.end_position(); 'inner: loop { - while $pos.addr() < end_addr { + // SAFETY: `end_pos` is end of source, so `$pos` must be on or before it. + // Note: `$pos < end_pos` would be semantically equivalent, but use `offset_from` + // because it takes advantage of `Source`'s invariant that `$pos` can never be + // after `end_pos`, which may allow compiler to produce better codegen. + while unsafe { end_pos.offset_from($pos) } > 0 { // SAFETY: `pos` is not at end of source, so safe to read a byte let byte = unsafe { $pos.read() }; if $table.matches(byte) { diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 615e77b478754..084b91d75029d 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -1,6 +1,8 @@ #![expect(clippy::unnecessary_safety_comment)] -use std::{marker::PhantomData, slice, str}; +use std::{cmp::Ordering, marker::PhantomData, slice, str}; + +use assert_unchecked::assert_unchecked; use crate::{UniquePromise, MAX_LEN}; @@ -109,12 +111,7 @@ impl<'a> Source<'a> { // SAFETY: // `start` and `end` are created from a `&str` in `Source::new`, so `start` cannot be after `end`. // `start` and `end` are by definition on UTF-8 char boundaries. - unsafe { - self.str_between_positions_unchecked( - SourcePosition::new(self.start), - SourcePosition::new(self.end), - ) - } + unsafe { self.str_between_positions_unchecked(self.start_position(), self.end_position()) } } /// Get remaining source text as `&str`. @@ -122,32 +119,20 @@ impl<'a> Source<'a> { pub(super) fn remaining(&self) -> &'a str { // SAFETY: // Invariant of `Source` is that `ptr` is always <= `end`, and is on a UTF-8 char boundary. - // `end` is pointer to end of original `&str`, so be definition a UTF-8 char boundary. - unsafe { - self.str_between_positions_unchecked( - SourcePosition::new(self.ptr), - SourcePosition::new(self.end), - ) - } + // `end` is pointer to end of original `&str`, so by definition is a UTF-8 char boundary. + unsafe { self.str_between_positions_unchecked(self.position(), self.end_position()) } } /// Return whether at end of source. #[inline] pub(super) fn is_eof(&self) -> bool { - self.ptr == self.end + self.position().is_end_of(self) } - /// Get end address. + /// Returns `true` if at least `Lexer::search::SEARCH_BATCH_SIZE` bytes remaining in source. #[inline] - pub(super) fn end_addr(&self) -> usize { - self.end as usize - } - - /// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes - /// can be read without going out of bounds. - #[inline] - pub(super) fn end_for_batch_search_addr(&self) -> usize { - self.end_for_batch_search_addr + pub(super) fn has_search_batch_remaining_from(&self, pos: SourcePosition<'a>) -> bool { + pos.ptr as usize <= self.end_for_batch_search_addr } /// Get current position. @@ -164,6 +149,24 @@ impl<'a> Source<'a> { unsafe { SourcePosition::new(self.ptr) } } + /// Get start of this `Source` as a `SourcePosition`. + /// + /// `SourcePosition` lives as long as the source text `&str` that `Source` was created from. + #[inline] + pub(super) fn start_position(&self) -> SourcePosition<'a> { + // SAFETY: Creating a `SourcePosition` from start of `Source` is always valid + unsafe { SourcePosition::new(self.start) } + } + + /// Get end of this `Source` as a `SourcePosition`. + /// + /// `SourcePosition` lives as long as the source text `&str` that `Source` was created from. + #[inline] + pub(super) fn end_position(&self) -> SourcePosition<'a> { + // SAFETY: Creating a `SourcePosition` from end of `Source` is always valid + unsafe { SourcePosition::new(self.end) } + } + /// Move current position. #[inline] pub(super) fn set_position(&mut self, pos: SourcePosition<'a>) { @@ -216,7 +219,7 @@ impl<'a> Source<'a> { /// Get string slice from a `SourcePosition` up to the current position of `Source`. pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition<'a>) -> &'a str { - assert!(pos.ptr <= self.ptr); + assert!(pos <= self.position()); // SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements unsafe { self.str_from_pos_to_current_unchecked(pos) } } @@ -235,7 +238,7 @@ impl<'a> Source<'a> { ) -> &'a str { // SAFETY: Caller guarantees `pos` is not after current position of `Source`. // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. - self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr)) + self.str_between_positions_unchecked(pos, self.position()) } /// Get string slice from current position of `Source` up to a `SourcePosition`, without checks. @@ -250,18 +253,16 @@ impl<'a> Source<'a> { &self, pos: SourcePosition<'a>, ) -> &'a str { - // SAFETY: Caller guarantees `pos` is not before current position of `Source`. - // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. - self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos) + // SAFETY: Caller guarantees `pos` is not before current position of `Source` + self.str_between_positions_unchecked(self.position(), pos) } /// Get string slice from a `SourcePosition` up to the end of `Source`. #[inline] pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition<'a>) -> &'a str { // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`, - // and always on a UTF-8 character boundary. - // `self.end` is always a valid `SourcePosition` due to invariants of `Source`. - unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) } + // and always on a UTF-8 character boundary + unsafe { self.str_between_positions_unchecked(pos, self.end_position()) } } /// Get string slice of source between 2 `SourcePosition`s, without checks. @@ -295,8 +296,8 @@ impl<'a> Source<'a> { // Invariants of `Source` and `SourcePosition` types guarantee that both are positioned // on UTF-8 character boundaries. So slicing source text between these 2 points will always // yield a valid UTF-8 string. - let len = end.addr() - start.addr(); - let slice = slice::from_raw_parts(start.ptr, len); + let len = end.offset_from(start); + let slice = slice::from_raw_parts(start.ptr, len as usize); std::str::from_utf8_unchecked(slice) } @@ -307,11 +308,16 @@ impl<'a> Source<'a> { } /// Get offset of `pos`. - #[expect(clippy::cast_possible_truncation)] #[inline] pub(super) fn offset_of(&self, pos: SourcePosition<'a>) -> u32 { - // Cannot overflow `u32` because of `MAX_LEN` check in `Source::new` - (pos.addr() - self.start as usize) as u32 + // SAFETY: `SourcePosition`s can only be created from a `Source`. + // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` + // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another + // `Source` originated on another thread can "jump" onto this one. + // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be + // from this `Source`. `SourcePosition`s must always be within bounds of the `Source` they were + // created from, and therefore `pos` is guaranteed to be equal to or after `self.start`. + unsafe { pos.offset_from(self.start_position()) } } /// Move current position back by `n` bytes. @@ -331,7 +337,7 @@ impl<'a> Source<'a> { assert!(n > 0, "Cannot call `Source::back` with 0"); // Ensure not attempting to go back to before start of source - let offset = self.ptr as usize - self.start as usize; + let offset = self.offset() as usize; assert!(n <= offset, "Cannot go back {n} bytes - only {offset} bytes consumed"); // SAFETY: We have checked that `n` is less than distance between `start` and `ptr`, @@ -521,9 +527,9 @@ impl<'a> Source<'a> { /// Peek next two bytes of source without consuming them. #[inline] pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> { - // `end` is always >= `ptr` so `end - ptr` cannot wrap around. - // No need to use checked/saturating subtraction here. - if (self.end as usize) - (self.ptr as usize) >= 2 { + // SAFETY: `end` is always >= `ptr` + let remaining_len = unsafe { self.end_position().offset_from(self.position()) }; + if remaining_len >= 2 { // SAFETY: The check above ensures that there are at least 2 bytes to // read from `self.ptr` without reading past `self.end` let bytes = unsafe { self.position().read2() }; @@ -560,7 +566,7 @@ pub struct SourcePosition<'a> { _marker: PhantomData<&'a u8>, } -impl SourcePosition<'_> { +impl<'a> SourcePosition<'a> { /// Create a new `SourcePosition` from a pointer. /// /// # SAFETY @@ -574,12 +580,6 @@ impl SourcePosition<'_> { Self { ptr, _marker: PhantomData } } - /// Get memory address of `SourcePosition` as a `usize`. - #[inline] - pub(super) fn addr(self) -> usize { - self.ptr as usize - } - /// Create new `SourcePosition` which is `n` bytes after this one. /// The provenance of the pointer `SourcePosition` contains is maintained. /// @@ -604,6 +604,80 @@ impl SourcePosition<'_> { Self::new(self.ptr.sub(n)) } + /// Get offset of this `SourcePosition` relative to another `SourcePosition`. + /// + /// # SAFETY + /// `self` must be after or equal to `other`. + /// i.e. `end.offset_from(start)`. + #[inline] + pub(super) unsafe fn offset_from(self, other: Self) -> u32 { + // SAFETY: `SourcePosition`s can only be created from a `Source`. + // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` + // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another + // `Source` originated on another thread can "jump" onto this one. + // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be + // from this `Source`, therefore `self` and `other` must both be within the same + // allocation, and derived from the same original pointer. + // Source text length is limited to `MAX_LEN`, which cannot be more than either `u32::MAX` + // or `isize::MAX`, so distance between `self` and `other` cannot exceed either. + let offset = self.distance_from(other); + debug_assert!(u32::try_from(offset).is_ok()); + u32::try_from(offset).unwrap_unchecked() + } + + /// Get distance of this `SourcePosition` relative to another `SourcePosition`. + /// + /// `self` can be before or after `other`. + /// * If `self` is after `other`, return value is positive. + /// * If `self` is before `other`, return value is negative. + #[inline] + fn distance_from(self, other: Self) -> isize { + debug_assert!(if self.ptr < other.ptr { + other.ptr as usize - self.ptr as usize <= MAX_LEN + } else { + self.ptr as usize - other.ptr as usize <= MAX_LEN + }); + + // SAFETY: `SourcePosition`s can only be created from a `Source`. + // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` + // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another + // `Source` originated on another thread can "jump" onto this one. + // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be + // from this `Source`, therefore `self` and `other` must both be within the same + // allocation, and derived from the same original pointer. + // Source text length is limited to `MAX_LEN`, which cannot be more than either `u32::MAX` + // or `isize::MAX`, so distance between `self` and `other` cannot exceed either. + // The `isize::MAX` constraint is required to satisfy safety conditions of `ptr::offset_from`. + unsafe { + let offset = self.ptr.offset_from(other.ptr); + + // These assertions aim to give the compiler as much information as possible about + // invariants constraining the possible values of `offset` + assert_unchecked!(offset != isize::MIN); + let offset_abs = offset.checked_abs().unwrap_unchecked(); + assert_unchecked!(usize::try_from(offset_abs).unwrap_unchecked() <= MAX_LEN); + + offset + } + } + + /// Returns `true` if this `SourcePosition` is at end of source. + #[inline] + pub(super) fn is_end_of(self, source: &Source<'a>) -> bool { + // SAFETY: `SourcePosition`s can only be created from a `Source`. + // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` + // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another + // `Source` originated on another thread can "jump" onto this one. + // Therefore `self` must be from provided `Source`, and so cannot be after source's end. + unsafe { source.end_position().offset_from(self) == 0 } + } + + /// Returns `true` if this `SourcePosition` is not at end of source. + #[inline] + pub(super) fn is_not_end_of(self, source: &Source<'a>) -> bool { + !self.is_end_of(source) + } + /// Read byte from this `SourcePosition`. /// /// # SAFETY @@ -640,19 +714,41 @@ impl SourcePosition<'_> { #[inline] pub(super) unsafe fn read2(self) -> [u8; 2] { // SAFETY: - // Caller guarantees `self` is not at no later than 2 bytes before end of source text. + // Caller guarantees `self` is at no later than 2 bytes before end of source text. // `Source` is created from a valid `&str`, so points to allocated, initialized memory. // `Source` conceptually holds the source text `&str`, which guarantees no mutable references // to the same memory can exist, as that would violate Rust's aliasing rules. - // Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects. + // Pointer is "dereferenceable" since caller guarantees it's within bounds of source text `&str`. // Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements). debug_assert!(!self.ptr.is_null()); - #[expect(clippy::ptr_as_ptr)] - let p = self.ptr as *const [u8; 2]; + let p = self.ptr.cast::<[u8; 2]>(); *p.as_ref().unwrap_unchecked() } } +impl PartialEq for SourcePosition<'_> { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.distance_from(*other) == 0 + } +} + +impl Eq for SourcePosition<'_> {} + +impl PartialOrd for SourcePosition<'_> { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for SourcePosition<'_> { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + self.distance_from(*other).cmp(&0) + } +} + /// Return if byte is a UTF-8 continuation byte. #[inline] const fn is_utf8_cont_byte(byte: u8) -> bool { diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index b850233e176d9..bc8dca15d2310 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -34,7 +34,7 @@ impl<'a> Lexer<'a> { b'$' => { // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary let after_dollar = unsafe { pos.add(1) }; - if after_dollar.addr() < self.source.end_addr() { + if after_dollar.is_not_end_of(&self.source) { // If `${`, exit. // SAFETY: Have checked there's at least 1 further byte to read. if unsafe { after_dollar.read() } == b'{' { @@ -103,7 +103,7 @@ impl<'a> Lexer<'a> { pos = pos.add(1); // If at EOF, exit. This illegal in valid JS, so cold branch. - if pos.addr() == self.source.end_addr() { + if pos.is_end_of(&self.source) { return cold_branch(|| { self.source.advance_to_end(); self.error(diagnostics::unterminated_string(self.unterminated_range())); @@ -201,7 +201,7 @@ impl<'a> Lexer<'a> { if next_byte == b'$' { // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary let after_dollar = pos.add(1); - if after_dollar.addr() < self.source.end_addr() { + if after_dollar.is_not_end_of(&self.source) { // If `${`, exit. // SAFETY: Have checked there's at least 1 further byte to read. if after_dollar.read() == b'{' { @@ -251,7 +251,7 @@ impl<'a> Lexer<'a> { // brought up to `chunk_start` again. chunk_start = pos.add(1); - if chunk_start.addr() < self.source.end_addr() { + if chunk_start.is_not_end_of(&self.source) { // If next char is `\n`, start next search after it. // NB: `byte_search!` macro already advances `pos` by 1, so only advance // by 1 here, so that in total we skip 2 bytes for `\r\n`. @@ -282,7 +282,7 @@ impl<'a> Lexer<'a> { // Start next chunk after escape sequence chunk_start = self.source.position(); - assert!(chunk_start.addr() >= after_backslash.addr()); + assert!(chunk_start >= after_backslash); // Continue search after escape sequence. // NB: `byte_search!` macro increments `pos` when return `true`,