diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index 76669ef92d746..edd8405ba3d54 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -98,9 +98,9 @@ pub enum LookAroundAssertionKind { #[derive(Debug)] pub struct Quantifier<'a> { pub span: Span, - pub min: u32, + pub min: u64, /// `None` means no upper bound. - pub max: Option, + pub max: Option, pub greedy: bool, pub body: Term<'a>, } diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs index e6f3804e1ef66..602eb95d206bd 100644 --- a/crates/oxc_regular_expression/src/body_parser/mod.rs +++ b/crates/oxc_regular_expression/src/body_parser/mod.rs @@ -32,6 +32,8 @@ mod test { ("a{1,", ParserOptions::default()), ("a{1,}", ParserOptions::default()), ("a{1,2}", ParserOptions::default()), + ("x{9007199254740991}", ParserOptions::default()), + ("x{9007199254740991,9007199254740991}", ParserOptions::default()), ("a|b", ParserOptions::default()), ("a|b|c", ParserOptions::default()), ("a|b+?|c", ParserOptions::default()), @@ -137,6 +139,8 @@ mod test { ("a{1", ParserOptions::default().with_unicode_mode()), ("a{1,", ParserOptions::default().with_unicode_mode()), ("a{,", ParserOptions::default().with_unicode_mode()), + ("x{9007199254740992}", ParserOptions::default()), + ("x{9007199254740991,9007199254740992}", ParserOptions::default()), ("(?=a", ParserOptions::default()), ("(? PatternParser<'a> { // ``` /// Returns: ((min, max), greedy) #[allow(clippy::type_complexity)] - fn consume_quantifier(&mut self) -> Result), bool)>> { + fn consume_quantifier(&mut self) -> Result), bool)>> { + const MAX_QUANTIFIER: u64 = 9_007_199_254_740_991; // 2^53 - 1 let is_greedy = |reader: &mut Reader| !reader.eat('?'); if self.reader.eat('*') { @@ -1583,11 +1584,27 @@ impl<'a> PatternParser<'a> { if self.reader.eat('{') { if let Some(min) = self.consume_decimal_digits() { if self.reader.eat('}') { + if MAX_QUANTIFIER < min { + return Err(OxcDiagnostic::error( + "Number is too large in braced quantifier", + ) + .with_label(self.span_factory.create(span_start, self.reader.offset()))); + } + return Ok(Some(((min, Some(min)), is_greedy(&mut self.reader)))); } if self.reader.eat(',') { if self.reader.eat('}') { + if MAX_QUANTIFIER < min { + return Err(OxcDiagnostic::error( + "Number is too large in braced quantifier", + ) + .with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } + return Ok(Some(((min, None), is_greedy(&mut self.reader)))); } @@ -1603,6 +1620,14 @@ impl<'a> PatternParser<'a> { self.span_factory.create(span_start, self.reader.offset()), )); } + if MAX_QUANTIFIER < min || MAX_QUANTIFIER < max { + return Err(OxcDiagnostic::error( + "Number is too large in braced quantifier", + ) + .with_label( + self.span_factory.create(span_start, self.reader.offset()), + )); + } return Ok(Some(((min, Some(max)), is_greedy(&mut self.reader)))); } @@ -1626,7 +1651,8 @@ impl<'a> PatternParser<'a> { if let Some(index) = self.consume_decimal_digits() { // \0 is CharacterEscape, not DecimalEscape if index != 0 { - return Some(index); + #[allow(clippy::cast_possible_truncation)] + return Some(index as u32); } self.reader.rewind(checkpoint); @@ -1642,13 +1668,15 @@ impl<'a> PatternParser<'a> { // [+Sep] DecimalDigits[+Sep] NumericLiteralSeparator DecimalDigit // ``` // ([Sep] is disabled for `QuantifierPrefix` and `DecimalEscape`, skip it) - fn consume_decimal_digits(&mut self) -> Option { + fn consume_decimal_digits(&mut self) -> Option { let checkpoint = self.reader.checkpoint(); let mut value = 0; while let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_decimal_digit(cp)) { // `- '0' as u32`: convert code point to digit - value = (10 * value) + (cp - '0' as u32); + #[allow(clippy::cast_lossless)] + let d = (cp - '0' as u32) as u64; + value = (10 * value) + d; self.reader.advance(); } diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs index f066bdceaa121..0949b36766e0f 100644 --- a/crates/oxc_regular_expression/src/body_parser/reader.rs +++ b/crates/oxc_regular_expression/src/body_parser/reader.rs @@ -1,21 +1,29 @@ +// NOTE: The current implementation switches iteration units depending on the mode. +// +// This is all for surrogate pairs in non-unicode mode, but it is required only in limited cases: +// - Group names for named `CapturingGroup`: `(?.)` +// - Group names for `NamedReference`: `\k` +// Even if we skip that distinction, it seems the current test262 cases pass due to other errors. +// +// Therefore, it is possible to change the implementation to iterate on `char` units always, +// assuming that change some output of AST for `Character[kind=Symbol]` to `Character[kind=SurrogatePairs]`. +// +// However, for the following reasons, we keep the current implementation: +// - We want to keep the behavior closer to the specification +// - and also, to prevent any oversight +// - Changing does not have a significant impact on performance +// +// See also: https://github.com/oxc-project/oxc/pull/5210 pub struct Reader<'a> { source: &'a str, unicode_mode: bool, /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode). index: usize, - // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary. - // - // If I understand correctly (and there are no unexpected factors), - // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs. - // - // Therefore, performance might be improved by: - // - using only `u8_units`, and - // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units. - // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once. /// Iteration units for unicode mode. /// Even in non-unicode mode, used for `Span` offset calculation. u8_units: Vec<(usize, char)>, /// Iteration units for non-unicode mode. + /// To iterate on surrogate pairs, this is needed. u16_units: Vec, /// Last offset caches for non-unicode mode. last_offset_indices: (usize, usize), @@ -67,6 +75,7 @@ impl<'a> Reader<'a> { self.index += 1; } + // We need a code point, not a char. fn peek_nth(&self, n: usize) -> Option { let nth = self.index + n;