feat(regular_expression): Validate max quantifier value (#5218)

I've never seen but `/a{9007199254740991}/` is valid and this is the maximum value for quantifier. \+ left comment about #5210 experiment.
oxc-project · Aug 26, 2024 · 46b641b · 46b641b
1 parent b39c0d6
commit 46b641b
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 15 deletions.
diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs
@@ -98,9 +98,9 @@ pub enum LookAroundAssertionKind {
 #[derive(Debug)]
 pub struct Quantifier<'a> {
     pub span: Span,
-    pub min: u32,
+    pub min: u64,
     /// `None` means no upper bound.
-    pub max: Option<u32>,
+    pub max: Option<u64>,
     pub greedy: bool,
     pub body: Term<'a>,
 }

diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs
@@ -32,6 +32,8 @@ mod test {
             ("a{1,", ParserOptions::default()),
             ("a{1,}", ParserOptions::default()),
             ("a{1,2}", ParserOptions::default()),
+            ("x{9007199254740991}", ParserOptions::default()),
+            ("x{9007199254740991,9007199254740991}", ParserOptions::default()),
             ("a|b", ParserOptions::default()),
             ("a|b|c", ParserOptions::default()),
             ("a|b+?|c", ParserOptions::default()),
@@ -137,6 +139,8 @@ mod test {
             ("a{1", ParserOptions::default().with_unicode_mode()),
             ("a{1,", ParserOptions::default().with_unicode_mode()),
             ("a{,", ParserOptions::default().with_unicode_mode()),
+            ("x{9007199254740992}", ParserOptions::default()),
+            ("x{9007199254740991,9007199254740992}", ParserOptions::default()),
             ("(?=a", ParserOptions::default()),
             ("(?<!a", ParserOptions::default()),
             (r"\c0", ParserOptions::default().with_unicode_mode()),

diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs
@@ -1565,7 +1565,8 @@ impl<'a> PatternParser<'a> {
     // ```
     /// Returns: ((min, max), greedy)
     #[allow(clippy::type_complexity)]
-    fn consume_quantifier(&mut self) -> Result<Option<((u32, Option<u32>), bool)>> {
+    fn consume_quantifier(&mut self) -> Result<Option<((u64, Option<u64>), bool)>> {
+        const MAX_QUANTIFIER: u64 = 9_007_199_254_740_991; // 2^53 - 1
         let is_greedy = |reader: &mut Reader| !reader.eat('?');
 
         if self.reader.eat('*') {
@@ -1583,11 +1584,27 @@ impl<'a> PatternParser<'a> {
         if self.reader.eat('{') {
             if let Some(min) = self.consume_decimal_digits() {
                 if self.reader.eat('}') {
+                    if MAX_QUANTIFIER < min {
+                        return Err(OxcDiagnostic::error(
+                            "Number is too large in braced quantifier",
+                        )
+                        .with_label(self.span_factory.create(span_start, self.reader.offset())));
+                    }
+
                     return Ok(Some(((min, Some(min)), is_greedy(&mut self.reader))));
                 }
 
                 if self.reader.eat(',') {
                     if self.reader.eat('}') {
+                        if MAX_QUANTIFIER < min {
+                            return Err(OxcDiagnostic::error(
+                                "Number is too large in braced quantifier",
+                            )
+                            .with_label(
+                                self.span_factory.create(span_start, self.reader.offset()),
+                            ));
+                        }
+
                         return Ok(Some(((min, None), is_greedy(&mut self.reader))));
                     }
 
@@ -1603,6 +1620,14 @@ impl<'a> PatternParser<'a> {
                                     self.span_factory.create(span_start, self.reader.offset()),
                                 ));
                             }
+                            if MAX_QUANTIFIER < min || MAX_QUANTIFIER < max {
+                                return Err(OxcDiagnostic::error(
+                                    "Number is too large in braced quantifier",
+                                )
+                                .with_label(
+                                    self.span_factory.create(span_start, self.reader.offset()),
+                                ));
+                            }
 
                             return Ok(Some(((min, Some(max)), is_greedy(&mut self.reader))));
                         }
@@ -1626,7 +1651,8 @@ impl<'a> PatternParser<'a> {
         if let Some(index) = self.consume_decimal_digits() {
             // \0 is CharacterEscape, not DecimalEscape
             if index != 0 {
-                return Some(index);
+                #[allow(clippy::cast_possible_truncation)]
+                return Some(index as u32);
             }
 
             self.reader.rewind(checkpoint);
@@ -1642,13 +1668,15 @@ impl<'a> PatternParser<'a> {
     //   [+Sep] DecimalDigits[+Sep] NumericLiteralSeparator DecimalDigit
     // ```
     // ([Sep] is disabled for `QuantifierPrefix` and `DecimalEscape`, skip it)
-    fn consume_decimal_digits(&mut self) -> Option<u32> {
+    fn consume_decimal_digits(&mut self) -> Option<u64> {
         let checkpoint = self.reader.checkpoint();
 
         let mut value = 0;
         while let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_decimal_digit(cp)) {
             // `- '0' as u32`: convert code point to digit
-            value = (10 * value) + (cp - '0' as u32);
+            #[allow(clippy::cast_lossless)]
+            let d = (cp - '0' as u32) as u64;
+            value = (10 * value) + d;
             self.reader.advance();
         }
 

diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs
@@ -1,21 +1,29 @@
+// NOTE: The current implementation switches iteration units depending on the mode.
+//
+// This is all for surrogate pairs in non-unicode mode, but it is required only in limited cases:
+// - Group names for named `CapturingGroup`: `(?<name>.)`
+// - Group names for `NamedReference`: `\k<name>`
+// Even if we skip that distinction, it seems the current test262 cases pass due to other errors.
+//
+// Therefore, it is possible to change the implementation to iterate on `char` units always,
+// assuming that change some output of AST for `Character[kind=Symbol]` to `Character[kind=SurrogatePairs]`.
+//
+// However, for the following reasons, we keep the current implementation:
+// - We want to keep the behavior closer to the specification
+//   - and also, to prevent any oversight
+// - Changing does not have a significant impact on performance
+//
+// See also: https://github.com/oxc-project/oxc/pull/5210
 pub struct Reader<'a> {
     source: &'a str,
     unicode_mode: bool,
     /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
     index: usize,
-    // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
-    //
-    // If I understand correctly (and there are no unexpected factors),
-    // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
-    //
-    // Therefore, performance might be improved by:
-    // - using only `u8_units`, and
-    // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
-    // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
     /// Iteration units for unicode mode.
     /// Even in non-unicode mode, used for `Span` offset calculation.
     u8_units: Vec<(usize, char)>,
     /// Iteration units for non-unicode mode.
+    /// To iterate on surrogate pairs, this is needed.
     u16_units: Vec<u16>,
     /// Last offset caches for non-unicode mode.
     last_offset_indices: (usize, usize),
@@ -67,6 +75,7 @@ impl<'a> Reader<'a> {
         self.index += 1;
     }
 
+    // We need a code point, not a char.
     fn peek_nth(&self, n: usize) -> Option<u32> {
         let nth = self.index + n;