Skip to content

Commit

Permalink
feat(regular_expression): Validate max quantifier value (#5218)
Browse files Browse the repository at this point in the history
I've never seen but `/a{9007199254740991}/` is valid and this is the maximum value for quantifier.

\+ left comment about #5210 experiment.
  • Loading branch information
leaysgur committed Aug 26, 2024
1 parent b39c0d6 commit 46b641b
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 15 deletions.
4 changes: 2 additions & 2 deletions crates/oxc_regular_expression/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ pub enum LookAroundAssertionKind {
#[derive(Debug)]
pub struct Quantifier<'a> {
pub span: Span,
pub min: u32,
pub min: u64,
/// `None` means no upper bound.
pub max: Option<u32>,
pub max: Option<u64>,
pub greedy: bool,
pub body: Term<'a>,
}
Expand Down
4 changes: 4 additions & 0 deletions crates/oxc_regular_expression/src/body_parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ mod test {
("a{1,", ParserOptions::default()),
("a{1,}", ParserOptions::default()),
("a{1,2}", ParserOptions::default()),
("x{9007199254740991}", ParserOptions::default()),
("x{9007199254740991,9007199254740991}", ParserOptions::default()),
("a|b", ParserOptions::default()),
("a|b|c", ParserOptions::default()),
("a|b+?|c", ParserOptions::default()),
Expand Down Expand Up @@ -137,6 +139,8 @@ mod test {
("a{1", ParserOptions::default().with_unicode_mode()),
("a{1,", ParserOptions::default().with_unicode_mode()),
("a{,", ParserOptions::default().with_unicode_mode()),
("x{9007199254740992}", ParserOptions::default()),
("x{9007199254740991,9007199254740992}", ParserOptions::default()),
("(?=a", ParserOptions::default()),
("(?<!a", ParserOptions::default()),
(r"\c0", ParserOptions::default().with_unicode_mode()),
Expand Down
36 changes: 32 additions & 4 deletions crates/oxc_regular_expression/src/body_parser/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1565,7 +1565,8 @@ impl<'a> PatternParser<'a> {
// ```
/// Returns: ((min, max), greedy)
#[allow(clippy::type_complexity)]
fn consume_quantifier(&mut self) -> Result<Option<((u32, Option<u32>), bool)>> {
fn consume_quantifier(&mut self) -> Result<Option<((u64, Option<u64>), bool)>> {
const MAX_QUANTIFIER: u64 = 9_007_199_254_740_991; // 2^53 - 1
let is_greedy = |reader: &mut Reader| !reader.eat('?');

if self.reader.eat('*') {
Expand All @@ -1583,11 +1584,27 @@ impl<'a> PatternParser<'a> {
if self.reader.eat('{') {
if let Some(min) = self.consume_decimal_digits() {
if self.reader.eat('}') {
if MAX_QUANTIFIER < min {
return Err(OxcDiagnostic::error(
"Number is too large in braced quantifier",
)
.with_label(self.span_factory.create(span_start, self.reader.offset())));
}

return Ok(Some(((min, Some(min)), is_greedy(&mut self.reader))));
}

if self.reader.eat(',') {
if self.reader.eat('}') {
if MAX_QUANTIFIER < min {
return Err(OxcDiagnostic::error(
"Number is too large in braced quantifier",
)
.with_label(
self.span_factory.create(span_start, self.reader.offset()),
));
}

return Ok(Some(((min, None), is_greedy(&mut self.reader))));
}

Expand All @@ -1603,6 +1620,14 @@ impl<'a> PatternParser<'a> {
self.span_factory.create(span_start, self.reader.offset()),
));
}
if MAX_QUANTIFIER < min || MAX_QUANTIFIER < max {
return Err(OxcDiagnostic::error(
"Number is too large in braced quantifier",
)
.with_label(
self.span_factory.create(span_start, self.reader.offset()),
));
}

return Ok(Some(((min, Some(max)), is_greedy(&mut self.reader))));
}
Expand All @@ -1626,7 +1651,8 @@ impl<'a> PatternParser<'a> {
if let Some(index) = self.consume_decimal_digits() {
// \0 is CharacterEscape, not DecimalEscape
if index != 0 {
return Some(index);
#[allow(clippy::cast_possible_truncation)]
return Some(index as u32);
}

self.reader.rewind(checkpoint);
Expand All @@ -1642,13 +1668,15 @@ impl<'a> PatternParser<'a> {
// [+Sep] DecimalDigits[+Sep] NumericLiteralSeparator DecimalDigit
// ```
// ([Sep] is disabled for `QuantifierPrefix` and `DecimalEscape`, skip it)
fn consume_decimal_digits(&mut self) -> Option<u32> {
fn consume_decimal_digits(&mut self) -> Option<u64> {
let checkpoint = self.reader.checkpoint();

let mut value = 0;
while let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_decimal_digit(cp)) {
// `- '0' as u32`: convert code point to digit
value = (10 * value) + (cp - '0' as u32);
#[allow(clippy::cast_lossless)]
let d = (cp - '0' as u32) as u64;
value = (10 * value) + d;
self.reader.advance();
}

Expand Down
27 changes: 18 additions & 9 deletions crates/oxc_regular_expression/src/body_parser/reader.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
// NOTE: The current implementation switches iteration units depending on the mode.
//
// This is all for surrogate pairs in non-unicode mode, but it is required only in limited cases:
// - Group names for named `CapturingGroup`: `(?<name>.)`
// - Group names for `NamedReference`: `\k<name>`
// Even if we skip that distinction, it seems the current test262 cases pass due to other errors.
//
// Therefore, it is possible to change the implementation to iterate on `char` units always,
// assuming that change some output of AST for `Character[kind=Symbol]` to `Character[kind=SurrogatePairs]`.
//
// However, for the following reasons, we keep the current implementation:
// - We want to keep the behavior closer to the specification
// - and also, to prevent any oversight
// - Changing does not have a significant impact on performance
//
// See also: https://github.com/oxc-project/oxc/pull/5210
pub struct Reader<'a> {
source: &'a str,
unicode_mode: bool,
/// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
index: usize,
// NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
//
// If I understand correctly (and there are no unexpected factors),
// AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
//
// Therefore, performance might be improved by:
// - using only `u8_units`, and
// - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
// However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
/// Iteration units for unicode mode.
/// Even in non-unicode mode, used for `Span` offset calculation.
u8_units: Vec<(usize, char)>,
/// Iteration units for non-unicode mode.
/// To iterate on surrogate pairs, this is needed.
u16_units: Vec<u16>,
/// Last offset caches for non-unicode mode.
last_offset_indices: (usize, usize),
Expand Down Expand Up @@ -67,6 +75,7 @@ impl<'a> Reader<'a> {
self.index += 1;
}

// We need a code point, not a char.
fn peek_nth(&self, n: usize) -> Option<u32> {
let nth = self.index + n;

Expand Down

0 comments on commit 46b641b

Please sign in to comment.