diff --git a/README.md b/README.md index 62624a2..dda9a7d 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,10 @@ On the left are rulex expressions (_rulexes_ for short), on the right is the com << 'foo' | 'bar' # (?<=foo|bar) !>> 'foo' | 'bar' # (?!foo|bar) !<< 'foo' | 'bar' # (?test)\k ``` ## Why use this instead of normal regexes? diff --git a/rulex-lib/src/compile.rs b/rulex-lib/src/compile.rs index 44f52f0..adbd41d 100644 --- a/rulex-lib/src/compile.rs +++ b/rulex-lib/src/compile.rs @@ -1,6 +1,10 @@ use std::collections::HashMap; -use crate::{error::CompileError, options::CompileOptions}; +use crate::{ + error::{CompileError, CompileErrorKind}, + options::CompileOptions, + span::Span, +}; pub(crate) type CompileResult = Result<(), CompileError>; @@ -33,6 +37,9 @@ impl Compile for Parens<'_, T> { pub(crate) struct CompileState { pub(crate) next_idx: u32, pub(crate) used_names: HashMap, + + pub(crate) unknown_references: Vec<(String, Span)>, + pub(crate) unknown_groups: Vec<(u32, Span)>, } impl CompileState { @@ -40,6 +47,20 @@ impl CompileState { CompileState { next_idx: 1, used_names: HashMap::new(), + unknown_references: vec![], + unknown_groups: vec![], + } + } + + pub(crate) fn check_validity(self) -> Result<(), CompileError> { + for (group, span) in self.unknown_groups { + if group >= self.next_idx { + return Err(CompileErrorKind::UnknownReferenceNumber(group).at(span)); + } + } + if let Some((reference, span)) = self.unknown_references.into_iter().next() { + return Err(CompileErrorKind::UnknownReferenceName(reference).at(span)); } + Ok(()) } } diff --git a/rulex-lib/src/error/compile_error.rs b/rulex-lib/src/error/compile_error.rs index 5c77857..e9de77e 100644 --- a/rulex-lib/src/error/compile_error.rs +++ b/rulex-lib/src/error/compile_error.rs @@ -28,6 +28,15 @@ pub enum CompileErrorKind { #[error("Compile error: Unsupported feature `{}` in the `{:?}` regex flavor", .0.name(), .1)] Unsupported(Feature, RegexFlavor), + #[error("Group references this large aren't supported")] + HugeReference, + + #[error("Reference to unknown group. There is no group number {}", .0)] + UnknownReferenceNumber(u32), + + #[error("Reference to unknown group. There is no group named `{}`", .0)] + UnknownReferenceName(String), + #[error("Compile error: Group name `{}` used multiple times", .0)] NameUsedMultipleTimes(String), @@ -50,6 +59,10 @@ pub enum Feature { Lookaround, Grapheme, UnicodeLineBreak, + Backreference, + ForwardReference, + RelativeReference, + NonNegativeRelativeReference, } impl Feature { @@ -59,6 +72,10 @@ impl Feature { Feature::Lookaround => "lookahead/behind", Feature::Grapheme => "grapheme cluster matcher (\\X)", Feature::UnicodeLineBreak => "Unicode line break (\\R)", + Feature::Backreference => "Backreference", + Feature::ForwardReference => "Forward reference", + Feature::RelativeReference => "Relative backreference", + Feature::NonNegativeRelativeReference => "Non-negative relative backreference", } } } diff --git a/rulex-lib/src/group.rs b/rulex-lib/src/group.rs index bbb0d1c..dcfb672 100644 --- a/rulex-lib/src/group.rs +++ b/rulex-lib/src/group.rs @@ -34,8 +34,8 @@ impl<'i> Group<'i> { } } - pub fn set_capture(&mut self, capture: Option>) { - self.capture = capture; + pub fn set_capture(&mut self, capture: Capture<'i>) { + self.capture = Some(capture); } pub fn needs_parens_before_repetition(&self) -> bool { @@ -61,6 +61,7 @@ impl Compile for Group<'_> { ); } state.used_names.insert(name.to_string(), state.next_idx); + state.unknown_references.retain(|(s, _)| s != name); state.next_idx += 1; // https://www.regular-expressions.info/named.html @@ -68,10 +69,13 @@ impl Compile for Group<'_> { RegexFlavor::Python | RegexFlavor::Pcre => { buf.push_str("(?P<"); } - RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => { + RegexFlavor::DotNet + | RegexFlavor::Java + | RegexFlavor::Ruby + | RegexFlavor::JavaScript => { buf.push_str("(?<"); } - RegexFlavor::JavaScript | RegexFlavor::Rust => { + RegexFlavor::Rust => { return Err(CompileErrorKind::Unsupported( Feature::NamedCaptureGroups, options.flavor, diff --git a/rulex-lib/src/lib.rs b/rulex-lib/src/lib.rs index 13a35a7..8423e55 100644 --- a/rulex-lib/src/lib.rs +++ b/rulex-lib/src/lib.rs @@ -8,6 +8,7 @@ pub mod literal; pub mod lookaround; pub mod options; pub mod parse; +pub mod reference; pub mod repetition; pub mod span; diff --git a/rulex-lib/src/parse/parsers.rs b/rulex-lib/src/parse/parsers.rs index 0236e2f..e7eaed9 100644 --- a/rulex-lib/src/parse/parsers.rs +++ b/rulex-lib/src/parse/parsers.rs @@ -2,7 +2,7 @@ use nom::{ branch::alt, combinator::{cut, map, opt, value}, multi::{many0, many1, separated_list0}, - sequence::{pair, separated_pair, tuple}, + sequence::{pair, preceded, separated_pair, tuple}, IResult, Parser, }; @@ -18,6 +18,7 @@ use crate::{ group::{Capture, Group}, literal::Literal, lookaround::{Lookaround, LookaroundKind}, + reference::{Reference, ReferenceTarget}, repetition::{Greedy, Repetition, RepetitionKind}, span::Span, Rulex, @@ -125,10 +126,6 @@ pub(super) fn parse_repetition<'i, 'b>( pub(super) fn parse_braced_repetition<'i, 'b>( input: Input<'i, 'b>, ) -> PResult<'i, 'b, (RepetitionKind, Span)> { - fn str_to_u32(s: &str) -> Result { - str::parse(s).map_err(|_| ParseErrorKind::Number(NumberError::TooLarge)) - } - fn parse_u32<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, u32> { try_map(Token::Number, |(s, _)| str_to_u32(s), nom::Err::Failure)(input) } @@ -157,6 +154,7 @@ pub(super) fn parse_atom<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex< parse_char_class, parse_grapheme, parse_boundary, + parse_reference, map(parse_code_point, |(c, span)| { Rulex::CharClass(CharClass::new(CharGroup::from_char(c), span)) }), @@ -166,10 +164,11 @@ pub(super) fn parse_atom<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex< } pub(super) fn parse_group<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex<'i>> { - fn parse_capture<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Capture<'i>> { - map(pair(Token::Colon, opt(Token::Identifier)), |(_, name)| { - Capture::new(name.map(|(s, _)| s)) - })(input) + fn parse_capture<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, (Capture<'i>, Span)> { + map( + pair(Token::Colon, opt(Token::Identifier)), + |((_, span1), name)| (Capture::new(name.map(|(s, _)| s)), span1), + )(input) } map( @@ -177,13 +176,18 @@ pub(super) fn parse_group<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex opt(parse_capture), tuple((Token::OpenParen, parse_or, cut(Token::CloseParen))), ), - |(capture, ((_, start), rule, (_, end)))| match (capture, rule) { + |(capture, (_, rule, (_, close_paren)))| match (capture, rule) { (None, rule) => rule, - (capture, Rulex::Group(mut g)) => { + (Some((capture, c_span)), Rulex::Group(mut g)) => { g.set_capture(capture); + g.span.start = c_span.start; Rulex::Group(g) } - (capture, rule) => Rulex::Group(Group::new(vec![rule], capture, start.join(end))), + (Some((capture, c_span)), rule) => Rulex::Group(Group::new( + vec![rule], + Some(capture), + c_span.join(close_paren), + )), }, )(input) } @@ -368,6 +372,48 @@ pub(super) fn parse_boundary<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Ru )(input) } +pub(super) fn parse_reference<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex<'i>> { + preceded( + Token::Backref, + alt(( + try_map( + Token::Number, + |(s, span)| { + let target = ReferenceTarget::Number(str_to_u32(s)?); + Ok(Rulex::Reference(Reference::new(target, span))) + }, + nom::Err::Failure, + ), + map(Token::Identifier, |(s, span)| { + let target = ReferenceTarget::Named(s); + Rulex::Reference(Reference::new(target, span)) + }), + try_map( + pair(alt((Token::Plus, Token::Dash)), Token::Number), + |((sign, span1), (s, span2))| { + let num = if sign == "-" { + str_to_i32(&format!("-{s}")) + } else { + str_to_i32(s) + }?; + let target = ReferenceTarget::Relative(num); + Ok(Rulex::Reference(Reference::new(target, span1.join(span2)))) + }, + nom::Err::Failure, + ), + err(|| ParseErrorKind::Expected("number or group name")), + )), + )(input) +} + +fn str_to_u32(s: &str) -> Result { + str::parse(s).map_err(|_| ParseErrorKind::Number(NumberError::TooLarge)) +} + +fn str_to_i32(s: &str) -> Result { + str::parse(s).map_err(|_| ParseErrorKind::Number(NumberError::TooLarge)) +} + fn strip_first_last(s: &str) -> &str { &s[1..s.len() - 1] } diff --git a/rulex-lib/src/reference.rs b/rulex-lib/src/reference.rs new file mode 100644 index 0000000..b0f2d55 --- /dev/null +++ b/rulex-lib/src/reference.rs @@ -0,0 +1,172 @@ +use crate::{ + compile::{Compile, CompileResult, CompileState}, + error::{CompileErrorKind, Feature}, + options::{CompileOptions, RegexFlavor}, + span::Span, +}; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Reference<'i> { + pub(crate) target: ReferenceTarget<'i>, + pub(crate) span: Span, +} + +impl<'i> Reference<'i> { + pub(crate) fn new(target: ReferenceTarget<'i>, span: Span) -> Self { + Reference { target, span } + } +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub(crate) enum ReferenceTarget<'i> { + Named(&'i str), + Number(u32), + Relative(i32), +} + +#[derive(Clone, Copy, PartialEq, Eq)] +enum ReferenceDirection { + Backwards, + Forwards, +} + +#[cfg(feature = "dbg")] +impl std::fmt::Debug for Reference<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.target { + ReferenceTarget::Named(n) => write!(f, "::{}", n), + ReferenceTarget::Number(i) => write!(f, "::{}", i), + ReferenceTarget::Relative(o) => write!(f, "::{}{}", if o < 0 { '-' } else { '+' }, o), + } + } +} + +impl Compile for Reference<'_> { + fn comp( + &self, + options: CompileOptions, + state: &mut CompileState, + buf: &mut String, + ) -> CompileResult { + use std::fmt::Write; + + //TODO: Warn in JS mode when referencing an optional group + + let direction = match self.target { + ReferenceTarget::Named(name) => { + match options.flavor { + RegexFlavor::Pcre + | RegexFlavor::JavaScript + | RegexFlavor::Java + | RegexFlavor::DotNet + | RegexFlavor::Ruby => { + buf.push_str("\\k<"); + buf.push_str(name); + buf.push('>'); + } + RegexFlavor::Python => { + buf.push_str("(?P="); + buf.push_str(name); + buf.push(')'); + } + + // return error below + RegexFlavor::Rust => {} + } + + if state.used_names.contains_key(name) { + ReferenceDirection::Backwards + } else { + state.unknown_references.push((name.to_string(), self.span)); + ReferenceDirection::Forwards + } + } + ReferenceTarget::Number(idx) => { + if idx > 99 { + return Err(CompileErrorKind::HugeReference.at(self.span)); + } + + match options.flavor { + RegexFlavor::Pcre + | RegexFlavor::JavaScript + | RegexFlavor::Java + | RegexFlavor::DotNet + | RegexFlavor::Ruby + | RegexFlavor::Python => { + write!(buf, "\\{idx}").unwrap(); + } + + // return error below + RegexFlavor::Rust => {} + } + + if idx >= state.next_idx { + state.unknown_groups.push((idx, self.span)); + ReferenceDirection::Forwards + } else { + ReferenceDirection::Backwards + } + } + ReferenceTarget::Relative(offset) => { + //TODO convert relative to absolute references + + if offset >= 0 { + return Err(CompileErrorKind::Unsupported( + Feature::NonNegativeRelativeReference, + options.flavor, + ) + .at(self.span)); + } + + match options.flavor { + RegexFlavor::Ruby => { + write!(buf, "\\k<{offset}>").unwrap(); + } + RegexFlavor::Pcre => { + write!(buf, "\\g{{{offset}}}").unwrap(); + } + + RegexFlavor::DotNet + | RegexFlavor::Java + | RegexFlavor::JavaScript + | RegexFlavor::Python => { + return Err(CompileErrorKind::Unsupported( + Feature::RelativeReference, + options.flavor, + ) + .at(self.span)); + } + + // return error below + RegexFlavor::Rust => {} + } + + if offset >= 0 { + ReferenceDirection::Forwards + } else { + ReferenceDirection::Backwards + } + } + }; + + match options.flavor { + RegexFlavor::Rust => Err(CompileErrorKind::Unsupported( + if direction == ReferenceDirection::Backwards { + Feature::Backreference + } else { + Feature::ForwardReference + }, + options.flavor, + ) + .at(self.span)), + RegexFlavor::JavaScript if direction == ReferenceDirection::Forwards => { + //TODO: Return "unknown group name" if this name isn't found + Err( + CompileErrorKind::Unsupported(Feature::ForwardReference, options.flavor) + .at(self.span), + ) + } + _ => Ok(()), + } + } +} diff --git a/rulex-lib/src/rule.rs b/rulex-lib/src/rule.rs index 9b34eb9..330f663 100644 --- a/rulex-lib/src/rule.rs +++ b/rulex-lib/src/rule.rs @@ -9,6 +9,7 @@ use crate::{ literal::Literal, lookaround::Lookaround, options::{CompileOptions, ParseOptions}, + reference::Reference, repetition::Repetition, span::Span, }; @@ -33,6 +34,8 @@ pub enum Rulex<'i> { Boundary(Boundary), /// A (positive or negative) lookahead or lookbehind. Lookaround(Box>), + /// A backreference or forward reference. + Reference(Reference<'i>), } impl<'i> Rulex<'i> { @@ -42,14 +45,18 @@ impl<'i> Rulex<'i> { pub fn compile(&self, options: CompileOptions) -> Result { let mut buf = String::new(); - self.comp(options, &mut CompileState::new(), &mut buf)?; + let mut state = CompileState::new(); + self.comp(options, &mut state, &mut buf)?; + state.check_validity()?; Ok(buf) } pub fn parse_and_compile(input: &str, options: CompileOptions) -> Result { let parsed = Rulex::parse(input, options.parse_options)?; let mut buf = String::new(); - parsed.comp(options, &mut CompileState::new(), &mut buf)?; + let mut state = CompileState::new(); + parsed.comp(options, &mut state, &mut buf)?; + state.check_validity()?; Ok(buf) } @@ -62,7 +69,8 @@ impl<'i> Rulex<'i> { | Rulex::Grapheme(_) | Rulex::Repetition(_) | Rulex::Boundary(_) - | Rulex::Lookaround(_) => false, + | Rulex::Lookaround(_) + | Rulex::Reference(_) => false, } } @@ -75,7 +83,8 @@ impl<'i> Rulex<'i> { | Rulex::Grapheme(_) | Rulex::Repetition(_) | Rulex::Boundary(_) - | Rulex::Lookaround(_) => false, + | Rulex::Lookaround(_) + | Rulex::Reference(_) => false, } } @@ -89,6 +98,7 @@ impl<'i> Rulex<'i> { Rulex::Repetition(r) => r.span, Rulex::Boundary(b) => b.span, Rulex::Lookaround(l) => l.span, + Rulex::Reference(r) => r.span, } } } @@ -97,14 +107,15 @@ impl<'i> Rulex<'i> { impl core::fmt::Debug for Rulex<'_> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { - Self::Literal(arg0) => arg0.fmt(f), - Self::CharClass(arg0) => arg0.fmt(f), + Rulex::Literal(arg0) => arg0.fmt(f), + Rulex::CharClass(arg0) => arg0.fmt(f), Rulex::Grapheme(arg0) => arg0.fmt(f), - Self::Group(arg0) => arg0.fmt(f), - Self::Alternation(arg0) => arg0.fmt(f), - Self::Repetition(arg0) => arg0.fmt(f), - Self::Boundary(arg0) => arg0.fmt(f), - Self::Lookaround(arg0) => arg0.fmt(f), + Rulex::Group(arg0) => arg0.fmt(f), + Rulex::Alternation(arg0) => arg0.fmt(f), + Rulex::Repetition(arg0) => arg0.fmt(f), + Rulex::Boundary(arg0) => arg0.fmt(f), + Rulex::Lookaround(arg0) => arg0.fmt(f), + Rulex::Reference(arg0) => arg0.fmt(f), } } } @@ -125,6 +136,7 @@ impl Compile for Rulex<'_> { Rulex::Repetition(r) => r.comp(options, state, buf), Rulex::Boundary(b) => b.comp(options, state, buf), Rulex::Lookaround(l) => l.comp(options, state, buf), + Rulex::Reference(r) => r.comp(options, state, buf), } } }