Skip to content

Commit

Permalink
Support backreferences
Browse files Browse the repository at this point in the history
  • Loading branch information
Aloso committed Mar 11, 2022
1 parent e1df0b3 commit c0b755e
Show file tree
Hide file tree
Showing 8 changed files with 305 additions and 28 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ On the left are rulex expressions (_rulexes_ for short), on the right is the com
<< 'foo' | 'bar' # (?<=foo|bar)
!>> 'foo' | 'bar' # (?!foo|bar)
!<< 'foo' | 'bar' # (?<!foo|bar)

# Backreferences
:('test') ::1 # (test)\1
:name('test') ::name # (?P<name>test)\k<name>
```

## Why use this instead of normal regexes?
Expand Down
23 changes: 22 additions & 1 deletion rulex-lib/src/compile.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
use std::collections::HashMap;

use crate::{error::CompileError, options::CompileOptions};
use crate::{
error::{CompileError, CompileErrorKind},
options::CompileOptions,
span::Span,
};

pub(crate) type CompileResult = Result<(), CompileError>;

Expand Down Expand Up @@ -33,13 +37,30 @@ impl<T: Compile> Compile for Parens<'_, T> {
pub(crate) struct CompileState {
pub(crate) next_idx: u32,
pub(crate) used_names: HashMap<String, u32>,

pub(crate) unknown_references: Vec<(String, Span)>,
pub(crate) unknown_groups: Vec<(u32, Span)>,
}

impl CompileState {
pub(crate) fn new() -> Self {
CompileState {
next_idx: 1,
used_names: HashMap::new(),
unknown_references: vec![],
unknown_groups: vec![],
}
}

pub(crate) fn check_validity(self) -> Result<(), CompileError> {
for (group, span) in self.unknown_groups {
if group >= self.next_idx {
return Err(CompileErrorKind::UnknownReferenceNumber(group).at(span));
}
}
if let Some((reference, span)) = self.unknown_references.into_iter().next() {
return Err(CompileErrorKind::UnknownReferenceName(reference).at(span));
}
Ok(())
}
}
17 changes: 17 additions & 0 deletions rulex-lib/src/error/compile_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ pub enum CompileErrorKind {
#[error("Compile error: Unsupported feature `{}` in the `{:?}` regex flavor", .0.name(), .1)]
Unsupported(Feature, RegexFlavor),

#[error("Group references this large aren't supported")]
HugeReference,

#[error("Reference to unknown group. There is no group number {}", .0)]
UnknownReferenceNumber(u32),

#[error("Reference to unknown group. There is no group named `{}`", .0)]
UnknownReferenceName(String),

#[error("Compile error: Group name `{}` used multiple times", .0)]
NameUsedMultipleTimes(String),

Expand All @@ -50,6 +59,10 @@ pub enum Feature {
Lookaround,
Grapheme,
UnicodeLineBreak,
Backreference,
ForwardReference,
RelativeReference,
NonNegativeRelativeReference,
}

impl Feature {
Expand All @@ -59,6 +72,10 @@ impl Feature {
Feature::Lookaround => "lookahead/behind",
Feature::Grapheme => "grapheme cluster matcher (\\X)",
Feature::UnicodeLineBreak => "Unicode line break (\\R)",
Feature::Backreference => "Backreference",
Feature::ForwardReference => "Forward reference",
Feature::RelativeReference => "Relative backreference",
Feature::NonNegativeRelativeReference => "Non-negative relative backreference",
}
}
}
Expand Down
12 changes: 8 additions & 4 deletions rulex-lib/src/group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ impl<'i> Group<'i> {
}
}

pub fn set_capture(&mut self, capture: Option<Capture<'i>>) {
self.capture = capture;
pub fn set_capture(&mut self, capture: Capture<'i>) {
self.capture = Some(capture);
}

pub fn needs_parens_before_repetition(&self) -> bool {
Expand All @@ -61,17 +61,21 @@ impl Compile for Group<'_> {
);
}
state.used_names.insert(name.to_string(), state.next_idx);
state.unknown_references.retain(|(s, _)| s != name);
state.next_idx += 1;

// https://www.regular-expressions.info/named.html
match options.flavor {
RegexFlavor::Python | RegexFlavor::Pcre => {
buf.push_str("(?P<");
}
RegexFlavor::DotNet | RegexFlavor::Java | RegexFlavor::Ruby => {
RegexFlavor::DotNet
| RegexFlavor::Java
| RegexFlavor::Ruby
| RegexFlavor::JavaScript => {
buf.push_str("(?<");
}
RegexFlavor::JavaScript | RegexFlavor::Rust => {
RegexFlavor::Rust => {
return Err(CompileErrorKind::Unsupported(
Feature::NamedCaptureGroups,
options.flavor,
Expand Down
1 change: 1 addition & 0 deletions rulex-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub mod literal;
pub mod lookaround;
pub mod options;
pub mod parse;
pub mod reference;
pub mod repetition;
pub mod span;

Expand Down
70 changes: 58 additions & 12 deletions rulex-lib/src/parse/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use nom::{
branch::alt,
combinator::{cut, map, opt, value},
multi::{many0, many1, separated_list0},
sequence::{pair, separated_pair, tuple},
sequence::{pair, preceded, separated_pair, tuple},
IResult, Parser,
};

Expand All @@ -18,6 +18,7 @@ use crate::{
group::{Capture, Group},
literal::Literal,
lookaround::{Lookaround, LookaroundKind},
reference::{Reference, ReferenceTarget},
repetition::{Greedy, Repetition, RepetitionKind},
span::Span,
Rulex,
Expand Down Expand Up @@ -125,10 +126,6 @@ pub(super) fn parse_repetition<'i, 'b>(
pub(super) fn parse_braced_repetition<'i, 'b>(
input: Input<'i, 'b>,
) -> PResult<'i, 'b, (RepetitionKind, Span)> {
fn str_to_u32(s: &str) -> Result<u32, ParseErrorKind> {
str::parse(s).map_err(|_| ParseErrorKind::Number(NumberError::TooLarge))
}

fn parse_u32<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, u32> {
try_map(Token::Number, |(s, _)| str_to_u32(s), nom::Err::Failure)(input)
}
Expand Down Expand Up @@ -157,6 +154,7 @@ pub(super) fn parse_atom<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex<
parse_char_class,
parse_grapheme,
parse_boundary,
parse_reference,
map(parse_code_point, |(c, span)| {
Rulex::CharClass(CharClass::new(CharGroup::from_char(c), span))
}),
Expand All @@ -166,24 +164,30 @@ pub(super) fn parse_atom<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex<
}

pub(super) fn parse_group<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex<'i>> {
fn parse_capture<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Capture<'i>> {
map(pair(Token::Colon, opt(Token::Identifier)), |(_, name)| {
Capture::new(name.map(|(s, _)| s))
})(input)
fn parse_capture<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, (Capture<'i>, Span)> {
map(
pair(Token::Colon, opt(Token::Identifier)),
|((_, span1), name)| (Capture::new(name.map(|(s, _)| s)), span1),
)(input)
}

map(
pair(
opt(parse_capture),
tuple((Token::OpenParen, parse_or, cut(Token::CloseParen))),
),
|(capture, ((_, start), rule, (_, end)))| match (capture, rule) {
|(capture, (_, rule, (_, close_paren)))| match (capture, rule) {
(None, rule) => rule,
(capture, Rulex::Group(mut g)) => {
(Some((capture, c_span)), Rulex::Group(mut g)) => {
g.set_capture(capture);
g.span.start = c_span.start;
Rulex::Group(g)
}
(capture, rule) => Rulex::Group(Group::new(vec![rule], capture, start.join(end))),
(Some((capture, c_span)), rule) => Rulex::Group(Group::new(
vec![rule],
Some(capture),
c_span.join(close_paren),
)),
},
)(input)
}
Expand Down Expand Up @@ -368,6 +372,48 @@ pub(super) fn parse_boundary<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Ru
)(input)
}

pub(super) fn parse_reference<'i, 'b>(input: Input<'i, 'b>) -> PResult<'i, 'b, Rulex<'i>> {
preceded(
Token::Backref,
alt((
try_map(
Token::Number,
|(s, span)| {
let target = ReferenceTarget::Number(str_to_u32(s)?);
Ok(Rulex::Reference(Reference::new(target, span)))
},
nom::Err::Failure,
),
map(Token::Identifier, |(s, span)| {
let target = ReferenceTarget::Named(s);
Rulex::Reference(Reference::new(target, span))
}),
try_map(
pair(alt((Token::Plus, Token::Dash)), Token::Number),
|((sign, span1), (s, span2))| {
let num = if sign == "-" {
str_to_i32(&format!("-{s}"))
} else {
str_to_i32(s)
}?;
let target = ReferenceTarget::Relative(num);
Ok(Rulex::Reference(Reference::new(target, span1.join(span2))))
},
nom::Err::Failure,
),
err(|| ParseErrorKind::Expected("number or group name")),
)),
)(input)
}

fn str_to_u32(s: &str) -> Result<u32, ParseErrorKind> {
str::parse(s).map_err(|_| ParseErrorKind::Number(NumberError::TooLarge))
}

fn str_to_i32(s: &str) -> Result<i32, ParseErrorKind> {
str::parse(s).map_err(|_| ParseErrorKind::Number(NumberError::TooLarge))
}

fn strip_first_last(s: &str) -> &str {
&s[1..s.len() - 1]
}
Expand Down
Loading

0 comments on commit c0b755e

Please sign in to comment.