diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index efe97d2e..c245c1b6 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -7,7 +7,7 @@ use crate::mir::{Class, ClassUnicode, Literal, Mir}; impl Graph { pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId { - self.parse_mir(mir, then, None, None) + self.parse_mir(mir, then, None, None, false) } fn parse_mir( @@ -16,6 +16,7 @@ impl Graph { then: NodeId, miss: Option, reserved: Option, + repeated: bool, ) -> NodeId { match mir { Mir::Empty => then, @@ -29,7 +30,7 @@ impl Graph { None => self.reserve(), }; - self.parse_mir(*mir, this.get(), Some(miss), Some(this)) + self.parse_mir(*mir, this.get(), Some(miss), Some(this), true) } Mir::Maybe(mir) => { let miss = match miss { @@ -37,13 +38,13 @@ impl Graph { None => then, }; - self.parse_mir(*mir, then, Some(miss), reserved) + self.parse_mir(*mir, then, Some(miss), reserved, true) } Mir::Alternation(alternation) => { let mut fork = Fork::new().miss(miss); for mir in alternation { - let id = self.parse_mir(mir, then, None, None); + let id = self.parse_mir(mir, then, None, None, repeated); let alt = self.fork_off(id); fork.merge(alt, self); @@ -73,7 +74,7 @@ impl Graph { } None } - Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => { + Mir::Class(Class::Unicode(class)) if is_one_ascii(&class, repeated) => { cur -= 1; ropebuf[cur] = class.ranges()[0].into(); None @@ -97,7 +98,7 @@ impl Graph { for mir in concat.drain(1..).rev() { if let Some(mir) = handle_bytes(self, mir, &mut then) { - then = self.parse_mir(mir, then, None, None); + then = self.parse_mir(mir, then, None, None, false); } } @@ -107,10 +108,10 @@ impl Graph { self.insert_or_push(reserved, rope) } - Some(mir) => self.parse_mir(mir, then, miss, reserved), + Some(mir) => self.parse_mir(mir, then, miss, reserved, false), } } - Mir::Class(Class::Unicode(class)) if !is_ascii(&class) => { + Mir::Class(Class::Unicode(class)) if !is_ascii(&class, repeated) => { let mut ropes = class .iter() .flat_map(|range| Utf8Sequences::new(range.start(), range.end())) @@ -160,16 +161,26 @@ impl Graph { } } -fn is_ascii(class: &ClassUnicode) -> bool { - class.iter().all(|range| { +/// Return wether current class unicode is ascii. +/// +/// Because unicode ranges are iterated in increasing order, +/// it is only necessary to check the last range. +/// +/// If the check is performed in a repetition, +/// a fast path is used by checking if end of range is 0x0010_FFFF. +fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool { + class.iter().last().map_or(true, |range| { let start = range.start() as u32; let end = range.end() as u32; - - start < 128 && (end < 128 || end == 0x0010_FFFF) + end < 128 || (repeated && start < 128 && end == 0x0010_FFFF) }) } -fn is_one_ascii(class: &ClassUnicode) -> bool { +/// Return wether current class unicode is ascii and only contains +/// one range. +/// +/// See [`is_ascii`] function for more details. +fn is_one_ascii(class: &ClassUnicode, repeated: bool) -> bool { if class.ranges().len() != 1 { return false; } @@ -177,8 +188,7 @@ fn is_one_ascii(class: &ClassUnicode) -> bool { let range = &class.ranges()[0]; let start = range.start() as u32; let end = range.end() as u32; - - start < 128 && (end < 128 || end == 0x0010_FFFF) + end < 128 || (repeated && start < 128 && end == 0x0010_FFFF) } #[cfg(test)] diff --git a/tests/tests/unicode_dot.rs b/tests/tests/unicode_dot.rs new file mode 100644 index 00000000..56434b36 --- /dev/null +++ b/tests/tests/unicode_dot.rs @@ -0,0 +1,56 @@ +use logos::Logos as _; +use logos_derive::Logos; + +#[derive(Logos, Debug, PartialEq)] +enum TestUnicodeDot { + #[regex(".")] + Dot, +} + +#[test] +fn test_unicode_dot_str_ascii() { + let mut lexer = TestUnicodeDot::lexer("a"); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDot::Dot))); + assert_eq!(lexer.remainder(), ""); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_unicode_dot_str_unicode() { + let mut lexer = TestUnicodeDot::lexer(""); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDot::Dot))); + assert_eq!(lexer.remainder(), ""); + assert_eq!(lexer.next(), None); +} + +#[derive(Logos, Debug, PartialEq)] +enum TestUnicodeDotBytes { + #[regex(".", priority = 100)] + Dot, + #[regex(b".", priority = 0)] + InvalidUtf8, +} + +#[test] +fn test_unicode_dot_bytes_ascii() { + let mut lexer = TestUnicodeDotBytes::lexer(b"a"); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::Dot))); + assert_eq!(lexer.remainder(), b""); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_unicode_dot_bytes_unicode() { + let mut lexer = TestUnicodeDotBytes::lexer("".as_bytes()); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::Dot))); + assert_eq!(lexer.remainder(), b""); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_unicode_dot_bytes_invalid_utf8() { + let mut lexer = TestUnicodeDotBytes::lexer(b"\xff"); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::InvalidUtf8))); + assert_eq!(lexer.remainder(), b""); + assert_eq!(lexer.next(), None); +}