From d44d81bc49a6f0983b3f7d60f09597d07b164e8c Mon Sep 17 00:00:00 2001 From: RustyYato Date: Thu, 15 Feb 2024 19:11:29 -0700 Subject: [PATCH 1/9] The `.` regex should not take the ASCII fast path see #375 for an example of undefined behavior because of this fast path. TLDR: the ASCII fast path will stop matching on the first matching byte, however this would split multi-byte codepoints. Combined with `Lexer::remaining` (or even just capturing the string like in the issue), this leads to non-utf8 strings escaping into user code. This is UNSOUND. --- logos-codegen/src/graph/regex.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index efe97d2e..40aa1c02 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -165,7 +165,7 @@ fn is_ascii(class: &ClassUnicode) -> bool { let start = range.start() as u32; let end = range.end() as u32; - start < 128 && (end < 128 || end == 0x0010_FFFF) + start < 128 && end < 128 }) } @@ -178,7 +178,7 @@ fn is_one_ascii(class: &ClassUnicode) -> bool { let start = range.start() as u32; let end = range.end() as u32; - start < 128 && (end < 128 || end == 0x0010_FFFF) + start < 128 && end < 128 } #[cfg(test)] From 0aa58a0f87546e397facba2ef5d678cb206fbea4 Mon Sep 17 00:00:00 2001 From: RustyYato Date: Thu, 15 Feb 2024 19:25:35 -0700 Subject: [PATCH 2/9] Add tests for unicode dot in both str and bytes --- src/lib.rs | 3 +++ src/tests.rs | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 src/tests.rs diff --git a/src/lib.rs b/src/lib.rs index 7df59837..c4784d97 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,6 +30,9 @@ extern crate core as std; pub use logos_derive::Logos; use std::fmt::Debug; +#[cfg(test)] +mod tests; + mod lexer; pub mod source; diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 00000000..1545e18b --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,57 @@ +use crate::Logos; + +#[derive(Logos, Debug, PartialEq)] +#[logos(crate = crate)] +enum TestUnicodeDot { + #[regex(".")] + Dot, +} + +#[test] +fn test_unicode_dot_str_ascii() { + let mut lexer = TestUnicodeDot::lexer("a"); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDot::Dot))); + assert_eq!(lexer.remainder(), ""); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_unicode_dot_str_unicode() { + let mut lexer = TestUnicodeDot::lexer(""); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDot::Dot))); + assert_eq!(lexer.remainder(), ""); + assert_eq!(lexer.next(), None); +} + +#[derive(Logos, Debug, PartialEq)] +#[logos(crate = crate)] +enum TestUnicodeDotBytes { + #[regex(".", priority = 100)] + Dot, + #[regex(b".", priority = 0)] + InvalidUtf8, +} + +#[test] +fn test_unicode_dot_bytes_ascii() { + let mut lexer = TestUnicodeDotBytes::lexer(b"a"); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::Dot))); + assert_eq!(lexer.remainder(), b""); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_unicode_dot_bytes_unicode() { + let mut lexer = TestUnicodeDotBytes::lexer("".as_bytes()); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::Dot))); + assert_eq!(lexer.remainder(), b""); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_unicode_dot_bytes_invalid_utf8() { + let mut lexer = TestUnicodeDotBytes::lexer(b"\xff"); + assert_eq!(lexer.next(), Some(Ok(TestUnicodeDotBytes::InvalidUtf8))); + assert_eq!(lexer.remainder(), b""); + assert_eq!(lexer.next(), None); +} From 80bd23fd112b7ab0efbb02ca278cb75d02dcd195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 11:04:42 +0100 Subject: [PATCH 3/9] chore(lib): rewrite using ClassUnicode methods As suggested by @RustyYato --- logos-codegen/src/graph/regex.rs | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index 40aa1c02..a1c2ffe7 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -160,25 +160,15 @@ impl Graph { } } +#[inline] fn is_ascii(class: &ClassUnicode) -> bool { - class.iter().all(|range| { - let start = range.start() as u32; - let end = range.end() as u32; - - start < 128 && end < 128 - }) + class.is_ascii() } +#[inline] fn is_one_ascii(class: &ClassUnicode) -> bool { - if class.ranges().len() != 1 { - return false; - } - - let range = &class.ranges()[0]; - let start = range.start() as u32; - let end = range.end() as u32; - - start < 128 && end < 128 + let ranges = class.ranges(); + ranges.len() == 1 && ranges[0].end() < '\x7F' } #[cfg(test)] From c99181782e17e1de602bab0c55d063b2e4bac310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 11:04:42 +0100 Subject: [PATCH 4/9] Revert "chore(lib): rewrite using ClassUnicode methods" This reverts commit 80bd23fd112b7ab0efbb02ca278cb75d02dcd195. --- logos-codegen/src/graph/regex.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index a1c2ffe7..40aa1c02 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -160,15 +160,25 @@ impl Graph { } } -#[inline] fn is_ascii(class: &ClassUnicode) -> bool { - class.is_ascii() + class.iter().all(|range| { + let start = range.start() as u32; + let end = range.end() as u32; + + start < 128 && end < 128 + }) } -#[inline] fn is_one_ascii(class: &ClassUnicode) -> bool { - let ranges = class.ranges(); - ranges.len() == 1 && ranges[0].end() < '\x7F' + if class.ranges().len() != 1 { + return false; + } + + let range = &class.ranges()[0]; + let start = range.start() as u32; + let end = range.end() as u32; + + start < 128 && end < 128 } #[cfg(test)] From 49df2d013cb2d24730af90fc10a0ae6b6b399bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 11:31:38 +0100 Subject: [PATCH 5/9] try: fallback to previous impl Tests are still passing --- logos-codegen/src/graph/regex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index 40aa1c02..d351a5f5 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -178,7 +178,7 @@ fn is_one_ascii(class: &ClassUnicode) -> bool { let start = range.start() as u32; let end = range.end() as u32; - start < 128 && end < 128 + start < 128 && (end < 128 || end == 0x0010_FFFF) } #[cfg(test)] From 5aa4021834a115a314bc22fef9b230523ea4f52b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 11:58:24 +0100 Subject: [PATCH 6/9] try: add repetition check --- logos-codegen/src/graph/regex.rs | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index d351a5f5..d85247a2 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -7,7 +7,7 @@ use crate::mir::{Class, ClassUnicode, Literal, Mir}; impl Graph { pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId { - self.parse_mir(mir, then, None, None) + self.parse_mir(mir, then, None, None, false) } fn parse_mir( @@ -16,6 +16,7 @@ impl Graph { then: NodeId, miss: Option, reserved: Option, + repeated: bool, ) -> NodeId { match mir { Mir::Empty => then, @@ -29,7 +30,7 @@ impl Graph { None => self.reserve(), }; - self.parse_mir(*mir, this.get(), Some(miss), Some(this)) + self.parse_mir(*mir, this.get(), Some(miss), Some(this), true) } Mir::Maybe(mir) => { let miss = match miss { @@ -37,13 +38,13 @@ impl Graph { None => then, }; - self.parse_mir(*mir, then, Some(miss), reserved) + self.parse_mir(*mir, then, Some(miss), reserved, false) } Mir::Alternation(alternation) => { let mut fork = Fork::new().miss(miss); for mir in alternation { - let id = self.parse_mir(mir, then, None, None); + let id = self.parse_mir(mir, then, None, None, repeated); let alt = self.fork_off(id); fork.merge(alt, self); @@ -97,7 +98,7 @@ impl Graph { for mir in concat.drain(1..).rev() { if let Some(mir) = handle_bytes(self, mir, &mut then) { - then = self.parse_mir(mir, then, None, None); + then = self.parse_mir(mir, then, None, None, false); } } @@ -107,10 +108,10 @@ impl Graph { self.insert_or_push(reserved, rope) } - Some(mir) => self.parse_mir(mir, then, miss, reserved), + Some(mir) => self.parse_mir(mir, then, miss, reserved, false), } } - Mir::Class(Class::Unicode(class)) if !is_ascii(&class) => { + Mir::Class(Class::Unicode(class)) if !is_ascii(&class, repeated) => { let mut ropes = class .iter() .flat_map(|range| Utf8Sequences::new(range.start(), range.end())) @@ -160,12 +161,10 @@ impl Graph { } } -fn is_ascii(class: &ClassUnicode) -> bool { - class.iter().all(|range| { - let start = range.start() as u32; +fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool { + class.iter().last().map_or(true, |range| { let end = range.end() as u32; - - start < 128 && end < 128 + end < 128 || (repeated && end == 0x0010_FFFF) }) } @@ -175,10 +174,9 @@ fn is_one_ascii(class: &ClassUnicode) -> bool { } let range = &class.ranges()[0]; - let start = range.start() as u32; let end = range.end() as u32; - start < 128 && (end < 128 || end == 0x0010_FFFF) + end < 128 || end == 0x0010_FFFF } #[cfg(test)] From 7301705ccca80604a0e866755475cc68c27f6822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 12:21:30 +0100 Subject: [PATCH 7/9] chore(lib): cleanup code --- logos-codegen/src/graph/regex.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index d85247a2..adfdbc63 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -74,7 +74,7 @@ impl Graph { } None } - Mir::Class(Class::Unicode(class)) if is_one_ascii(&class) => { + Mir::Class(Class::Unicode(class)) if is_one_ascii(&class, repeated) => { cur -= 1; ropebuf[cur] = class.ranges()[0].into(); None @@ -161,22 +161,34 @@ impl Graph { } } +/// Return wether current class unicode is ascii. +/// +/// Because unicode ranges are iterated in increasing order, +/// it is only necessary to check the last range. +/// +/// If the check is performed in a repetition, +/// a fast path is used by checking if end of range is 0x0010_FFFF. fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool { class.iter().last().map_or(true, |range| { + let start = range.start() as u32; let end = range.end() as u32; - end < 128 || (repeated && end == 0x0010_FFFF) + end < 128 || (repeated && start < 128 && end == 0x0010_FFFF) }) } -fn is_one_ascii(class: &ClassUnicode) -> bool { +/// Return wether current class unicode is ascii and only contains +/// one range. +/// +/// See [`is_ascii`] function for more details. +fn is_one_ascii(class: &ClassUnicode, repeated: bool) -> bool { if class.ranges().len() != 1 { return false; } let range = &class.ranges()[0]; + let start = range.start() as u32; let end = range.end() as u32; - - end < 128 || end == 0x0010_FFFF + end < 128 || (repeated && start < 128 && end == 0x0010_FFFF) } #[cfg(test)] From bbf64da024514a67fcb90a8352e4c47aab884869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 12:39:56 +0100 Subject: [PATCH 8/9] fix and move --- logos-codegen/src/graph/regex.rs | 2 +- src/tests.rs => tests/tests/unicode_dot.rs | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) rename src/tests.rs => tests/tests/unicode_dot.rs (95%) diff --git a/logos-codegen/src/graph/regex.rs b/logos-codegen/src/graph/regex.rs index adfdbc63..c245c1b6 100644 --- a/logos-codegen/src/graph/regex.rs +++ b/logos-codegen/src/graph/regex.rs @@ -38,7 +38,7 @@ impl Graph { None => then, }; - self.parse_mir(*mir, then, Some(miss), reserved, false) + self.parse_mir(*mir, then, Some(miss), reserved, true) } Mir::Alternation(alternation) => { let mut fork = Fork::new().miss(miss); diff --git a/src/tests.rs b/tests/tests/unicode_dot.rs similarity index 95% rename from src/tests.rs rename to tests/tests/unicode_dot.rs index 1545e18b..56434b36 100644 --- a/src/tests.rs +++ b/tests/tests/unicode_dot.rs @@ -1,7 +1,7 @@ -use crate::Logos; +use logos::Logos as _; +use logos_derive::Logos; #[derive(Logos, Debug, PartialEq)] -#[logos(crate = crate)] enum TestUnicodeDot { #[regex(".")] Dot, @@ -24,7 +24,6 @@ fn test_unicode_dot_str_unicode() { } #[derive(Logos, Debug, PartialEq)] -#[logos(crate = crate)] enum TestUnicodeDotBytes { #[regex(".", priority = 100)] Dot, From d8a4e2f4cf9a9a02da5ae1994b066030c5a49b30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Fri, 16 Feb 2024 12:41:04 +0100 Subject: [PATCH 9/9] another fix --- src/lib.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c4784d97..7df59837 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,9 +30,6 @@ extern crate core as std; pub use logos_derive::Logos; use std::fmt::Debug; -#[cfg(test)] -mod tests; - mod lexer; pub mod source;