From c773540fe76dc4b95526b3994b1999a9ded34212 Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 18:14:50 +0800 Subject: [PATCH] perf: faster lines iterator for `Rope` (#145) * refactor: init * refactor: faster lines * refactor: try * chore: clippy * test: more * chore: more --- src/helpers.rs | 11 +- src/rope.rs | 279 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+), 6 deletions(-) diff --git a/src/helpers.rs b/src/helpers.rs index dd5a2c4..48534c3 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -5,7 +5,6 @@ use std::{ ops::Range, }; -use itertools::Either; use rustc_hash::FxHashMap as HashMap; use crate::{ @@ -201,7 +200,7 @@ const EMPTY_ROPE: Rope = Rope::new(); /// Split the string with a needle, each string will contain the needle. /// /// Copied and modified from https://github.com/rust-lang/cargo/blob/30efe860c0e4adc1a6d7057ad223dc6e47d34edf/src/cargo/sources/registry/index.rs#L1048-L1072 -pub fn split<'a>( +fn split<'a>( haystack: &Rope<'a>, needle: u8, ) -> impl Iterator> { @@ -1324,10 +1323,10 @@ pub trait SourceText<'a>: Default + Clone + ToString { impl<'a> SourceText<'a> for Rope<'a> { fn split_into_lines(&self) -> impl Iterator { - if let Some(s) = self.get_simple() { - return Either::Left(split_str(s, b'\n').map(Rope::from)); - } - Either::Right(split(self, b'\n')) + // Split the text into lines, including the line ending character. + // If the text ends with a newline, the last line will be ignored + // For example: "abc\nefg\n" => ["abc\n", "efg\n"] + self.lines_impl(false) } #[inline] diff --git a/src/rope.rs b/src/rope.rs index 3b53124..0bc51c9 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -419,6 +419,34 @@ impl<'a> Rope<'a> { } } + /// Returns an iterator over the lines of the rope. + pub fn lines(&self) -> Lines<'_, 'a> { + self.lines_impl(true) + } + + /// Returns an iterator over the lines of the rope. + /// + /// If `trailing_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline + pub(crate) fn lines_impl( + &self, + trailing_line_break_as_newline: bool, + ) -> Lines<'_, 'a> { + Lines { + iter: match &self.repr { + Repr::Simple(s) => LinesEnum::Simple(s), + Repr::Complex(data) => LinesEnum::Complex { + iter: data, + in_chunk_byte_idx: 0, + chunk_idx: 0, + }, + }, + byte_idx: 0, + ended: false, + total_bytes: self.len(), + trailing_line_break_as_newline, + } + } + /// Converts the rope to bytes. /// /// Returns borrowed bytes for simple ropes and owned bytes for complex ropes. @@ -457,6 +485,188 @@ impl Hash for Rope<'_> { } } +enum LinesEnum<'a, 'b> { + Simple(&'b str), + Complex { + iter: &'a Vec<(&'b str, usize)>, + in_chunk_byte_idx: usize, + chunk_idx: usize, + }, +} + +pub struct Lines<'a, 'b> { + iter: LinesEnum<'a, 'b>, + byte_idx: usize, + ended: bool, + total_bytes: usize, + + /// Whether to treat the end of the rope with ('\n') as an empty newline. + trailing_line_break_as_newline: bool, +} + +impl<'a> Iterator for Lines<'_, 'a> { + type Item = Rope<'a>; + + fn next(&mut self) -> Option { + match *self { + Lines { + iter: LinesEnum::Simple(s), + ref mut byte_idx, + ref mut ended, + ref total_bytes, + trailing_line_break_as_newline, + .. + } => { + if *ended { + return None; + } else if byte_idx == total_bytes { + if trailing_line_break_as_newline { + *ended = true; + return Some(Rope::from("")); + } + return None; + } else if let Some(idx) = + memchr::memchr(b'\n', &s.as_bytes()[*byte_idx..]) + { + let end = *byte_idx + idx + 1; + let rope = Rope::from(&s[*byte_idx..end]); + *byte_idx = end; + return Some(rope); + } + *ended = true; + Some(Rope::from(&s[*byte_idx..])) + } + Lines { + iter: + LinesEnum::Complex { + iter: chunks, + ref mut in_chunk_byte_idx, + ref mut chunk_idx, + }, + ref mut byte_idx, + ref mut ended, + ref total_bytes, + trailing_line_break_as_newline, + } => { + if *ended { + return None; + } else if byte_idx == total_bytes { + if trailing_line_break_as_newline { + *ended = true; + return Some(Rope::from("")); + } + return None; + } + + debug_assert!(*chunk_idx < chunks.len()); + let &(chunk, _) = &chunks[*chunk_idx]; + + // Always try to find a newline in the current chunk, + // if the current chunk contains a newline, return this line. + if let Some(idx) = + memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..]) + { + let end = *in_chunk_byte_idx + idx + 1; + let rope = Rope::from(&chunk[*in_chunk_byte_idx..end]); + *in_chunk_byte_idx = end; + *byte_idx += *in_chunk_byte_idx; + Some(rope) + } else { + // Check if the current chunk has left over bytes. + // If it is the last chunk, return the remaining bytes. + // This is the end of the rope. + if *chunk_idx == chunks.len() - 1 { + // Rope is not ended with a newline. + // Explicitly set the ended flag to true to bail out. + *ended = true; + *byte_idx += chunk.len() - *in_chunk_byte_idx; + return Some(Rope::from(&chunk[*in_chunk_byte_idx..])); + } + + // If the current chunk has running out of bytes, move to the next chunk. + if *in_chunk_byte_idx == chunk.len() { + *chunk_idx += 1; + *in_chunk_byte_idx = 0; + return self.next(); + } + + // If it is not the last chunk, the line spans multiple chunks. + // As such, we need to find the next newline in the next few chunks. + let start_chunk_idx = *chunk_idx; + let start_in_chunk_byte_idx = *in_chunk_byte_idx; + + let end_info = loop { + if *chunk_idx == chunks.len() { + break None; + } + let &(chunk, _) = &chunks[*chunk_idx]; + if let Some(idx) = + memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..]) + { + *in_chunk_byte_idx += idx + 1; + *byte_idx += *in_chunk_byte_idx; + break Some((*chunk_idx, *in_chunk_byte_idx)); + } else { + *in_chunk_byte_idx = 0; + *byte_idx += chunk.len(); + *chunk_idx += 1; + } + }; + + // If we found a newline in the next few chunks, return the line. + if let Some((end_chunk_idx, end_in_chunk_byte_idx)) = end_info { + let mut raw = + Vec::with_capacity(end_chunk_idx - start_chunk_idx + 1); + let mut len = 0; + (start_chunk_idx..end_chunk_idx + 1).for_each(|i| { + let &(chunk, _) = &chunks[i]; + if start_chunk_idx == i { + let start = start_in_chunk_byte_idx; + raw.push((&chunk[start..], len)); + len += chunk.len() - start; + } else if end_chunk_idx == i { + let end = end_in_chunk_byte_idx; + raw.push((&chunk[..end], len)); + len += end; + } else { + raw.push((chunk, len)); + len += chunk.len(); + } + }); + // Advance the byte index to the end of the line. + *byte_idx += len; + Some(Rope { + repr: Repr::Complex(Rc::new(raw)), + }) + } else { + // If we did not find a newline in the next few chunks, + // return the remaining bytes. + + let mut raw = Vec::with_capacity(chunks.len() - start_chunk_idx); + let mut len = 0; + (start_chunk_idx..chunks.len()).for_each(|i| { + let &(chunk, _) = &chunks[i]; + if start_chunk_idx == i { + let start = start_in_chunk_byte_idx; + raw.push((&chunk[start..], len)); + len += chunk.len() - start; + } else { + raw.push((chunk, len)); + len += chunk.len(); + } + }); + // Advance the byte index to the end of the rope. + *byte_idx += len; + Some(Rope { + repr: Repr::Complex(Rc::new(raw)), + }) + } + } + } + } + } +} + enum CharIndicesRepr<'a, 'b> { Simple { iter: std::str::CharIndices<'b>, @@ -908,4 +1118,73 @@ mod tests { "こんにちは世界".char_indices().collect::>() ); } + + #[test] + fn lines1() { + let rope = Rope::from("abc"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc"]); + + // empty line at the end if the line before ends with a newline ('\n') + let rope = Rope::from("abc\ndef\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "def\n", ""]); + + // no empty line at the end if the line before does not end with a newline ('\n') + let rope = Rope::from("abc\ndef"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "def"]); + + let rope = Rope::from("Test\nTest\nTest\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["Test\n", "Test\n", "Test\n", ""]); + + let rope = Rope::from("\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["\n", ""]); + + let rope = Rope::from("\n\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["\n", "\n", ""]); + } + + #[test] + fn lines2() { + let rope = Rope::from_iter(["abc\n", "def\n", "ghi\n"]); + let lines = rope.lines().collect::>(); + // empty line at the end if the line before ends with a newline ('\n') + assert_eq!(lines, ["abc\n", "def\n", "ghi\n", ""]); + + let rope = Rope::from_iter(["abc\n", "def\n", "ghi"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "def\n", "ghi"]); + + let rope = Rope::from_iter(["abc\ndef", "ghi\n", "jkl"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "defghi\n", "jkl"]); + + let rope = Rope::from_iter(["a\nb", "c\n", "d\n"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["a\n", "bc\n", "d\n", ""]); + + let rope = Rope::from_iter(["\n"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["\n", ""]); + } + + #[test] + fn lines_with_trailing_line_break_as_newline() { + let trailing_line_break_as_newline = false; + let rope = Rope::from("abc\n"); + let lines = rope + .lines_impl(trailing_line_break_as_newline) + .collect::>(); + assert_eq!(lines, ["abc\n"]); + + let rope = Rope::from("\n"); + let lines = rope + .lines_impl(trailing_line_break_as_newline) + .collect::>(); + assert_eq!(lines, ["\n"]); + } }