From cd6eed1604e8a6ec5fbf1d5f4cb936f26b1238c0 Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 12:57:37 +0800 Subject: [PATCH 1/6] refactor: init --- src/helpers.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/helpers.rs b/src/helpers.rs index dd5a2c4..f071ade 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -225,8 +225,7 @@ pub fn split<'a>( // SAFETY: base and end positions are guaranteed to be within the bounds of the rope. // and both of them are on char boundaries. #[allow(unsafe_code)] - let ret = - unsafe { self.rope.byte_slice_unchecked(self.base..end_pos + 1) }; + let ret = unsafe { self.rope.byte_slice(self.base..end_pos + 1) }; self.base = end_pos + 1; if self.base >= self.bytes.len() { From bdf97fb41a0725cae72b70f4c719ca0c0bf56a30 Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 16:28:43 +0800 Subject: [PATCH 2/6] refactor: faster lines --- src/helpers.rs | 3 +- src/rope.rs | 213 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 215 insertions(+), 1 deletion(-) diff --git a/src/helpers.rs b/src/helpers.rs index f071ade..dd5a2c4 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -225,7 +225,8 @@ pub fn split<'a>( // SAFETY: base and end positions are guaranteed to be within the bounds of the rope. // and both of them are on char boundaries. #[allow(unsafe_code)] - let ret = unsafe { self.rope.byte_slice(self.base..end_pos + 1) }; + let ret = + unsafe { self.rope.byte_slice_unchecked(self.base..end_pos + 1) }; self.base = end_pos + 1; if self.base >= self.bytes.len() { diff --git a/src/rope.rs b/src/rope.rs index 3b53124..04bd65e 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -419,6 +419,21 @@ impl<'a> Rope<'a> { } } + /// Returns an iterator over the lines of the rope. + pub fn lines(&self) -> Lines<'_, 'a> { + Lines { + iter: match &self.repr { + Repr::Simple(s) => LinesEnum::Simple(s), + Repr::Complex(data) => LinesEnum::Complex(data), + }, + byte_idx: 0, + in_chunk_byte_idx: 0, + chunk_idx: 0, + ended: false, + total_bytes: self.len(), + } + } + /// Converts the rope to bytes. /// /// Returns borrowed bytes for simple ropes and owned bytes for complex ropes. @@ -457,6 +472,173 @@ impl Hash for Rope<'_> { } } +enum LinesEnum<'a, 'b> { + Simple(&'b str), + Complex(&'a Vec<(&'b str, usize)>), +} + +pub struct Lines<'a, 'b> { + iter: LinesEnum<'a, 'b>, + byte_idx: usize, + in_chunk_byte_idx: usize, + chunk_idx: usize, + ended: bool, + total_bytes: usize, +} + +impl<'a, 'b> Iterator for Lines<'a, 'b> { + type Item = Rope<'a>; + + fn next(&mut self) -> Option { + match *self { + Lines { + iter: LinesEnum::Simple(s), + ref mut byte_idx, + ref mut ended, + ref total_bytes, + .. + } => { + if *ended { + return None; + } else if byte_idx == total_bytes { + *ended = true; + return Some(Rope::from("")); + } else if let Some(idx) = + memchr::memchr(b'\n', &s.as_bytes()[*byte_idx..]) + { + let end = *byte_idx + idx + 1; + let rope = Rope::from(&s[*byte_idx..end]); + *byte_idx = end; + return Some(rope); + } + *ended = true; + return Some(Rope::from(&s[*byte_idx..])); + } + Lines { + iter: LinesEnum::Complex(chunks), + ref mut byte_idx, + ref mut in_chunk_byte_idx, + ref mut chunk_idx, + ref mut ended, + ref total_bytes, + } => { + if *ended { + return None; + } else if byte_idx == total_bytes { + *ended = true; + return Some(Rope::from("")); + } + + debug_assert!(*chunk_idx < chunks.len()); + + let &(chunk, _) = &chunks[*chunk_idx]; + // Always try to find a newline in the current chunk, + // if the current chunk contains a newline, return this line. + if let Some(idx) = + memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..]) + { + let end = *in_chunk_byte_idx + idx + 1; + let rope = Rope::from(&chunk[*in_chunk_byte_idx..end]); + *in_chunk_byte_idx = end; + *byte_idx += *in_chunk_byte_idx; + return Some(rope); + } else { + // Check if the current chunk has left over bytes. + + // If it is the last chunk, return the remaining bytes. + // This is the end of the rope. + if *chunk_idx == chunks.len() - 1 { + // Rope is not ended with a newline. + // Explicitly set the ended flag to true to bail out. + *ended = true; + *byte_idx += chunk.len() - *in_chunk_byte_idx; + return Some(Rope::from(&chunk[*in_chunk_byte_idx..])); + } + + // If the current chunk has running out of bytes, move to the next chunk. + if *in_chunk_byte_idx == chunk.len() { + *chunk_idx += 1; + *in_chunk_byte_idx = 0; + return self.next(); + } + + // If it is not the last chunk, the line spans multiple chunks. + // As such, we need to find the next newline in the next few chunks. + let start_chunk_idx = *chunk_idx; + let start_in_chunk_byte_idx = *in_chunk_byte_idx; + + let end_info = loop { + if *chunk_idx == chunks.len() { + break None; + } + let &(chunk, _) = &chunks[*chunk_idx]; + if let Some(idx) = + memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..]) + { + *in_chunk_byte_idx += idx + 1; + *byte_idx += *in_chunk_byte_idx; + break Some((*chunk_idx, *in_chunk_byte_idx)); + } else { + *in_chunk_byte_idx = 0; + *byte_idx += chunk.len(); + *chunk_idx += 1; + } + }; + + // If we found a newline in the next few chunks, return the line. + if let Some((end_chunk_idx, end_in_chunk_byte_idx)) = end_info { + let mut raw = + Vec::with_capacity(end_chunk_idx - start_chunk_idx + 1); + let mut len = 0; + (start_chunk_idx..end_chunk_idx + 1).for_each(|i| { + let &(chunk, _) = &chunks[i]; + if start_chunk_idx == i { + let start = start_in_chunk_byte_idx; + raw.push((&chunk[start..], len)); + len += chunk.len() - start; + } else if end_chunk_idx == i { + let end = end_in_chunk_byte_idx; + raw.push((&chunk[..end], len)); + len += end; + } else { + raw.push((chunk, len)); + len += chunk.len(); + } + }); + // Advance the byte index to the end of the line. + *byte_idx += len; + return Some(Rope { + repr: Repr::Complex(Rc::new(raw)), + }); + } else { + // If we did not find a newline in the next few chunks, + // return the remaining bytes. + + let mut raw = Vec::with_capacity(chunks.len() - start_chunk_idx); + let mut len = 0; + (start_chunk_idx..chunks.len()).for_each(|i| { + let &(chunk, _) = &chunks[i]; + if start_chunk_idx == i { + let start = start_in_chunk_byte_idx; + raw.push((&chunk[start..], len)); + len += chunk.len() - start; + } else { + raw.push((chunk, len)); + len += chunk.len(); + } + }); + // Advance the byte index to the end of the rope. + *byte_idx += len; + return Some(Rope { + repr: Repr::Complex(Rc::new(raw)), + }); + } + } + } + } + } +} + enum CharIndicesRepr<'a, 'b> { Simple { iter: std::str::CharIndices<'b>, @@ -908,4 +1090,35 @@ mod tests { "こんにちは世界".char_indices().collect::>() ); } + + #[test] + fn lines1() { + let rope = Rope::from("abc\ndef\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "def\n", ""]); + + let rope = Rope::from("abc\ndef"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "def"]); + } + + #[test] + fn lines2() { + let rope = Rope::from_iter(["abc\n", "def\n", "ghi\n"]); + let lines = rope.lines().collect::>(); + // empty line at the end if the line before ends with a newline ('\n') + assert_eq!(lines, ["abc\n", "def\n", "ghi\n", ""]); + + let rope = Rope::from_iter(["abc\n", "def\n", "ghi"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "def\n", "ghi"]); + + let rope = Rope::from_iter(["abc\ndef", "ghi\n", "jkl"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc\n", "defghi\n", "jkl"]); + + let rope = Rope::from_iter(["a\nb", "c\n", "d\n"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["a\n", "bc\n", "d\n", ""]); + } } From e60d734503ffd5cd2c312a575e551a9a28e0caac Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 17:48:15 +0800 Subject: [PATCH 3/6] refactor: try --- src/helpers.rs | 11 +++++------ src/rope.rs | 48 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/helpers.rs b/src/helpers.rs index dd5a2c4..48534c3 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -5,7 +5,6 @@ use std::{ ops::Range, }; -use itertools::Either; use rustc_hash::FxHashMap as HashMap; use crate::{ @@ -201,7 +200,7 @@ const EMPTY_ROPE: Rope = Rope::new(); /// Split the string with a needle, each string will contain the needle. /// /// Copied and modified from https://github.com/rust-lang/cargo/blob/30efe860c0e4adc1a6d7057ad223dc6e47d34edf/src/cargo/sources/registry/index.rs#L1048-L1072 -pub fn split<'a>( +fn split<'a>( haystack: &Rope<'a>, needle: u8, ) -> impl Iterator> { @@ -1324,10 +1323,10 @@ pub trait SourceText<'a>: Default + Clone + ToString { impl<'a> SourceText<'a> for Rope<'a> { fn split_into_lines(&self) -> impl Iterator { - if let Some(s) = self.get_simple() { - return Either::Left(split_str(s, b'\n').map(Rope::from)); - } - Either::Right(split(self, b'\n')) + // Split the text into lines, including the line ending character. + // If the text ends with a newline, the last line will be ignored + // For example: "abc\nefg\n" => ["abc\n", "efg\n"] + self.lines_impl(false) } #[inline] diff --git a/src/rope.rs b/src/rope.rs index 04bd65e..403f8e3 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -421,6 +421,16 @@ impl<'a> Rope<'a> { /// Returns an iterator over the lines of the rope. pub fn lines(&self) -> Lines<'_, 'a> { + self.lines_impl(true) + } + + /// Returns an iterator over the lines of the rope. + /// + /// If `end_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline + pub(crate) fn lines_impl( + &self, + end_line_break_as_newline: bool, + ) -> Lines<'_, 'a> { Lines { iter: match &self.repr { Repr::Simple(s) => LinesEnum::Simple(s), @@ -431,6 +441,7 @@ impl<'a> Rope<'a> { chunk_idx: 0, ended: false, total_bytes: self.len(), + end_line_break_as_newline, } } @@ -484,9 +495,12 @@ pub struct Lines<'a, 'b> { chunk_idx: usize, ended: bool, total_bytes: usize, + + /// Whether to treat the end of the rope with ('\n') as an empty newline. + end_line_break_as_newline: bool, } -impl<'a, 'b> Iterator for Lines<'a, 'b> { +impl<'a> Iterator for Lines<'_, 'a> { type Item = Rope<'a>; fn next(&mut self) -> Option { @@ -496,13 +510,17 @@ impl<'a, 'b> Iterator for Lines<'a, 'b> { ref mut byte_idx, ref mut ended, ref total_bytes, + end_line_break_as_newline, .. } => { if *ended { return None; } else if byte_idx == total_bytes { - *ended = true; - return Some(Rope::from("")); + if end_line_break_as_newline { + *ended = true; + return Some(Rope::from("")); + } + return None; } else if let Some(idx) = memchr::memchr(b'\n', &s.as_bytes()[*byte_idx..]) { @@ -512,7 +530,7 @@ impl<'a, 'b> Iterator for Lines<'a, 'b> { return Some(rope); } *ended = true; - return Some(Rope::from(&s[*byte_idx..])); + Some(Rope::from(&s[*byte_idx..])) } Lines { iter: LinesEnum::Complex(chunks), @@ -521,12 +539,16 @@ impl<'a, 'b> Iterator for Lines<'a, 'b> { ref mut chunk_idx, ref mut ended, ref total_bytes, + end_line_break_as_newline, } => { if *ended { return None; } else if byte_idx == total_bytes { - *ended = true; - return Some(Rope::from("")); + if end_line_break_as_newline { + *ended = true; + return Some(Rope::from("")); + } + return None; } debug_assert!(*chunk_idx < chunks.len()); @@ -629,9 +651,9 @@ impl<'a, 'b> Iterator for Lines<'a, 'b> { }); // Advance the byte index to the end of the rope. *byte_idx += len; - return Some(Rope { + Some(Rope { repr: Repr::Complex(Rc::new(raw)), - }); + }) } } } @@ -1093,13 +1115,23 @@ mod tests { #[test] fn lines1() { + let rope = Rope::from("abc"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["abc"]); + + // empty line at the end if the line before ends with a newline ('\n') let rope = Rope::from("abc\ndef\n"); let lines = rope.lines().collect::>(); assert_eq!(lines, ["abc\n", "def\n", ""]); + // no empty line at the end if the line before does not end with a newline ('\n') let rope = Rope::from("abc\ndef"); let lines = rope.lines().collect::>(); assert_eq!(lines, ["abc\n", "def"]); + + let rope = Rope::from("Test\nTest\nTest\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["Test\n", "Test\n", "Test\n", ""]); } #[test] From 658627235854a87fbb00c84e25125c7548d49101 Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 17:49:17 +0800 Subject: [PATCH 4/6] chore: clippy --- src/rope.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/rope.rs b/src/rope.rs index 403f8e3..0fd5c57 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -563,7 +563,7 @@ impl<'a> Iterator for Lines<'_, 'a> { let rope = Rope::from(&chunk[*in_chunk_byte_idx..end]); *in_chunk_byte_idx = end; *byte_idx += *in_chunk_byte_idx; - return Some(rope); + Some(rope) } else { // Check if the current chunk has left over bytes. @@ -629,9 +629,9 @@ impl<'a> Iterator for Lines<'_, 'a> { }); // Advance the byte index to the end of the line. *byte_idx += len; - return Some(Rope { + Some(Rope { repr: Repr::Complex(Rc::new(raw)), - }); + }) } else { // If we did not find a newline in the next few chunks, // return the remaining bytes. From 157f8c7ffb54fd44b7ab8c2577ce631e3207271e Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 18:04:55 +0800 Subject: [PATCH 5/6] test: more --- src/rope.rs | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/src/rope.rs b/src/rope.rs index 0fd5c57..323c6b0 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -426,10 +426,10 @@ impl<'a> Rope<'a> { /// Returns an iterator over the lines of the rope. /// - /// If `end_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline + /// If `trailing_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline pub(crate) fn lines_impl( &self, - end_line_break_as_newline: bool, + trailing_line_break_as_newline: bool, ) -> Lines<'_, 'a> { Lines { iter: match &self.repr { @@ -441,7 +441,7 @@ impl<'a> Rope<'a> { chunk_idx: 0, ended: false, total_bytes: self.len(), - end_line_break_as_newline, + trailing_line_break_as_newline, } } @@ -497,7 +497,7 @@ pub struct Lines<'a, 'b> { total_bytes: usize, /// Whether to treat the end of the rope with ('\n') as an empty newline. - end_line_break_as_newline: bool, + trailing_line_break_as_newline: bool, } impl<'a> Iterator for Lines<'_, 'a> { @@ -510,13 +510,13 @@ impl<'a> Iterator for Lines<'_, 'a> { ref mut byte_idx, ref mut ended, ref total_bytes, - end_line_break_as_newline, + trailing_line_break_as_newline, .. } => { if *ended { return None; } else if byte_idx == total_bytes { - if end_line_break_as_newline { + if trailing_line_break_as_newline { *ended = true; return Some(Rope::from("")); } @@ -539,12 +539,12 @@ impl<'a> Iterator for Lines<'_, 'a> { ref mut chunk_idx, ref mut ended, ref total_bytes, - end_line_break_as_newline, + trailing_line_break_as_newline, } => { if *ended { return None; } else if byte_idx == total_bytes { - if end_line_break_as_newline { + if trailing_line_break_as_newline { *ended = true; return Some(Rope::from("")); } @@ -552,8 +552,8 @@ impl<'a> Iterator for Lines<'_, 'a> { } debug_assert!(*chunk_idx < chunks.len()); - let &(chunk, _) = &chunks[*chunk_idx]; + // Always try to find a newline in the current chunk, // if the current chunk contains a newline, return this line. if let Some(idx) = @@ -566,7 +566,6 @@ impl<'a> Iterator for Lines<'_, 'a> { Some(rope) } else { // Check if the current chunk has left over bytes. - // If it is the last chunk, return the remaining bytes. // This is the end of the rope. if *chunk_idx == chunks.len() - 1 { @@ -1132,6 +1131,14 @@ mod tests { let rope = Rope::from("Test\nTest\nTest\n"); let lines = rope.lines().collect::>(); assert_eq!(lines, ["Test\n", "Test\n", "Test\n", ""]); + + let rope = Rope::from("\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["\n", ""]); + + let rope = Rope::from("\n\n"); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["\n", "\n", ""]); } #[test] @@ -1152,5 +1159,25 @@ mod tests { let rope = Rope::from_iter(["a\nb", "c\n", "d\n"]); let lines = rope.lines().collect::>(); assert_eq!(lines, ["a\n", "bc\n", "d\n", ""]); + + let rope = Rope::from_iter(["\n"]); + let lines = rope.lines().collect::>(); + assert_eq!(lines, ["\n", ""]); + } + + #[test] + fn lines_with_trailing_line_break_as_newline() { + let trailing_line_break_as_newline = false; + let rope = Rope::from("abc\n"); + let lines = rope + .lines_impl(trailing_line_break_as_newline) + .collect::>(); + assert_eq!(lines, ["abc\n"]); + + let rope = Rope::from("\n"); + let lines = rope + .lines_impl(trailing_line_break_as_newline) + .collect::>(); + assert_eq!(lines, ["\n"]); } } From c431b6116c1f0e39005385a5b41808000effe947 Mon Sep 17 00:00:00 2001 From: Hana Date: Thu, 12 Dec 2024 18:14:16 +0800 Subject: [PATCH 6/6] chore: more --- src/rope.rs | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/rope.rs b/src/rope.rs index 323c6b0..0bc51c9 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -434,11 +434,13 @@ impl<'a> Rope<'a> { Lines { iter: match &self.repr { Repr::Simple(s) => LinesEnum::Simple(s), - Repr::Complex(data) => LinesEnum::Complex(data), + Repr::Complex(data) => LinesEnum::Complex { + iter: data, + in_chunk_byte_idx: 0, + chunk_idx: 0, + }, }, byte_idx: 0, - in_chunk_byte_idx: 0, - chunk_idx: 0, ended: false, total_bytes: self.len(), trailing_line_break_as_newline, @@ -485,14 +487,16 @@ impl Hash for Rope<'_> { enum LinesEnum<'a, 'b> { Simple(&'b str), - Complex(&'a Vec<(&'b str, usize)>), + Complex { + iter: &'a Vec<(&'b str, usize)>, + in_chunk_byte_idx: usize, + chunk_idx: usize, + }, } pub struct Lines<'a, 'b> { iter: LinesEnum<'a, 'b>, byte_idx: usize, - in_chunk_byte_idx: usize, - chunk_idx: usize, ended: bool, total_bytes: usize, @@ -533,10 +537,13 @@ impl<'a> Iterator for Lines<'_, 'a> { Some(Rope::from(&s[*byte_idx..])) } Lines { - iter: LinesEnum::Complex(chunks), + iter: + LinesEnum::Complex { + iter: chunks, + ref mut in_chunk_byte_idx, + ref mut chunk_idx, + }, ref mut byte_idx, - ref mut in_chunk_byte_idx, - ref mut chunk_idx, ref mut ended, ref total_bytes, trailing_line_break_as_newline,