From d948bfd822c59362f0a2ec4caac1130832dc4f17 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sun, 22 Dec 2024 13:07:47 +0800 Subject: [PATCH 01/14] perf: WithIndices --- src/with_indices.rs | 60 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index e491252..fc5d6fd 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -1,4 +1,4 @@ -use std::{cell::OnceCell, marker::PhantomData}; +use std::marker::PhantomData; use crate::helpers::SourceText; @@ -9,8 +9,7 @@ where { /// line is a string reference pub line: S, - /// the byte position of each `char` in `line` string slice . - pub indices_indexes: OnceCell<Vec<usize>>, + last_char_index_to_byte_index: (usize, usize), data: PhantomData<&'a S>, } @@ -20,32 +19,65 @@ where { pub fn new(line: S) -> Self { Self { - indices_indexes: OnceCell::new(), line, + last_char_index_to_byte_index: (0, 0), data: PhantomData, } } - /// substring::SubString with cache - pub(crate) fn substring(&self, start_index: usize, end_index: usize) -> S { - if end_index <= start_index { + pub(crate) fn substring( + &self, + start_char_index: usize, + end_char_index: usize, + ) -> S { + if end_char_index <= start_char_index { return S::default(); } - let indices_indexes = self.indices_indexes.get_or_init(|| { - self.line.char_indices().map(|(i, _)| i).collect::<Vec<_>>() - }); + let mut start_byte_index = None; + let mut end_byte_index = None; - let str_len = self.line.len(); - let start = *indices_indexes.get(start_index).unwrap_or(&str_len); - let end = *indices_indexes.get(end_index).unwrap_or(&str_len); + let (last_char_index, mut last_byte_index) = + self.last_char_index_to_byte_index; + let mut char_index = last_char_index; + if last_char_index < start_char_index { + char_index = 0; + last_byte_index = 0; + } + for (byte_index, _) in self + .line + .byte_slice(last_byte_index..self.line.len()) + .char_indices() + { + if char_index == start_char_index { + start_byte_index = Some(byte_index); + } + if char_index == end_char_index { + end_byte_index = Some(byte_index); + break; + } + char_index += 1; + } + + let start_byte_index = if let Some(start_byte_index) = start_byte_index { + start_byte_index + } else { + return S::default(); + }; + + let end_byte_index = end_byte_index.unwrap_or(self.line.len()); + if end_byte_index <= start_byte_index { + return S::default(); + } #[allow(unsafe_code)] unsafe { // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee // that the indices obtained from it will always be within the bounds of `self` and they // will always lie on UTF-8 sequence boundaries. - self.line.byte_slice_unchecked(start..end) + self + .line + .byte_slice_unchecked(start_byte_index..end_byte_index) } } } From 18687ce0ea064af196aaad2e025c779003b264c4 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sun, 22 Dec 2024 13:22:53 +0800 Subject: [PATCH 02/14] fix: should recode last_char_index_to_byte_index --- src/with_indices.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index fc5d6fd..af53408 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -1,4 +1,4 @@ -use std::marker::PhantomData; +use std::{cell::RefCell, marker::PhantomData}; use crate::helpers::SourceText; @@ -9,7 +9,7 @@ where { /// line is a string reference pub line: S, - last_char_index_to_byte_index: (usize, usize), + last_char_index_to_byte_index: RefCell<(usize, usize)>, data: PhantomData<&'a S>, } @@ -20,7 +20,7 @@ where pub fn new(line: S) -> Self { Self { line, - last_char_index_to_byte_index: (0, 0), + last_char_index_to_byte_index: RefCell::new((0, 0)), data: PhantomData, } } @@ -38,7 +38,7 @@ where let mut end_byte_index = None; let (last_char_index, mut last_byte_index) = - self.last_char_index_to_byte_index; + *self.last_char_index_to_byte_index.borrow(); let mut char_index = last_char_index; if last_char_index < start_char_index { char_index = 0; @@ -51,9 +51,13 @@ where { if char_index == start_char_index { start_byte_index = Some(byte_index); + if end_char_index == usize::MAX { + break; + } } if char_index == end_char_index { end_byte_index = Some(byte_index); + *self.last_char_index_to_byte_index.borrow_mut() = (end_char_index, byte_index); break; } char_index += 1; From 30500f335b24870c8a3cce406b929e70d866a9d9 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sun, 22 Dec 2024 14:02:06 +0800 Subject: [PATCH 03/14] fix: test --- src/with_indices.rs | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index af53408..4813072 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -34,30 +34,33 @@ where return S::default(); } + let line_len = self.line.len(); + let mut start_byte_index = None; let mut end_byte_index = None; let (last_char_index, mut last_byte_index) = *self.last_char_index_to_byte_index.borrow(); let mut char_index = last_char_index; - if last_char_index < start_char_index { + if start_char_index < last_char_index { char_index = 0; last_byte_index = 0; } for (byte_index, _) in self .line - .byte_slice(last_byte_index..self.line.len()) + .byte_slice(last_byte_index..line_len) .char_indices() { if char_index == start_char_index { - start_byte_index = Some(byte_index); + start_byte_index = Some(byte_index + last_byte_index); if end_char_index == usize::MAX { break; } } if char_index == end_char_index { - end_byte_index = Some(byte_index); - *self.last_char_index_to_byte_index.borrow_mut() = (end_char_index, byte_index); + end_byte_index = Some(byte_index + last_byte_index); + *self.last_char_index_to_byte_index.borrow_mut() = + (end_char_index, byte_index); break; } char_index += 1; @@ -68,11 +71,7 @@ where } else { return S::default(); }; - - let end_byte_index = end_byte_index.unwrap_or(self.line.len()); - if end_byte_index <= start_byte_index { - return S::default(); - } + let end_byte_index = end_byte_index.unwrap_or(line_len); #[allow(unsafe_code)] unsafe { @@ -126,4 +125,21 @@ mod tests { "øbα" ); } + + #[test] + fn test_last_char_index_to_byte_index() { + let rope_with_indices = WithIndices::new(Rope::from("foobar")); + assert_eq!( + rope_with_indices.substring(0, 3), + "foo" + ); + assert_eq!( + rope_with_indices.substring(3, 6), + "bar" + ); + assert_eq!( + rope_with_indices.substring(0, usize::MAX), + "foobar" + ); + } } From dd81442b42b1533ea12edfb117b00bfbe5bc6e54 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sun, 22 Dec 2024 14:10:25 +0800 Subject: [PATCH 04/14] fix --- src/with_indices.rs | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index 4813072..aa8eed9 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -60,7 +60,7 @@ where if char_index == end_char_index { end_byte_index = Some(byte_index + last_byte_index); *self.last_char_index_to_byte_index.borrow_mut() = - (end_char_index, byte_index); + (end_char_index, byte_index + last_byte_index); break; } char_index += 1; @@ -129,17 +129,8 @@ mod tests { #[test] fn test_last_char_index_to_byte_index() { let rope_with_indices = WithIndices::new(Rope::from("foobar")); - assert_eq!( - rope_with_indices.substring(0, 3), - "foo" - ); - assert_eq!( - rope_with_indices.substring(3, 6), - "bar" - ); - assert_eq!( - rope_with_indices.substring(0, usize::MAX), - "foobar" - ); + assert_eq!(rope_with_indices.substring(0, 3), "foo"); + assert_eq!(rope_with_indices.substring(3, 6), "bar"); + assert_eq!(rope_with_indices.substring(0, usize::MAX), "foobar"); } } From 48477380051a2346eae396b3115df517888edd4a Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sun, 22 Dec 2024 14:34:36 +0800 Subject: [PATCH 05/14] feat: reduce last_char_index_to_byte_index --- src/with_indices.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index aa8eed9..a602902 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -9,7 +9,7 @@ where { /// line is a string reference pub line: S, - last_char_index_to_byte_index: RefCell<(usize, usize)>, + last_char_index_to_byte_index: RefCell<(u32, u32)>, data: PhantomData<&'a S>, } @@ -39,10 +39,11 @@ where let mut start_byte_index = None; let mut end_byte_index = None; - let (last_char_index, mut last_byte_index) = + let (last_char_index, last_byte_index) = *self.last_char_index_to_byte_index.borrow(); - let mut char_index = last_char_index; - if start_char_index < last_char_index { + let mut last_byte_index = last_byte_index as usize; + let mut char_index = last_char_index as usize; + if start_char_index < last_char_index as usize { char_index = 0; last_byte_index = 0; } @@ -60,7 +61,7 @@ where if char_index == end_char_index { end_byte_index = Some(byte_index + last_byte_index); *self.last_char_index_to_byte_index.borrow_mut() = - (end_char_index, byte_index + last_byte_index); + (end_char_index as u32, (byte_index + last_byte_index) as u32); break; } char_index += 1; From 7674774bb5f40fe5d2f5b76349fc503a12426046 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Fri, 27 Dec 2024 23:03:16 +0800 Subject: [PATCH 06/14] feat: rope.chars --- src/rope.rs | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/src/rope.rs b/src/rope.rs index b18d90c..bc75d98 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -6,6 +6,7 @@ use std::{ hash::Hash, ops::{Bound, RangeBounds}, rc::Rc, + str::Chars, }; use crate::Error; @@ -144,6 +145,22 @@ impl<'a> Rope<'a> { } } + /// Returns an iterator over the [`char`]s of a string slice. + pub fn chars(&self) -> RopeChars<'_> { + match &self.repr { + Repr::Light(s) => RopeChars { + iters: vec![s.chars()], + left: 0, + right: 0 + }, + Repr::Full(data) => { + let iters = data.iter().map(|(s, _)| s.chars()).collect::<Vec<_>>(); + let len = iters.len(); + RopeChars { iters, left: 0, right: (len - 1) as u32 } + } + } + } + /// Returns whether the rope starts with the given string. #[inline] pub fn starts_with(&self, value: &str) -> bool { @@ -939,6 +956,46 @@ fn end_bound_to_range_end(end: Bound<&usize>) -> Option<usize> { } } +pub struct RopeChars<'a> { + iters: Vec<Chars<'a>>, + left: u32, + right: u32 +} + +impl<'a> Iterator for RopeChars<'a> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option<char> { + let left = self.left as usize; + if left >= self.iters.len() { + return None; + } + if let Some(char) = self.iters[left].next() { + return Some(char); + } else { + self.left += 1; + self.next() + } + } +} + +impl<'a> DoubleEndedIterator for RopeChars<'a> { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + let right = self.right as usize; + if right == 0 { + return self.iters[right].next_back(); + } + if let Some(char) = self.iters[right].next_back() { + return Some(char); + } else { + self.right -= 1; + self.next_back() + } + } +} + #[cfg(test)] mod tests { use std::rc::Rc; @@ -1244,4 +1301,38 @@ mod tests { .collect::<Vec<_>>(); assert_eq!(lines, ["\n"]); } + + #[test] + fn chars() { + let rope = Rope::from("abc"); + let mut chars = rope.chars(); + assert_eq!(chars.next(), Some('a')); + assert_eq!(chars.next(), Some('b')); + assert_eq!(chars.next(), Some('c')); + assert_eq!(chars.next(), None); + + let rope = Rope::from_iter(["a", "b", "c"]); + let mut chars = rope.chars(); + assert_eq!(chars.next(), Some('a')); + assert_eq!(chars.next(), Some('b')); + assert_eq!(chars.next(), Some('c')); + assert_eq!(chars.next(), None); + } + + #[test] + fn reverse_chars() { + let rope = Rope::from("abc"); + let mut chars = rope.chars().rev(); + assert_eq!(chars.next(), Some('c')); + assert_eq!(chars.next(), Some('b')); + assert_eq!(chars.next(), Some('a')); + assert_eq!(chars.next(), None); + + let rope = Rope::from_iter(["a", "b", "c"]); + let mut chars = rope.chars().rev(); + assert_eq!(chars.next(), Some('c')); + assert_eq!(chars.next(), Some('b')); + assert_eq!(chars.next(), Some('a')); + assert_eq!(chars.next(), None); + } } From 7a0856dbd179718d6a90c4a6453722e27488a001 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sat, 28 Dec 2024 08:59:01 +0800 Subject: [PATCH 07/14] u --- src/helpers.rs | 11 +++++++ src/rope.rs | 18 +++++++----- src/with_indices.rs | 70 ++++++++++++++++++++++++++++----------------- 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/src/helpers.rs b/src/helpers.rs index 2243fde..981da56 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -1254,6 +1254,9 @@ pub trait SourceText<'a>: Default + Clone + ToString { /// Returns an iterator over the char indices in the text. fn char_indices(&self) -> impl Iterator<Item = (usize, char)>; + /// Returns an iterator over the [`char`]s of a string slice. + fn chars(&self) -> impl DoubleEndedIterator<Item = char>; + /// Gets the byte at the specified index, if it exists. fn get_byte(&self, byte_index: usize) -> Option<u8>; @@ -1293,6 +1296,10 @@ impl<'a> SourceText<'a> for Rope<'a> { self.char_indices() } + fn chars(&self) -> impl DoubleEndedIterator<Item = char> { + (*self).chars() + } + fn byte_slice(&self, range: Range<usize>) -> Self { self.byte_slice(range) } @@ -1335,6 +1342,10 @@ impl<'a> SourceText<'a> for &'a str { (*self).char_indices() } + fn chars(&self) -> impl DoubleEndedIterator<Item = char> { + (*self).chars() + } + fn byte_slice(&self, range: Range<usize>) -> Self { self.get(range).unwrap_or_default() } diff --git a/src/rope.rs b/src/rope.rs index bc75d98..f6e49f6 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -151,12 +151,16 @@ impl<'a> Rope<'a> { Repr::Light(s) => RopeChars { iters: vec![s.chars()], left: 0, - right: 0 + right: 0, }, Repr::Full(data) => { let iters = data.iter().map(|(s, _)| s.chars()).collect::<Vec<_>>(); let len = iters.len(); - RopeChars { iters, left: 0, right: (len - 1) as u32 } + RopeChars { + iters, + left: 0, + right: (len - 1) as u32, + } } } } @@ -959,10 +963,10 @@ fn end_bound_to_range_end(end: Bound<&usize>) -> Option<usize> { pub struct RopeChars<'a> { iters: Vec<Chars<'a>>, left: u32, - right: u32 + right: u32, } -impl<'a> Iterator for RopeChars<'a> { +impl Iterator for RopeChars<'_> { type Item = char; #[inline] @@ -972,7 +976,7 @@ impl<'a> Iterator for RopeChars<'a> { return None; } if let Some(char) = self.iters[left].next() { - return Some(char); + Some(char) } else { self.left += 1; self.next() @@ -980,7 +984,7 @@ impl<'a> Iterator for RopeChars<'a> { } } -impl<'a> DoubleEndedIterator for RopeChars<'a> { +impl DoubleEndedIterator for RopeChars<'_> { #[inline] fn next_back(&mut self) -> Option<Self::Item> { let right = self.right as usize; @@ -988,7 +992,7 @@ impl<'a> DoubleEndedIterator for RopeChars<'a> { return self.iters[right].next_back(); } if let Some(char) = self.iters[right].next_back() { - return Some(char); + Some(char) } else { self.right -= 1; self.next_back() diff --git a/src/with_indices.rs b/src/with_indices.rs index a602902..86778e2 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -36,42 +36,58 @@ where let line_len = self.line.len(); - let mut start_byte_index = None; - let mut end_byte_index = None; + let mut start_byte_index = + if start_char_index == 0 { Some(0) } else { None }; + let mut end_byte_index = if end_char_index == usize::MAX { + Some(line_len) + } else { + None + }; let (last_char_index, last_byte_index) = *self.last_char_index_to_byte_index.borrow(); - let mut last_byte_index = last_byte_index as usize; + let mut byte_index = last_byte_index as usize; let mut char_index = last_char_index as usize; - if start_char_index < last_char_index as usize { - char_index = 0; - last_byte_index = 0; - } - for (byte_index, _) in self - .line - .byte_slice(last_byte_index..line_len) - .char_indices() + + if start_char_index >= last_char_index as usize + || end_char_index >= last_char_index as usize { - if char_index == start_char_index { - start_byte_index = Some(byte_index + last_byte_index); - if end_char_index == usize::MAX { + for char in self.line.byte_slice(byte_index..line_len).chars() { + if start_byte_index.is_some() && end_byte_index.is_some() { break; } + if char_index == start_char_index { + start_byte_index = Some(byte_index); + *self.last_char_index_to_byte_index.borrow_mut() = + (char_index as u32, byte_index as u32); + } + if char_index == end_char_index { + end_byte_index = Some(byte_index); + *self.last_char_index_to_byte_index.borrow_mut() = + (char_index as u32, byte_index as u32); + } + byte_index += char.len_utf8(); + char_index += 1; } - if char_index == end_char_index { - end_byte_index = Some(byte_index + last_byte_index); - *self.last_char_index_to_byte_index.borrow_mut() = - (end_char_index as u32, (byte_index + last_byte_index) as u32); - break; + } else { + for char in self.line.byte_slice(0..byte_index).chars().rev() { + if start_byte_index.is_some() && end_byte_index.is_some() { + break; + } + byte_index -= char.len_utf8(); + char_index -= 1; + if char_index == end_char_index { + end_byte_index = Some(byte_index); + *self.last_char_index_to_byte_index.borrow_mut() = + (char_index as u32, byte_index as u32); + } + if char_index == start_char_index { + start_byte_index = Some(byte_index); + } } - char_index += 1; } - let start_byte_index = if let Some(start_byte_index) = start_byte_index { - start_byte_index - } else { - return S::default(); - }; + let start_byte_index = start_byte_index.unwrap_or(line_len); let end_byte_index = end_byte_index.unwrap_or(line_len); #[allow(unsafe_code)] @@ -131,7 +147,7 @@ mod tests { fn test_last_char_index_to_byte_index() { let rope_with_indices = WithIndices::new(Rope::from("foobar")); assert_eq!(rope_with_indices.substring(0, 3), "foo"); - assert_eq!(rope_with_indices.substring(3, 6), "bar"); - assert_eq!(rope_with_indices.substring(0, usize::MAX), "foobar"); + assert_eq!(rope_with_indices.substring(3, 5), "ba"); + assert_eq!(rope_with_indices.substring(0, 3), "foo"); } } From 11ea50a742688d1d821cca0d1a79a957f8d8220e Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sat, 28 Dec 2024 09:20:33 +0800 Subject: [PATCH 08/14] u --- src/rope.rs | 1 + src/with_indices.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/rope.rs b/src/rope.rs index f6e49f6..2a43ce3 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -146,6 +146,7 @@ impl<'a> Rope<'a> { } /// Returns an iterator over the [`char`]s of a string slice. + #[inline(always)] pub fn chars(&self) -> RopeChars<'_> { match &self.repr { Repr::Light(s) => RopeChars { diff --git a/src/with_indices.rs b/src/with_indices.rs index 86778e2..549b44a 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -145,9 +145,10 @@ mod tests { #[test] fn test_last_char_index_to_byte_index() { - let rope_with_indices = WithIndices::new(Rope::from("foobar")); - assert_eq!(rope_with_indices.substring(0, 3), "foo"); - assert_eq!(rope_with_indices.substring(3, 5), "ba"); - assert_eq!(rope_with_indices.substring(0, 3), "foo"); + let rope_with_indices = + WithIndices::new(Rope::from("hello world 你好世界")); + assert_eq!(rope_with_indices.substring(10, 13), "d 你"); + assert_eq!(rope_with_indices.substring(13, 15), "好世"); + assert_eq!(rope_with_indices.substring(10, 13), "d 你"); } } From cedea6e89a84c4377f08a4e3ea04cba9fb228969 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Sat, 28 Dec 2024 10:51:39 +0800 Subject: [PATCH 09/14] fix: try perf --- src/with_indices.rs | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index 549b44a..f0639b3 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -1,4 +1,4 @@ -use std::{cell::RefCell, marker::PhantomData}; +use std::{cell::Cell, marker::PhantomData}; use crate::helpers::SourceText; @@ -9,7 +9,7 @@ where { /// line is a string reference pub line: S, - last_char_index_to_byte_index: RefCell<(u32, u32)>, + last_char_index_to_byte_index: Cell<(u32, u32)>, data: PhantomData<&'a S>, } @@ -20,7 +20,7 @@ where pub fn new(line: S) -> Self { Self { line, - last_char_index_to_byte_index: RefCell::new((0, 0)), + last_char_index_to_byte_index: Cell::new((0, 0)), data: PhantomData, } } @@ -35,7 +35,6 @@ where } let line_len = self.line.len(); - let mut start_byte_index = if start_char_index == 0 { Some(0) } else { None }; let mut end_byte_index = if end_char_index == usize::MAX { @@ -45,32 +44,44 @@ where }; let (last_char_index, last_byte_index) = - *self.last_char_index_to_byte_index.borrow(); + self.last_char_index_to_byte_index.get(); let mut byte_index = last_byte_index as usize; let mut char_index = last_char_index as usize; if start_char_index >= last_char_index as usize || end_char_index >= last_char_index as usize { - for char in self.line.byte_slice(byte_index..line_len).chars() { + #[allow(unsafe_code)] + let slice = unsafe { + // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee + // that the indices obtained from it will always be within the bounds of `self` and they + // will always lie on UTF-8 sequence boundaries. + self.line.byte_slice_unchecked(byte_index..line_len) + }; + for char in slice.chars() { if start_byte_index.is_some() && end_byte_index.is_some() { break; } if char_index == start_char_index { start_byte_index = Some(byte_index); - *self.last_char_index_to_byte_index.borrow_mut() = - (char_index as u32, byte_index as u32); - } - if char_index == end_char_index { + } else if char_index == end_char_index { end_byte_index = Some(byte_index); - *self.last_char_index_to_byte_index.borrow_mut() = - (char_index as u32, byte_index as u32); + self + .last_char_index_to_byte_index + .set((char_index as u32, byte_index as u32)); } byte_index += char.len_utf8(); char_index += 1; } } else { - for char in self.line.byte_slice(0..byte_index).chars().rev() { + #[allow(unsafe_code)] + let slice = unsafe { + // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee + // that the indices obtained from it will always be within the bounds of `self` and they + // will always lie on UTF-8 sequence boundaries. + self.line.byte_slice_unchecked(0..byte_index) + }; + for char in slice.chars().rev() { if start_byte_index.is_some() && end_byte_index.is_some() { break; } @@ -78,10 +89,7 @@ where char_index -= 1; if char_index == end_char_index { end_byte_index = Some(byte_index); - *self.last_char_index_to_byte_index.borrow_mut() = - (char_index as u32, byte_index as u32); - } - if char_index == start_char_index { + } else if char_index == start_char_index { start_byte_index = Some(byte_index); } } From 43494d31803c7c65e60a8fb1efacaddfdf8f2125 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Mon, 30 Dec 2024 12:31:14 +0800 Subject: [PATCH 10/14] perf: reduce condition --- src/with_indices.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index f0639b3..92dc650 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -43,6 +43,10 @@ where None }; + if start_byte_index.is_some() && end_byte_index.is_some() { + return self.line.clone(); + } + let (last_char_index, last_byte_index) = self.last_char_index_to_byte_index.get(); let mut byte_index = last_byte_index as usize; @@ -59,16 +63,17 @@ where self.line.byte_slice_unchecked(byte_index..line_len) }; for char in slice.chars() { - if start_byte_index.is_some() && end_byte_index.is_some() { - break; - } if char_index == start_char_index { start_byte_index = Some(byte_index); + if end_byte_index.is_some() { + break; + } } else if char_index == end_char_index { end_byte_index = Some(byte_index); self .last_char_index_to_byte_index .set((char_index as u32, byte_index as u32)); + break; } byte_index += char.len_utf8(); char_index += 1; @@ -82,15 +87,16 @@ where self.line.byte_slice_unchecked(0..byte_index) }; for char in slice.chars().rev() { - if start_byte_index.is_some() && end_byte_index.is_some() { - break; - } byte_index -= char.len_utf8(); char_index -= 1; if char_index == end_char_index { end_byte_index = Some(byte_index); + if start_byte_index.is_some() { + break; + } } else if char_index == start_char_index { start_byte_index = Some(byte_index); + break; } } } From f706b2c00090b3c8ab467058ed4e992cea404ce6 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Mon, 30 Dec 2024 12:36:23 +0800 Subject: [PATCH 11/14] perf: use char_indices --- src/with_indices.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/with_indices.rs b/src/with_indices.rs index 92dc650..68c2be9 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -62,20 +62,19 @@ where // will always lie on UTF-8 sequence boundaries. self.line.byte_slice_unchecked(byte_index..line_len) }; - for char in slice.chars() { + for (byte_offset, _) in slice.char_indices() { if char_index == start_char_index { - start_byte_index = Some(byte_index); + start_byte_index = Some(byte_index + byte_offset); if end_byte_index.is_some() { break; } } else if char_index == end_char_index { - end_byte_index = Some(byte_index); + end_byte_index = Some(byte_index + byte_offset); self .last_char_index_to_byte_index - .set((char_index as u32, byte_index as u32)); + .set((char_index as u32, (byte_index + byte_offset) as u32)); break; } - byte_index += char.len_utf8(); char_index += 1; } } else { From 982133116831bdce9b9a5a1aa028db18c6afe0fd Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Mon, 30 Dec 2024 14:58:08 +0800 Subject: [PATCH 12/14] fix: char_indices support rev --- src/helpers.rs | 17 +--- src/rope.rs | 217 +++++++++++++++++++------------------------- src/with_indices.rs | 11 +-- 3 files changed, 102 insertions(+), 143 deletions(-) diff --git a/src/helpers.rs b/src/helpers.rs index 981da56..ff7e9f9 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -1252,10 +1252,7 @@ pub trait SourceText<'a>: Default + Clone + ToString { fn ends_with(&self, value: &str) -> bool; /// Returns an iterator over the char indices in the text. - fn char_indices(&self) -> impl Iterator<Item = (usize, char)>; - - /// Returns an iterator over the [`char`]s of a string slice. - fn chars(&self) -> impl DoubleEndedIterator<Item = char>; + fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)>; /// Gets the byte at the specified index, if it exists. fn get_byte(&self, byte_index: usize) -> Option<u8>; @@ -1292,14 +1289,10 @@ impl<'a> SourceText<'a> for Rope<'a> { (*self).ends_with(value) } - fn char_indices(&self) -> impl Iterator<Item = (usize, char)> { + fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> { self.char_indices() } - fn chars(&self) -> impl DoubleEndedIterator<Item = char> { - (*self).chars() - } - fn byte_slice(&self, range: Range<usize>) -> Self { self.byte_slice(range) } @@ -1338,14 +1331,10 @@ impl<'a> SourceText<'a> for &'a str { (*self).ends_with(value) } - fn char_indices(&self) -> impl Iterator<Item = (usize, char)> { + fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> { (*self).char_indices() } - fn chars(&self) -> impl DoubleEndedIterator<Item = char> { - (*self).chars() - } - fn byte_slice(&self, range: Range<usize>) -> Self { self.get(range).unwrap_or_default() } diff --git a/src/rope.rs b/src/rope.rs index 2a43ce3..17fdcdc 100644 --- a/src/rope.rs +++ b/src/rope.rs @@ -2,11 +2,9 @@ use std::{ borrow::Cow, - collections::VecDeque, hash::Hash, ops::{Bound, RangeBounds}, rc::Rc, - str::Chars, }; use crate::Error; @@ -135,32 +133,22 @@ impl<'a> Rope<'a> { iter: s.char_indices(), }, }, - Repr::Full(data) => CharIndices { - iter: CharIndicesEnum::Full { - chunks: data, - char_indices: VecDeque::new(), - chunk_index: 0, - }, - }, - } - } - - /// Returns an iterator over the [`char`]s of a string slice. - #[inline(always)] - pub fn chars(&self) -> RopeChars<'_> { - match &self.repr { - Repr::Light(s) => RopeChars { - iters: vec![s.chars()], - left: 0, - right: 0, - }, - Repr::Full(data) => { - let iters = data.iter().map(|(s, _)| s.chars()).collect::<Vec<_>>(); - let len = iters.len(); - RopeChars { - iters, - left: 0, - right: (len - 1) as u32, + Repr::Full(vec) => { + let right_byte_offset = vec.iter().map(|(s, _)| s.len() as u32).sum(); + + CharIndices { + iter: CharIndicesEnum::Full { + iters: vec + .iter() + .map(|(s, _)| s.char_indices()) + .collect::<Vec<_>>(), + left_chunk_index: 0, + left_byte_offset: 0, + last_left_indice: None, + right_chunk_index: (vec.len() - 1) as u32, + right_byte_offset, + right_byte_offset_for: vec.len() as u32, + }, } } } @@ -680,9 +668,13 @@ enum CharIndicesEnum<'a, 'b> { iter: std::str::CharIndices<'b>, }, Full { - chunks: &'a [(&'b str, usize)], - char_indices: VecDeque<(usize, char)>, - chunk_index: usize, + iters: Vec<std::str::CharIndices<'a>>, + left_chunk_index: u32, + left_byte_offset: u32, + last_left_indice: Option<(usize, char)>, + right_chunk_index: u32, + right_byte_offset: u32, + right_byte_offset_for: u32, }, } @@ -697,29 +689,59 @@ impl Iterator for CharIndices<'_, '_> { match &mut self.iter { CharIndicesEnum::Light { iter } => iter.next(), CharIndicesEnum::Full { - chunks, - char_indices, - chunk_index, + iters, + left_chunk_index, + left_byte_offset, + last_left_indice, + .. } => { - if let Some(item) = char_indices.pop_front() { - return Some(item); - } - - if *chunk_index >= chunks.len() { + if (*left_chunk_index as usize) >= iters.len() { return None; } - - // skip empty chunks - while *chunk_index < chunks.len() && chunks[*chunk_index].0.is_empty() { - *chunk_index += 1; + if let Some((byte_index, char)) = + iters[*left_chunk_index as usize].next() + { + *last_left_indice = Some((byte_index, char)); + Some((byte_index + (*left_byte_offset as usize), char)) + } else { + *left_chunk_index += 1; + if let Some((byte_index, char)) = last_left_indice.take() { + *left_byte_offset = + *left_byte_offset + byte_index as u32 + char.len_utf8() as u32; + } + self.next() } + } + } + } +} - let (chunk, start_pos) = chunks[*chunk_index]; - - char_indices - .extend(chunk.char_indices().map(|(i, c)| (start_pos + i, c))); - *chunk_index += 1; - char_indices.pop_front() +impl DoubleEndedIterator for CharIndices<'_, '_> { + fn next_back(&mut self) -> Option<Self::Item> { + match &mut self.iter { + CharIndicesEnum::Light { iter } => iter.next_back(), + CharIndicesEnum::Full { + iters, + right_chunk_index, + right_byte_offset, + right_byte_offset_for, + .. + } => { + if let Some((byte_index, char)) = + iters[*right_chunk_index as usize].next_back() + { + if *right_byte_offset_for != *right_chunk_index { + *right_byte_offset = + *right_byte_offset - byte_index as u32 - char.len_utf8() as u32; + *right_byte_offset_for = *right_chunk_index; + } + Some((byte_index + (*right_byte_offset as usize), char)) + } else if *right_chunk_index > 0 { + *right_chunk_index -= 1; + self.next_back() + } else { + None + } } } } @@ -961,46 +983,6 @@ fn end_bound_to_range_end(end: Bound<&usize>) -> Option<usize> { } } -pub struct RopeChars<'a> { - iters: Vec<Chars<'a>>, - left: u32, - right: u32, -} - -impl Iterator for RopeChars<'_> { - type Item = char; - - #[inline] - fn next(&mut self) -> Option<char> { - let left = self.left as usize; - if left >= self.iters.len() { - return None; - } - if let Some(char) = self.iters[left].next() { - Some(char) - } else { - self.left += 1; - self.next() - } - } -} - -impl DoubleEndedIterator for RopeChars<'_> { - #[inline] - fn next_back(&mut self) -> Option<Self::Item> { - let right = self.right as usize; - if right == 0 { - return self.iters[right].next_back(); - } - if let Some(char) = self.iters[right].next_back() { - Some(char) - } else { - self.right -= 1; - self.next_back() - } - } -} - #[cfg(test)] mod tests { use std::rc::Rc; @@ -1230,6 +1212,29 @@ mod tests { ); } + #[test] + fn reverse_char_indices() { + let mut a = Rope::new(); + a.add("abc"); + a.add("def"); + assert_eq!( + a.char_indices().rev().collect::<Vec<_>>(), + "abcdef".char_indices().rev().collect::<Vec<_>>() + ); + + let mut a = Rope::new(); + a.add("こんにちは"); + assert_eq!( + a.char_indices().rev().collect::<Vec<_>>(), + "こんにちは".char_indices().rev().collect::<Vec<_>>() + ); + a.add("世界"); + assert_eq!( + a.char_indices().rev().collect::<Vec<_>>(), + "こんにちは世界".char_indices().rev().collect::<Vec<_>>() + ); + } + #[test] fn lines1() { let rope = Rope::from("abc"); @@ -1306,38 +1311,4 @@ mod tests { .collect::<Vec<_>>(); assert_eq!(lines, ["\n"]); } - - #[test] - fn chars() { - let rope = Rope::from("abc"); - let mut chars = rope.chars(); - assert_eq!(chars.next(), Some('a')); - assert_eq!(chars.next(), Some('b')); - assert_eq!(chars.next(), Some('c')); - assert_eq!(chars.next(), None); - - let rope = Rope::from_iter(["a", "b", "c"]); - let mut chars = rope.chars(); - assert_eq!(chars.next(), Some('a')); - assert_eq!(chars.next(), Some('b')); - assert_eq!(chars.next(), Some('c')); - assert_eq!(chars.next(), None); - } - - #[test] - fn reverse_chars() { - let rope = Rope::from("abc"); - let mut chars = rope.chars().rev(); - assert_eq!(chars.next(), Some('c')); - assert_eq!(chars.next(), Some('b')); - assert_eq!(chars.next(), Some('a')); - assert_eq!(chars.next(), None); - - let rope = Rope::from_iter(["a", "b", "c"]); - let mut chars = rope.chars().rev(); - assert_eq!(chars.next(), Some('c')); - assert_eq!(chars.next(), Some('b')); - assert_eq!(chars.next(), Some('a')); - assert_eq!(chars.next(), None); - } } diff --git a/src/with_indices.rs b/src/with_indices.rs index 68c2be9..819d42a 100644 --- a/src/with_indices.rs +++ b/src/with_indices.rs @@ -49,7 +49,7 @@ where let (last_char_index, last_byte_index) = self.last_char_index_to_byte_index.get(); - let mut byte_index = last_byte_index as usize; + let byte_index = last_byte_index as usize; let mut char_index = last_char_index as usize; if start_char_index >= last_char_index as usize @@ -85,18 +85,17 @@ where // will always lie on UTF-8 sequence boundaries. self.line.byte_slice_unchecked(0..byte_index) }; - for char in slice.chars().rev() { - byte_index -= char.len_utf8(); - char_index -= 1; + for (byte_index, char) in slice.char_indices().rev() { if char_index == end_char_index { - end_byte_index = Some(byte_index); + end_byte_index = Some(byte_index + char.len_utf8()); if start_byte_index.is_some() { break; } } else if char_index == start_char_index { - start_byte_index = Some(byte_index); + start_byte_index = Some(byte_index + char.len_utf8()); break; } + char_index -= 1; } } From 212bbc8a6de6dabf9cc00f941116c9a519f5a052 Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Thu, 2 Jan 2025 10:04:30 +0800 Subject: [PATCH 13/14] fix: remove generate generated_column --- src/decoder.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/decoder.rs b/src/decoder.rs index 9cfd624..4622924 100644 --- a/src/decoder.rs +++ b/src/decoder.rs @@ -39,7 +39,6 @@ pub(crate) struct MappingsDecoder<'a> { current_value: i64, current_value_pos: usize, generated_line: u32, - generated_column: i64, } impl<'a> MappingsDecoder<'a> { @@ -52,7 +51,6 @@ impl<'a> MappingsDecoder<'a> { current_value: 0, current_value_pos: 0, generated_line: 1, - generated_column: -1, } } } @@ -95,12 +93,10 @@ impl Iterator for MappingsDecoder<'_> { }), _ => None, }; - self.generated_column = self.current_data[0] as i64; self.current_data_pos = 0; if value == SEM { self.generated_line += 1; self.current_data[0] = 0; - self.generated_column = -1; } if mapping.is_some() { return mapping; From b201deef5389d4cfe8bddb451921944a8135f9ee Mon Sep 17 00:00:00 2001 From: Cong-Cong <dacongsama@live.com> Date: Thu, 2 Jan 2025 12:36:11 +0800 Subject: [PATCH 14/14] perf --- src/decoder.rs | 51 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/decoder.rs b/src/decoder.rs index 4622924..38c64eb 100644 --- a/src/decoder.rs +++ b/src/decoder.rs @@ -65,42 +65,39 @@ impl Iterator for MappingsDecoder<'_> { continue; } if (value & COM) != 0 { - let mapping = match self.current_data_pos { - 1 => Some(Mapping { - generated_line: self.generated_line, - generated_column: self.current_data[0], - original: None, - }), - 4 => Some(Mapping { - generated_line: self.generated_line, - generated_column: self.current_data[0], - original: Some(OriginalLocation { + let mut mapping = Mapping { + generated_line: self.generated_line, + generated_column: self.current_data[0], + original: None, + }; + let current_data_pos = self.current_data_pos; + self.current_data_pos = 0; + if value == SEM { + self.generated_line += 1; + self.current_data[0] = 0; + } + match current_data_pos { + 1 => return Some(mapping), + 4 => { + mapping.original = Some(OriginalLocation { source_index: self.current_data[1], original_line: self.current_data[2], original_column: self.current_data[3], name_index: None, - }), - }), - 5 => Some(Mapping { - generated_line: self.generated_line, - generated_column: self.current_data[0], - original: Some(OriginalLocation { + }); + return Some(mapping); + } + 5 => { + mapping.original = Some(OriginalLocation { source_index: self.current_data[1], original_line: self.current_data[2], original_column: self.current_data[3], name_index: Some(self.current_data[4]), - }), - }), - _ => None, + }); + return Some(mapping); + } + _ => (), }; - self.current_data_pos = 0; - if value == SEM { - self.generated_line += 1; - self.current_data[0] = 0; - } - if mapping.is_some() { - return mapping; - } } else if (value & CONTINUATION_BIT) == 0 { // last sextet self.current_value |= (value as i64) << self.current_value_pos;