perf: faster lines iterator for Rope (#145)

* refactor: init * refactor: faster lines * refactor: try * chore: clippy * test: more * chore: more
web-infra-dev · Dec 12, 2024 · c773540 · c773540
1 parent b08c1b7
commit c773540
Show file tree

Hide file tree

Showing 2 changed files with 284 additions and 6 deletions.
diff --git a/src/helpers.rs b/src/helpers.rs
@@ -5,7 +5,6 @@ use std::{
   ops::Range,
 };
 
-use itertools::Either;
 use rustc_hash::FxHashMap as HashMap;
 
 use crate::{
@@ -201,7 +200,7 @@ const EMPTY_ROPE: Rope = Rope::new();
 /// Split the string with a needle, each string will contain the needle.
 ///
 /// Copied and modified from https://github.com/rust-lang/cargo/blob/30efe860c0e4adc1a6d7057ad223dc6e47d34edf/src/cargo/sources/registry/index.rs#L1048-L1072
-pub fn split<'a>(
+fn split<'a>(
   haystack: &Rope<'a>,
   needle: u8,
 ) -> impl Iterator<Item = Rope<'a>> {
@@ -1324,10 +1323,10 @@ pub trait SourceText<'a>: Default + Clone + ToString {
 
 impl<'a> SourceText<'a> for Rope<'a> {
   fn split_into_lines(&self) -> impl Iterator<Item = Self> {
-    if let Some(s) = self.get_simple() {
-      return Either::Left(split_str(s, b'\n').map(Rope::from));
-    }
-    Either::Right(split(self, b'\n'))
+    // Split the text into lines, including the line ending character.
+    // If the text ends with a newline, the last line will be ignored
+    // For example: "abc\nefg\n" => ["abc\n", "efg\n"]
+    self.lines_impl(false)
   }
 
   #[inline]

diff --git a/src/rope.rs b/src/rope.rs
@@ -419,6 +419,34 @@ impl<'a> Rope<'a> {
     }
   }
 
+  /// Returns an iterator over the lines of the rope.
+  pub fn lines(&self) -> Lines<'_, 'a> {
+    self.lines_impl(true)
+  }
+
+  /// Returns an iterator over the lines of the rope.
+  ///
+  /// If `trailing_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline
+  pub(crate) fn lines_impl(
+    &self,
+    trailing_line_break_as_newline: bool,
+  ) -> Lines<'_, 'a> {
+    Lines {
+      iter: match &self.repr {
+        Repr::Simple(s) => LinesEnum::Simple(s),
+        Repr::Complex(data) => LinesEnum::Complex {
+          iter: data,
+          in_chunk_byte_idx: 0,
+          chunk_idx: 0,
+        },
+      },
+      byte_idx: 0,
+      ended: false,
+      total_bytes: self.len(),
+      trailing_line_break_as_newline,
+    }
+  }
+
   /// Converts the rope to bytes.
   ///
   /// Returns borrowed bytes for simple ropes and owned bytes for complex ropes.
@@ -457,6 +485,188 @@ impl Hash for Rope<'_> {
   }
 }
 
+enum LinesEnum<'a, 'b> {
+  Simple(&'b str),
+  Complex {
+    iter: &'a Vec<(&'b str, usize)>,
+    in_chunk_byte_idx: usize,
+    chunk_idx: usize,
+  },
+}
+
+pub struct Lines<'a, 'b> {
+  iter: LinesEnum<'a, 'b>,
+  byte_idx: usize,
+  ended: bool,
+  total_bytes: usize,
+
+  /// Whether to treat the end of the rope with ('\n') as an empty newline.
+  trailing_line_break_as_newline: bool,
+}
+
+impl<'a> Iterator for Lines<'_, 'a> {
+  type Item = Rope<'a>;
+
+  fn next(&mut self) -> Option<Self::Item> {
+    match *self {
+      Lines {
+        iter: LinesEnum::Simple(s),
+        ref mut byte_idx,
+        ref mut ended,
+        ref total_bytes,
+        trailing_line_break_as_newline,
+        ..
+      } => {
+        if *ended {
+          return None;
+        } else if byte_idx == total_bytes {
+          if trailing_line_break_as_newline {
+            *ended = true;
+            return Some(Rope::from(""));
+          }
+          return None;
+        } else if let Some(idx) =
+          memchr::memchr(b'\n', &s.as_bytes()[*byte_idx..])
+        {
+          let end = *byte_idx + idx + 1;
+          let rope = Rope::from(&s[*byte_idx..end]);
+          *byte_idx = end;
+          return Some(rope);
+        }
+        *ended = true;
+        Some(Rope::from(&s[*byte_idx..]))
+      }
+      Lines {
+        iter:
+          LinesEnum::Complex {
+            iter: chunks,
+            ref mut in_chunk_byte_idx,
+            ref mut chunk_idx,
+          },
+        ref mut byte_idx,
+        ref mut ended,
+        ref total_bytes,
+        trailing_line_break_as_newline,
+      } => {
+        if *ended {
+          return None;
+        } else if byte_idx == total_bytes {
+          if trailing_line_break_as_newline {
+            *ended = true;
+            return Some(Rope::from(""));
+          }
+          return None;
+        }
+
+        debug_assert!(*chunk_idx < chunks.len());
+        let &(chunk, _) = &chunks[*chunk_idx];
+
+        // Always try to find a newline in the current chunk,
+        // if the current chunk contains a newline, return this line.
+        if let Some(idx) =
+          memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..])
+        {
+          let end = *in_chunk_byte_idx + idx + 1;
+          let rope = Rope::from(&chunk[*in_chunk_byte_idx..end]);
+          *in_chunk_byte_idx = end;
+          *byte_idx += *in_chunk_byte_idx;
+          Some(rope)
+        } else {
+          // Check if the current chunk has left over bytes.
+          // If it is the last chunk, return the remaining bytes.
+          // This is the end of the rope.
+          if *chunk_idx == chunks.len() - 1 {
+            // Rope is not ended with a newline.
+            // Explicitly set the ended flag to true to bail out.
+            *ended = true;
+            *byte_idx += chunk.len() - *in_chunk_byte_idx;
+            return Some(Rope::from(&chunk[*in_chunk_byte_idx..]));
+          }
+
+          // If the current chunk has running out of bytes, move to the next chunk.
+          if *in_chunk_byte_idx == chunk.len() {
+            *chunk_idx += 1;
+            *in_chunk_byte_idx = 0;
+            return self.next();
+          }
+
+          // If it is not the last chunk, the line spans multiple chunks.
+          // As such, we need to find the next newline in the next few chunks.
+          let start_chunk_idx = *chunk_idx;
+          let start_in_chunk_byte_idx = *in_chunk_byte_idx;
+
+          let end_info = loop {
+            if *chunk_idx == chunks.len() {
+              break None;
+            }
+            let &(chunk, _) = &chunks[*chunk_idx];
+            if let Some(idx) =
+              memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..])
+            {
+              *in_chunk_byte_idx += idx + 1;
+              *byte_idx += *in_chunk_byte_idx;
+              break Some((*chunk_idx, *in_chunk_byte_idx));
+            } else {
+              *in_chunk_byte_idx = 0;
+              *byte_idx += chunk.len();
+              *chunk_idx += 1;
+            }
+          };
+
+          // If we found a newline in the next few chunks, return the line.
+          if let Some((end_chunk_idx, end_in_chunk_byte_idx)) = end_info {
+            let mut raw =
+              Vec::with_capacity(end_chunk_idx - start_chunk_idx + 1);
+            let mut len = 0;
+            (start_chunk_idx..end_chunk_idx + 1).for_each(|i| {
+              let &(chunk, _) = &chunks[i];
+              if start_chunk_idx == i {
+                let start = start_in_chunk_byte_idx;
+                raw.push((&chunk[start..], len));
+                len += chunk.len() - start;
+              } else if end_chunk_idx == i {
+                let end = end_in_chunk_byte_idx;
+                raw.push((&chunk[..end], len));
+                len += end;
+              } else {
+                raw.push((chunk, len));
+                len += chunk.len();
+              }
+            });
+            // Advance the byte index to the end of the line.
+            *byte_idx += len;
+            Some(Rope {
+              repr: Repr::Complex(Rc::new(raw)),
+            })
+          } else {
+            // If we did not find a newline in the next few chunks,
+            // return the remaining bytes.
+
+            let mut raw = Vec::with_capacity(chunks.len() - start_chunk_idx);
+            let mut len = 0;
+            (start_chunk_idx..chunks.len()).for_each(|i| {
+              let &(chunk, _) = &chunks[i];
+              if start_chunk_idx == i {
+                let start = start_in_chunk_byte_idx;
+                raw.push((&chunk[start..], len));
+                len += chunk.len() - start;
+              } else {
+                raw.push((chunk, len));
+                len += chunk.len();
+              }
+            });
+            // Advance the byte index to the end of the rope.
+            *byte_idx += len;
+            Some(Rope {
+              repr: Repr::Complex(Rc::new(raw)),
+            })
+          }
+        }
+      }
+    }
+  }
+}
+
 enum CharIndicesRepr<'a, 'b> {
   Simple {
     iter: std::str::CharIndices<'b>,
@@ -908,4 +1118,73 @@ mod tests {
       "こんにちは世界".char_indices().collect::<Vec<_>>()
     );
   }
+
+  #[test]
+  fn lines1() {
+    let rope = Rope::from("abc");
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["abc"]);
+
+    // empty line at the end if the line before ends with a newline ('\n')
+    let rope = Rope::from("abc\ndef\n");
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["abc\n", "def\n", ""]);
+
+    // no empty line at the end if the line before does not end with a newline ('\n')
+    let rope = Rope::from("abc\ndef");
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["abc\n", "def"]);
+
+    let rope = Rope::from("Test\nTest\nTest\n");
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["Test\n", "Test\n", "Test\n", ""]);
+
+    let rope = Rope::from("\n");
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["\n", ""]);
+
+    let rope = Rope::from("\n\n");
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["\n", "\n", ""]);
+  }
+
+  #[test]
+  fn lines2() {
+    let rope = Rope::from_iter(["abc\n", "def\n", "ghi\n"]);
+    let lines = rope.lines().collect::<Vec<_>>();
+    // empty line at the end if the line before ends with a newline ('\n')
+    assert_eq!(lines, ["abc\n", "def\n", "ghi\n", ""]);
+
+    let rope = Rope::from_iter(["abc\n", "def\n", "ghi"]);
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["abc\n", "def\n", "ghi"]);
+
+    let rope = Rope::from_iter(["abc\ndef", "ghi\n", "jkl"]);
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["abc\n", "defghi\n", "jkl"]);
+
+    let rope = Rope::from_iter(["a\nb", "c\n", "d\n"]);
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["a\n", "bc\n", "d\n", ""]);
+
+    let rope = Rope::from_iter(["\n"]);
+    let lines = rope.lines().collect::<Vec<_>>();
+    assert_eq!(lines, ["\n", ""]);
+  }
+
+  #[test]
+  fn lines_with_trailing_line_break_as_newline() {
+    let trailing_line_break_as_newline = false;
+    let rope = Rope::from("abc\n");
+    let lines = rope
+      .lines_impl(trailing_line_break_as_newline)
+      .collect::<Vec<_>>();
+    assert_eq!(lines, ["abc\n"]);
+
+    let rope = Rope::from("\n");
+    let lines = rope
+      .lines_impl(trailing_line_break_as_newline)
+      .collect::<Vec<_>>();
+    assert_eq!(lines, ["\n"]);
+  }
 }