Skip to content

Commit

Permalink
perf: faster lines iterator for Rope (#145)
Browse files Browse the repository at this point in the history
* refactor: init

* refactor: faster lines

* refactor: try

* chore: clippy

* test: more

* chore: more
  • Loading branch information
h-a-n-a authored Dec 12, 2024
1 parent b08c1b7 commit c773540
Show file tree
Hide file tree
Showing 2 changed files with 284 additions and 6 deletions.
11 changes: 5 additions & 6 deletions src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use std::{
ops::Range,
};

use itertools::Either;
use rustc_hash::FxHashMap as HashMap;

use crate::{
Expand Down Expand Up @@ -201,7 +200,7 @@ const EMPTY_ROPE: Rope = Rope::new();
/// Split the string with a needle, each string will contain the needle.
///
/// Copied and modified from https://github.com/rust-lang/cargo/blob/30efe860c0e4adc1a6d7057ad223dc6e47d34edf/src/cargo/sources/registry/index.rs#L1048-L1072
pub fn split<'a>(
fn split<'a>(
haystack: &Rope<'a>,
needle: u8,
) -> impl Iterator<Item = Rope<'a>> {
Expand Down Expand Up @@ -1324,10 +1323,10 @@ pub trait SourceText<'a>: Default + Clone + ToString {

impl<'a> SourceText<'a> for Rope<'a> {
fn split_into_lines(&self) -> impl Iterator<Item = Self> {
if let Some(s) = self.get_simple() {
return Either::Left(split_str(s, b'\n').map(Rope::from));
}
Either::Right(split(self, b'\n'))
// Split the text into lines, including the line ending character.
// If the text ends with a newline, the last line will be ignored
// For example: "abc\nefg\n" => ["abc\n", "efg\n"]
self.lines_impl(false)
}

#[inline]
Expand Down
279 changes: 279 additions & 0 deletions src/rope.rs
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,34 @@ impl<'a> Rope<'a> {
}
}

/// Returns an iterator over the lines of the rope.
pub fn lines(&self) -> Lines<'_, 'a> {
self.lines_impl(true)
}

/// Returns an iterator over the lines of the rope.
///
/// If `trailing_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline
pub(crate) fn lines_impl(
&self,
trailing_line_break_as_newline: bool,
) -> Lines<'_, 'a> {
Lines {
iter: match &self.repr {
Repr::Simple(s) => LinesEnum::Simple(s),
Repr::Complex(data) => LinesEnum::Complex {
iter: data,
in_chunk_byte_idx: 0,
chunk_idx: 0,
},
},
byte_idx: 0,
ended: false,
total_bytes: self.len(),
trailing_line_break_as_newline,
}
}

/// Converts the rope to bytes.
///
/// Returns borrowed bytes for simple ropes and owned bytes for complex ropes.
Expand Down Expand Up @@ -457,6 +485,188 @@ impl Hash for Rope<'_> {
}
}

enum LinesEnum<'a, 'b> {
Simple(&'b str),
Complex {
iter: &'a Vec<(&'b str, usize)>,
in_chunk_byte_idx: usize,
chunk_idx: usize,
},
}

pub struct Lines<'a, 'b> {
iter: LinesEnum<'a, 'b>,
byte_idx: usize,
ended: bool,
total_bytes: usize,

/// Whether to treat the end of the rope with ('\n') as an empty newline.
trailing_line_break_as_newline: bool,
}

impl<'a> Iterator for Lines<'_, 'a> {
type Item = Rope<'a>;

fn next(&mut self) -> Option<Self::Item> {
match *self {
Lines {
iter: LinesEnum::Simple(s),
ref mut byte_idx,
ref mut ended,
ref total_bytes,
trailing_line_break_as_newline,
..
} => {
if *ended {
return None;
} else if byte_idx == total_bytes {
if trailing_line_break_as_newline {
*ended = true;
return Some(Rope::from(""));
}
return None;
} else if let Some(idx) =
memchr::memchr(b'\n', &s.as_bytes()[*byte_idx..])
{
let end = *byte_idx + idx + 1;
let rope = Rope::from(&s[*byte_idx..end]);
*byte_idx = end;
return Some(rope);
}
*ended = true;
Some(Rope::from(&s[*byte_idx..]))
}
Lines {
iter:
LinesEnum::Complex {
iter: chunks,
ref mut in_chunk_byte_idx,
ref mut chunk_idx,
},
ref mut byte_idx,
ref mut ended,
ref total_bytes,
trailing_line_break_as_newline,
} => {
if *ended {
return None;
} else if byte_idx == total_bytes {
if trailing_line_break_as_newline {
*ended = true;
return Some(Rope::from(""));
}
return None;
}

debug_assert!(*chunk_idx < chunks.len());
let &(chunk, _) = &chunks[*chunk_idx];

// Always try to find a newline in the current chunk,
// if the current chunk contains a newline, return this line.
if let Some(idx) =
memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..])
{
let end = *in_chunk_byte_idx + idx + 1;
let rope = Rope::from(&chunk[*in_chunk_byte_idx..end]);
*in_chunk_byte_idx = end;
*byte_idx += *in_chunk_byte_idx;
Some(rope)
} else {
// Check if the current chunk has left over bytes.
// If it is the last chunk, return the remaining bytes.
// This is the end of the rope.
if *chunk_idx == chunks.len() - 1 {
// Rope is not ended with a newline.
// Explicitly set the ended flag to true to bail out.
*ended = true;
*byte_idx += chunk.len() - *in_chunk_byte_idx;
return Some(Rope::from(&chunk[*in_chunk_byte_idx..]));
}

// If the current chunk has running out of bytes, move to the next chunk.
if *in_chunk_byte_idx == chunk.len() {
*chunk_idx += 1;
*in_chunk_byte_idx = 0;
return self.next();
}

// If it is not the last chunk, the line spans multiple chunks.
// As such, we need to find the next newline in the next few chunks.
let start_chunk_idx = *chunk_idx;
let start_in_chunk_byte_idx = *in_chunk_byte_idx;

let end_info = loop {
if *chunk_idx == chunks.len() {
break None;
}
let &(chunk, _) = &chunks[*chunk_idx];
if let Some(idx) =
memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..])
{
*in_chunk_byte_idx += idx + 1;
*byte_idx += *in_chunk_byte_idx;
break Some((*chunk_idx, *in_chunk_byte_idx));
} else {
*in_chunk_byte_idx = 0;
*byte_idx += chunk.len();
*chunk_idx += 1;
}
};

// If we found a newline in the next few chunks, return the line.
if let Some((end_chunk_idx, end_in_chunk_byte_idx)) = end_info {
let mut raw =
Vec::with_capacity(end_chunk_idx - start_chunk_idx + 1);
let mut len = 0;
(start_chunk_idx..end_chunk_idx + 1).for_each(|i| {
let &(chunk, _) = &chunks[i];
if start_chunk_idx == i {
let start = start_in_chunk_byte_idx;
raw.push((&chunk[start..], len));
len += chunk.len() - start;
} else if end_chunk_idx == i {
let end = end_in_chunk_byte_idx;
raw.push((&chunk[..end], len));
len += end;
} else {
raw.push((chunk, len));
len += chunk.len();
}
});
// Advance the byte index to the end of the line.
*byte_idx += len;
Some(Rope {
repr: Repr::Complex(Rc::new(raw)),
})
} else {
// If we did not find a newline in the next few chunks,
// return the remaining bytes.

let mut raw = Vec::with_capacity(chunks.len() - start_chunk_idx);
let mut len = 0;
(start_chunk_idx..chunks.len()).for_each(|i| {
let &(chunk, _) = &chunks[i];
if start_chunk_idx == i {
let start = start_in_chunk_byte_idx;
raw.push((&chunk[start..], len));
len += chunk.len() - start;
} else {
raw.push((chunk, len));
len += chunk.len();
}
});
// Advance the byte index to the end of the rope.
*byte_idx += len;
Some(Rope {
repr: Repr::Complex(Rc::new(raw)),
})
}
}
}
}
}
}

enum CharIndicesRepr<'a, 'b> {
Simple {
iter: std::str::CharIndices<'b>,
Expand Down Expand Up @@ -908,4 +1118,73 @@ mod tests {
"こんにちは世界".char_indices().collect::<Vec<_>>()
);
}

#[test]
fn lines1() {
let rope = Rope::from("abc");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc"]);

// empty line at the end if the line before ends with a newline ('\n')
let rope = Rope::from("abc\ndef\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "def\n", ""]);

// no empty line at the end if the line before does not end with a newline ('\n')
let rope = Rope::from("abc\ndef");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "def"]);

let rope = Rope::from("Test\nTest\nTest\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["Test\n", "Test\n", "Test\n", ""]);

let rope = Rope::from("\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["\n", ""]);

let rope = Rope::from("\n\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["\n", "\n", ""]);
}

#[test]
fn lines2() {
let rope = Rope::from_iter(["abc\n", "def\n", "ghi\n"]);
let lines = rope.lines().collect::<Vec<_>>();
// empty line at the end if the line before ends with a newline ('\n')
assert_eq!(lines, ["abc\n", "def\n", "ghi\n", ""]);

let rope = Rope::from_iter(["abc\n", "def\n", "ghi"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "def\n", "ghi"]);

let rope = Rope::from_iter(["abc\ndef", "ghi\n", "jkl"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "defghi\n", "jkl"]);

let rope = Rope::from_iter(["a\nb", "c\n", "d\n"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["a\n", "bc\n", "d\n", ""]);

let rope = Rope::from_iter(["\n"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["\n", ""]);
}

#[test]
fn lines_with_trailing_line_break_as_newline() {
let trailing_line_break_as_newline = false;
let rope = Rope::from("abc\n");
let lines = rope
.lines_impl(trailing_line_break_as_newline)
.collect::<Vec<_>>();
assert_eq!(lines, ["abc\n"]);

let rope = Rope::from("\n");
let lines = rope
.lines_impl(trailing_line_break_as_newline)
.collect::<Vec<_>>();
assert_eq!(lines, ["\n"]);
}
}

0 comments on commit c773540

Please sign in to comment.