Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: faster lines iterator for Rope #145

Merged
merged 6 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use std::{
ops::Range,
};

use itertools::Either;
use rustc_hash::FxHashMap as HashMap;

use crate::{
Expand Down Expand Up @@ -201,7 +200,7 @@ const EMPTY_ROPE: Rope = Rope::new();
/// Split the string with a needle, each string will contain the needle.
///
/// Copied and modified from https://github.com/rust-lang/cargo/blob/30efe860c0e4adc1a6d7057ad223dc6e47d34edf/src/cargo/sources/registry/index.rs#L1048-L1072
pub fn split<'a>(
fn split<'a>(
haystack: &Rope<'a>,
needle: u8,
) -> impl Iterator<Item = Rope<'a>> {
Expand Down Expand Up @@ -1324,10 +1323,10 @@ pub trait SourceText<'a>: Default + Clone + ToString {

impl<'a> SourceText<'a> for Rope<'a> {
fn split_into_lines(&self) -> impl Iterator<Item = Self> {
if let Some(s) = self.get_simple() {
return Either::Left(split_str(s, b'\n').map(Rope::from));
}
Either::Right(split(self, b'\n'))
// Split the text into lines, including the line ending character.
// If the text ends with a newline, the last line will be ignored
// For example: "abc\nefg\n" => ["abc\n", "efg\n"]
self.lines_impl(false)
}

#[inline]
Expand Down
279 changes: 279 additions & 0 deletions src/rope.rs
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,34 @@ impl<'a> Rope<'a> {
}
}

/// Returns an iterator over the lines of the rope.
pub fn lines(&self) -> Lines<'_, 'a> {
self.lines_impl(true)
}

/// Returns an iterator over the lines of the rope.
///
/// If `trailing_line_break_as_newline` is true, the end of the rope with ('\n') is treated as an empty newline
pub(crate) fn lines_impl(
&self,
trailing_line_break_as_newline: bool,
) -> Lines<'_, 'a> {
Lines {
iter: match &self.repr {
Repr::Simple(s) => LinesEnum::Simple(s),
Repr::Complex(data) => LinesEnum::Complex {
iter: data,
in_chunk_byte_idx: 0,
chunk_idx: 0,
},
},
byte_idx: 0,
ended: false,
total_bytes: self.len(),
trailing_line_break_as_newline,
}
}

/// Converts the rope to bytes.
///
/// Returns borrowed bytes for simple ropes and owned bytes for complex ropes.
Expand Down Expand Up @@ -457,6 +485,188 @@ impl Hash for Rope<'_> {
}
}

enum LinesEnum<'a, 'b> {
Simple(&'b str),
Complex {
iter: &'a Vec<(&'b str, usize)>,
in_chunk_byte_idx: usize,
chunk_idx: usize,
},
}

pub struct Lines<'a, 'b> {
iter: LinesEnum<'a, 'b>,
byte_idx: usize,
ended: bool,
total_bytes: usize,

/// Whether to treat the end of the rope with ('\n') as an empty newline.
trailing_line_break_as_newline: bool,
}

impl<'a> Iterator for Lines<'_, 'a> {
type Item = Rope<'a>;

fn next(&mut self) -> Option<Self::Item> {
match *self {
Lines {
iter: LinesEnum::Simple(s),
ref mut byte_idx,
ref mut ended,
ref total_bytes,
trailing_line_break_as_newline,
..
} => {
if *ended {
return None;
} else if byte_idx == total_bytes {
if trailing_line_break_as_newline {
*ended = true;
return Some(Rope::from(""));
}
return None;
} else if let Some(idx) =
memchr::memchr(b'\n', &s.as_bytes()[*byte_idx..])
{
let end = *byte_idx + idx + 1;
let rope = Rope::from(&s[*byte_idx..end]);
*byte_idx = end;
return Some(rope);
}
*ended = true;
Some(Rope::from(&s[*byte_idx..]))
}
Lines {
iter:
LinesEnum::Complex {
iter: chunks,
ref mut in_chunk_byte_idx,
ref mut chunk_idx,
},
ref mut byte_idx,
ref mut ended,
ref total_bytes,
trailing_line_break_as_newline,
} => {
if *ended {
return None;
} else if byte_idx == total_bytes {
if trailing_line_break_as_newline {
*ended = true;
return Some(Rope::from(""));
}
return None;
}

debug_assert!(*chunk_idx < chunks.len());
let &(chunk, _) = &chunks[*chunk_idx];

// Always try to find a newline in the current chunk,
// if the current chunk contains a newline, return this line.
if let Some(idx) =
memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..])
{
let end = *in_chunk_byte_idx + idx + 1;
let rope = Rope::from(&chunk[*in_chunk_byte_idx..end]);
*in_chunk_byte_idx = end;
*byte_idx += *in_chunk_byte_idx;
Some(rope)
} else {
// Check if the current chunk has left over bytes.
// If it is the last chunk, return the remaining bytes.
// This is the end of the rope.
if *chunk_idx == chunks.len() - 1 {
// Rope is not ended with a newline.
// Explicitly set the ended flag to true to bail out.
*ended = true;
*byte_idx += chunk.len() - *in_chunk_byte_idx;
return Some(Rope::from(&chunk[*in_chunk_byte_idx..]));
}

// If the current chunk has running out of bytes, move to the next chunk.
if *in_chunk_byte_idx == chunk.len() {
*chunk_idx += 1;
*in_chunk_byte_idx = 0;
return self.next();
}

// If it is not the last chunk, the line spans multiple chunks.
// As such, we need to find the next newline in the next few chunks.
let start_chunk_idx = *chunk_idx;
let start_in_chunk_byte_idx = *in_chunk_byte_idx;

let end_info = loop {
if *chunk_idx == chunks.len() {
break None;
}
let &(chunk, _) = &chunks[*chunk_idx];
if let Some(idx) =
memchr::memchr(b'\n', &chunk.as_bytes()[*in_chunk_byte_idx..])
{
*in_chunk_byte_idx += idx + 1;
*byte_idx += *in_chunk_byte_idx;
break Some((*chunk_idx, *in_chunk_byte_idx));
} else {
*in_chunk_byte_idx = 0;
*byte_idx += chunk.len();
*chunk_idx += 1;
}
};

// If we found a newline in the next few chunks, return the line.
if let Some((end_chunk_idx, end_in_chunk_byte_idx)) = end_info {
let mut raw =
Vec::with_capacity(end_chunk_idx - start_chunk_idx + 1);
let mut len = 0;
(start_chunk_idx..end_chunk_idx + 1).for_each(|i| {
let &(chunk, _) = &chunks[i];
if start_chunk_idx == i {
let start = start_in_chunk_byte_idx;
raw.push((&chunk[start..], len));
len += chunk.len() - start;
} else if end_chunk_idx == i {
let end = end_in_chunk_byte_idx;
raw.push((&chunk[..end], len));
len += end;
} else {
raw.push((chunk, len));
len += chunk.len();
}
});
// Advance the byte index to the end of the line.
*byte_idx += len;
Some(Rope {
repr: Repr::Complex(Rc::new(raw)),
})
} else {
// If we did not find a newline in the next few chunks,
// return the remaining bytes.

let mut raw = Vec::with_capacity(chunks.len() - start_chunk_idx);
let mut len = 0;
(start_chunk_idx..chunks.len()).for_each(|i| {
let &(chunk, _) = &chunks[i];
if start_chunk_idx == i {
let start = start_in_chunk_byte_idx;
raw.push((&chunk[start..], len));
len += chunk.len() - start;
} else {
raw.push((chunk, len));
len += chunk.len();
}
});
// Advance the byte index to the end of the rope.
*byte_idx += len;
Some(Rope {
repr: Repr::Complex(Rc::new(raw)),
})
}
}
}
}
}
}

enum CharIndicesRepr<'a, 'b> {
Simple {
iter: std::str::CharIndices<'b>,
Expand Down Expand Up @@ -908,4 +1118,73 @@ mod tests {
"こんにちは世界".char_indices().collect::<Vec<_>>()
);
}

#[test]
fn lines1() {
let rope = Rope::from("abc");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc"]);

// empty line at the end if the line before ends with a newline ('\n')
let rope = Rope::from("abc\ndef\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "def\n", ""]);

// no empty line at the end if the line before does not end with a newline ('\n')
let rope = Rope::from("abc\ndef");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "def"]);

let rope = Rope::from("Test\nTest\nTest\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["Test\n", "Test\n", "Test\n", ""]);

let rope = Rope::from("\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["\n", ""]);

let rope = Rope::from("\n\n");
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["\n", "\n", ""]);
}

#[test]
fn lines2() {
let rope = Rope::from_iter(["abc\n", "def\n", "ghi\n"]);
let lines = rope.lines().collect::<Vec<_>>();
// empty line at the end if the line before ends with a newline ('\n')
assert_eq!(lines, ["abc\n", "def\n", "ghi\n", ""]);

let rope = Rope::from_iter(["abc\n", "def\n", "ghi"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "def\n", "ghi"]);

let rope = Rope::from_iter(["abc\ndef", "ghi\n", "jkl"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["abc\n", "defghi\n", "jkl"]);

let rope = Rope::from_iter(["a\nb", "c\n", "d\n"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["a\n", "bc\n", "d\n", ""]);

let rope = Rope::from_iter(["\n"]);
let lines = rope.lines().collect::<Vec<_>>();
assert_eq!(lines, ["\n", ""]);
}

#[test]
fn lines_with_trailing_line_break_as_newline() {
let trailing_line_break_as_newline = false;
let rope = Rope::from("abc\n");
let lines = rope
.lines_impl(trailing_line_break_as_newline)
.collect::<Vec<_>>();
assert_eq!(lines, ["abc\n"]);

let rope = Rope::from("\n");
let lines = rope
.lines_impl(trailing_line_break_as_newline)
.collect::<Vec<_>>();
assert_eq!(lines, ["\n"]);
}
}
Loading