From 3c41a54dac6b78bf6b1fa343f98090e64a46f2a8 Mon Sep 17 00:00:00 2001 From: MinusGix Date: Thu, 7 Mar 2024 13:25:57 -0600 Subject: [PATCH] Implement line-ending recognition and normalization (#353) --- Cargo.toml | 3 +- editor-core/Cargo.toml | 3 + editor-core/src/buffer/mod.rs | 50 +++- editor-core/src/command.rs | 4 + editor-core/src/editor.rs | 10 + editor-core/src/lib.rs | 1 + editor-core/src/line_ending.rs | 411 +++++++++++++++++++++++++++++++++ 7 files changed, 479 insertions(+), 3 deletions(-) create mode 100644 editor-core/src/line_ending.rs diff --git a/Cargo.toml b/Cargo.toml index 87d5ea57..c19d6772 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ serde = "1.0" lapce-xi-rope = { version = "0.3.2", features = ["serde"] } strum = "0.21.0" strum_macros = "0.21.1" +once_cell = "1.17.1" [dependencies] sha2 = "0.10.6" @@ -39,7 +40,6 @@ kurbo = { version = "0.9.5", features = ["serde"] } unicode-segmentation = "1.10.0" floem-peniko = "0.1.0" crossbeam-channel = "0.5.6" -once_cell = "1.17.1" im = "15.1.0" im-rc = "15.1.0" serde = { workspace = true, optional = true } @@ -57,6 +57,7 @@ floem-winit = { version = "0.29.4", features = ["rwh_05"] } floem-editor-core = { path = "editor-core", version = "0.1.0", optional = true } image = { version = "0.24", features = ["jpeg", "png"] } copypasta = { version = "0.10.0", default-features = false, features = ["wayland", "x11"] } +once_cell.workspace = true [features] default = ["editor", "rfd-tokio"] diff --git a/editor-core/Cargo.toml b/editor-core/Cargo.toml index 8c904e7e..dffe7182 100644 --- a/editor-core/Cargo.toml +++ b/editor-core/Cargo.toml @@ -14,6 +14,9 @@ lapce-xi-rope.workspace = true itertools = "0.10.1" bitflags = "1.3.2" +memchr = "2.7.1" + +once_cell.workspace = true [features] serde = ["dep:serde"] diff --git a/editor-core/src/buffer/mod.rs b/editor-core/src/buffer/mod.rs index de7d57d7..b8234b00 100644 --- a/editor-core/src/buffer/mod.rs +++ b/editor-core/src/buffer/mod.rs @@ -18,6 +18,7 @@ use crate::{ cursor::CursorMode, editor::EditType, indent::{auto_detect_indent_style, IndentStyle}, + line_ending::{LineEnding, LineEndingDetermination}, mode::Mode, selection::Selection, word::WordCursor, @@ -88,6 +89,7 @@ pub struct Buffer { last_edit_type: EditType, indent_style: IndentStyle, + line_ending: LineEnding, max_len: usize, max_len_line: usize, @@ -102,6 +104,14 @@ impl ToString for Buffer { impl Buffer { pub fn new(text: impl Into) -> Self { let text = text.into(); + + // Determine the line ending of the text and adjust it if necessary + let line_ending = LineEndingDetermination::determine(&text); + let line_ending = line_ending.unwrap_or(LineEnding::Lf); + + // Get rid of lone Cr's as Rope does not treat them as line endings + let text = line_ending.normalize_limited(&text); + let len = text.len(); Self { text, @@ -131,6 +141,7 @@ impl Buffer { this_edit_type: EditType::Other, last_edit_type: EditType::Other, indent_style: IndentStyle::DEFAULT_INDENT, + line_ending, max_len: 0, max_len_line: 0, @@ -236,6 +247,11 @@ impl Buffer { pub fn init_content(&mut self, content: Rope) { if !content.is_empty() { + let line_ending = LineEndingDetermination::determine(&content); + self.line_ending = line_ending.unwrap_or(self.line_ending); + + let content = self.line_ending.normalize_limited(&content); + let delta = Delta::simple_edit(Interval::new(0, 0), content, 0); let (new_rev, new_text, new_tombstones, new_deletes_from_union) = self.mk_new_rev(0, delta.clone()); @@ -251,6 +267,12 @@ impl Buffer { } pub fn reload(&mut self, content: Rope, set_pristine: bool) -> (Rope, RopeDelta, InvalLines) { + // Determine the line ending of the new text + let line_ending = LineEndingDetermination::determine(&content); + self.line_ending = line_ending.unwrap_or(self.line_ending); + + let content = self.line_ending.normalize_limited(&content); + let len = self.text.len(); let delta = Delta::simple_edit(Interval::new(0, len), content, len); self.this_edit_type = EditType::Other; @@ -274,11 +296,19 @@ impl Buffer { self.indent_style.as_str() } + pub fn line_ending(&self) -> LineEnding { + self.line_ending + } + + pub fn set_line_ending(&mut self, line_ending: LineEnding) { + self.line_ending = line_ending; + } + pub fn reset_edit_type(&mut self) { self.last_edit_type = EditType::Other; } - /// Apply edits + /// Apply edits, normalizes line endings before applying. /// Returns `(Text before delta, delta, invalidated lines)` pub fn edit<'a, I, E, S>( &mut self, @@ -309,14 +339,29 @@ impl Buffer { } }); for (start, end, rope) in interval_rope.into_iter() { - builder.replace(start..end, rope); + // TODO(minor): normalizing line endings here technically has an edge-case where it + // could be that we put a `\r` at the end of a replacement, then a `\n` at the start of + // a replacement right after it, and then it becomes a double newline. + // A possible alternative that might be better overall (?) would be to get the range of + // the delta and normalize that area after applying the delta. + builder.replace(start..end, self.line_ending.normalize(&rope)); } let delta = builder.build(); self.this_edit_type = edit_type; self.add_delta(delta) } + pub fn normalize_line_endings(&mut self) -> Option<(Rope, RopeDelta, InvalLines)> { + let Some(delta) = self.line_ending.normalize_delta(&self.text) else { + // There were no changes needed + return None; + }; + self.this_edit_type = EditType::NormalizeLineEndings; + Some(self.add_delta(delta)) + } + // TODO: don't clone the delta and return it, if the caller needs it then they can clone it + /// Note: the delta's line-endings should be normalized. fn add_delta(&mut self, delta: RopeDelta) -> (Rope, RopeDelta, InvalLines) { let text = self.text.clone(); @@ -388,6 +433,7 @@ impl Buffer { } } + /// Returns `(Revision, new text, new tombstones, new deletes from union)` fn mk_new_rev(&self, undo_group: usize, delta: RopeDelta) -> (Revision, Rope, Rope, Subset) { let (ins_delta, deletes) = delta.factor(); diff --git a/editor-core/src/command.rs b/editor-core/src/command.rs index d6653042..8c723821 100644 --- a/editor-core/src/command.rs +++ b/editor-core/src/command.rs @@ -125,6 +125,10 @@ pub enum EditCommand { #[strum(message = "Duplicate Line Down")] #[strum(serialize = "duplicate_line_down")] DuplicateLineDown, + + #[strum(message = "Normalize Line Endings")] + #[strum(serialize = "normalize_line_endings")] + NormalizeLineEndings, } impl EditCommand { diff --git a/editor-core/src/editor.rs b/editor-core/src/editor.rs index 5ca6f078..dbe51fcd 100644 --- a/editor-core/src/editor.rs +++ b/editor-core/src/editor.rs @@ -62,6 +62,7 @@ pub enum EditType { DeleteToEndOfLine, DeleteToEndOfLineAndInsert, MotionDelete, + NormalizeLineEndings, Undo, Redo, Other, @@ -1493,6 +1494,15 @@ impl Action { } DuplicateLineUp => Self::duplicate_line(cursor, buffer, DuplicateDirection::Up), DuplicateLineDown => Self::duplicate_line(cursor, buffer, DuplicateDirection::Down), + NormalizeLineEndings => { + let Some((text, delta, inval)) = buffer.normalize_line_endings() else { + return vec![]; + }; + + cursor.apply_delta(&delta); + + vec![(text, delta, inval)] + } } } } diff --git a/editor-core/src/lib.rs b/editor-core/src/lib.rs index dadb5599..ea5e408e 100644 --- a/editor-core/src/lib.rs +++ b/editor-core/src/lib.rs @@ -5,6 +5,7 @@ pub mod command; pub mod cursor; pub mod editor; pub mod indent; +pub mod line_ending; pub mod mode; pub mod movement; pub mod paragraph; diff --git a/editor-core/src/line_ending.rs b/editor-core/src/line_ending.rs new file mode 100644 index 00000000..0cc502f7 --- /dev/null +++ b/editor-core/src/line_ending.rs @@ -0,0 +1,411 @@ +use std::{iter::Peekable, ops::Range}; + +use lapce_xi_rope::{DeltaBuilder, Rope, RopeDelta}; +use memchr::{memchr, memchr2}; +use once_cell::sync::Lazy; + +// Cached ropes for the two line endings +static CR_LF: Lazy = Lazy::new(|| Rope::from("\r\n")); +static LF: Lazy = Lazy::new(|| Rope::from("\n")); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum LineEnding { + /// `\r\n` Windows + CrLf, + /// `\n` Unix + Lf, +} +impl LineEnding { + /// Replace the line endings (`\n`, `\r\n`, `\r`) used in `text` with the line ending named by + /// `self`. + pub fn normalize(self, text: &Rope) -> Rope { + self.normalize_delta(text) + .map(|d| d.apply(text)) + .unwrap_or_else(|| text.clone()) + } + + pub fn normalize_delta(self, text: &Rope) -> Option { + let mut builder = DeltaBuilder::new(text.len()); + + let le = if self == LineEnding::Lf { + LF.clone() + } else { + CR_LF.clone() + }; + + let mut had_entries = false; + for (range, kind) in FullLeChunkSearch::new(text.iter_chunks(..)) { + had_entries = true; + match kind { + LeChunkKind::CrLf => { + if self == LineEnding::Lf { + builder.replace(range, LF.clone()); + } + } + LeChunkKind::Lf => { + if self == LineEnding::CrLf { + builder.replace(range, CR_LF.clone()); + } + } + LeChunkKind::Cr => { + builder.replace(range, le.clone()); + } + } + } + + if had_entries { + let delta = builder.build(); + Some(delta) + } else { + None + } + } + + /// Only replace the carriage return line-endings. + pub fn normalize_limited(self, text: &Rope) -> Rope { + let mut builder = DeltaBuilder::new(text.len()); + + let le = if self == LineEnding::Lf { + LF.clone() + } else { + CR_LF.clone() + }; + + let mut had_entries = false; + for offset in LoneCrChunkSearch::new(text.iter_chunks(..)) { + had_entries = true; + builder.replace(offset..offset + 1, le.clone()); + } + + if had_entries { + let delta = builder.build(); + delta.apply(text) + } else { + text.clone() + } + } + + pub fn as_str(&self) -> &'static str { + match self { + LineEnding::CrLf => "CRLF", + LineEnding::Lf => "LF", + } + } +} + +#[derive(Debug, Clone, Copy)] +pub enum LineEndingDetermination { + CrLf, + Lf, + Mixed, + Unknown, +} +impl LineEndingDetermination { + // TODO: should we just do a simpler routine of checking the first few lines? + // Based off of xi-rope's line-ending determination logic + pub fn determine(text: &Rope) -> Self { + let mut crlf = false; + let mut lf = false; + + for chunk in text.iter_chunks(..) { + match LineEndingDetermination::determine_str(chunk) { + LineEndingDetermination::CrLf => crlf = true, + LineEndingDetermination::Lf => lf = true, + LineEndingDetermination::Mixed => { + return LineEndingDetermination::Mixed; + } + LineEndingDetermination::Unknown => {} + } + } + + match (crlf, lf) { + (true, true) => LineEndingDetermination::Mixed, + (true, false) => LineEndingDetermination::CrLf, + (false, true) => LineEndingDetermination::Lf, + (false, false) => LineEndingDetermination::Unknown, + } + } + + fn determine_str(chunk: &str) -> LineEndingDetermination { + let bytes = chunk.as_bytes(); + let newline = memchr2(b'\n', b'\r', bytes); + match newline { + Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => { + LineEndingDetermination::CrLf + } + Some(x) if bytes[x] == b'\n' => LineEndingDetermination::Lf, + Some(_) => LineEndingDetermination::Mixed, + None => LineEndingDetermination::Unknown, + } + } + + pub fn unwrap_or(self, le: LineEnding) -> LineEnding { + match self { + LineEndingDetermination::CrLf => LineEnding::CrLf, + LineEndingDetermination::Lf => LineEnding::Lf, + LineEndingDetermination::Mixed | LineEndingDetermination::Unknown => le, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum LeChunkKind { + CrLf, + Lf, + Cr, +} + +/// Line ending chunk searcher +struct FullLeChunkSearch<'a, I: Iterator> { + offset: usize, + /// Offset within the chunk itself + chunk_pos: usize, + chunks: Peekable, +} +impl<'a, I: Iterator> FullLeChunkSearch<'a, I> { + fn new(chunks: I) -> Self { + Self { + offset: 0, + chunk_pos: 0, + chunks: chunks.peekable(), + } + } + + /// Get the current chunk, updating the current chunk if needed + fn get_chunk(&mut self) -> Option<&'a str> { + let chunk = self.chunks.peek()?; + if self.chunk_pos >= chunk.len() { + self.advance_chunk(); + Some(*self.chunks.peek()?) + } else { + Some(chunk) + } + } + + fn advance_chunk(&mut self) -> Option<()> { + let chunk = self.chunks.next()?; + self.offset += chunk.len(); + self.chunk_pos = 0; + + Some(()) + } +} +impl<'a, I: Iterator> Iterator for FullLeChunkSearch<'a, I> { + type Item = (Range, LeChunkKind); + + fn next(&mut self) -> Option { + let chunk = self.get_chunk()?; + + let bytes = &chunk.as_bytes()[self.chunk_pos..]; + + let newline = memchr2(b'\n', b'\r', bytes); + match newline { + // CrLf + Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => { + let start = self.offset + self.chunk_pos + x; + let end = start + 2; + + self.chunk_pos += x + 2; + Some((start..end, LeChunkKind::CrLf)) + } + // Lf + Some(x) if bytes[x] == b'\n' => { + let start = self.offset + self.chunk_pos + x; + let end = start + 1; + + self.chunk_pos += x + 1; + Some((start..end, LeChunkKind::Lf)) + } + Some(x) => { + // Typically this only occurs for a lone `\r`. + // However, we need to handl the case where the `\r` is the last character in the + // chunk whilst the next chunk starts with a `\n`. + assert!(bytes[x] == b'\r'); + + let start = self.offset + self.chunk_pos + x; + self.chunk_pos += x + 1; + + let v = if self.chunk_pos == chunk.len() { + if let Some(next_chunk) = self.get_chunk() { + if next_chunk.starts_with('\n') { + self.chunk_pos = 1; + Some((start..start + 2, LeChunkKind::CrLf)) + } else { + None + } + } else { + None + } + } else { + None + }; + + Some(v.unwrap_or_else(|| { + // There is no \n so it is a lone `\r` + // (Which is used in MacOS, or sometimes due to bugged line endings) + let end = start + 1; + (start..end, LeChunkKind::Cr) + })) + } + None => { + self.advance_chunk(); + self.next() + } + } + } +} + +/// Iterator that searches for lone carriage returns ('\r') in chunks of text. +struct LoneCrChunkSearch<'a, I: Iterator> { + offset: usize, + chunk_pos: usize, + chunks: Peekable, +} + +impl<'a, I: Iterator> LoneCrChunkSearch<'a, I> { + fn new(chunks: I) -> Self { + Self { + offset: 0, + chunk_pos: 0, + chunks: chunks.peekable(), + } + } + + fn get_chunk(&mut self) -> Option<&'a str> { + let chunk = self.chunks.peek()?; + if self.chunk_pos >= chunk.len() { + self.advance_chunk(); + Some(*self.chunks.peek()?) + } else { + Some(chunk) + } + } + + fn advance_chunk(&mut self) -> Option<()> { + let chunk = self.chunks.next()?; + self.offset += chunk.len(); + self.chunk_pos = 0; + + Some(()) + } +} + +impl<'a, I: Iterator> Iterator for LoneCrChunkSearch<'a, I> { + type Item = usize; + + fn next(&mut self) -> Option { + let chunk = self.get_chunk()?; + + let bytes = &chunk.as_bytes()[self.chunk_pos..]; + + let newline = memchr(b'\r', bytes); + match newline { + Some(x) => { + let offset = self.offset + self.chunk_pos + x; + + // Check if the next character is '\n' (indicating \r\n) + self.chunk_pos += x + 1; + if self.chunk_pos < chunk.len() && chunk.as_bytes()[self.chunk_pos] == b'\n' { + // Skip \r\n sequences + self.chunk_pos += 1; + self.next() + } else if let Some(next_chunk) = self.get_chunk() { + if next_chunk.starts_with('\n') { + // Skip \r\n sequences across chunks + self.chunk_pos = 1; + self.next() + } else { + // Lone \r + Some(offset) + } + } else { + // Lone \r at the end + Some(offset) + } + } + None => { + self.advance_chunk(); + self.next() + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn normalize() { + let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi"); + let normalized = LineEnding::CrLf.normalize(&text); + assert_eq!( + normalized.slice_to_cow(..), + "hello\r\nworld toast and jam\r\nthe end\r\nhi" + ); + + let text = Rope::from("\n"); + let normalized = LineEnding::Lf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\n"); + let normalized = LineEnding::CrLf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\r\n"); + + let text = Rope::from("\r\n"); + let normalized = LineEnding::Lf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\n"); + let normalized = LineEnding::CrLf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\r\n"); + + // `\r` is always normalized to the line ending of the file + let text = Rope::from("\r"); + let normalized = LineEnding::Lf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\n"); + let normalized = LineEnding::CrLf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\r\n"); + let normalized = LineEnding::Lf.normalize_limited(&text); + assert_eq!(normalized.slice_to_cow(..), "\n"); + + let text = Rope::from("\rtest"); + let normalized = LineEnding::Lf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\ntest"); + let normalized = LineEnding::CrLf.normalize(&text); + assert_eq!(normalized.slice_to_cow(..), "\r\ntest"); + let normalized = LineEnding::Lf.normalize_limited(&text); + assert_eq!(normalized.slice_to_cow(..), "\ntest"); + } + + #[test] + fn chunk_search() { + let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi"); + let c = FullLeChunkSearch::new(text.iter_chunks(..)); + assert_eq!( + c.collect::>(), + vec![ + (5..7, LeChunkKind::CrLf), + (26..27, LeChunkKind::Lf), + (34..35, LeChunkKind::Lf), + ] + ); + let c = LoneCrChunkSearch::new(text.iter_chunks(..)); + assert_eq!(c.collect::>(), Vec::new()); + + // Test searching across different chunks of text + // (Using a non-Rope iterator to simplify creation, however it should behave the same) + let text = ["a\n", "\n5", "\r\ne\r", "\ntest\r", "\rv"]; + let multi_chunk = FullLeChunkSearch::new(text.into_iter()); + assert_eq!( + multi_chunk.collect::>(), + vec![ + (1..2, LeChunkKind::Lf), + (2..3, LeChunkKind::Lf), + (4..6, LeChunkKind::CrLf), + (7..9, LeChunkKind::CrLf), + (13..14, LeChunkKind::Cr), + (14..15, LeChunkKind::Cr), + ] + ); + + let multi_chunk = LoneCrChunkSearch::new(text.into_iter()); + assert_eq!(multi_chunk.collect::>(), vec![13, 14]); + } +}