From 3c41a54dac6b78bf6b1fa343f98090e64a46f2a8 Mon Sep 17 00:00:00 2001
From: MinusGix <MinusGix@gmail.com>
Date: Thu, 7 Mar 2024 13:25:57 -0600
Subject: [PATCH] Implement line-ending recognition and normalization (#353)

---
 Cargo.toml                     |   3 +-
 editor-core/Cargo.toml         |   3 +
 editor-core/src/buffer/mod.rs  |  50 +++-
 editor-core/src/command.rs     |   4 +
 editor-core/src/editor.rs      |  10 +
 editor-core/src/lib.rs         |   1 +
 editor-core/src/line_ending.rs | 411 +++++++++++++++++++++++++++++++++
 7 files changed, 479 insertions(+), 3 deletions(-)
 create mode 100644 editor-core/src/line_ending.rs
diff --git a/Cargo.toml b/Cargo.toml
index 87d5ea57..c19d6772 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ serde = "1.0"
 lapce-xi-rope = { version = "0.3.2", features = ["serde"] }
 strum = "0.21.0"
 strum_macros = "0.21.1"
+once_cell = "1.17.1"
 
 [dependencies]
 sha2 = "0.10.6"
@@ -39,7 +40,6 @@ kurbo = { version = "0.9.5", features = ["serde"] }
 unicode-segmentation = "1.10.0"
 floem-peniko = "0.1.0"
 crossbeam-channel = "0.5.6"
-once_cell = "1.17.1"
 im = "15.1.0"
 im-rc = "15.1.0"
 serde = { workspace = true, optional = true }
@@ -57,6 +57,7 @@ floem-winit = { version = "0.29.4", features = ["rwh_05"] }
 floem-editor-core = { path = "editor-core", version = "0.1.0", optional = true }
 image = { version = "0.24", features = ["jpeg", "png"] }
 copypasta = { version = "0.10.0", default-features = false, features = ["wayland", "x11"] }
+once_cell.workspace = true
 
 [features]
 default = ["editor", "rfd-tokio"]
diff --git a/editor-core/Cargo.toml b/editor-core/Cargo.toml
index 8c904e7e..dffe7182 100644
--- a/editor-core/Cargo.toml
+++ b/editor-core/Cargo.toml
@@ -14,6 +14,9 @@ lapce-xi-rope.workspace = true
 
 itertools = "0.10.1"
 bitflags = "1.3.2"
+memchr = "2.7.1"
+
+once_cell.workspace = true
 
 [features]
 serde = ["dep:serde"]
diff --git a/editor-core/src/buffer/mod.rs b/editor-core/src/buffer/mod.rs
index de7d57d7..b8234b00 100644
--- a/editor-core/src/buffer/mod.rs
+++ b/editor-core/src/buffer/mod.rs
@@ -18,6 +18,7 @@ use crate::{
     cursor::CursorMode,
     editor::EditType,
     indent::{auto_detect_indent_style, IndentStyle},
+    line_ending::{LineEnding, LineEndingDetermination},
     mode::Mode,
     selection::Selection,
     word::WordCursor,
@@ -88,6 +89,7 @@ pub struct Buffer {
     last_edit_type: EditType,
 
     indent_style: IndentStyle,
+    line_ending: LineEnding,
 
     max_len: usize,
     max_len_line: usize,
@@ -102,6 +104,14 @@ impl ToString for Buffer {
 impl Buffer {
     pub fn new(text: impl Into<Rope>) -> Self {
         let text = text.into();
+
+        // Determine the line ending of the text and adjust it if necessary
+        let line_ending = LineEndingDetermination::determine(&text);
+        let line_ending = line_ending.unwrap_or(LineEnding::Lf);
+
+        // Get rid of lone Cr's as Rope does not treat them as line endings
+        let text = line_ending.normalize_limited(&text);
+
         let len = text.len();
         Self {
             text,
@@ -131,6 +141,7 @@ impl Buffer {
             this_edit_type: EditType::Other,
             last_edit_type: EditType::Other,
             indent_style: IndentStyle::DEFAULT_INDENT,
+            line_ending,
 
             max_len: 0,
             max_len_line: 0,
@@ -236,6 +247,11 @@ impl Buffer {
 
     pub fn init_content(&mut self, content: Rope) {
         if !content.is_empty() {
+            let line_ending = LineEndingDetermination::determine(&content);
+            self.line_ending = line_ending.unwrap_or(self.line_ending);
+
+            let content = self.line_ending.normalize_limited(&content);
+
             let delta = Delta::simple_edit(Interval::new(0, 0), content, 0);
             let (new_rev, new_text, new_tombstones, new_deletes_from_union) =
                 self.mk_new_rev(0, delta.clone());
@@ -251,6 +267,12 @@ impl Buffer {
     }
 
     pub fn reload(&mut self, content: Rope, set_pristine: bool) -> (Rope, RopeDelta, InvalLines) {
+        // Determine the line ending of the new text
+        let line_ending = LineEndingDetermination::determine(&content);
+        self.line_ending = line_ending.unwrap_or(self.line_ending);
+
+        let content = self.line_ending.normalize_limited(&content);
+
         let len = self.text.len();
         let delta = Delta::simple_edit(Interval::new(0, len), content, len);
         self.this_edit_type = EditType::Other;
@@ -274,11 +296,19 @@ impl Buffer {
         self.indent_style.as_str()
     }
 
+    pub fn line_ending(&self) -> LineEnding {
+        self.line_ending
+    }
+
+    pub fn set_line_ending(&mut self, line_ending: LineEnding) {
+        self.line_ending = line_ending;
+    }
+
     pub fn reset_edit_type(&mut self) {
         self.last_edit_type = EditType::Other;
     }
 
-    /// Apply edits  
+    /// Apply edits, normalizes line endings before applying.
     /// Returns `(Text before delta, delta, invalidated lines)`
     pub fn edit<'a, I, E, S>(
         &mut self,
@@ -309,14 +339,29 @@ impl Buffer {
             }
         });
         for (start, end, rope) in interval_rope.into_iter() {
-            builder.replace(start..end, rope);
+            // TODO(minor): normalizing line endings here technically has an edge-case where it
+            // could be that we put a `\r` at the end of a replacement, then a `\n` at the start of
+            // a replacement right after it, and then it becomes a double newline.
+            // A possible alternative that might be better overall (?) would be to get the range of
+            // the delta and normalize that area after applying the delta.
+            builder.replace(start..end, self.line_ending.normalize(&rope));
         }
         let delta = builder.build();
         self.this_edit_type = edit_type;
         self.add_delta(delta)
     }
 
+    pub fn normalize_line_endings(&mut self) -> Option<(Rope, RopeDelta, InvalLines)> {
+        let Some(delta) = self.line_ending.normalize_delta(&self.text) else {
+            // There were no changes needed
+            return None;
+        };
+        self.this_edit_type = EditType::NormalizeLineEndings;
+        Some(self.add_delta(delta))
+    }
+
     // TODO: don't clone the delta and return it, if the caller needs it then they can clone it
+    /// Note: the delta's line-endings should be normalized.
     fn add_delta(&mut self, delta: RopeDelta) -> (Rope, RopeDelta, InvalLines) {
         let text = self.text.clone();
 
@@ -388,6 +433,7 @@ impl Buffer {
         }
     }
 
+    /// Returns `(Revision, new text, new tombstones, new deletes from union)`
     fn mk_new_rev(&self, undo_group: usize, delta: RopeDelta) -> (Revision, Rope, Rope, Subset) {
         let (ins_delta, deletes) = delta.factor();
 
diff --git a/editor-core/src/command.rs b/editor-core/src/command.rs
index d6653042..8c723821 100644
--- a/editor-core/src/command.rs
+++ b/editor-core/src/command.rs
@@ -125,6 +125,10 @@ pub enum EditCommand {
     #[strum(message = "Duplicate Line Down")]
     #[strum(serialize = "duplicate_line_down")]
     DuplicateLineDown,
+
+    #[strum(message = "Normalize Line Endings")]
+    #[strum(serialize = "normalize_line_endings")]
+    NormalizeLineEndings,
 }
 
 impl EditCommand {
diff --git a/editor-core/src/editor.rs b/editor-core/src/editor.rs
index 5ca6f078..dbe51fcd 100644
--- a/editor-core/src/editor.rs
+++ b/editor-core/src/editor.rs
@@ -62,6 +62,7 @@ pub enum EditType {
     DeleteToEndOfLine,
     DeleteToEndOfLineAndInsert,
     MotionDelete,
+    NormalizeLineEndings,
     Undo,
     Redo,
     Other,
@@ -1493,6 +1494,15 @@ impl Action {
             }
             DuplicateLineUp => Self::duplicate_line(cursor, buffer, DuplicateDirection::Up),
             DuplicateLineDown => Self::duplicate_line(cursor, buffer, DuplicateDirection::Down),
+            NormalizeLineEndings => {
+                let Some((text, delta, inval)) = buffer.normalize_line_endings() else {
+                    return vec![];
+                };
+
+                cursor.apply_delta(&delta);
+
+                vec![(text, delta, inval)]
+            }
         }
     }
 }
diff --git a/editor-core/src/lib.rs b/editor-core/src/lib.rs
index dadb5599..ea5e408e 100644
--- a/editor-core/src/lib.rs
+++ b/editor-core/src/lib.rs
@@ -5,6 +5,7 @@ pub mod command;
 pub mod cursor;
 pub mod editor;
 pub mod indent;
+pub mod line_ending;
 pub mod mode;
 pub mod movement;
 pub mod paragraph;
diff --git a/editor-core/src/line_ending.rs b/editor-core/src/line_ending.rs
new file mode 100644
index 00000000..0cc502f7
--- /dev/null
+++ b/editor-core/src/line_ending.rs
@@ -0,0 +1,411 @@
+use std::{iter::Peekable, ops::Range};
+
+use lapce_xi_rope::{DeltaBuilder, Rope, RopeDelta};
+use memchr::{memchr, memchr2};
+use once_cell::sync::Lazy;
+
+// Cached ropes for the two line endings
+static CR_LF: Lazy<Rope> = Lazy::new(|| Rope::from("\r\n"));
+static LF: Lazy<Rope> = Lazy::new(|| Rope::from("\n"));
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum LineEnding {
+    /// `\r\n` Windows  
+    CrLf,
+    /// `\n` Unix
+    Lf,
+}
+impl LineEnding {
+    /// Replace the line endings (`\n`, `\r\n`, `\r`) used in `text` with the line ending named by
+    /// `self`.
+    pub fn normalize(self, text: &Rope) -> Rope {
+        self.normalize_delta(text)
+            .map(|d| d.apply(text))
+            .unwrap_or_else(|| text.clone())
+    }
+
+    pub fn normalize_delta(self, text: &Rope) -> Option<RopeDelta> {
+        let mut builder = DeltaBuilder::new(text.len());
+
+        let le = if self == LineEnding::Lf {
+            LF.clone()
+        } else {
+            CR_LF.clone()
+        };
+
+        let mut had_entries = false;
+        for (range, kind) in FullLeChunkSearch::new(text.iter_chunks(..)) {
+            had_entries = true;
+            match kind {
+                LeChunkKind::CrLf => {
+                    if self == LineEnding::Lf {
+                        builder.replace(range, LF.clone());
+                    }
+                }
+                LeChunkKind::Lf => {
+                    if self == LineEnding::CrLf {
+                        builder.replace(range, CR_LF.clone());
+                    }
+                }
+                LeChunkKind::Cr => {
+                    builder.replace(range, le.clone());
+                }
+            }
+        }
+
+        if had_entries {
+            let delta = builder.build();
+            Some(delta)
+        } else {
+            None
+        }
+    }
+
+    /// Only replace the carriage return line-endings.
+    pub fn normalize_limited(self, text: &Rope) -> Rope {
+        let mut builder = DeltaBuilder::new(text.len());
+
+        let le = if self == LineEnding::Lf {
+            LF.clone()
+        } else {
+            CR_LF.clone()
+        };
+
+        let mut had_entries = false;
+        for offset in LoneCrChunkSearch::new(text.iter_chunks(..)) {
+            had_entries = true;
+            builder.replace(offset..offset + 1, le.clone());
+        }
+
+        if had_entries {
+            let delta = builder.build();
+            delta.apply(text)
+        } else {
+            text.clone()
+        }
+    }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            LineEnding::CrLf => "CRLF",
+            LineEnding::Lf => "LF",
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum LineEndingDetermination {
+    CrLf,
+    Lf,
+    Mixed,
+    Unknown,
+}
+impl LineEndingDetermination {
+    // TODO: should we just do a simpler routine of checking the first few lines?
+    // Based off of xi-rope's line-ending determination logic
+    pub fn determine(text: &Rope) -> Self {
+        let mut crlf = false;
+        let mut lf = false;
+
+        for chunk in text.iter_chunks(..) {
+            match LineEndingDetermination::determine_str(chunk) {
+                LineEndingDetermination::CrLf => crlf = true,
+                LineEndingDetermination::Lf => lf = true,
+                LineEndingDetermination::Mixed => {
+                    return LineEndingDetermination::Mixed;
+                }
+                LineEndingDetermination::Unknown => {}
+            }
+        }
+
+        match (crlf, lf) {
+            (true, true) => LineEndingDetermination::Mixed,
+            (true, false) => LineEndingDetermination::CrLf,
+            (false, true) => LineEndingDetermination::Lf,
+            (false, false) => LineEndingDetermination::Unknown,
+        }
+    }
+
+    fn determine_str(chunk: &str) -> LineEndingDetermination {
+        let bytes = chunk.as_bytes();
+        let newline = memchr2(b'\n', b'\r', bytes);
+        match newline {
+            Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => {
+                LineEndingDetermination::CrLf
+            }
+            Some(x) if bytes[x] == b'\n' => LineEndingDetermination::Lf,
+            Some(_) => LineEndingDetermination::Mixed,
+            None => LineEndingDetermination::Unknown,
+        }
+    }
+
+    pub fn unwrap_or(self, le: LineEnding) -> LineEnding {
+        match self {
+            LineEndingDetermination::CrLf => LineEnding::CrLf,
+            LineEndingDetermination::Lf => LineEnding::Lf,
+            LineEndingDetermination::Mixed | LineEndingDetermination::Unknown => le,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+enum LeChunkKind {
+    CrLf,
+    Lf,
+    Cr,
+}
+
+/// Line ending chunk searcher
+struct FullLeChunkSearch<'a, I: Iterator<Item = &'a str>> {
+    offset: usize,
+    /// Offset within the chunk itself
+    chunk_pos: usize,
+    chunks: Peekable<I>,
+}
+impl<'a, I: Iterator<Item = &'a str>> FullLeChunkSearch<'a, I> {
+    fn new(chunks: I) -> Self {
+        Self {
+            offset: 0,
+            chunk_pos: 0,
+            chunks: chunks.peekable(),
+        }
+    }
+
+    /// Get the current chunk, updating the current chunk if needed
+    fn get_chunk(&mut self) -> Option<&'a str> {
+        let chunk = self.chunks.peek()?;
+        if self.chunk_pos >= chunk.len() {
+            self.advance_chunk();
+            Some(*self.chunks.peek()?)
+        } else {
+            Some(chunk)
+        }
+    }
+
+    fn advance_chunk(&mut self) -> Option<()> {
+        let chunk = self.chunks.next()?;
+        self.offset += chunk.len();
+        self.chunk_pos = 0;
+
+        Some(())
+    }
+}
+impl<'a, I: Iterator<Item = &'a str>> Iterator for FullLeChunkSearch<'a, I> {
+    type Item = (Range<usize>, LeChunkKind);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let chunk = self.get_chunk()?;
+
+        let bytes = &chunk.as_bytes()[self.chunk_pos..];
+
+        let newline = memchr2(b'\n', b'\r', bytes);
+        match newline {
+            // CrLf
+            Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => {
+                let start = self.offset + self.chunk_pos + x;
+                let end = start + 2;
+
+                self.chunk_pos += x + 2;
+                Some((start..end, LeChunkKind::CrLf))
+            }
+            // Lf
+            Some(x) if bytes[x] == b'\n' => {
+                let start = self.offset + self.chunk_pos + x;
+                let end = start + 1;
+
+                self.chunk_pos += x + 1;
+                Some((start..end, LeChunkKind::Lf))
+            }
+            Some(x) => {
+                // Typically this only occurs for a lone `\r`.
+                // However, we need to handl the case where the `\r` is the last character in the
+                // chunk whilst the next chunk starts with a `\n`.
+                assert!(bytes[x] == b'\r');
+
+                let start = self.offset + self.chunk_pos + x;
+                self.chunk_pos += x + 1;
+
+                let v = if self.chunk_pos == chunk.len() {
+                    if let Some(next_chunk) = self.get_chunk() {
+                        if next_chunk.starts_with('\n') {
+                            self.chunk_pos = 1;
+                            Some((start..start + 2, LeChunkKind::CrLf))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                };
+
+                Some(v.unwrap_or_else(|| {
+                    // There is no \n so it is a lone `\r`
+                    // (Which is used in MacOS, or sometimes due to bugged line endings)
+                    let end = start + 1;
+                    (start..end, LeChunkKind::Cr)
+                }))
+            }
+            None => {
+                self.advance_chunk();
+                self.next()
+            }
+        }
+    }
+}
+
+/// Iterator that searches for lone carriage returns ('\r') in chunks of text.
+struct LoneCrChunkSearch<'a, I: Iterator<Item = &'a str>> {
+    offset: usize,
+    chunk_pos: usize,
+    chunks: Peekable<I>,
+}
+
+impl<'a, I: Iterator<Item = &'a str>> LoneCrChunkSearch<'a, I> {
+    fn new(chunks: I) -> Self {
+        Self {
+            offset: 0,
+            chunk_pos: 0,
+            chunks: chunks.peekable(),
+        }
+    }
+
+    fn get_chunk(&mut self) -> Option<&'a str> {
+        let chunk = self.chunks.peek()?;
+        if self.chunk_pos >= chunk.len() {
+            self.advance_chunk();
+            Some(*self.chunks.peek()?)
+        } else {
+            Some(chunk)
+        }
+    }
+
+    fn advance_chunk(&mut self) -> Option<()> {
+        let chunk = self.chunks.next()?;
+        self.offset += chunk.len();
+        self.chunk_pos = 0;
+
+        Some(())
+    }
+}
+
+impl<'a, I: Iterator<Item = &'a str>> Iterator for LoneCrChunkSearch<'a, I> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let chunk = self.get_chunk()?;
+
+        let bytes = &chunk.as_bytes()[self.chunk_pos..];
+
+        let newline = memchr(b'\r', bytes);
+        match newline {
+            Some(x) => {
+                let offset = self.offset + self.chunk_pos + x;
+
+                // Check if the next character is '\n' (indicating \r\n)
+                self.chunk_pos += x + 1;
+                if self.chunk_pos < chunk.len() && chunk.as_bytes()[self.chunk_pos] == b'\n' {
+                    // Skip \r\n sequences
+                    self.chunk_pos += 1;
+                    self.next()
+                } else if let Some(next_chunk) = self.get_chunk() {
+                    if next_chunk.starts_with('\n') {
+                        // Skip \r\n sequences across chunks
+                        self.chunk_pos = 1;
+                        self.next()
+                    } else {
+                        // Lone \r
+                        Some(offset)
+                    }
+                } else {
+                    // Lone \r at the end
+                    Some(offset)
+                }
+            }
+            None => {
+                self.advance_chunk();
+                self.next()
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize() {
+        let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi");
+        let normalized = LineEnding::CrLf.normalize(&text);
+        assert_eq!(
+            normalized.slice_to_cow(..),
+            "hello\r\nworld toast and jam\r\nthe end\r\nhi"
+        );
+
+        let text = Rope::from("\n");
+        let normalized = LineEnding::Lf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\n");
+        let normalized = LineEnding::CrLf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\r\n");
+
+        let text = Rope::from("\r\n");
+        let normalized = LineEnding::Lf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\n");
+        let normalized = LineEnding::CrLf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\r\n");
+
+        // `\r` is always normalized to the line ending of the file
+        let text = Rope::from("\r");
+        let normalized = LineEnding::Lf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\n");
+        let normalized = LineEnding::CrLf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\r\n");
+        let normalized = LineEnding::Lf.normalize_limited(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\n");
+
+        let text = Rope::from("\rtest");
+        let normalized = LineEnding::Lf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\ntest");
+        let normalized = LineEnding::CrLf.normalize(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\r\ntest");
+        let normalized = LineEnding::Lf.normalize_limited(&text);
+        assert_eq!(normalized.slice_to_cow(..), "\ntest");
+    }
+
+    #[test]
+    fn chunk_search() {
+        let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi");
+        let c = FullLeChunkSearch::new(text.iter_chunks(..));
+        assert_eq!(
+            c.collect::<Vec<_>>(),
+            vec![
+                (5..7, LeChunkKind::CrLf),
+                (26..27, LeChunkKind::Lf),
+                (34..35, LeChunkKind::Lf),
+            ]
+        );
+        let c = LoneCrChunkSearch::new(text.iter_chunks(..));
+        assert_eq!(c.collect::<Vec<_>>(), Vec::new());
+
+        // Test searching across different chunks of text
+        // (Using a non-Rope iterator to simplify creation, however it should behave the same)
+        let text = ["a\n", "\n5", "\r\ne\r", "\ntest\r", "\rv"];
+        let multi_chunk = FullLeChunkSearch::new(text.into_iter());
+        assert_eq!(
+            multi_chunk.collect::<Vec<_>>(),
+            vec![
+                (1..2, LeChunkKind::Lf),
+                (2..3, LeChunkKind::Lf),
+                (4..6, LeChunkKind::CrLf),
+                (7..9, LeChunkKind::CrLf),
+                (13..14, LeChunkKind::Cr),
+                (14..15, LeChunkKind::Cr),
+            ]
+        );
+
+        let multi_chunk = LoneCrChunkSearch::new(text.into_iter());
+        assert_eq!(multi_chunk.collect::<Vec<_>>(), vec![13, 14]);
+    }
+}