diff --git a/Cargo.lock b/Cargo.lock index 02fa2b6c..165c4d08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -145,10 +145,13 @@ dependencies = [ "num_cpus", "once_cell", "paste", + "rand", + "rand_regex", "rayon", "regex", "rstest", "serde", + "test-log", "unicode_titlecase", ] @@ -614,6 +617,17 @@ dependencies = [ "slab", ] +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "gimli" version = "0.28.0" @@ -902,6 +916,12 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + [[package]] name = "predicates" version = "3.0.3" @@ -948,6 +968,46 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_regex" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8276e2c4e37f1907c587794d9b8b3334e1d44f36e05f8c13d61c50c19c264ae2" +dependencies = [ + "rand", + "regex-syntax", +] + [[package]] name = "rayon" version = "1.7.0" @@ -1193,6 +1253,17 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" +[[package]] +name = "test-log" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9601d162c1d77e62c1ea0bc8116cd1caf143ce3af947536c3c9052a1677fe0c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "thiserror" version = "1.0.47" @@ -1283,6 +1354,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.87" diff --git a/Cargo.toml b/Cargo.toml index d7955993..05b4259b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,10 +31,11 @@ regex = "1.9.5" [features] default = ["all"] -all = ["german", "symbols", "deletion"] +all = ["german", "symbols", "deletion", "squeeze"] german = [] symbols = [] deletion = [] +squeeze = [] [dev-dependencies] assert_cmd = "2.0.12" @@ -45,6 +46,9 @@ rstest = "0.18.2" serde = { version = "1.0.188", features = ["derive"] } glob = "0.3.1" num_cpus = "1.16.0" +rand = "0.8.5" +rand_regex = "0.16.0" +test-log = "0.2.12" [profile.dev.package.insta] # https://insta.rs/docs/quickstart/#optional-faster-runs diff --git a/src/lib.rs b/src/lib.rs index a23c622f..605cf21f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,7 @@ pub use crate::stages::Stage; use log::{debug, info}; +use scoped::Scope; use std::io::{BufRead, Error, Write}; /// Internal macros. Have to live here to be usable in unit, not just integration @@ -26,9 +27,13 @@ use std::io::{BufRead, Error, Write}; #[macro_use] pub mod macros; +/// Items related to scopes, which are used to limit the application of stages. +pub mod scoped; /// Main components around [`Stage`]s and their [processing][Stage::substitute]. pub mod stages; +/// Pattern signalling global scope, aka matching entire inputs. +pub const GLOBAL_SCOPE: &str = r".*"; const EXPECTABLE_AVERAGE_WORD_LENGTH_BYTES: u8 = 16; const EXPECTABLE_AVERAGE_MATCHES_PER_WORD: u8 = 2; @@ -45,7 +50,7 @@ const EXPECTABLE_AVERAGE_MATCHES_PER_WORD: u8 = 2; /// /// /// ``` -/// use betterletters::{apply, stages::GermanStage, Stage}; +/// use betterletters::{apply, scoped::Scope, stages::GermanStage, Stage}; /// use std::io::Cursor; /// /// let stages = vec![Box::new(GermanStage::default())].into_iter().map(|g| g as Box).collect(); @@ -53,7 +58,7 @@ const EXPECTABLE_AVERAGE_MATCHES_PER_WORD: u8 = 2; /// let mut input = Cursor::new("Gruess Gott!\n"); /// let mut output: Vec = Vec::new(); /// -/// apply(&stages, &mut input, &mut output); +/// apply(&stages, &Scope::default(), &mut input, &mut output); /// /// assert_eq!(output, "Grüß Gott!\n".as_bytes()); /// ``` @@ -68,6 +73,7 @@ const EXPECTABLE_AVERAGE_MATCHES_PER_WORD: u8 = 2; /// - when the destination cannot be flushed before exiting pub fn apply( stages: &Vec>, + scope: &Scope, source: &mut impl BufRead, destination: &mut impl Write, ) -> Result<(), Error> { @@ -79,8 +85,7 @@ pub fn apply( debug!("Starting processing line: '{}'", buf.escape_debug()); for stage in stages { - let result = stage.substitute(&buf)?; - buf = result.into(); + buf = stage.apply(&buf, scope); } debug!("Processed line, will write out: '{}'", buf.escape_debug()); diff --git a/src/main.rs b/src/main.rs index 1768da9a..64fd95c2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,10 @@ -use betterletters::apply; #[cfg(feature = "deletion")] use betterletters::stages::DeletionStage; #[cfg(feature = "german")] use betterletters::stages::GermanStage; #[cfg(feature = "symbols")] use betterletters::stages::SymbolsStage; +use betterletters::{apply, stages::SqueezeStage}; use log::{debug, info}; use std::io::{self, BufReader, Error}; @@ -13,71 +13,103 @@ fn main() -> Result<(), Error> { .format_timestamp_micros() // High precision is nice for benchmarks .init(); - info!("Launching app"); + let args = cli::Args::init(); + info!("Launching app with args: {:?}", args); - let mut args = cli::Args::init(); + // args.append_stage_if_missing_but_required(cli::Stage::German, args.german_prefer_original); + // args.append_stage_if_missing_but_required(cli::Stage::Symbols, false /* None yet */); + // args.append_stage_if_missing_but_required( + // cli::Stage::Deletion, + // args.deletion_pattern.is_some(), + // ); - args.append_stage_if_missing_but_required(cli::Stage::German, args.german_prefer_original); - args.append_stage_if_missing_but_required(cli::Stage::Symbols, false /* None yet */); - args.append_stage_if_missing_but_required( - cli::Stage::Deletion, - args.deletion_pattern.is_some(), - ); + let mut stages: Vec> = Vec::new(); - let stages = args - .stages - .iter() - .map(|stage| { - let res: Result, _> = match stage { - #[cfg(feature = "german")] - cli::Stage::German => Ok(Box::new(GermanStage::new(args.german_prefer_original))), + if args.squeeze { + stages.push(Box::::default()); + debug!("Loaded stage: Squeeze"); + } - #[cfg(feature = "symbols")] - cli::Stage::Symbols => Ok(Box::new(SymbolsStage)), + if args.german { + stages.push(Box::new(GermanStage::new(args.german_prefer_original))); + debug!("Loaded stage: German"); + } - #[cfg(feature = "deletion")] - cli::Stage::Deletion => Ok(Box::new(DeletionStage::new( - args.deletion_pattern.clone().ok_or(Error::new( - io::ErrorKind::InvalidInput, // Abuse... - "Deletion requested but no delete option specified.", - ))?, - ))), - }; + if args.symbols { + stages.push(Box::::default()); + debug!("Loaded stage: Symbols"); + } - debug!("Loaded stage: {:?}", stage); + if args.delete { + stages.push(Box::::default()); + debug!("Loaded stage: Deletion"); + } - res - }) - .collect::, Error>>()?; + // let stages = args + // .stages + // .iter() + // .map(|stage| { + // let res: Result, _> = match stage { + // #[cfg(feature = "german")] + // cli::Stage::German => Ok(Box::new(GermanStage::new(args.german_prefer_original))), + + // #[cfg(feature = "symbols")] + // cli::Stage::Symbols => Ok(Box::new(SymbolsStage)), + // #[cfg(feature = "deletion")] + // cli::Stage::Deletion => Ok(Box::new(DeletionStage::new( + // args.scope.clone().ok_or(Error::new( + // io::ErrorKind::InvalidInput, // Abuse... + // "Deletion requested but no delete option specified.", + // ))?, + // ))), + // }; + + // debug!("Loaded stage: {:?}", stage); + + // res + // }) + // .collect::, Error>>()?; let mut source = BufReader::new(io::stdin()); let mut destination = io::stdout(); - apply(&stages, &mut source, &mut destination)?; + apply(&stages, &args.scope.into(), &mut source, &mut destination)?; info!("Done, exiting"); Ok(()) } mod cli { - use clap::{Parser, ValueEnum}; - use log::info; + use betterletters::GLOBAL_SCOPE; + use clap::Parser; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] pub(super) struct Args { - /// Stages to use. - /// - /// Stages are applied in the order given. Options to individual stages are - /// those prefixed by the stage's name. All other options apply globally, across - /// stages. + /// Scope to apply to, as a regular expression pattern + #[arg(value_name = "SCOPE", default_value = GLOBAL_SCOPE)] + pub scope: regex::Regex, + /// Replace what was matched with this value + #[arg( + value_name = "REPLACEMENT", + // conflicts_with = "delete", + env = "REPLACE", + )] + pub replace: Option, + /// Perform substitutions on German words, such as 'Gruesse' to 'Grüße' /// - /// If a stage-specific option is given but the corresponding stage is - /// not, the stage is appended automatically. - // Use proper "command chaining" once available: - // https://github.com/clap-rs/clap/issues/2222 - // https://github.com/TeXitoi/structopt/issues/84#issuecomment-1443764459 - #[arg(value_enum, required = false, num_args = 0.., env = "BETTERLETTERS_STAGES")] - pub stages: Vec, + /// Compound words are supported. Words _legally_ containing alternative Umlaut + /// spellings are respected and not modified (e.g., 'Abente_ue_r'). + #[arg(short, long, env = "GERMAN")] + pub german: bool, + /// Perform substitutions on symbols, such as '!=' to '≠', '->' to '→' + #[arg(short = 'S', long, env = "SYMBOLS")] + pub symbols: bool, + /// Delete what was matched + #[arg(short, long, env = "DELETE", requires = "scope")] + pub delete: bool, + /// Squeeze consecutive occurrences of what was matched into one + #[arg(short, long, env = "SQUEEZE", requires = "scope")] + pub squeeze: bool, /// When some original version and its replacement are equally legal, prefer the /// original and do not modify. /// @@ -85,53 +117,41 @@ mod cli { /// words: by default, the tool would prefer the latter. // More fine-grained control is not available. We are not in the business of // natural language processing or LLMs, so that's all we can offer... - #[arg(long, env = "BETTERLETTERS_GERMAN_PREFER_ORIGINAL")] + #[arg(long, env = "GERMAN_PREFER_ORIGINAL")] pub german_prefer_original: bool, - /// Delete all characters matching the given regex. - /// - /// *Required* if deletion is requested. - // Again, this would be nicer with proper command chaining - // (https://github.com/clap-rs/clap/issues/2222). - #[arg( - short, - long, - value_name = "REGEX", - env = "BETTERLETTERS_DELETE", - visible_alias = "delete" - )] - pub deletion_pattern: Option, } impl Args { pub(super) fn init() -> Self { Self::parse() } - - pub(super) fn append_stage_if_missing_but_required( - &mut self, - stage: Stage, - relevant_options_present: bool, - ) { - if relevant_options_present && !self.stages.contains(&stage) { - info!( - "Arguments specific to {:?} stage found, but stage not specified. Adding.", - stage - ); - self.stages.push(stage); - } - } } - #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] - pub(super) enum Stage { - /// Substitutions like 'Gruesse!' to 'Grüße!' - #[cfg(feature = "german")] - German, - /// Substitutions like '!=' to '≠', '->' to '→' - #[cfg(feature = "symbols")] - Symbols, - /// Deletions of character classes - #[cfg(feature = "deletion")] - Deletion, - } + // pub(super) fn append_stage_if_missing_but_required( + // &mut self, + // stage: Stage, + // relevant_options_present: bool, + // ) { + // // if relevant_options_present && !self.stages.contains(&stage) { + // // info!( + // // "Arguments specific to {:?} stage found, but stage not specified. Adding.", + // // stage + // // ); + // // self.stages.push(stage); + // // } + // } + // } + + // #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] + // pub(super) enum Stage { + // /// Substitutions like 'Gruesse!' to 'Grüße!' + // #[cfg(feature = "german")] + // German, + // /// Substitutions like '!=' to '≠', '->' to '→' + // #[cfg(feature = "symbols")] + // Symbols, + // /// Deletions of character classes + // #[cfg(feature = "deletion")] + // Deletion, + // } } diff --git a/src/scoped.rs b/src/scoped.rs new file mode 100644 index 00000000..d2765e0d --- /dev/null +++ b/src/scoped.rs @@ -0,0 +1,305 @@ +use log::debug; +use regex::Regex; + +use crate::GLOBAL_SCOPE; + +/// A scope to apply a [`Stage`] to. +/// +/// A scope is a newtype around a regular expression pattern, and used to split a given +/// string into [`ScopeStatus`]es. The scope can span any regex, including the entire +/// input (`.*`), or individual characters. +/// +/// Special care should be given to greedy matching, which is the +/// [default](https://docs.rs/regex/latest/regex/#repetitions). It might extend to scope +/// further than intended. +#[derive(Debug, Clone)] +pub struct Scope(Regex); + +impl Scope { + /// Create a new [`Scope`]. + #[must_use] + pub fn new(pattern: Regex) -> Self { + Self(pattern) + } +} + +impl From for Scope { + fn from(r: Regex) -> Self { + Self(r) + } +} + +impl From for Regex { + fn from(s: Scope) -> Self { + s.0 + } +} + +impl From<&Scope> for Regex { + fn from(s: &Scope) -> Self { + s.0.clone() + } +} + +impl From<&Regex> for Scope { + fn from(r: &Regex) -> Self { + Self(r.clone()) + } +} + +impl Default for Scope { + /// Create a new [`Scope`] that matches everything ([`GLOBAL_SCOPE`]). + fn default() -> Self { + Self(Regex::new(GLOBAL_SCOPE).unwrap()) + } +} + +/// A trait for splitting a string into [`ScopeStatus`]es. +/// +/// [`Stage`]s are [`Scoped`], such that their processing can be applied only to parts +/// in some [`Scope`] (these are [`InScope`]), and not to parts outside of it (these +/// are [`OutOfScope`]). +pub trait Scoped { + /// Given some `input` and a corresponding [`Scope`], split the `input` into + /// consecutive [`ScopeStatus`]es according to the `scope`. + /// + /// This is like [`Regex::find_iter`] (matched items are considered [`InScope`]), + /// but also returns [`OutOfScope`] (i.e., unmatched) items, interleaved. As such, + /// reassembling all returned [`str`] parts yields back the original `input`. + /// + /// The returned [`Vec`] does not necessarily contain alternatingly scoped slices. + /// Multiple [`InScope`] items in a row might be returned if corresponding + /// consecutive matches are found. However, [`OutOfScope`] items cannot follow one + /// another directly. Empty [`str`] slices are not returned. + /// + // # Examples + // + // ``` + // use regex::Regex; + // use text_processing_pipeline::scoped::{Scope, Scoped, ScopeStatus}; + // ``` + fn split_by_scope<'a>(&self, input: &'a str, scope: &Scope) -> Vec> { + let mut scopes = Vec::new(); + let mut last_end = 0; + + for m in scope.0.find_iter(input) { + if m.start() > last_end { + scopes.push(ScopeStatus::OutOfScope(&input[last_end..m.start()])); + } + + scopes.push(ScopeStatus::InScope(m.as_str())); + last_end = m.end(); + } + + scopes.push(ScopeStatus::OutOfScope(&input[last_end..])); + + scopes.retain(|s| { + let s: &str = s.into(); + !s.is_empty() + }); + + debug!("Scopes to work on: {:?}", scopes); + scopes + } +} + +/// Indicates whether a given string part is in scope. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ScopeStatus<'a> { + /// The given string part is in scope for processing. + InScope(&'a str), + /// The given string part is out of scope for processing. + OutOfScope(&'a str), +} + +impl<'a> From<&'a ScopeStatus<'_>> for &'a str { + /// Get the underlying string slice of a [`ScopeStatus`]. + /// + /// All variants contain such a slice, so this is a convenient method. + fn from(s: &'a ScopeStatus) -> Self { + match s { + ScopeStatus::InScope(s) | ScopeStatus::OutOfScope(s) => s, + } + } +} + +#[cfg(test)] +mod tests { + use super::ScopeStatus::{InScope, OutOfScope}; + use super::*; + use rstest::rstest; + + struct Dummy; + impl Scoped for Dummy {} + + /// Run some manual testing for sanity. Random/fuzzing/property testing is much + /// better in this case. See below. + #[rstest] + #[case("a", "", vec![OutOfScope("a")])] + #[case("", "a", vec![])] // Empty results are discarded + // + #[case("a", "a", vec![InScope("a")])] + #[case("a", "b", vec![OutOfScope("a")])] + // + #[case("a", ".*", vec![InScope("a")])] + #[case("a", ".+?", vec![InScope("a")])] + // + #[case("a\na", ".*", vec![InScope("a"), OutOfScope("\n"), InScope("a")])] + #[case("a\na", "(?s).*", vec![InScope("a\na")])] // Dot matches newline + // + #[case("abc", "a", vec![InScope("a"), OutOfScope("bc")])] + // + #[case("abc", r"\w", vec![InScope("a"), InScope("b"), InScope("c")])] + #[case("abc", r"\W", vec![OutOfScope("abc")])] + #[case("abc", r"\w+", vec![InScope("abc")])] + // + #[case("Work 69 on 420 words", r"\w+", vec![InScope("Work"), OutOfScope(" "), InScope("69"), OutOfScope(" "), InScope("on"), OutOfScope(" "), InScope("420"), OutOfScope(" "), InScope("words")])] + #[case("Ignore 69 the 420 digits", r"\p{letter}+", vec![InScope("Ignore"), OutOfScope(" 69 "), InScope("the"), OutOfScope(" 420 "), InScope("digits")])] + fn test_split_by_scope( + #[case] input: &str, + #[case] scope: &str, + #[case] expected: Vec, + ) { + let scope = Scope::from(Regex::new(scope).unwrap()); + let dummy = Dummy {}; + + let scopes = dummy.split_by_scope(input, &scope); + + assert_eq!(scopes, expected); + } + + mod random { + use std::time::{Duration, Instant}; + + use super::*; + + use log::info; + use rand; + use rand::seq::SliceRandom; + use rand::Rng; + use test_log::test; + + fn generate_random_regex(mut rng: &mut rand::rngs::ThreadRng) -> Option { + let atoms: [&str; 7] = [".", "\\d", "\\D", "\\w", "\\W", "\\s", "\\S"]; + let quantifiers: [&str; 5] = ["*", "+", "?", "{2,5}", "{3}"]; + let others: [&str; 3] = ["|", "^", "$"]; + let letters: [&str; 26] = [ + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", + "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", + ]; + + let mut regex = String::new(); + + for _ in 0..rng.gen_range(1..=2) { + if rng.gen_bool(0.3) { + regex.push_str(atoms.choose(&mut rng).unwrap()); + } + + if rng.gen_bool(0.6) { + let letter = letters.choose(&mut rng).unwrap(); + if rng.gen_bool(0.5) { + let uc = letter.to_uppercase(); + regex.push_str(uc.as_str()); + } else { + regex.push_str(letter); + } + } + + if rng.gen_bool(0.3) { + regex.push_str(quantifiers.choose(&mut rng).unwrap()); + } + + if rng.gen_bool(0.1) { + regex.push_str(others.choose(&mut rng).unwrap()); + } + } + + Regex::new(regex.as_str()).ok() + } + + /// Run fuzz-like testing. + /// + /// This is much like fuzzing, but a bit more manually controlled and part of + /// the core test harness, hence running always. Property testing like + /// `proptest` would be much better ("given some input in this shape, and some + /// regex, test the property that reassembly works"), but setup for that crate + /// is substantial. The below approach is 'good enough' and emulates property + /// testing a fair bit. We just need some random inputs and some short-ish but + /// random regex to split by (generating random, valid regex is... interesting). + /// + /// Run for a duration instead of a fixed number of tries, as we would have to + /// choose that fixed number rather low for CI to not be too slow. That would + /// waste potential when running locally. + #[test] + fn test_scoping_randomly() { + let mut n_tries = 0; + let mut n_matches = 0; + + let duration = if std::env::var("CI").is_ok() { + Duration::from_secs(5) + } else { + // SORRY if this crashed the test on your machine. Flaky one :( + Duration::from_millis(500) + }; + + let mut rng = rand::thread_rng(); + + // "Anything but 'other'", see also: + // https://docs.rs/regex/latest/regex/#matching-one-character + // https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values + let pattern = r"\P{other}+"; + let gen = rand_regex::Regex::compile(pattern, 100).unwrap(); + + let now = Instant::now(); + let dummy = Dummy {}; + + loop { + n_tries += 1; + + let Some(regex) = generate_random_regex(&mut rng) else { + continue; + }; + let scope = Scope::from(regex); + let input: String = rng.sample(&gen); + + let scopes = dummy.split_by_scope(&input, &scope); + + if scopes.iter().any(|s| match s { + InScope(_) => true, + OutOfScope(_) => false, + }) { + n_matches += 1; + } + + let mut reassembled = String::new(); + for scope in scopes { + reassembled.push_str((&scope).into()); + } + + assert_eq!(input, reassembled); + + if now.elapsed() > duration { + break; + } + } + + info!( + // To test anything, we actually need matches so splits happen. + "Processed {} inputs, of which {} were matched and successfully reassembled", + n_tries, n_matches + ); + + assert!( + n_matches >= n_tries / 20, + "Too few regex matches; try lowering regex length" + ); + + assert!( + n_tries > 250, + // Might happen in CI, but we should ensure a certain lower bound; + // locally, many more tests can run. + "Too few tries; is the host machine very slow?" + ); + } + } +} diff --git a/src/stages/deletion/mod.rs b/src/stages/deletion/mod.rs index 9f65f14e..a65c04f2 100644 --- a/src/stages/deletion/mod.rs +++ b/src/stages/deletion/mod.rs @@ -1,27 +1,16 @@ -use super::{tooling::StageResult, Stage}; -use regex::Regex; +use crate::scoped::Scoped; + +use super::Stage; /// Deletes all matches of a given regex. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] #[allow(clippy::module_name_repetitions)] -pub struct DeletionStage { - pattern: Regex, -} +pub struct DeletionStage {} -impl Stage for DeletionStage { - fn substitute(&self, input: &str) -> StageResult { - Ok(self.pattern.replace_all(input, "").to_string().into()) - } -} +impl Scoped for DeletionStage {} -impl DeletionStage { - /// Create a new [`DeletionStage`]. - /// - /// # Arguments - /// - /// * `pattern`: The regex to use for deletion. - #[must_use] - pub fn new(pattern: Regex) -> Self { - Self { pattern } +impl Stage for DeletionStage { + fn substitute(&self, _input: &str) -> String { + String::new() } } diff --git a/src/stages/german/driver.rs b/src/stages/german/driver.rs index f713bfaf..647e3547 100644 --- a/src/stages/german/driver.rs +++ b/src/stages/german/driver.rs @@ -1,9 +1,9 @@ +use crate::scoped::Scoped; use crate::stages::{ german::{ machine::{StateMachine, Transition}, words::{Replace, Replacement, WordCasing}, }, - tooling::StageResult, Stage, }; use cached::proc_macro::cached; @@ -38,7 +38,7 @@ use unicode_titlecase::StrTitleCase; /// use betterletters::{Stage, stages::GermanStage}; /// /// let stage = GermanStage::default(); -/// let result: String = stage.substitute("Gruess Gott!").unwrap().into(); +/// let result = stage.substitute("Gruess Gott!"); /// assert_eq!(result, "Grüß Gott!"); /// ``` /// @@ -51,7 +51,7 @@ use unicode_titlecase::StrTitleCase; /// use betterletters::{Stage, stages::GermanStage}; /// /// let stage = GermanStage::default(); -/// let result: String = stage.substitute("Du Suesswassertagtraeumer!").unwrap().into(); +/// let result = stage.substitute("Du Suesswassertagtraeumer!"); /// assert_eq!(result, "Du Süßwassertagträumer!"); /// ``` /// @@ -79,7 +79,7 @@ use unicode_titlecase::StrTitleCase; /// "Steuerung", // should not be "Steürung" /// ] { /// let stage = GermanStage::default(); -/// let result: String = stage.substitute(word).unwrap().into(); +/// let result = stage.substitute(word); /// assert_eq!(result, word.to_string()); /// } /// ``` @@ -119,7 +119,7 @@ use unicode_titlecase::StrTitleCase; /// use betterletters::{Stage, stages::GermanStage}; /// /// let stage = GermanStage::default(); -/// let result: String = stage.substitute("aEpFeL").unwrap().into(); +/// let result = stage.substitute("aEpFeL"); /// /// // Error: MiXeD CaSe noun without leading capital letter /// assert_eq!(result, "aEpFeL"); @@ -136,7 +136,7 @@ use unicode_titlecase::StrTitleCase; /// use betterletters::{Stage, stages::GermanStage}; /// /// let stage = GermanStage::default(); -/// let result: String = stage.substitute("AePfEl").unwrap().into(); +/// let result: String = stage.substitute("AePfEl"); /// /// // OK: MiXeD CaSe words nouns are okay, *if* starting with a capital letter /// assert_eq!(result, "ÄPfEl"); @@ -148,7 +148,7 @@ use unicode_titlecase::StrTitleCase; /// use betterletters::{Stage, stages::GermanStage}; /// /// let stage = GermanStage::default(); -/// let f = |word: &str| -> String {stage.substitute(word).unwrap().into()}; +/// let f = |word: &str| -> String {stage.substitute(word)}; /// /// // OK: The normal case, adjective lowercase /// assert_eq!(f("Voll suess!"), "Voll süß!"); @@ -215,7 +215,7 @@ use unicode_titlecase::StrTitleCase; /// use betterletters::{Stage, stages::GermanStage}; /// /// let stage = GermanStage::default(); -/// let result: String = stage.substitute("\0Schoener 你好 Satz... 👋🏻\r\n\n").unwrap().into(); +/// let result = stage.substitute("\0Schoener 你好 Satz... 👋🏻\r\n\n"); /// assert_eq!(result, "\0Schöner 你好 Satz... 👋🏻\r\n\n"); /// ``` /// @@ -300,12 +300,12 @@ impl GermanStage { /// ] { /// // `false`: prefer replacement /// let stage = GermanStage::new(false); - /// let result: String = stage.substitute(original).unwrap().into(); + /// let result = stage.substitute(original); /// assert_eq!(result, output.to_string()); /// /// // `true`: prefer original /// let stage = GermanStage::new(true); - /// let result: String = stage.substitute(original).unwrap().into(); + /// let result = stage.substitute(original); /// assert_eq!(result, original.to_string()); /// } /// ``` @@ -326,8 +326,10 @@ impl Default for GermanStage { } } +impl Scoped for GermanStage {} + impl Stage for GermanStage { - fn substitute(&self, input: &str) -> StageResult { + fn substitute(&self, input: &str) -> String { const INDICATOR: char = '\0'; debug!("Working on input '{}'", input.escape_debug()); @@ -385,7 +387,8 @@ impl Stage for GermanStage { debug!("Final output string is '{}'", output.escape_debug()); - Ok(output.into()) + // Ok(output.into()) + output } } @@ -626,7 +629,8 @@ mod tests { ) (|data: &TestProcess| { let input = word.clone(); let stage = GermanStage{ prefer_original: false }; - let result: String = stage.substitute(&input).unwrap().into(); + let result: String = stage.substitute(&input); + // .unwrap().into(); insta::assert_yaml_snapshot!(data.to_string(), result); } ) diff --git a/src/stages/mod.rs b/src/stages/mod.rs index caffa428..c671a527 100644 --- a/src/stages/mod.rs +++ b/src/stages/mod.rs @@ -2,14 +2,52 @@ mod deletion; #[cfg(feature = "german")] mod german; +#[cfg(feature = "squeeze")] mod squeeze; #[cfg(feature = "symbols")] mod symbols; -/// Tooling (types, traits, ...) around stages. -pub mod tooling; pub use deletion::DeletionStage; pub use german::GermanStage; pub use squeeze::SqueezeStage; pub use symbols::SymbolsStage; -pub use tooling::Stage; + +use crate::scoped::{ + Scope, + ScopeStatus::{InScope, OutOfScope}, + Scoped, +}; + +/// A stage in the processing pipeline, as initiated by [`crate::apply`]. +/// +/// Stages are the core of the text processing pipeline and can be applied in any order, +/// [any number of times each](https://en.wikipedia.org/wiki/Idempotence) (more than +/// once being wasted work, though). +pub trait Stage: Send + Sync + Scoped { + /// Substitute text in a given `input` string. + /// + /// This is infallible: it cannot fail in the sense of [`Result`]. It can only + /// return incorrect results, which would be bugs (please report). + fn substitute(&self, input: &str) -> String; + + /// Applies this stage to an `input`, working only on [`InScope`] items and + /// forwarding [`OutOfScope`] items unchanged. + /// + /// Always returns an owned version of the `input`, even for stages where that might + /// technically be unnecessary. + /// + /// This is infallible: it cannot fail in the sense of [`Result`]. It can only + /// return incorrect results, which would be bugs (please report). + fn apply(&self, input: &str, scope: &Scope) -> String { + let mut out = String::with_capacity(input.len()); + + for scope in self.split_by_scope(input, scope) { + match scope { + InScope(s) => out.push_str(&self.substitute(s)), + OutOfScope(s) => out.push_str(s), + } + } + + out + } +} diff --git a/src/stages/squeeze/mod.rs b/src/stages/squeeze/mod.rs index 9164f1c4..44813f07 100644 --- a/src/stages/squeeze/mod.rs +++ b/src/stages/squeeze/mod.rs @@ -1,67 +1,91 @@ -use std::ops::Range; - -use super::{tooling::StageResult, Stage}; +use super::Stage; +use crate::scoped::{Scope, ScopeStatus::InScope, Scoped}; use regex::Regex; -/// Deletes all matches of a given regex. -#[derive(Debug, Clone)] +/// Squeezes all consecutive matched scopes into a single occurrence. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] #[allow(clippy::module_name_repetitions)] -pub struct SqueezeStage { - pattern: Regex, -} +pub struct SqueezeStage {} + +impl Scoped for SqueezeStage {} impl Stage for SqueezeStage { - fn substitute(&self, input: &str) -> StageResult { + fn substitute(&self, _input: &str) -> String { + unimplemented!("Squeezing works without substituting") + // Wouldn't need an owned `String` for this stage, but return signature requires // it anyway. - let mut out = String::with_capacity(input.len()); + // let mut out = String::with_capacity(input.len()); - let mut left = 0; // Left bound of current substring we *might* push - let mut previous: Option = None; + // let mut left = 0; // Left bound of current substring we *might* push + // let mut previous: Option = None; - for m in self.pattern.find_iter(input) { - let flush = previous.map_or(true, |p| !ranges_are_consecutive(&p.range(), &m.range())); + // for m in self.pattern.find_iter(input) { + // let flush = previous.map_or(true, |p| !ranges_are_consecutive(&p.range(), &m.range())); - if flush { - out.push_str(&input[left..m.end()]); - } + // if flush { + // out.push_str(&input[left..m.end()]); + // } - left = m.end(); - previous = Some(m); - } + // left = m.end(); + // previous = Some(m); + // } - out.push_str(&input[left..]); // Remainder; entire string if no matches + // out.push_str(&input[left..]); // Remainder; entire string if no matches - Ok(out.into()) + // // Ok(out.into()) + // out } -} -fn ranges_are_consecutive(left: &Range, right: &Range) -> bool { - left.end == right.start -} + fn apply(&self, input: &str, scope: &Scope) -> String { + let mut out = String::with_capacity(input.len()); -impl SqueezeStage { - /// Create a new instance. - /// - /// # Arguments - /// - /// * `pattern`: The regex to use for squeezing. - /// - /// # Panics - /// - /// Panics if the given pattern cannot be prepended with `(?U)`, which is used to - /// [render greedy quantifiers - /// non-greedy](https://docs.rs/regex/latest/regex/#grouping-and-flags), and vice - /// versa. - #[must_use] - pub fn new(pattern: &Regex) -> Self { - let pattern = Regex::new(&format!(r"(?U){pattern}")) - .expect("should be able to prepend (?U) to pattern"); - - Self { pattern } + let scope: Scope = Regex::new(&format!(r"(?U){}", Regex::from(scope))) + .expect("should be able to prepend (?U) to pattern") + .into(); + + let mut previous = None; + for scope in self.split_by_scope(input, &scope) { + if let InScope(_) = scope { + if let Some(InScope(_)) = previous { + continue; + } + } + + out.push_str((&scope).into()); + previous = Some(scope); + } + + out } } +// fn ranges_are_consecutive(left: &Range, right: &Range) -> bool { +// left.end == right.start +// } + +// impl SqueezeStage { +// /// Create a new instance. +// /// +// /// # Arguments +// /// +// /// * `pattern`: The regex to use for squeezing. +// /// +// /// # Panics +// /// +// /// Panics if the given pattern cannot be prepended with `(?U)`, which is used to +// /// [render greedy quantifiers +// /// non-greedy](https://docs.rs/regex/latest/regex/#grouping-and-flags), and vice +// /// versa. +// #[must_use] +// pub fn new(pattern: &Regex) -> Self { +// let pattern = Regex::new(&format!(r"(?U){pattern}")) +// .expect("should be able to prepend (?U) to pattern"); + +// Self { pattern } +// } +// } + #[cfg(test)] mod tests { use rstest::rstest; @@ -80,6 +104,9 @@ mod tests { #[case("babab", "a", "babab")] #[case("ababa", "a", "ababa")] // + // Squeezes only the pattern, no other repetitions + #[case("aaabbb", "a", "abbb")] + // // Squeezes start #[case("aab", "a", "ab")] // @@ -114,6 +141,15 @@ mod tests { #[case("Hello World", r"\S", "H W")] #[case("Hello\t\tWorld", r"\S", "H\t\tW")] // + // Deals with overlapping matches; behavior of `regex` crate + #[case("abab", r"aba", "abab")] + #[case("ababa", r"aba", "ababa")] + #[case("ababab", r"aba", "ababab")] + #[case("abababa", r"aba", "abababa")] + // + #[case("aba", r"aba", "aba")] + #[case("abaaba", r"aba", "aba")] + // // Turns greedy quantifiers into non-greedy ones automatically #[case("ab", r"\s+", "ab")] #[case("a b", r"\s+", "a b")] @@ -175,9 +211,9 @@ mod tests { " dirty Strings \t with \t\t messed up whitespace\n\n\n" )] fn test_squeeze(#[case] input: &str, #[case] pattern: Regex, #[case] expected: &str) { - let stage = SqueezeStage::new(&pattern); + let stage = SqueezeStage {}; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.apply(input, &Scope::new(pattern)); assert_eq!(result, expected); } diff --git a/src/stages/symbols/mod.rs b/src/stages/symbols/mod.rs index be4cb70f..86ed3fb7 100644 --- a/src/stages/symbols/mod.rs +++ b/src/stages/symbols/mod.rs @@ -1,6 +1,6 @@ #[cfg(doc)] use super::GermanStage; -use super::{tooling::StageResult, Stage}; +use crate::{scoped::Scoped, Stage}; use std::collections::VecDeque; /// Replace ASCII symbols (`--`, `->`, `!=`, ...) with proper Unicode equivalents (`–`, @@ -8,7 +8,7 @@ use std::collections::VecDeque; /// /// This stage is greedy, i.e. it will try to replace as many symbols as possible, /// replacing left-to-right as greedily as possible. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] #[allow(clippy::module_name_repetitions)] pub struct SymbolsStage; @@ -27,6 +27,8 @@ macro_rules! fetch_next { }; } +impl Scoped for SymbolsStage {} + impl Stage for SymbolsStage { /// ## Implementation note /// @@ -40,7 +42,7 @@ impl Stage for SymbolsStage { /// coroutine so it can be yielded again. /// /// All in all, ugly and verbose, would not recommend, but a worthwhile experiment. - fn substitute(&self, input: &str) -> StageResult { + fn substitute(&self, input: &str) -> String { let mut deque = input.chars().collect::>(); let mut out = String::new(); @@ -130,7 +132,7 @@ impl Stage for SymbolsStage { out.push_str(&stack.into_iter().collect::()); } - Ok(out.into()) + out } } @@ -215,7 +217,7 @@ mod tests { #[case("!=", "≠")] fn test_symbol_substitution_base_cases(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -247,7 +249,7 @@ mod tests { #[case] expected: &str, ) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -271,7 +273,7 @@ mod tests { #[case("A!=B", "A≠B")] fn test_symbol_substitution_neighboring_letters(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -298,7 +300,7 @@ mod tests { #[case] expected: &str, ) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -319,7 +321,7 @@ mod tests { #[case("<--X-->", "⟵X⟶")] fn test_symbol_substitution_disrupting_symbols(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -332,7 +334,7 @@ mod tests { #[case("->In->Out->", "→In→Out→")] fn test_symbol_substitution_sentences(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -368,7 +370,7 @@ mod tests { #[case("!=!=!=", "≠≠≠")] fn test_symbol_substitution_ambiguous_sequences(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -387,7 +389,7 @@ mod tests { #[case("≥", "≥")] fn test_symbol_substitution_existing_symbol(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } @@ -410,7 +412,7 @@ mod tests { #[case("https://->", "https://->")] // Pivot point fn test_symbol_substitution_uri(#[case] input: &str, #[case] expected: &str) { let stage = SymbolsStage; - let result: String = stage.substitute(input).unwrap().into(); + let result = stage.substitute(input); assert_eq!(result, expected); } diff --git a/src/stages/tooling.rs b/src/stages/tooling.rs deleted file mode 100644 index 7537fb44..00000000 --- a/src/stages/tooling.rs +++ /dev/null @@ -1,62 +0,0 @@ -use std::error::Error; - -/// An error that occurred during processing in a stage. -#[derive(Debug, Copy, Clone)] -pub struct StageError; - -impl From for std::io::Error { - fn from(e: StageError) -> Self { - std::io::Error::new(std::io::ErrorKind::Other, e.to_string()) - } -} - -impl Error for StageError {} - -impl std::fmt::Display for StageError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Error in processing stage") - } -} - -/// A string that has been substituted by a stage. -/// -/// This is a -/// [newtype](https://doc.rust-lang.org/rust-by-example/generics/new_types.html), used -/// for increased clarity. -#[derive(Debug)] -pub struct SubstitutedString(String); - -/// Convert a [`SubstitutedString`] into a [`String`]. -/// -/// Convenience method. -impl From for String { - fn from(s: SubstitutedString) -> Self { - s.0 - } -} - -/// Convert a [`String`] into a [`SubstitutedString`]. -/// -/// Convenience method. -impl From for SubstitutedString { - fn from(s: String) -> Self { - Self(s) - } -} - -/// The [`Result`] of a stage: we either [substituted properly][SubstitutedString], or [failed][StageError]. -pub type StageResult = Result; - -/// A stage in the processing pipeline, as initiated by [`crate::apply`]. -/// -/// Stages are the core of the text processing pipeline and can be applied in any order, -/// [any number of times each](https://en.wikipedia.org/wiki/Idempotence) (more than -/// once being wasted work, though). -pub trait Stage: Send + Sync { - /// Substitute text in a given `input` string. - /// - /// # Errors - /// - /// This method can error out if the stage fails to process the input. - fn substitute(&self, input: &str) -> StageResult; -} diff --git a/tests/cli.rs b/tests/cli.rs index 431fd696..658cb26b 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,66 +1,68 @@ -struct Sample { - content: String, - name: String, -} +//! End-to-end tests for the CLI. Main purpose is exercising multiple combinations of +//! inputs/flags/options. #[cfg(test)] mod tests { use assert_cmd::Command; - use glob::glob; - use rstest::{fixture, rstest}; - use std::fs; - - use super::*; - - #[fixture] - fn samples() -> Vec { - let mut samples = Vec::new(); - - for entry in glob("tests/samples/**/*.txt").unwrap() { - let path = entry.unwrap(); - let sample_number = path.file_stem().unwrap().to_str().unwrap(); - let stage_name = path - .parent() - .unwrap() - .file_name() - .unwrap() - .to_str() - .unwrap(); - - let sample = fs::read_to_string(&path).unwrap(); - - samples.push(Sample { - content: sample, - name: format!("{}-{}", stage_name, sample_number), - }); - } - - assert!(!samples.is_empty(), "No samples found, wrong glob?"); - - samples - } + use rstest::rstest; + + // There's a test for asserting panic on non-UTF8 input, so it's okay we're doing + // integration tests only with valid UTF8. + static SAMPLES: &[&str] = &[ + r#"Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt. +Franz jagt im komplett verwahrlosten Taxi quer durch Bayern. +Zwoelf Boxkaempfer jagen Viktor quer ueber den grossen Sylter Deich. +Vogel Quax zwickt Johnys Pferd Bim. +Sylvia wagt quick den Jux bei Pforzheim. +Polyfon zwitschernd assen Maexchens Voegel Rueben, Joghurt und Quark. +"Fix, Schwyz!" quaekt Juergen bloed vom Pass. +Victor jagt zwoelf Boxkaempfer quer ueber den grossen Sylter Deich. +Falsches Ueben von Xylophonmusik quaelt jeden groesseren Zwerg. +Heizoelrueckstossabdaempfung. +"#, + r#" + + +Duebel + +😂 + + + +"#, + r#"Duebel -> 1.5mm; Wand != 3m²... UEBELTAETER! 😫"#, + ]; #[rstest] - fn test_cli(samples: Vec, #[values(&["german"], &["symbols"])] args: &[&str]) { - // Should rebuild the binary to `target/debug/`. This works if running as an - // integration test (insides `tests/`), but not if running as a unit test (inside - // `src/main.rs` etc.). + fn test_cli( + // This will generate all permutations of all `values`, which is a lot but + // neatly manageable through `insta`. + #[values(1, 2, 3)] n_sample: usize, + #[values(&["--german"], &["--symbols"], &["--german", "--symbols"])] args: &[&str], + ) { + // Should rebuild the binary to `target/debug/`. This works if running as + // an integration test (insides `tests/`), but not if running as a unit test + // (inside `src/main.rs` etc.). let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - for sample in samples { - let input = sample.content; - cmd.args(args).write_stdin(input.clone()); + let sample = SAMPLES[n_sample - 1]; + cmd.args(args).write_stdin(sample.clone()); + + let output = cmd.output().expect("failed to execute process"); + assert!( + // Don't forget this; not manually checked by the framework! + output.status.success(), + "Command failed: {:?}", + cmd + ); + + let stdout = String::from_utf8(output.stdout).unwrap(); - let raw_output = cmd.output().unwrap().stdout; - let output = String::from_utf8(raw_output).unwrap(); + let padded_sample_number = format!("{:03}", n_sample); - let snapshot_name = sample.name + "_" + &args.join("-"); - insta::with_settings!({ - description => &input, - }, { - insta::assert_snapshot!(snapshot_name, &output); - }) - } + let snapshot_name = + (padded_sample_number.clone() + "+" + &args.join("_")).replace(' ', "_"); + insta::assert_snapshot!(snapshot_name, &stdout); } #[test] diff --git a/tests/samples/german/002.txt b/tests/samples/german/002.txt deleted file mode 100644 index a3a81d4a..00000000 --- a/tests/samples/german/002.txt +++ /dev/null @@ -1 +0,0 @@ -Hallo Welt, Mauerduebel! diff --git a/tests/samples/german/003.txt b/tests/samples/german/003.txt deleted file mode 100644 index 73510a87..00000000 --- a/tests/samples/german/003.txt +++ /dev/null @@ -1 +0,0 @@ -Schufaeintrag diff --git a/tests/samples/german/004.txt b/tests/samples/german/004.txt deleted file mode 100644 index 7238f3d8..00000000 --- a/tests/samples/german/004.txt +++ /dev/null @@ -1,7 +0,0 @@ - - -Duebel - -😂 - - diff --git a/tests/samples/german/005.txt b/tests/samples/german/005.txt deleted file mode 100644 index 8492776a..00000000 --- a/tests/samples/german/005.txt +++ /dev/null @@ -1 +0,0 @@ -Der Massstab diff --git a/tests/samples/german/006.txt b/tests/samples/german/006.txt deleted file mode 100644 index c86605b3..00000000 --- a/tests/samples/german/006.txt +++ /dev/null @@ -1,2 +0,0 @@ -Sehr droege, dieser -Massstab! \ No newline at end of file diff --git a/tests/samples/german/007.txt b/tests/samples/german/007.txt deleted file mode 100644 index 9eb959a0..00000000 --- a/tests/samples/german/007.txt +++ /dev/null @@ -1 +0,0 @@ -Duebel \ No newline at end of file diff --git a/tests/snapshots/cli__tests__001+--german.snap b/tests/snapshots/cli__tests__001+--german.snap new file mode 100644 index 00000000..85eb17fa --- /dev/null +++ b/tests/snapshots/cli__tests__001+--german.snap @@ -0,0 +1,15 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- +Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt. +Franz jagt im komplett verwahrlosten Taxi quer durch Bayern. +Zwölf Boxkämpfer jagen Viktor quer über den großen Sylter Deich. +Vogel Quax zwickt Johnys Pferd Bim. +Sylvia wagt quick den Jux bei Pforzheim. +Polyfon zwitschernd aßen Maexchens Vögel Rüben, Joghurt und Quark. +"Fix, Schwyz!" quäkt Jürgen blöd vom Pass. +Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. +Falsches Üben von Xylophonmusik quält jeden größeren Zwerg. +Heizölrückstoßabdämpfung. + diff --git a/tests/snapshots/cli__tests__001+--german_--symbols.snap b/tests/snapshots/cli__tests__001+--german_--symbols.snap new file mode 100644 index 00000000..85eb17fa --- /dev/null +++ b/tests/snapshots/cli__tests__001+--german_--symbols.snap @@ -0,0 +1,15 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- +Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt. +Franz jagt im komplett verwahrlosten Taxi quer durch Bayern. +Zwölf Boxkämpfer jagen Viktor quer über den großen Sylter Deich. +Vogel Quax zwickt Johnys Pferd Bim. +Sylvia wagt quick den Jux bei Pforzheim. +Polyfon zwitschernd aßen Maexchens Vögel Rüben, Joghurt und Quark. +"Fix, Schwyz!" quäkt Jürgen blöd vom Pass. +Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. +Falsches Üben von Xylophonmusik quält jeden größeren Zwerg. +Heizölrückstoßabdämpfung. + diff --git a/tests/samples/german/001.txt b/tests/snapshots/cli__tests__001+--symbols.snap similarity index 91% rename from tests/samples/german/001.txt rename to tests/snapshots/cli__tests__001+--symbols.snap index 63faa718..1053745d 100644 --- a/tests/samples/german/001.txt +++ b/tests/snapshots/cli__tests__001+--symbols.snap @@ -1,3 +1,7 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt. Franz jagt im komplett verwahrlosten Taxi quer durch Bayern. Zwoelf Boxkaempfer jagen Viktor quer ueber den grossen Sylter Deich. @@ -8,3 +12,4 @@ Polyfon zwitschernd assen Maexchens Voegel Rueben, Joghurt und Quark. Victor jagt zwoelf Boxkaempfer quer ueber den grossen Sylter Deich. Falsches Ueben von Xylophonmusik quaelt jeden groesseren Zwerg. Heizoelrueckstossabdaempfung. + diff --git a/tests/snapshots/cli__tests__002+--german.snap b/tests/snapshots/cli__tests__002+--german.snap new file mode 100644 index 00000000..895b7fa8 --- /dev/null +++ b/tests/snapshots/cli__tests__002+--german.snap @@ -0,0 +1,14 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- + + + +Dübel + +😂 + + + + diff --git a/tests/snapshots/cli__tests__002+--german_--symbols.snap b/tests/snapshots/cli__tests__002+--german_--symbols.snap new file mode 100644 index 00000000..895b7fa8 --- /dev/null +++ b/tests/snapshots/cli__tests__002+--german_--symbols.snap @@ -0,0 +1,14 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- + + + +Dübel + +😂 + + + + diff --git a/tests/snapshots/cli__tests__002+--symbols.snap b/tests/snapshots/cli__tests__002+--symbols.snap new file mode 100644 index 00000000..b2a6388e --- /dev/null +++ b/tests/snapshots/cli__tests__002+--symbols.snap @@ -0,0 +1,14 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- + + + +Duebel + +😂 + + + + diff --git a/tests/snapshots/cli__tests__003+--german.snap b/tests/snapshots/cli__tests__003+--german.snap new file mode 100644 index 00000000..a36b7d61 --- /dev/null +++ b/tests/snapshots/cli__tests__003+--german.snap @@ -0,0 +1,5 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- +Dübel -> 1.5mm; Wand != 3m²... ÜBELTÄTER! 😫 diff --git a/tests/snapshots/cli__tests__003+--german_--symbols.snap b/tests/snapshots/cli__tests__003+--german_--symbols.snap new file mode 100644 index 00000000..589660c1 --- /dev/null +++ b/tests/snapshots/cli__tests__003+--german_--symbols.snap @@ -0,0 +1,5 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- +Dübel → 1.5mm; Wand ≠ 3m²... ÜBELTÄTER! 😫 diff --git a/tests/snapshots/cli__tests__003+--symbols.snap b/tests/snapshots/cli__tests__003+--symbols.snap new file mode 100644 index 00000000..a4a41f6b --- /dev/null +++ b/tests/snapshots/cli__tests__003+--symbols.snap @@ -0,0 +1,5 @@ +--- +source: tests/cli.rs +expression: "&stdout" +--- +Duebel → 1.5mm; Wand ≠ 3m²... UEBELTAETER! 😫 diff --git a/tests/snapshots/cli__tests__german-001_german.snap b/tests/snapshots/cli__tests__german-001_german.snap deleted file mode 100644 index 3bcf1ebb..00000000 --- a/tests/snapshots/cli__tests__german-001_german.snap +++ /dev/null @@ -1,16 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt.\nFranz jagt im komplett verwahrlosten Taxi quer durch Bayern.\nZwoelf Boxkaempfer jagen Viktor quer ueber den grossen Sylter Deich.\nVogel Quax zwickt Johnys Pferd Bim.\nSylvia wagt quick den Jux bei Pforzheim.\nPolyfon zwitschernd assen Maexchens Voegel Rueben, Joghurt und Quark.\n\"Fix, Schwyz!\" quaekt Juergen bloed vom Pass.\nVictor jagt zwoelf Boxkaempfer quer ueber den grossen Sylter Deich.\nFalsches Ueben von Xylophonmusik quaelt jeden groesseren Zwerg.\nHeizoelrueckstossabdaempfung.\n" -expression: "&output" ---- -Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt. -Franz jagt im komplett verwahrlosten Taxi quer durch Bayern. -Zwölf Boxkämpfer jagen Viktor quer über den großen Sylter Deich. -Vogel Quax zwickt Johnys Pferd Bim. -Sylvia wagt quick den Jux bei Pforzheim. -Polyfon zwitschernd aßen Maexchens Vögel Rüben, Joghurt und Quark. -"Fix, Schwyz!" quäkt Jürgen blöd vom Pass. -Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. -Falsches Üben von Xylophonmusik quält jeden größeren Zwerg. -Heizölrückstoßabdämpfung. - diff --git a/tests/snapshots/cli__tests__german-001_symbols.snap b/tests/snapshots/cli__tests__german-001_symbols.snap deleted file mode 100644 index 60c40a2e..00000000 --- a/tests/snapshots/cli__tests__german-001_symbols.snap +++ /dev/null @@ -1,16 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt.\nFranz jagt im komplett verwahrlosten Taxi quer durch Bayern.\nZwoelf Boxkaempfer jagen Viktor quer ueber den grossen Sylter Deich.\nVogel Quax zwickt Johnys Pferd Bim.\nSylvia wagt quick den Jux bei Pforzheim.\nPolyfon zwitschernd assen Maexchens Voegel Rueben, Joghurt und Quark.\n\"Fix, Schwyz!\" quaekt Juergen bloed vom Pass.\nVictor jagt zwoelf Boxkaempfer quer ueber den grossen Sylter Deich.\nFalsches Ueben von Xylophonmusik quaelt jeden groesseren Zwerg.\nHeizoelrueckstossabdaempfung.\n" -expression: "&output" ---- -Zwei flinke Boxer jagen die quirlige Eva und ihren Mops durch Sylt. -Franz jagt im komplett verwahrlosten Taxi quer durch Bayern. -Zwoelf Boxkaempfer jagen Viktor quer ueber den grossen Sylter Deich. -Vogel Quax zwickt Johnys Pferd Bim. -Sylvia wagt quick den Jux bei Pforzheim. -Polyfon zwitschernd assen Maexchens Voegel Rueben, Joghurt und Quark. -"Fix, Schwyz!" quaekt Juergen bloed vom Pass. -Victor jagt zwoelf Boxkaempfer quer ueber den grossen Sylter Deich. -Falsches Ueben von Xylophonmusik quaelt jeden groesseren Zwerg. -Heizoelrueckstossabdaempfung. - diff --git a/tests/snapshots/cli__tests__german-002_german.snap b/tests/snapshots/cli__tests__german-002_german.snap deleted file mode 100644 index 80851ed1..00000000 --- a/tests/snapshots/cli__tests__german-002_german.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Hallo Welt, Mauerduebel!\n" -expression: "&output" ---- -Hallo Welt, Mauerdübel! - diff --git a/tests/snapshots/cli__tests__german-002_symbols.snap b/tests/snapshots/cli__tests__german-002_symbols.snap deleted file mode 100644 index cb67248a..00000000 --- a/tests/snapshots/cli__tests__german-002_symbols.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Hallo Welt, Mauerduebel!\n" -expression: "&output" ---- -Hallo Welt, Mauerduebel! - diff --git a/tests/snapshots/cli__tests__german-003_german.snap b/tests/snapshots/cli__tests__german-003_german.snap deleted file mode 100644 index 627c1778..00000000 --- a/tests/snapshots/cli__tests__german-003_german.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Schufaeintrag\n" -expression: "&output" ---- -Schufaeintrag - diff --git a/tests/snapshots/cli__tests__german-003_symbols.snap b/tests/snapshots/cli__tests__german-003_symbols.snap deleted file mode 100644 index 627c1778..00000000 --- a/tests/snapshots/cli__tests__german-003_symbols.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Schufaeintrag\n" -expression: "&output" ---- -Schufaeintrag - diff --git a/tests/snapshots/cli__tests__german-004_german.snap b/tests/snapshots/cli__tests__german-004_german.snap deleted file mode 100644 index 71f05d8a..00000000 --- a/tests/snapshots/cli__tests__german-004_german.snap +++ /dev/null @@ -1,13 +0,0 @@ ---- -source: core/tests/cli.rs -description: "\n\nDuebel\n\n😂\n\n\u0010\n" -expression: "&output" ---- - - -Dübel - -😂 - - - diff --git a/tests/snapshots/cli__tests__german-004_symbols.snap b/tests/snapshots/cli__tests__german-004_symbols.snap deleted file mode 100644 index bbc10a24..00000000 --- a/tests/snapshots/cli__tests__german-004_symbols.snap +++ /dev/null @@ -1,13 +0,0 @@ ---- -source: core/tests/cli.rs -description: "\n\nDuebel\n\n😂\n\n\u0010\n" -expression: "&output" ---- - - -Duebel - -😂 - - - diff --git a/tests/snapshots/cli__tests__german-005_german.snap b/tests/snapshots/cli__tests__german-005_german.snap deleted file mode 100644 index 7714f65d..00000000 --- a/tests/snapshots/cli__tests__german-005_german.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Der Massstab\n" -expression: "&output" ---- -Der Maßstab - diff --git a/tests/snapshots/cli__tests__german-005_symbols.snap b/tests/snapshots/cli__tests__german-005_symbols.snap deleted file mode 100644 index 66370370..00000000 --- a/tests/snapshots/cli__tests__german-005_symbols.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Der Massstab\n" -expression: "&output" ---- -Der Massstab - diff --git a/tests/snapshots/cli__tests__german-006_german.snap b/tests/snapshots/cli__tests__german-006_german.snap deleted file mode 100644 index 2cc64316..00000000 --- a/tests/snapshots/cli__tests__german-006_german.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Sehr droege, dieser\nMassstab!" -expression: "&output" ---- -Sehr dröge, dieser -Maßstab! diff --git a/tests/snapshots/cli__tests__german-006_symbols.snap b/tests/snapshots/cli__tests__german-006_symbols.snap deleted file mode 100644 index 979e9105..00000000 --- a/tests/snapshots/cli__tests__german-006_symbols.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: core/tests/cli.rs -description: "Sehr droege, dieser\nMassstab!" -expression: "&output" ---- -Sehr droege, dieser -Massstab! diff --git a/tests/snapshots/cli__tests__german-007_german.snap b/tests/snapshots/cli__tests__german-007_german.snap deleted file mode 100644 index 85a2eae4..00000000 --- a/tests/snapshots/cli__tests__german-007_german.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: core/tests/cli.rs -description: Duebel -expression: "&output" ---- -Dübel diff --git a/tests/snapshots/cli__tests__german-007_symbols.snap b/tests/snapshots/cli__tests__german-007_symbols.snap deleted file mode 100644 index 18a630f7..00000000 --- a/tests/snapshots/cli__tests__german-007_symbols.snap +++ /dev/null @@ -1,6 +0,0 @@ ---- -source: core/tests/cli.rs -description: Duebel -expression: "&output" ---- -Duebel