diff --git a/Cargo.lock b/Cargo.lock index 165c4d08..b11e08d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -136,6 +136,7 @@ dependencies = [ "clap", "criterion", "decompound", + "enum-iterator", "env_logger", "fst", "glob", @@ -145,6 +146,7 @@ dependencies = [ "num_cpus", "once_cell", "paste", + "proptest", "rand", "rand_regex", "rayon", @@ -155,6 +157,21 @@ dependencies = [ "unicode_titlecase", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -184,6 +201,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "cached" version = "0.44.0" @@ -476,6 +499,26 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "enum-iterator" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7add3873b5dd076766ee79c8e406ad1a472c385476b9e38849f8eec24f1be689" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eecf8589574ce9b895052fa12d69af7a233f99e6107f5cb8dd1044f2a17bfdcb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.29", +] + [[package]] name = "env_logger" version = "0.10.0" @@ -510,6 +553,12 @@ dependencies = [ "libc", ] +[[package]] +name = "fastrand" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" + [[package]] name = "fnv" version = "1.0.7" @@ -755,6 +804,12 @@ version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +[[package]] +name = "libm" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -814,6 +869,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -959,6 +1015,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e35c06b98bf36aba164cc17cb25f7e232f5c4aeea73baa14b8a9f0d92dbfa65" +dependencies = [ + "bit-set", + "bitflags 1.3.2", + "byteorder", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax 0.6.29", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.33" @@ -1005,7 +1087,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8276e2c4e37f1907c587794d9b8b3334e1d44f36e05f8c13d61c50c19c264ae2" dependencies = [ "rand", - "regex-syntax", + "regex-syntax 0.7.5", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", ] [[package]] @@ -1048,7 +1139,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.7.5", ] [[package]] @@ -1059,9 +1150,15 @@ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.7.5", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.7.5" @@ -1131,6 +1228,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.15" @@ -1238,6 +1347,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "termcolor" version = "1.2.0" @@ -1317,6 +1439,12 @@ dependencies = [ "syn 2.0.29", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.11" diff --git a/Cargo.toml b/Cargo.toml index af96a5af..ea495108 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,8 @@ num_cpus = "1.16.0" rand = "0.8.5" rand_regex = "0.16.0" test-log = "0.2.12" +proptest = "1.2.0" +enum-iterator = "1.4.1" [profile.dev.package.insta] # https://insta.rs/docs/quickstart/#optional-faster-runs diff --git a/src/main.rs b/src/main.rs index f56b1226..c8674a28 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,10 +7,10 @@ use betterletters::stages::GermanStage; use betterletters::stages::LowerStage; #[cfg(feature = "squeeze")] use betterletters::stages::SqueezeStage; -#[cfg(feature = "symbols")] -use betterletters::stages::SymbolsStage; #[cfg(feature = "upper")] use betterletters::stages::UpperStage; +#[cfg(feature = "symbols")] +use betterletters::stages::{SymbolsInversionStage, SymbolsStage}; use log::{debug, info}; use std::io::{self, BufReader, Error}; @@ -39,8 +39,13 @@ fn main() -> Result<(), Error> { } if args.symbols { - stages.push(Box::::default()); - debug!("Loaded stage: Symbols"); + if args.invert { + stages.push(Box::::default()); + debug!("Loaded stage: SymbolsInversion"); + } else { + stages.push(Box::::default()); + debug!("Loaded stage: Symbols"); + } } if args.delete { @@ -100,7 +105,7 @@ mod cli { #[arg(short, long, env = "GERMAN")] pub german: bool, /// Perform substitutions on symbols, such as '!=' to '≠', '->' to '→' - #[arg(short = 'S', long, env = "SYMBOLS")] + #[arg(short = 'S', long, env = "SYMBOLS", group = "invertible")] pub symbols: bool, /// Delete what was matched /// @@ -126,6 +131,24 @@ mod cli { /// Useful for names, which are otherwise not modifiable as they do not occur in /// dictionaries. Called 'naive' as this does not perform legal checks. pub german_naive: bool, + /// Undo the effects of passed stages, where applicable + /// + /// Requires a 1:1 mapping (bijection) between replacements and original, which + /// is currently available for: + /// + /// - symbols: '≠' <-> '!=' etc. + /// + /// Other stages: + /// + /// - german: inverting e.g. 'Ä' is ambiguous (can be 'Ae' or 'AE') + /// + /// - upper, lower, deletion, squeeze: inversion is impossible as information is + /// lost + /// + /// These may still be passed, but will be ignored for inversion and applied + /// normally + #[arg(short, long, env = "INVERT", requires = "invertible")] + pub invert: bool, } impl Args { diff --git a/src/stages/german/driver.rs b/src/stages/german/driver.rs index 77f6ad81..ca442099 100644 --- a/src/stages/german/driver.rs +++ b/src/stages/german/driver.rs @@ -735,4 +735,31 @@ mod tests { let result = stage.substitute(input); assert_eq!(result, expected); } + + #[rstest] + // Single letter. Notice the mapping is irreversible. + #[case("ue", "ü")] + #[case("uE", "ü")] + #[case("Ue", "Ü")] + #[case("UE", "Ü")] + // + // Beginning of word + #[case("uekol", "ükol")] + #[case("uEkol", "ükol")] + #[case("Uekol", "Ükol")] + #[case("UEkol", "Ükol")] + // + // Middle of word + #[case("guessa", "güßa")] + #[case("gUessa", "gÜßa")] + #[case("guEssa", "güßa")] + #[case("gUEssa", "gÜßa")] + #[case("Guessa", "Güßa")] + #[case("GUESSA", "GÜẞA")] + fn test_casing_when_being_naive(#[case] input: &str, #[case] expected: &str) { + let mut stage = GermanStage::default(); + stage.naive(); + let result = stage.substitute(input); + assert_eq!(result, expected); + } } diff --git a/src/stages/mod.rs b/src/stages/mod.rs index 9b8b2581..c2d0fe96 100644 --- a/src/stages/mod.rs +++ b/src/stages/mod.rs @@ -17,6 +17,7 @@ pub use deletion::DeletionStage; pub use german::GermanStage; pub use lower::LowerStage; pub use squeeze::SqueezeStage; +pub use symbols::inversion::SymbolsInversionStage; pub use symbols::SymbolsStage; pub use upper::UpperStage; diff --git a/src/stages/symbols/inversion.rs b/src/stages/symbols/inversion.rs new file mode 100644 index 00000000..4bb3204f --- /dev/null +++ b/src/stages/symbols/inversion.rs @@ -0,0 +1,42 @@ +use super::Symbol; +use crate::{scoped::Scoped, Stage}; + +/// Inverts all symbols inserted by [`SymbolsStage`]. +/// +/// This is guaranteed to be the inverse of [`SymbolsStage`], as the replacements and +/// originals form a [bijection](https://en.wikipedia.org/wiki/Bijection). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[allow(clippy::module_name_repetitions)] +pub struct SymbolsInversionStage {} + +impl Scoped for SymbolsInversionStage {} + +impl Stage for SymbolsInversionStage { + fn substitute(&self, input: &str) -> String { + input + .chars() + .map(|c| match Symbol::try_from(c) { + Ok(s) => match s { + // This is *horrible* as in the current implementation, we cannot + // access these symbols. They are implicitly encoded in the + // `substitute` method of `SymbolsStage`. As such, this inversion + // can get out of sync with the original. There is a property test + // in place to catch this. + Symbol::EmDash => "---", + Symbol::EnDash => "--", + Symbol::ShortRightArrow => "->", + Symbol::ShortLeftArrow => "<-", + Symbol::LongRightArrow => "-->", + Symbol::LongLeftArrow => "<--", + Symbol::LeftRightArrow => "<->", + Symbol::RightDoubleArrow => "=>", + Symbol::NotEqual => "!=", + Symbol::LessThanOrEqual => "<=", + Symbol::GreaterThanOrEqual => ">=", + } + .into(), + Err(_) => c.to_string(), + }) + .collect() + } +} diff --git a/src/stages/symbols/mod.rs b/src/stages/symbols/mod.rs index 86ed3fb7..a63c2f7f 100644 --- a/src/stages/symbols/mod.rs +++ b/src/stages/symbols/mod.rs @@ -1,8 +1,12 @@ #[cfg(doc)] use super::GermanStage; use crate::{scoped::Scoped, Stage}; +#[cfg(test)] +use enum_iterator::{all, Sequence}; use std::collections::VecDeque; +pub mod inversion; + /// Replace ASCII symbols (`--`, `->`, `!=`, ...) with proper Unicode equivalents (`–`, /// `→`, `≠`, ...). /// @@ -10,7 +14,7 @@ use std::collections::VecDeque; /// replacing left-to-right as greedily as possible. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] #[allow(clippy::module_name_repetitions)] -pub struct SymbolsStage; +pub struct SymbolsStage {} macro_rules! fetch_next { ($it:expr, $stack:expr, $buf:expr $(, $label:tt)?) => { @@ -136,6 +140,8 @@ impl Stage for SymbolsStage { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(test, derive(Sequence))] enum Symbol { // Typographic symbols EmDash, @@ -173,6 +179,30 @@ impl From for char { } } +impl TryFrom for Symbol { + type Error = (); + + fn try_from(c: char) -> Result { + match c { + // Typographic symbols + '–' => Ok(Symbol::EnDash), + '—' => Ok(Symbol::EmDash), + // Arrows + '→' => Ok(Symbol::ShortRightArrow), + '←' => Ok(Symbol::ShortLeftArrow), + '⟶' => Ok(Symbol::LongRightArrow), + '⟵' => Ok(Symbol::LongLeftArrow), + '↔' => Ok(Symbol::LeftRightArrow), + '⇒' => Ok(Symbol::RightDoubleArrow), + // Math + '≠' => Ok(Symbol::NotEqual), + '≤' => Ok(Symbol::LessThanOrEqual), + '≥' => Ok(Symbol::GreaterThanOrEqual), + _ => Err(()), + } + } +} + /// We might greedily overfetch and then end up with a [`char`] on the `stack` we do not /// know how to handle. However, *subsequent, other states might*. Hence, be a good /// citizen and put it back where it came from. @@ -216,7 +246,7 @@ mod tests { #[case(">=", "≥")] #[case("!=", "≠")] fn test_symbol_substitution_base_cases(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -248,7 +278,7 @@ mod tests { #[case] input: &str, #[case] expected: &str, ) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -272,7 +302,7 @@ mod tests { #[case("A>=B", "A≥B")] #[case("A!=B", "A≠B")] fn test_symbol_substitution_neighboring_letters(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -299,7 +329,7 @@ mod tests { #[case] input: &str, #[case] expected: &str, ) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -320,7 +350,7 @@ mod tests { // #[case("<--X-->", "⟵X⟶")] fn test_symbol_substitution_disrupting_symbols(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -333,7 +363,7 @@ mod tests { #[case("A <= B => C", "A ≤ B ⇒ C")] #[case("->In->Out->", "→In→Out→")] fn test_symbol_substitution_sentences(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -369,7 +399,7 @@ mod tests { #[case("!=!=", "≠≠")] #[case("!=!=!=", "≠≠≠")] fn test_symbol_substitution_ambiguous_sequences(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -388,7 +418,7 @@ mod tests { #[case("≤", "≤")] #[case("≥", "≥")] fn test_symbol_substitution_existing_symbol(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); @@ -411,9 +441,21 @@ mod tests { #[case("https:/->", "https:/→")] #[case("https://->", "https://->")] // Pivot point fn test_symbol_substitution_uri(#[case] input: &str, #[case] expected: &str) { - let stage = SymbolsStage; + let stage = SymbolsStage::default(); let result = stage.substitute(input); assert_eq!(result, expected); } + + #[test] + fn test_symbol_to_char_and_back_is_bijective() { + let symbols: Vec<_> = all::().collect(); + + for symbol in symbols { + let c = char::from(symbol); + let back = Symbol::try_from(c).expect("Should be able to convert back to symbol"); + + assert_eq!(symbol, back); + } + } } diff --git a/tests/mod.rs b/tests/mod.rs new file mode 100644 index 00000000..b90e91c9 --- /dev/null +++ b/tests/mod.rs @@ -0,0 +1,2 @@ +#[cfg(test)] +mod properties; diff --git a/tests/properties/lower.rs b/tests/properties/lower.rs new file mode 100644 index 00000000..8e22b3e2 --- /dev/null +++ b/tests/properties/lower.rs @@ -0,0 +1,16 @@ +use betterletters::stages::LowerStage; +use proptest::prelude::*; + +use crate::properties::{apply_with_default_scope, DEFAULT_NUMBER_OF_TEST_CASES}; + +proptest! { + #![proptest_config(ProptestConfig::with_cases(DEFAULT_NUMBER_OF_TEST_CASES))] + #[test] + fn test_lowercasing_lowercase_has_no_effect( + // https://docs.rs/regex/latest/regex/#matching-one-character + // https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values + input in r"\p{Lowercase_Letter}*" + ) { + assert_eq!(apply_with_default_scope(&LowerStage::default(), &input), input); + } +} diff --git a/tests/properties/mod.rs b/tests/properties/mod.rs new file mode 100644 index 00000000..6266aa53 --- /dev/null +++ b/tests/properties/mod.rs @@ -0,0 +1,17 @@ +use betterletters::{scoped::Scope, Stage}; + +mod lower; +mod squeeze; +mod symbols; +mod upper; + +// https://proptest-rs.github.io/proptest/proptest/tutorial/config.html +const DEFAULT_NUMBER_OF_TEST_CASES: u32 = 1_024; + +fn apply(stage: &impl Stage, input: &str, scope: Scope) -> String { + stage.apply(input, &scope) +} + +fn apply_with_default_scope(stage: &impl Stage, input: &str) -> String { + apply(stage, input, Scope::default()) +} diff --git a/tests/properties/squeeze.rs b/tests/properties/squeeze.rs new file mode 100644 index 00000000..7cd54382 --- /dev/null +++ b/tests/properties/squeeze.rs @@ -0,0 +1,18 @@ +use betterletters::{scoped::Scope, stages::SqueezeStage}; +use proptest::prelude::*; +use regex::Regex; + +use crate::properties::{apply, DEFAULT_NUMBER_OF_TEST_CASES}; + +proptest! { + #![proptest_config(ProptestConfig::with_cases(DEFAULT_NUMBER_OF_TEST_CASES))] + #[test] + fn test_squeezing_anything_at_all_makes_the_input_shorter( + // https://docs.rs/regex/latest/regex/#matching-one-character + // https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values + input in r"\p{Any}*AA\p{Any}*" + ) { + let scope = Scope::from(Regex::new("A").unwrap()); + assert!(apply(&SqueezeStage::default(), &input, scope).len() < input.len()); + } +} diff --git a/tests/properties/symbols.rs b/tests/properties/symbols.rs new file mode 100644 index 00000000..efa6af14 --- /dev/null +++ b/tests/properties/symbols.rs @@ -0,0 +1,19 @@ +use betterletters::stages::{SymbolsInversionStage, SymbolsStage}; +use proptest::prelude::*; + +use crate::properties::{apply_with_default_scope, DEFAULT_NUMBER_OF_TEST_CASES}; + +proptest! { + #![proptest_config(ProptestConfig::with_cases(DEFAULT_NUMBER_OF_TEST_CASES * 2))] + #[test] + fn test_inverting_symbols_is_idempotent( + // https://docs.rs/regex/latest/regex/#matching-one-character + // https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values + input in r"\p{Any}*(-|<|>|=|!){2,3}\p{Any}*" + ) { + let applied = apply_with_default_scope(&SymbolsStage::default(), &input); + let inverted = apply_with_default_scope(&SymbolsInversionStage::default(), &applied); + + assert_eq!(input, inverted); + } +} diff --git a/tests/properties/upper.rs b/tests/properties/upper.rs new file mode 100644 index 00000000..3fdb116c --- /dev/null +++ b/tests/properties/upper.rs @@ -0,0 +1,15 @@ +use betterletters::stages::UpperStage; +use proptest::prelude::*; + +use crate::properties::{apply_with_default_scope, DEFAULT_NUMBER_OF_TEST_CASES}; +proptest! { + #![proptest_config(ProptestConfig::with_cases(DEFAULT_NUMBER_OF_TEST_CASES))] + #[test] + fn test_uppercasing_uppercase_has_no_effect( + // https://docs.rs/regex/latest/regex/#matching-one-character + // https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values + input in r"\p{Uppercase_Letter}*" + ) { + assert_eq!(apply_with_default_scope(&UpperStage::default(), &input), input); + } +}