diff --git a/Cargo.lock b/Cargo.lock index ef4d6b03..a4d037ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -153,6 +153,7 @@ dependencies = [ "itertools 0.11.0", "log", "nom", + "nom-unicode", "num_cpus", "once_cell", "paste", @@ -164,6 +165,7 @@ dependencies = [ "rstest", "serde", "serial_test", + "shell-words", "test-log", "tree-sitter", "tree-sitter-python", @@ -1038,6 +1040,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom-unicode" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5c03a4d71ab9740c752c165329e9df13a808093b146dfbef6170ac260771ffe" +dependencies = [ + "nom", +] + [[package]] name = "num-traits" version = "0.2.16" diff --git a/Cargo.toml b/Cargo.toml index ff580419..632c270a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ fancy-regex = "0.11.0" [features] default = ["all"] -all = ["german", "symbols", "deletion", "squeeze", "upper", "lower", "replace"] +all = ["german", "deletion", "squeeze", "upper", "lower", "replace", "symbols"] german = [] symbols = [] deletion = [] @@ -60,6 +60,8 @@ enum-iterator = "1.4.1" serial_test = "2.0.0" comrak = "0.18.0" nom = "7.1.3" +nom-unicode = "0.3.0" +shell-words = "1.1.0" [profile.dev.package.insta] # https://insta.rs/docs/quickstart/#optional-faster-runs diff --git a/README.md b/README.md index 5906d3d2..06da15c8 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,20 @@ ## Usage -### As a drop-in replacement for `tr` +1. Conventional + 1. Actions +2. Unicode tricks +3. Scoping -There is some overlap with plain `tr`, so simple replacements work as expected: +### Conventional + +There is considerable overlap with plain `tr`: the tool is designed to have close to +drop-in compatibility for the most common use cases. As such, the tool can be used +'conventionally'. + +#### Replacements + +For example, simple replacements work as expected: ```console $ echo 'Hello, World!' | betterletters 'H' 'J' @@ -14,27 +25,30 @@ Jello, World! ``` However, there is no direct concept of character classes. Instead, the first argument is -a regular expression pattern, so *its* classes can be used to similar effect: +a regular expression pattern, so *its* [classes](https://docs.rs/regex/1.9.5/regex/index.html#character-classes) can be used to similar effect: ```console $ echo 'Hello, World!' | betterletters '[a-z]' '_' H____, W____! ``` -The replacement occurs greedily across the entire match by default: +The replacement occurs greedily across the entire match by default (note the [UTS character +class](https://docs.rs/regex/1.9.5/regex/index.html#ascii-character-classes), +reminiscent of [`tr`'s +`[:alnum:]`](https://github.com/coreutils/coreutils/blob/769ace51e8a1129c44ee4e7e209c3b2df2111524/src/tr.c#L322C25-L322C25)): ```console -$ echo 'ghp_oHn0As3cr3T' | betterletters 'ghp_[a-zA-Z0-9]+' '*' # A GitHub token -* +$ echo 'ghp_oHn0As3cr3T!!' | betterletters 'ghp_[[:alnum:]]+' '*' # A GitHub token +*!! ``` -However, in the presence of capture groups, the individual characters comprising a -capture group match are treated *individually* for processing, allowing a replacement to -be repeated: +However, +in the presence of capture groups, the *individual characters comprising a capture group +match* are treated *individually* for processing, allowing a replacement to be repeated: ```console -$ echo 'ghp_oHn0As3cr3T' | betterletters '(ghp_[a-zA-Z0-9]+)' '*' -*************** +$ echo 'Hide ghp_th15 and ghp_m0r3' | betterletters '(ghp_[a-zA-Z0-9]+)' '*' +Hide ******** and ******** ``` Advanced regex features are @@ -50,3 +64,75 @@ Take care in using these safely, as advanced patterns come without certain [safe performance guarantees](https://docs.rs/regex/latest/regex/#untrusted-input). If they aren't used, [performance is not impacted](https://docs.rs/fancy-regex/0.11.0/fancy_regex/index.html#). + +The replacement is not limited to a single character. It can be any string, for example +to fix [this quote](http://regex.info/blog/2006-09-15/247): + +```console +$ echo '"Using regex, I now have no issues."' | betterletters 'no issues' '2 problems' +"Using regex, I now have 2 problems." +``` + +#### Other actions + +Seeing how the replacement is merely a static string, its usefulness is limited. This is +where [`tr`'s secret sauce](https://maizure.org/projects/decoded-gnu-coreutils/tr.html) +comes into play using its character classes, which are valid in the second position as +well, neatly translating from members of the first to the second. Here, those classes +are instead regexes, and only valid in first position. A regular expression being a +state machine, it is impossible to match onto a 'list of characters'. That concept is +out the window, and its flexibility lost. + +## Common `tr` use cases + +In theory, `tr` is quite flexible. In practice, it is commonly used mainly across a +couple specific tasks. Next to its two positional arguments ('arrays of characters'), +one finds four flags: + +1. `-c`, `-C`, `--complement`: complement the first array +2. `-d`, `--delete`: delete characters in the first first array +3. `-s`, `--squeeze-repeats`: squeeze repeats of characters in the first array +4. `-t`, `--truncate-set1`: truncate the first array to the length of the second + +In this tool, these are implemented as follows: + +1. is not available directly as an option; instead, negation of regular expression classes can be used (e.g., `[^a-z]`), to much more potent, flexible and well-known effect +2. available (via regex) +3. available (via regex) +4. not available: it's inapplicable to regular expressions, not commonly used and, if used, often misused + +To show how uses of `tr` found in the wild can translate to this tool, consider the +following section. + +### Use cases and equivalences in this tool + +The following sections are the approximate categories much of `tr` usage falls into. +They were found using [GitHub's code search](https://cs.github.com). The corresponding +queries are given. Results are from the first page of results at the time. + +As the stdin isn't known (usually dynamic), some representative samples are used and the +tool is exercised on those. + +#### Identifier Safety + +Making inputs safe for use as identifiers, for example as variable names. + +[Query](https://github.com/search?type=code&q=%22tr+-c%22) + +1. [`tr -C '[:alnum:]_\n' '_'`](https://github.com/grafana/grafana/blob/9328fda8ea8384e8cfcf1c78d1fe95d92bbad786/docs/make-docs#L234) + + Translates to: + + ```console + $ echo 'some-variable? đŸ€”' | betterletters '[^[[:alnum:]]_\n]' '_' + some_variable___ + ``` + +2. [`tr -c '[:alnum:]' _`](https://github.com/freebsd/freebsd-src/blob/9dc0c983b0931f359c2ff10d47ad835ef74e929a/libexec/rc/rc.d/jail#L413) + + Translates to: + + ```console + $ echo 'some variablĂȘ' | betterletters '[^[[:alnum:]]]' '_' + some__variabl_ + ``` diff --git a/tests/cli.rs b/tests/cli.rs index 035f64a8..4f064a0d 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -2,6 +2,9 @@ //! inputs/flags/options. #[cfg(test)] +/// Only run these tests if the required features are *all* enabled. This will require +/// adjusting and isn't ideal (not fine-grained). +#[cfg(all(feature = "german", feature = "symbols", feature = "deletion"))] mod tests { use assert_cmd::Command; use rstest::rstest; diff --git a/tests/readme.rs b/tests/readme.rs index 9582b2d9..c69dadda 100644 --- a/tests/readme.rs +++ b/tests/readme.rs @@ -6,13 +6,15 @@ mod tests { parse_document, Arena, ComrakOptions, }; - use itertools::Itertools; use nom::{ - bytes::complete::{tag, take_till, take_until, take_while1}, - character::complete::{char, line_ending, multispace0}, + branch::alt, + bytes::complete::{escaped, tag, take_till, take_until, take_while1}, + character::complete::{alpha1 as ascii_alpha1, char, line_ending, multispace0, none_of}, multi::many1, + sequence::delimited, Finish, IResult, }; + use shell_words::split; #[derive(Debug, Clone, PartialEq, Eq)] struct CommandUnderTest { @@ -21,26 +23,29 @@ mod tests { stdout: String, } + /// https://stackoverflow.com/a/58907488/11477374 + fn parse_quoted(input: &str) -> IResult<&str, &str> { + let esc = escaped(none_of("\\\'"), '\\', tag("'")); + let esc_or_empty = alt((esc, tag(""))); + let res = delimited(tag("'"), esc_or_empty, tag("'"))(input)?; + + Ok(res) + } + fn parse_command_output_pair(input: &str) -> IResult<&str, CommandUnderTest> { - let (input, _) = char('$')(input)?; + let (input, _terminal_prompt) = char('$')(input)?; - let (input, _) = multispace0(input)?; - let (input, _) = tag("echo")(input)?; - let (input, _) = multispace0(input)?; + let (input, _echo_cmd) = delimited(multispace0, tag("echo"), multispace0)(input)?; - let quote = '\''; - let (input, _) = char(quote)(input)?; - // Doesn't handle escaping - let (input, stdin) = take_until("\'")(input)?; - let (input, _) = char(quote)(input)?; + let (input, stdin) = parse_quoted(input)?; - let (input, _) = multispace0(input)?; - let (input, _) = char('|')(input)?; - let (input, _) = multispace0(input)?; + let (input, _unix_pipe) = delimited(multispace0, char('|'), multispace0)(input)?; - let (input, _program) = take_till(|c| c == ' ')(input)?; + let (input, _program_name) = ascii_alpha1(input)?; + + let (input, _) = multispace0(input)?; - let (input, args) = take_while1(|c| c != '#' && c != '\n')(input)?; + let (input, raw_args) = take_while1(|c| c != '#' && c != '\n')(input)?; let (input, _) = take_until("\n")(input)?; let (input, _) = line_ending(input)?; @@ -52,12 +57,7 @@ mod tests { input, CommandUnderTest { stdin: stdin.trim().to_string(), - args: args - .split_whitespace() - .map(String::from) - .filter(|s| !s.is_empty()) - .map(|s| s.replace(quote, "")) - .collect_vec(), + args: split(raw_args).expect("Should be able to split args"), stdout: stdout.trim().to_string(), }, )) @@ -83,13 +83,11 @@ mod tests { if let NodeValue::CodeBlock(NodeCodeBlock { info, literal, .. }) = value { if info == console { - let x = parse_code_blocks(&literal).finish(); - println!("{:#?}", x); - let parsed = parse_code_blocks(&literal) + let (_, commands) = parse_code_blocks(&literal) .finish() .expect("Anything in `console` should be parseable as a command"); - println!("{:#?}", parsed); - cuts.extend(parsed.1); + println!("Found command to run: {:#?}", commands); + cuts.extend(commands); } } });