-
Notifications
You must be signed in to change notification settings - Fork 74
/
replace.rs
221 lines (190 loc) · 8 KB
/
replace.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
static USAGE: &str = r#"
Replace occurrences of a pattern across a CSV file.
You can of course match groups using parentheses and use those in
the replacement string. But don't forget to escape your $ in bash by using a
backslash or by wrapping the replacement string into single quotes:
$ qsv replace 'hel(lo)' 'hal$1' file.csv
$ qsv replace "hel(lo)" "hal\$1" file.csv
Returns exitcode 0 when replacements are done, returning number of replacements to stderr.
Returns exitcode 1 when no replacements are done, unless the '--not-one' flag is used.
For more examples, see https://github.com/dathere/qsv/blob/master/tests/test_replace.rs.
Usage:
qsv replace [options] <pattern> <replacement> [<input>]
qsv replace --help
replace arguments:
<pattern> Regular expression pattern to match. Uses Rust regex syntax.
See https://docs.rs/regex/latest/regex/index.html#syntax
or https://regex101.com with the Rust flavor for more info.
<input> The CSV file to read. If not given, reads from stdin.
<replacement> Replacement string. Set to '<NULL>' if you want to
replace matches with ''.
replace options:
-i, --ignore-case Case insensitive search. This is equivalent to
prefixing the regex with '(?i)'.
--literal Treat the regex pattern as a literal string. This allows
you to search for exact matches that even contain
regex special characters.
-s, --select <arg> Select the columns to search. See 'qsv select -h'
for the full syntax.
-u, --unicode Enable unicode support. When enabled, character classes
will match all unicode word characters instead of only
ASCII word characters. Decreases performance.
--size-limit <mb> Set the approximate size limit (MB) of the compiled
regular expression. If the compiled expression exceeds this
number, then a compilation error is returned.
[default: 50]
--dfa-size-limit <mb> Set the approximate size of the cache (MB) used by the regular
expression engine's Discrete Finite Automata.
[default: 10]
--not-one Use exit code 0 instead of 1 for no replacement found.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers. (i.e., They are not searched, analyzed,
sliced, etc.)
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-p, --progressbar Show progress bars. Not valid for stdin.
-Q, --quiet Do not print number of replacements to stderr.
"#;
use std::{borrow::Cow, collections::HashSet};
#[cfg(any(feature = "feature_capable", feature = "lite"))]
use indicatif::{HumanCount, ProgressBar, ProgressDrawTarget};
use regex::bytes::RegexBuilder;
use serde::Deserialize;
use crate::{
config::{Config, Delimiter},
select::SelectColumns,
util, CliError, CliResult,
};
#[allow(dead_code)]
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
arg_pattern: String,
arg_replacement: String,
flag_select: SelectColumns,
flag_unicode: bool,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
flag_ignore_case: bool,
flag_literal: bool,
flag_size_limit: usize,
flag_dfa_size_limit: usize,
flag_not_one: bool,
flag_progressbar: bool,
flag_quiet: bool,
}
const NULL_VALUE: &str = "<null>";
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let regex_unicode = if util::get_envvar_flag("QSV_REGEX_UNICODE") {
true
} else {
args.flag_unicode
};
let arg_pattern = if args.flag_literal {
regex::escape(&args.arg_pattern)
} else {
args.arg_pattern.clone()
};
let pattern = RegexBuilder::new(&arg_pattern)
.case_insensitive(args.flag_ignore_case)
.unicode(regex_unicode)
.size_limit(args.flag_size_limit * (1 << 20))
.dfa_size_limit(args.flag_dfa_size_limit * (1 << 20))
.build()?;
let replacement = if args.arg_replacement.to_lowercase() == NULL_VALUE {
b""
} else {
args.arg_replacement.as_bytes()
};
let rconfig = Config::new(args.arg_input.as_ref())
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
.select(args.flag_select);
let mut rdr = rconfig.reader()?;
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
let headers = rdr.byte_headers()?.clone();
let sel = rconfig.selection(&headers)?;
// use a hash set for O(1) time complexity
// instead of O(n) with the previous vector lookup
let sel_indices: HashSet<&usize> = sel.iter().collect();
if !rconfig.no_headers {
wtr.write_record(&headers)?;
}
// prep progress bar
#[cfg(any(feature = "feature_capable", feature = "lite"))]
let show_progress =
(args.flag_progressbar || util::get_envvar_flag("QSV_PROGRESSBAR")) && !rconfig.is_stdin();
#[cfg(any(feature = "feature_capable", feature = "lite"))]
let progress = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr_with_hz(5));
#[cfg(any(feature = "feature_capable", feature = "lite"))]
if show_progress {
util::prep_progress(&progress, util::count_rows(&rconfig)?);
} else {
progress.set_draw_target(ProgressDrawTarget::hidden());
}
let mut record = csv::ByteRecord::new();
let mut total_match_ctr: u64 = 0;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
let mut rows_with_matches_ctr: u64 = 0;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
let mut match_found;
while rdr.read_byte_record(&mut record)? {
#[cfg(any(feature = "feature_capable", feature = "lite"))]
if show_progress {
progress.inc(1);
}
#[cfg(any(feature = "feature_capable", feature = "lite"))]
{
match_found = false;
}
record = record
.into_iter()
.enumerate()
.map(|(i, v)| {
if sel_indices.contains(&i) {
if pattern.is_match(v) {
total_match_ctr += 1;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
{
match_found = true;
}
pattern.replace_all(v, replacement)
} else {
Cow::Borrowed(v)
}
} else {
Cow::Borrowed(v)
}
})
.collect();
#[cfg(any(feature = "feature_capable", feature = "lite"))]
if match_found {
rows_with_matches_ctr += 1;
}
wtr.write_byte_record(&record)?;
}
wtr.flush()?;
#[cfg(any(feature = "feature_capable", feature = "lite"))]
if show_progress {
progress.set_message(format!(
r#" - {} total matches replaced with "{}" in {} out of {} records."#,
HumanCount(total_match_ctr),
args.arg_replacement,
HumanCount(rows_with_matches_ctr),
HumanCount(progress.length().unwrap()),
));
util::finish_progress(&progress);
}
if !args.flag_quiet {
eprintln!("{total_match_ctr}");
}
if total_match_ctr == 0 && !args.flag_not_one {
return Err(CliError::NoMatch());
}
Ok(())
}