diff --git a/Cargo.lock b/Cargo.lock index ff1fd15..9445c5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,14 @@ dependencies = [ "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "atty" version = "0.2.13" @@ -37,6 +45,11 @@ dependencies = [ "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "bstr" version = "0.2.8" @@ -63,6 +76,20 @@ name = "cfg-if" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "clap" +version = "2.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "cli_test_dir" version = "0.1.7" @@ -88,17 +115,6 @@ dependencies = [ "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "docopt" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", - "strsim 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "env_logger" version = "0.7.1" @@ -120,6 +136,14 @@ dependencies = [ "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "humansize" version = "1.1.0" @@ -164,6 +188,16 @@ dependencies = [ "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "proc-macro-error" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "proc-macro2" version = "1.0.6" @@ -226,11 +260,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "scrubcsv" -version = "0.1.5" +version = "0.1.6" dependencies = [ + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "cli_test_dir 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "docopt 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", "humansize 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -239,7 +273,7 @@ dependencies = [ "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", + "structopt 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -247,25 +281,33 @@ dependencies = [ name = "serde" version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "structopt" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "serde_derive 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", + "structopt-derive 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] -name = "serde_derive" -version = "1.0.101" +name = "structopt-derive" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ + "heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro-error 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "syn 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "strsim" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "syn" version = "1.0.5" @@ -284,6 +326,14 @@ dependencies = [ "wincolor 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "thread_local" version = "0.3.6" @@ -302,11 +352,26 @@ dependencies = [ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "unicode-segmentation" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "unicode-xid" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "version_check" version = "0.1.5" @@ -350,19 +415,22 @@ dependencies = [ [metadata] "checksum aho-corasick 0.7.6 (registry+https://github.com/rust-lang/crates.io-index)" = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" "checksum backtrace 0.3.40 (registry+https://github.com/rust-lang/crates.io-index)" = "924c76597f0d9ca25d762c25a4d369d51267536465dc5064bdf0eb073ed477ea" "checksum backtrace-sys 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6575f128516de27e3ce99689419835fce9643a9b215a14d2b5b685be018491" +"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" "checksum bstr 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8d6c2c5b58ab920a4f5aeaaca34b4488074e8cc7596af94e6f8c6ff247c60245" "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" "checksum cc 1.0.46 (registry+https://github.com/rust-lang/crates.io-index)" = "0213d356d3c4ea2c18c40b037c3be23cd639825c18f25ee670ac7813beeef99c" "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cli_test_dir 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "2bc63338a59538d4f4b767dfb6082e4d26736aadb5100894b76039a04d6ad519" "checksum csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "37519ccdfd73a75821cac9319d4fce15a81b9fcf75f951df5b9988aa3a0af87d" "checksum csv-core 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9b5cadb6b25c77aeff80ba701712494213f4a8418fcda2ee11b6560c3ad0bf4c" -"checksum docopt 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7f525a586d310c87df72ebcd98009e57f1cc030c8c268305287a476beb653969" "checksum env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" "checksum error-chain 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3ab49e9dcb602294bc42f9a7dfc9bc6e936fca4418ea300dbfb84fe16de0b7d9" +"checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" "checksum humansize 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e" "checksum humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" @@ -370,6 +438,7 @@ dependencies = [ "checksum libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)" = "1a31a0627fdf1f6a39ec0dd577e101440b7db22672c0901fe00a9a6fbb5c24e8" "checksum log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" "checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e" +"checksum proc-macro-error 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "aeccfe4d5d8ea175d5f0e4a2ad0637e0f4121d63bd99d356fb1f39ab2e7c6097" "checksum proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9c9e470a8dc4aeae2dee2f335e8f533e2d4b347e1434e5671afc49b054592f27" "checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" "checksum quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053a8c8bcc71fcce321828dc897a98ab9760bef03a4fc36693c231e5b3216cfe" @@ -380,13 +449,18 @@ dependencies = [ "checksum rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783" "checksum ryu 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "bfa8506c1de11c9c4e4c38863ccbe02a305c8188e85a05a784c9e11e1c3910c8" "checksum serde 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)" = "9796c9b7ba2ffe7a9ce53c2287dfc48080f4b2b362fcc245a259b3a7201119dd" -"checksum serde_derive 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)" = "4b133a43a1ecd55d4086bd5b4dc6c1751c68b1bfbeba7a5040442022c7e7c02e" -"checksum strsim 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "032c03039aae92b350aad2e3779c352e104d919cb192ba2fabbd7b831ce4f0f6" +"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +"checksum structopt 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6d4f66a4c0ddf7aee4677995697366de0749b0139057342eccbb609b12d0affc" +"checksum structopt-derive 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8fe0c13e476b4e21ff7f5c4ace3818b6d7bdc16897c31c73862471bc1663acae" "checksum syn 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "66850e97125af79138385e9b88339cbcd037e3f28ceab8c5ad98e64f0f1f80bf" "checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e" +"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" +"checksum unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1967f4cdfc355b37fd76d2a954fb2ed3871034eb4f26d60537d88795cfc332a9" +"checksum unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20" "checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" "checksum winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" diff --git a/Cargo.toml b/Cargo.toml index 6d68e31..538ffcd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scrubcsv" -version = "0.1.5" +version = "0.1.6" authors = ["Eric Kidd "] edition = "2018" @@ -16,8 +16,8 @@ homepage = "https://github.com/faradayio/scrubcsv" opt-level = 3 [dependencies] +clap = "2.33.0" csv = "1" -docopt = "1" env_logger = "0.7" error-chain = "0.12" humansize = "1.0.1" @@ -26,7 +26,7 @@ libc = "0.2.18" log = "0.4" regex = "1" serde = "1.0" -serde_derive = "1.0" +structopt = "0.3.3" time = "0.1.35" [dev-dependencies] diff --git a/src/main.rs b/src/main.rs index c13353b..d9d8d10 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,22 +1,11 @@ // Declare a list of external crates. -extern crate csv; -extern crate docopt; -extern crate env_logger; #[macro_use] extern crate error_chain; -extern crate humansize; -extern crate libc; -#[macro_use] -extern crate lazy_static; -#[macro_use] -extern crate log; -extern crate regex; -#[macro_use] -extern crate serde_derive; -extern crate time; // Import from other crates. use humansize::{file_size_opts, FileSize}; +use lazy_static::lazy_static; +use log::debug; use regex::bytes::Regex; use std::{ borrow::Cow, @@ -24,6 +13,7 @@ use std::{ io::{self, prelude::*}, process, }; +use structopt::StructOpt; // Modules defined in separate files. mod errors; @@ -33,31 +23,14 @@ mod util; use crate::errors::*; use crate::util::parse_char_specifier; -/// Provide a CLI help message, which doctopt will also use to parse our -/// command-line arguments. -const USAGE: &str = r#" -Usage: scrubcsv [options] [] - scrubcsv --help - scrubcsv --version - -Read a CSV file, normalize the "good" lines, and print them to standard +/// Our command-line arguments. +#[derive(Debug, StructOpt)] +#[structopt( + name = "scrubcsv", + about = "Clean and normalize a CSV file.", + after_help = "Read a CSV file, normalize the \"good\" lines, and print them to standard output. Discard any lines with the wrong number of columns. -Options: - --help Show this help message. - --version Print the version of this program. - -q, --quiet Do not print performance information. - -d, --delimiter CHAR Character used to separate fields in a row. - (must be a single ASCII byte). [default: ,] - --quote CHAR Character used to quote entries. May be set to - "none" to ignore all quoting. [default: "] - -n, --null NULLREGEX Convert values matching NULLREGEX to an empty - string. - --replace-newlines Replace LF and CRLF sequences in values with - spaces. This should improve compatibility with - systems like BigQuery that don't expect newlines - inside escaped strings. - Regular expressions use Rust syntax, as described here: https://doc.rust-lang.org/regex/regex/index.html#syntax @@ -67,19 +40,45 @@ attempt to transcode. Exit code: 0 on success 1 on error - 2 if more than 10% of rows were bad -"#; + 2 if more than 10% of rows were bad" +)] +struct Opt { + /// Input file (uses stdin if omitted). + input: Option, -/// Our command-line arguments. -#[derive(Debug, Deserialize)] -struct Args { - arg_input: Option, - flag_delimiter: String, - flag_null: Option, - flag_replace_newlines: bool, - flag_quiet: bool, - flag_quote: String, - flag_version: bool, + /// Character used to separate fields in a row (must be a single ASCII + /// byte, or "tab"). + #[structopt( + value_name = "CHAR", + short = "d", + long = "delimiter", + default_value = "," + )] + delimiter: String, + + /// Convert values matching NULLREGEX to an empty string. + #[structopt(value_name = "NULLREGEX", short = "n", long = "null")] + null: Option, + + // Replace LF and CRLF sequences in values with spaces. This should improve + // compatibility with systems like BigQuery that don't expect newlines + // inside escaped strings. + #[structopt(long = "replace-newlines")] + replace_newlines: bool, + + // Drop any rows where the specified column is empty or NULL. Can be passed + // more than once. + #[structopt(value_name = "COL", long = "drop-row-if-null")] + drop_row_if_null: Vec, + + // Do not print performance information. + #[structopt(short = "q", long = "quiet")] + quiet: bool, + + // Character used to quote entries. May be set to "none" to ignore all + // quoting. + #[structopt(value_name = "CHAR", long = "quote", default_value = "\"")] + quote: String, } lazy_static! { @@ -97,23 +96,15 @@ fn run() -> Result<()> { env_logger::init(); // Parse our command-line arguments using `docopt`. - let args: Args = docopt::Docopt::new(USAGE) - .and_then(|d| d.deserialize()) - .unwrap_or_else(|e| e.exit()); - debug!("Arguments: {:#?}", args); - - // Print our version if asked to do so. - if args.flag_version { - println!("scrubcsv {}", env!("CARGO_PKG_VERSION")); - process::exit(0); - } + let opt = Opt::from_args(); + debug!("Options: {:#?}", opt); // Figure out our field delimiter and quote character. - let delimiter = match parse_char_specifier(&args.flag_delimiter)? { + let delimiter = match parse_char_specifier(&opt.delimiter)? { Some(d) => d, _ => return Err("field delimiter is required".into()), }; - let quote = parse_char_specifier(&args.flag_quote)?; + let quote = parse_char_specifier(&opt.quote)?; // Remember the time we started. let start_time = time::precise_time_s(); @@ -150,7 +141,7 @@ fn run() -> Result<()> { // because we wrap the `BufReader` around the box, we only do that // dispatch once per buffer flush, not on every tiny write. let stdin = io::stdin(); - let unbuffered_input: Box = if let Some(ref path) = args.arg_input { + let unbuffered_input: Box = if let Some(ref path) = opt.input { Box::new(fs::File::open(path)?) } else { Box::new(stdin.lock()) @@ -158,7 +149,7 @@ fn run() -> Result<()> { let input = io::BufReader::new(unbuffered_input); // Build a set containing all our `--null` values. - let null_re = if let Some(null_re_str) = args.flag_null.as_ref() { + let null_re = if let Some(null_re_str) = opt.null.as_ref() { let s = format!("^{}$", null_re_str); let re = Regex::new(&s) .chain_err(|| -> Error { "can't compile regular expression".into() })?; @@ -169,8 +160,8 @@ fn run() -> Result<()> { // Create our CSV reader. let mut rdr_builder = csv::ReaderBuilder::new(); - // Treat headers (if any) as any other record. - rdr_builder.has_headers(false); + // We need headers so that we can honor --drop-row-if-null. + rdr_builder.has_headers(true); // Allow records with the wrong number of columns. rdr_builder.flexible(true); // Configure our delimiter. @@ -182,20 +173,34 @@ fn run() -> Result<()> { rdr_builder.quoting(false); } let mut rdr = rdr_builder.from_reader(input); + let hdr = rdr.byte_headers()?.to_owned(); + + // Just in --drop-row-if-null was passed, precompute which columns are + // required. + let required_cols = hdr + .iter() + .map(|name| -> bool { + opt.drop_row_if_null + .iter() + .any(|requried_name| requried_name.as_bytes() == name) + }) + .collect::>(); // Create our CSV writer. Note that we _don't_ allow variable numbers // of columns, non-standard delimiters, or other nonsense: We want our // output to be highly normalized. let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(output); + wtr.write_byte_record(&hdr)?; // Keep track of total rows and malformed rows seen. - let mut rows: u64 = 0; + let mut rows: u64 = 1; let mut bad_rows: u64 = 0; // Can we use the fast path and copy the data through unchanged? Or do we // need to clean up emebedded newlines in our data? (These break BigQuery, // for example.) - let use_fast_path = null_re.is_none() && !args.flag_replace_newlines; + let use_fast_path = + null_re.is_none() && !opt.replace_newlines && opt.drop_row_if_null.is_empty(); // Iterate over all the rows, checking to make sure they look // reasonable. @@ -204,58 +209,64 @@ fn run() -> Result<()> { // about 225 MB/s. But it turns out we can't do that, because we need to // have a copy of all the row's fields before deciding whether or not // to write it out. - let mut columns_expected = None; - for record in rdr.byte_records() { + 'next_row: for record in rdr.byte_records() { let record = record?; - // Keep track of how many columns we expected. - let is_good = match columns_expected { - // This is the first row. - None => { - columns_expected = Some(record.len()); - true - } - // We know how many columns we expect, and it matches. - Some(expected) if record.len() == expected => true, - // The current row is weird. - Some(_) => false, - }; - - // If this is a good row, output it. - if is_good { - if use_fast_path { - // We don't need to do anything fancy, so just pass it through. - // I'm not sure how much this actually buys us in current Rust - // versions, but it seemed like a good idea at the time. - wtr.write_record(record.into_iter())?; - } else { - // We need to apply one or more cleanups, so run the slow path. - wtr.write_record(record.into_iter().map(|mut val| { - // Convert values matching `--null` regex to empty strings. - if let Some(ref null_re) = null_re { - if null_re.is_match(&val) { - val = &[] - } + // Keep track of how many rows we've seen. + rows += 1; + + // Check if we have the right number of columns in this row. + if record.len() != hdr.len() { + bad_rows += 1; + continue 'next_row; + } + + // Decide how to handle this row. + if use_fast_path { + // We don't need to do anything fancy, so just pass it through. + // I'm not sure how much this actually buys us in current Rust + // versions, but it seemed like a good idea at the time. + wtr.write_record(record.into_iter())?; + } else { + // We need to apply one or more cleanups, so run the slow path. + let cleaned = record.into_iter().map(|mut val: &[u8]| -> Cow<[u8]> { + // Convert values matching `--null` regex to empty strings. + if let Some(ref null_re) = null_re { + if null_re.is_match(&val) { + val = &[] } + } - // Fix newlines. - if args.flag_replace_newlines - && (val.contains(&b'\n') || val.contains(&b'\r')) - { - NEWLINE_RE.replace_all(val, &b" "[..]) - } else { - Cow::Borrowed(val) + // Fix newlines. + if opt.replace_newlines + && (val.contains(&b'\n') || val.contains(&b'\r')) + { + NEWLINE_RE.replace_all(val, &b" "[..]) + } else { + Cow::Borrowed(val) + } + }); + if opt.drop_row_if_null.is_empty() { + // Still somewhat fast! + wtr.write_record(cleaned)?; + } else { + // We need to rebuild the record, check for null columns, + // and only output the record if everything's OK. + let row = cleaned.collect::>>(); + for (value, &is_required_col) in row.iter().zip(required_cols.iter()) { + // If the column is NULL but shouldn't be, bail on this row. + if is_required_col && value.is_empty() { + bad_rows += 1; + continue 'next_row; } - }))?; + } + wtr.write_record(row)?; } - } else { - bad_rows += 1; } - rows += 1; } // Print out some information about our run. - if !args.flag_quiet { + if !opt.quiet { let ellapsed = time::precise_time_s() - start_time; let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64; writeln!( diff --git a/tests/tests.rs b/tests/tests.rs index a6e7799..6e31d4a 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,7 +8,8 @@ use cli_test_dir::*; fn help_flag() { let testdir = TestDir::new("scrubcsv", "flag_help"); let output = testdir.cmd().arg("--help").expect_success(); - assert!(output.stdout_str().contains("scrubcsv --help")); + assert!(output.stdout_str().contains("scrubcsv")); + assert!(output.stdout_str().contains("--help")); } #[test] @@ -135,9 +136,9 @@ fn null_normalization() { let output = testdir .cmd() .args(&["--null", "(?i)null|NIL"]) - .output_with_stdin("null,NIL,nil,,not null\n") + .output_with_stdin("a,b,c,d,e\nnull,NIL,nil,,not null\n") .expect_success(); - assert_eq!(output.stdout_str(), ",,,,not null\n") + assert_eq!(output.stdout_str(), "a,b,c,d,e\n,,,,not null\n") } #[test] @@ -146,7 +147,34 @@ fn replace_newlines() { let output = testdir .cmd() .arg("--replace-newlines") - .output_with_stdin("\"line\r\nbreak\r1\",\"line\nbreak\n2\"\n") + .output_with_stdin("a,b\n\"line\r\nbreak\r1\",\"line\nbreak\n2\"\n") .expect_success(); - assert_eq!(output.stdout_str(), "line break 1,line break 2\n"); + assert_eq!(output.stdout_str(), "a,b\nline break 1,line break 2\n"); +} + +#[test] +fn drop_row_if_null() { + let testdir = TestDir::new("scrubcsv", "replace_newlines"); + let output = testdir + .cmd() + .arg("--drop-row-if-null=c1") + .arg("--drop-row-if-null=c2") + .args(&["--null", "NULL"]) + .output_with_stdin( + r#"c1,c2,c3 +1,, +,2, +NULL,3, +a,b,c +"#, + ) + .expect("error running scrubcsv"); + eprintln!("{}", output.stderr_str()); + //assert_eq!(output.status.code(), Some(2)); + assert_eq!( + output.stdout_str(), + r#"c1,c2,c3 +a,b,c +"# + ); }