From 677e9ad88f19e172e30a5f39b7d98fd594ff1a05 Mon Sep 17 00:00:00 2001 From: Eric Kidd Date: Thu, 24 Oct 2019 16:56:32 -0400 Subject: [PATCH] v0.1.6: Add `--drop-row-if-null=COL` flag (fixes #9) We now support a `--drop-row-if-null=COL` flag which can be used to automatically discard certain lines when particular columns are NULL. This argument can be passed more than once to check more than one column, and it is applied _after_ will normalize NULLs as specified by `--null`. In order to implement this, we had to switch from `docopt` (a deprecated argument parser) to `structopt` (the recommended replacement). --- Cargo.lock | 124 +++++++++++++++++++++------ Cargo.toml | 6 +- src/main.rs | 227 ++++++++++++++++++++++++++----------------------- tests/tests.rs | 38 +++++++-- 4 files changed, 254 insertions(+), 141 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ff1fd15..9445c5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,14 @@ dependencies = [ "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "atty" version = "0.2.13" @@ -37,6 +45,11 @@ dependencies = [ "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "bstr" version = "0.2.8" @@ -63,6 +76,20 @@ name = "cfg-if" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "clap" +version = "2.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "cli_test_dir" version = "0.1.7" @@ -88,17 +115,6 @@ dependencies = [ "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "docopt" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", - "strsim 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "env_logger" version = "0.7.1" @@ -120,6 +136,14 @@ dependencies = [ "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "humansize" version = "1.1.0" @@ -164,6 +188,16 @@ dependencies = [ "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "proc-macro-error" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "proc-macro2" version = "1.0.6" @@ -226,11 +260,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "scrubcsv" -version = "0.1.5" +version = "0.1.6" dependencies = [ + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "cli_test_dir 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", - "docopt 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", "error-chain 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", "humansize 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -239,7 +273,7 @@ dependencies = [ "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", + "structopt 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -247,25 +281,33 @@ dependencies = [ name = "serde" version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "structopt" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "serde_derive 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", + "structopt-derive 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] -name = "serde_derive" -version = "1.0.101" +name = "structopt-derive" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ + "heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro-error 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", "quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "syn 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "strsim" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "syn" version = "1.0.5" @@ -284,6 +326,14 @@ dependencies = [ "wincolor 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "thread_local" version = "0.3.6" @@ -302,11 +352,26 @@ dependencies = [ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "unicode-segmentation" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-width" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "unicode-xid" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "vec_map" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "version_check" version = "0.1.5" @@ -350,19 +415,22 @@ dependencies = [ [metadata] "checksum aho-corasick 0.7.6 (registry+https://github.com/rust-lang/crates.io-index)" = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" "checksum backtrace 0.3.40 (registry+https://github.com/rust-lang/crates.io-index)" = "924c76597f0d9ca25d762c25a4d369d51267536465dc5064bdf0eb073ed477ea" "checksum backtrace-sys 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6575f128516de27e3ce99689419835fce9643a9b215a14d2b5b685be018491" +"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" "checksum bstr 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8d6c2c5b58ab920a4f5aeaaca34b4488074e8cc7596af94e6f8c6ff247c60245" "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" "checksum cc 1.0.46 (registry+https://github.com/rust-lang/crates.io-index)" = "0213d356d3c4ea2c18c40b037c3be23cd639825c18f25ee670ac7813beeef99c" "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cli_test_dir 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "2bc63338a59538d4f4b767dfb6082e4d26736aadb5100894b76039a04d6ad519" "checksum csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "37519ccdfd73a75821cac9319d4fce15a81b9fcf75f951df5b9988aa3a0af87d" "checksum csv-core 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9b5cadb6b25c77aeff80ba701712494213f4a8418fcda2ee11b6560c3ad0bf4c" -"checksum docopt 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7f525a586d310c87df72ebcd98009e57f1cc030c8c268305287a476beb653969" "checksum env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" "checksum error-chain 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3ab49e9dcb602294bc42f9a7dfc9bc6e936fca4418ea300dbfb84fe16de0b7d9" +"checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" "checksum humansize 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e" "checksum humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" @@ -370,6 +438,7 @@ dependencies = [ "checksum libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)" = "1a31a0627fdf1f6a39ec0dd577e101440b7db22672c0901fe00a9a6fbb5c24e8" "checksum log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" "checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e" +"checksum proc-macro-error 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "aeccfe4d5d8ea175d5f0e4a2ad0637e0f4121d63bd99d356fb1f39ab2e7c6097" "checksum proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9c9e470a8dc4aeae2dee2f335e8f533e2d4b347e1434e5671afc49b054592f27" "checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" "checksum quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053a8c8bcc71fcce321828dc897a98ab9760bef03a4fc36693c231e5b3216cfe" @@ -380,13 +449,18 @@ dependencies = [ "checksum rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783" "checksum ryu 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "bfa8506c1de11c9c4e4c38863ccbe02a305c8188e85a05a784c9e11e1c3910c8" "checksum serde 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)" = "9796c9b7ba2ffe7a9ce53c2287dfc48080f4b2b362fcc245a259b3a7201119dd" -"checksum serde_derive 1.0.101 (registry+https://github.com/rust-lang/crates.io-index)" = "4b133a43a1ecd55d4086bd5b4dc6c1751c68b1bfbeba7a5040442022c7e7c02e" -"checksum strsim 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "032c03039aae92b350aad2e3779c352e104d919cb192ba2fabbd7b831ce4f0f6" +"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +"checksum structopt 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6d4f66a4c0ddf7aee4677995697366de0749b0139057342eccbb609b12d0affc" +"checksum structopt-derive 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8fe0c13e476b4e21ff7f5c4ace3818b6d7bdc16897c31c73862471bc1663acae" "checksum syn 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "66850e97125af79138385e9b88339cbcd037e3f28ceab8c5ad98e64f0f1f80bf" "checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e" +"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" +"checksum unicode-segmentation 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1967f4cdfc355b37fd76d2a954fb2ed3871034eb4f26d60537d88795cfc332a9" +"checksum unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20" "checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" "checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" "checksum winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" diff --git a/Cargo.toml b/Cargo.toml index 6d68e31..538ffcd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scrubcsv" -version = "0.1.5" +version = "0.1.6" authors = ["Eric Kidd "] edition = "2018" @@ -16,8 +16,8 @@ homepage = "https://github.com/faradayio/scrubcsv" opt-level = 3 [dependencies] +clap = "2.33.0" csv = "1" -docopt = "1" env_logger = "0.7" error-chain = "0.12" humansize = "1.0.1" @@ -26,7 +26,7 @@ libc = "0.2.18" log = "0.4" regex = "1" serde = "1.0" -serde_derive = "1.0" +structopt = "0.3.3" time = "0.1.35" [dev-dependencies] diff --git a/src/main.rs b/src/main.rs index c13353b..d9d8d10 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,22 +1,11 @@ // Declare a list of external crates. -extern crate csv; -extern crate docopt; -extern crate env_logger; #[macro_use] extern crate error_chain; -extern crate humansize; -extern crate libc; -#[macro_use] -extern crate lazy_static; -#[macro_use] -extern crate log; -extern crate regex; -#[macro_use] -extern crate serde_derive; -extern crate time; // Import from other crates. use humansize::{file_size_opts, FileSize}; +use lazy_static::lazy_static; +use log::debug; use regex::bytes::Regex; use std::{ borrow::Cow, @@ -24,6 +13,7 @@ use std::{ io::{self, prelude::*}, process, }; +use structopt::StructOpt; // Modules defined in separate files. mod errors; @@ -33,31 +23,14 @@ mod util; use crate::errors::*; use crate::util::parse_char_specifier; -/// Provide a CLI help message, which doctopt will also use to parse our -/// command-line arguments. -const USAGE: &str = r#" -Usage: scrubcsv [options] [] - scrubcsv --help - scrubcsv --version - -Read a CSV file, normalize the "good" lines, and print them to standard +/// Our command-line arguments. +#[derive(Debug, StructOpt)] +#[structopt( + name = "scrubcsv", + about = "Clean and normalize a CSV file.", + after_help = "Read a CSV file, normalize the \"good\" lines, and print them to standard output. Discard any lines with the wrong number of columns. -Options: - --help Show this help message. - --version Print the version of this program. - -q, --quiet Do not print performance information. - -d, --delimiter CHAR Character used to separate fields in a row. - (must be a single ASCII byte). [default: ,] - --quote CHAR Character used to quote entries. May be set to - "none" to ignore all quoting. [default: "] - -n, --null NULLREGEX Convert values matching NULLREGEX to an empty - string. - --replace-newlines Replace LF and CRLF sequences in values with - spaces. This should improve compatibility with - systems like BigQuery that don't expect newlines - inside escaped strings. - Regular expressions use Rust syntax, as described here: https://doc.rust-lang.org/regex/regex/index.html#syntax @@ -67,19 +40,45 @@ attempt to transcode. Exit code: 0 on success 1 on error - 2 if more than 10% of rows were bad -"#; + 2 if more than 10% of rows were bad" +)] +struct Opt { + /// Input file (uses stdin if omitted). + input: Option, -/// Our command-line arguments. -#[derive(Debug, Deserialize)] -struct Args { - arg_input: Option, - flag_delimiter: String, - flag_null: Option, - flag_replace_newlines: bool, - flag_quiet: bool, - flag_quote: String, - flag_version: bool, + /// Character used to separate fields in a row (must be a single ASCII + /// byte, or "tab"). + #[structopt( + value_name = "CHAR", + short = "d", + long = "delimiter", + default_value = "," + )] + delimiter: String, + + /// Convert values matching NULLREGEX to an empty string. + #[structopt(value_name = "NULLREGEX", short = "n", long = "null")] + null: Option, + + // Replace LF and CRLF sequences in values with spaces. This should improve + // compatibility with systems like BigQuery that don't expect newlines + // inside escaped strings. + #[structopt(long = "replace-newlines")] + replace_newlines: bool, + + // Drop any rows where the specified column is empty or NULL. Can be passed + // more than once. + #[structopt(value_name = "COL", long = "drop-row-if-null")] + drop_row_if_null: Vec, + + // Do not print performance information. + #[structopt(short = "q", long = "quiet")] + quiet: bool, + + // Character used to quote entries. May be set to "none" to ignore all + // quoting. + #[structopt(value_name = "CHAR", long = "quote", default_value = "\"")] + quote: String, } lazy_static! { @@ -97,23 +96,15 @@ fn run() -> Result<()> { env_logger::init(); // Parse our command-line arguments using `docopt`. - let args: Args = docopt::Docopt::new(USAGE) - .and_then(|d| d.deserialize()) - .unwrap_or_else(|e| e.exit()); - debug!("Arguments: {:#?}", args); - - // Print our version if asked to do so. - if args.flag_version { - println!("scrubcsv {}", env!("CARGO_PKG_VERSION")); - process::exit(0); - } + let opt = Opt::from_args(); + debug!("Options: {:#?}", opt); // Figure out our field delimiter and quote character. - let delimiter = match parse_char_specifier(&args.flag_delimiter)? { + let delimiter = match parse_char_specifier(&opt.delimiter)? { Some(d) => d, _ => return Err("field delimiter is required".into()), }; - let quote = parse_char_specifier(&args.flag_quote)?; + let quote = parse_char_specifier(&opt.quote)?; // Remember the time we started. let start_time = time::precise_time_s(); @@ -150,7 +141,7 @@ fn run() -> Result<()> { // because we wrap the `BufReader` around the box, we only do that // dispatch once per buffer flush, not on every tiny write. let stdin = io::stdin(); - let unbuffered_input: Box = if let Some(ref path) = args.arg_input { + let unbuffered_input: Box = if let Some(ref path) = opt.input { Box::new(fs::File::open(path)?) } else { Box::new(stdin.lock()) @@ -158,7 +149,7 @@ fn run() -> Result<()> { let input = io::BufReader::new(unbuffered_input); // Build a set containing all our `--null` values. - let null_re = if let Some(null_re_str) = args.flag_null.as_ref() { + let null_re = if let Some(null_re_str) = opt.null.as_ref() { let s = format!("^{}$", null_re_str); let re = Regex::new(&s) .chain_err(|| -> Error { "can't compile regular expression".into() })?; @@ -169,8 +160,8 @@ fn run() -> Result<()> { // Create our CSV reader. let mut rdr_builder = csv::ReaderBuilder::new(); - // Treat headers (if any) as any other record. - rdr_builder.has_headers(false); + // We need headers so that we can honor --drop-row-if-null. + rdr_builder.has_headers(true); // Allow records with the wrong number of columns. rdr_builder.flexible(true); // Configure our delimiter. @@ -182,20 +173,34 @@ fn run() -> Result<()> { rdr_builder.quoting(false); } let mut rdr = rdr_builder.from_reader(input); + let hdr = rdr.byte_headers()?.to_owned(); + + // Just in --drop-row-if-null was passed, precompute which columns are + // required. + let required_cols = hdr + .iter() + .map(|name| -> bool { + opt.drop_row_if_null + .iter() + .any(|requried_name| requried_name.as_bytes() == name) + }) + .collect::>(); // Create our CSV writer. Note that we _don't_ allow variable numbers // of columns, non-standard delimiters, or other nonsense: We want our // output to be highly normalized. let mut wtr = csv::WriterBuilder::new().flexible(true).from_writer(output); + wtr.write_byte_record(&hdr)?; // Keep track of total rows and malformed rows seen. - let mut rows: u64 = 0; + let mut rows: u64 = 1; let mut bad_rows: u64 = 0; // Can we use the fast path and copy the data through unchanged? Or do we // need to clean up emebedded newlines in our data? (These break BigQuery, // for example.) - let use_fast_path = null_re.is_none() && !args.flag_replace_newlines; + let use_fast_path = + null_re.is_none() && !opt.replace_newlines && opt.drop_row_if_null.is_empty(); // Iterate over all the rows, checking to make sure they look // reasonable. @@ -204,58 +209,64 @@ fn run() -> Result<()> { // about 225 MB/s. But it turns out we can't do that, because we need to // have a copy of all the row's fields before deciding whether or not // to write it out. - let mut columns_expected = None; - for record in rdr.byte_records() { + 'next_row: for record in rdr.byte_records() { let record = record?; - // Keep track of how many columns we expected. - let is_good = match columns_expected { - // This is the first row. - None => { - columns_expected = Some(record.len()); - true - } - // We know how many columns we expect, and it matches. - Some(expected) if record.len() == expected => true, - // The current row is weird. - Some(_) => false, - }; - - // If this is a good row, output it. - if is_good { - if use_fast_path { - // We don't need to do anything fancy, so just pass it through. - // I'm not sure how much this actually buys us in current Rust - // versions, but it seemed like a good idea at the time. - wtr.write_record(record.into_iter())?; - } else { - // We need to apply one or more cleanups, so run the slow path. - wtr.write_record(record.into_iter().map(|mut val| { - // Convert values matching `--null` regex to empty strings. - if let Some(ref null_re) = null_re { - if null_re.is_match(&val) { - val = &[] - } + // Keep track of how many rows we've seen. + rows += 1; + + // Check if we have the right number of columns in this row. + if record.len() != hdr.len() { + bad_rows += 1; + continue 'next_row; + } + + // Decide how to handle this row. + if use_fast_path { + // We don't need to do anything fancy, so just pass it through. + // I'm not sure how much this actually buys us in current Rust + // versions, but it seemed like a good idea at the time. + wtr.write_record(record.into_iter())?; + } else { + // We need to apply one or more cleanups, so run the slow path. + let cleaned = record.into_iter().map(|mut val: &[u8]| -> Cow<[u8]> { + // Convert values matching `--null` regex to empty strings. + if let Some(ref null_re) = null_re { + if null_re.is_match(&val) { + val = &[] } + } - // Fix newlines. - if args.flag_replace_newlines - && (val.contains(&b'\n') || val.contains(&b'\r')) - { - NEWLINE_RE.replace_all(val, &b" "[..]) - } else { - Cow::Borrowed(val) + // Fix newlines. + if opt.replace_newlines + && (val.contains(&b'\n') || val.contains(&b'\r')) + { + NEWLINE_RE.replace_all(val, &b" "[..]) + } else { + Cow::Borrowed(val) + } + }); + if opt.drop_row_if_null.is_empty() { + // Still somewhat fast! + wtr.write_record(cleaned)?; + } else { + // We need to rebuild the record, check for null columns, + // and only output the record if everything's OK. + let row = cleaned.collect::>>(); + for (value, &is_required_col) in row.iter().zip(required_cols.iter()) { + // If the column is NULL but shouldn't be, bail on this row. + if is_required_col && value.is_empty() { + bad_rows += 1; + continue 'next_row; } - }))?; + } + wtr.write_record(row)?; } - } else { - bad_rows += 1; } - rows += 1; } // Print out some information about our run. - if !args.flag_quiet { + if !opt.quiet { let ellapsed = time::precise_time_s() - start_time; let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64; writeln!( diff --git a/tests/tests.rs b/tests/tests.rs index a6e7799..6e31d4a 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,7 +8,8 @@ use cli_test_dir::*; fn help_flag() { let testdir = TestDir::new("scrubcsv", "flag_help"); let output = testdir.cmd().arg("--help").expect_success(); - assert!(output.stdout_str().contains("scrubcsv --help")); + assert!(output.stdout_str().contains("scrubcsv")); + assert!(output.stdout_str().contains("--help")); } #[test] @@ -135,9 +136,9 @@ fn null_normalization() { let output = testdir .cmd() .args(&["--null", "(?i)null|NIL"]) - .output_with_stdin("null,NIL,nil,,not null\n") + .output_with_stdin("a,b,c,d,e\nnull,NIL,nil,,not null\n") .expect_success(); - assert_eq!(output.stdout_str(), ",,,,not null\n") + assert_eq!(output.stdout_str(), "a,b,c,d,e\n,,,,not null\n") } #[test] @@ -146,7 +147,34 @@ fn replace_newlines() { let output = testdir .cmd() .arg("--replace-newlines") - .output_with_stdin("\"line\r\nbreak\r1\",\"line\nbreak\n2\"\n") + .output_with_stdin("a,b\n\"line\r\nbreak\r1\",\"line\nbreak\n2\"\n") .expect_success(); - assert_eq!(output.stdout_str(), "line break 1,line break 2\n"); + assert_eq!(output.stdout_str(), "a,b\nline break 1,line break 2\n"); +} + +#[test] +fn drop_row_if_null() { + let testdir = TestDir::new("scrubcsv", "replace_newlines"); + let output = testdir + .cmd() + .arg("--drop-row-if-null=c1") + .arg("--drop-row-if-null=c2") + .args(&["--null", "NULL"]) + .output_with_stdin( + r#"c1,c2,c3 +1,, +,2, +NULL,3, +a,b,c +"#, + ) + .expect("error running scrubcsv"); + eprintln!("{}", output.stderr_str()); + //assert_eq!(output.status.code(), Some(2)); + assert_eq!( + output.stdout_str(), + r#"c1,c2,c3 +a,b,c +"# + ); }