From c7475df38e448fdecf8090ec5dbe19dd7dd0bfa4 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Thu, 27 Feb 2020 21:29:39 -0600 Subject: [PATCH 1/2] Close #13 save bad rows to a file --- Cargo.toml | 2 +- src/main.rs | 21 +++++++++++++++++++++ tests/tests.rs | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d712485..2ea6e47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ homepage = "https://github.com/faradayio/scrubcsv" [profile.dev] # We always want to be fast, even in debug mode. Comment this out for much # faster compiles. -opt-level = 3 +# opt-level = 3 [dependencies] clap = { version = "2.33.0", features = ["wrap_help"] } diff --git a/src/main.rs b/src/main.rs index bbdd7f5..2a1c8e6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -99,6 +99,10 @@ struct Opt { /// quoting. #[structopt(value_name = "CHAR", long = "quote", default_value = "\"")] quote: CharSpecifier, + + /// Save badly formed rows to a file. + #[structopt(value_name = "PATH", long = "bad-rows-path")] + bad_rows_path: Option, } lazy_static! { @@ -182,6 +186,13 @@ fn run() -> Result<()> { .buffer_capacity(BUFFER_SIZE) .from_writer(output); + // Create out CSV writer for bad rows if it is requested. + let mut bad_rows_wtr = if let Some(ref path) = opt.bad_rows_path { + Some(csv::WriterBuilder::new().from_path(path)?) + } else { + None + }; + // Get our header and, if we were asked, make sure all the column names are unique. let mut hdr = rdr .byte_headers() @@ -244,6 +255,11 @@ fn run() -> Result<()> { // Check if we have the right number of columns in this row. if record.len() != expected_cols { bad_rows += 1; + if let Some(ref mut wtr_bad) = bad_rows_wtr { + wtr_bad + .write_record(record.into_iter()) + .context("cannot write record")?; + }; continue 'next_row; } @@ -303,6 +319,11 @@ fn run() -> Result<()> { // If the column is NULL but shouldn't be, bail on this row. if is_required_col && value.is_empty() { bad_rows += 1; + if let Some(ref mut wtr_bad) = bad_rows_wtr { + wtr_bad + .write_record(record.into_iter()) + .context("cannot write record")?; + }; continue 'next_row; } } diff --git a/tests/tests.rs b/tests/tests.rs index b47dd37..3fd2957 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -111,6 +111,26 @@ fn bad_rows() { assert!(output.stderr_str().contains("102 rows (1 bad)")); } +#[test] +fn bad_rows_saved() { + let mut good_rows = "a,b,c\n".to_owned(); + for _ in 0..100 { + good_rows.push_str("1,2,3\n"); + } + let mut bad_rows = good_rows.clone(); + bad_rows.push_str("1,2\n"); + + let testdir = TestDir::new("scrubcsv", "bad_rows_saved"); + let output = testdir + .cmd() + .args(&["--bad-rows-path", "bad.csv"]) + .output_with_stdin(&bad_rows) + .expect_success(); + testdir.expect_file_contents("bad.csv", "1,2\n"); + assert_eq!(output.stdout_str(), &good_rows); + assert!(output.stderr_str().contains("102 rows (1 bad)")); +} + #[test] fn too_many_bad_rows() { let testdir = TestDir::new("scrubcsv", "too_many_bad_rows"); @@ -200,3 +220,30 @@ a,b,c "# ); } + +#[test] +fn drop_row_if_null_saved() { + let testdir = TestDir::new("scrubcsv", "drop_row_if_null_saved"); + let output = testdir + .cmd() + .arg("--drop-row-if-null=c1") + .arg("--drop-row-if-null=c2") + .args(&["--bad-rows-path", "bad.csv"]) + .output_with_stdin( + r#"c1,c2,c3 +1,, +a,b,c +1,2,3 +3,2,1 +1,4,5 +2,2,2 +1,1,1 +5,5,5 +2,2,2 +1,1,1 +"#, + ) + .expect("error running scrubcsv"); + eprintln!("{}", output.stderr_str()); + testdir.expect_file_contents("bad.csv", "1,,\n"); +} From 91b74ac2b48555115c8f0898a8ffa6c5d01ec085 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Tue, 3 Mar 2020 12:58:09 -0600 Subject: [PATCH 2/2] restore opt-level = 3 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 2ea6e47..d712485 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ homepage = "https://github.com/faradayio/scrubcsv" [profile.dev] # We always want to be fast, even in debug mode. Comment this out for much # faster compiles. -# opt-level = 3 +opt-level = 3 [dependencies] clap = { version = "2.33.0", features = ["wrap_help"] }