diff --git a/.gitignore b/.gitignore index 1d281f8f0..b96295c72 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ /.bundle/ /.yardoc /_yardoc/ +/docs /coverage/ /pkg/ /spec/reports/ diff --git a/.ruby-version b/.ruby-version index b50214693..a4dd9dba4 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -3.0.2 +2.7.4 diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index 91cde676d..e0f743506 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -23,6 +23,11 @@ toc::[] == Unreleased These changes are merged into the `main` branch but have not yet been tagged as a new version/release. +==== Breaking +* Changes to keyword argument names for `Delete::FieldValueIfEqualsOtherField` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +** `sep` becomes `delim` +** `case_sensitive` becomes `casesensitive` + ==== Added * `multival` parameter added to `Cspace::NormalizeForID` transform (in https://github.com/lyrasis/kiba-extend/pull/49[PR#49]) * new https://lyrasis.github.io/kiba-extend/Kiba/Extend/Transforms/Count/FieldValues.html[`Count::FieldValues`] transform (in https://github.com/lyrasis/kiba-extend/pull/50[PR#50]) @@ -31,14 +36,32 @@ These changes are merged into the `main` branch but have not yet been tagged as ** warns of any supplied files that do not exist (in https://github.com/lyrasis/kiba-extend/pull/54[PR#54]) ** creates any reference directories that do not exist (in https://github.com/lyrasis/kiba-extend/pull/54[PR#54]) * test Clean::RegexpFindReplaceFieldVals to replace `\n` (in https://github.com/lyrasis/kiba-extend/pull/55[PR#55]) +* `Helpers.empty?` method, which returns true/false for a given string value (without treating delimiter values as special) (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* `fields` keyword argument to `Delete::FieldsExcept`, which should be used going forward instead of `keepfields` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* `nullvalue` setting to `Kiba::Extend.config`. Default value is '%NULLVALUE%' (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* `usenull` keyword argument to `Delete::EmptyFieldValues` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* `delim` keyword argument to `Delete::EmptyFieldValues`, which should be used going forward instead of `sep` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* documentation for `Delete` transforms (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* `Delete::BlankFields` transform (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) ==== Changed -- move/alias `Merge::CountOfMatchingRows` to `Count::MatchingRowsInLookup`(in https://github.com/lyrasis/kiba-extend/pull/50[PR#50]) +* move/alias `Merge::CountOfMatchingRows` to `Count::MatchingRowsInLookup`(in https://github.com/lyrasis/kiba-extend/pull/50[PR#50]) +* `Delete::FieldsExcept` can accept a single symbol as value for `fields` keyword argument (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* `Delete::EmptyFieldValues` will default to `Kiba::Extend.delim` as delimiter if none given explicitly (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +* keyword argument names for `Delete::FieldValueIfEqualsOtherField` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +** `sep` becomes `delim` +** `case_sensitive` becomes `casesensitive` ==== Deleted - Removed JARD as development dependency (in https://github.com/lyrasis/kiba-extend/pull/52[PR#52]) - Removed `-t` alias from `jobs:tagged_and` and `jobs:tagged_or` tasks, as they conflicted with the `-t/--tell` option (in https://github.com/lyrasis/kiba-extend/pull/56[PR#56]) +==== To be deprecated/Will break in a future version +These will now give warnings if used. + +- `Delete::FieldsExcept` `keepfields` keyword parameter. Change to `fields` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) +- `Delete::EmptyFieldValues` `sep` keyword parameter. Change to `delim` (in https://github.com/lyrasis/kiba-extend/pull/57[PR#57]) + == Releases === version - date diff --git a/Gemfile b/Gemfile index 95db0c43e..784bd1ad8 100644 --- a/Gemfile +++ b/Gemfile @@ -16,7 +16,6 @@ group :development, :test do gem 'rspec', '~> 3.10' gem 'rubocop', '~> 1.18.4' gem 'rubocop-rspec', '~> 2.4.0' -# gem 'ruby_jard' end group :test do diff --git a/Gemfile.lock b/Gemfile.lock index a9ca908eb..cfec56dc2 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - kiba-extend (2.5.3) + kiba-extend (2.6.0) activesupport (~> 6) csv (~> 3) dry-configurable (~> 0) @@ -15,7 +15,7 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (6.1.4.1) + activesupport (6.1.4.6) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 1.6, < 2) minitest (>= 5.1) @@ -25,10 +25,10 @@ GEM byebug (11.1.3) coderay (1.1.3) concurrent-ruby (1.1.9) - csv (3.2.0) + csv (3.2.2) diff-lcs (1.4.4) docile (1.4.0) - dry-configurable (0.13.0) + dry-configurable (0.14.0) concurrent-ruby (~> 1.0) dry-core (~> 0.6) dry-container (0.9.0) @@ -36,15 +36,15 @@ GEM dry-configurable (~> 0.13, >= 0.13.0) dry-core (0.7.1) concurrent-ruby (~> 1.0) - i18n (1.8.10) + i18n (1.10.0) concurrent-ruby (~> 1.0) kiba (4.0.0) kiba-common (1.5.0) kiba (>= 3.0.0, < 5) - measured (2.7.1) + measured (2.8.2) activesupport (>= 5.2) method_source (1.0.0) - minitest (5.14.4) + minitest (5.15.0) parallel (1.20.1) parser (3.0.2.0) ast (~> 2.4.1) @@ -90,12 +90,12 @@ GEM simplecov_json_formatter (~> 0.1) simplecov-html (0.12.3) simplecov_json_formatter (0.1.3) - thor (1.1.0) + thor (1.2.1) tzinfo (2.0.4) concurrent-ruby (~> 1.0) unicode-display_width (2.0.0) xxhash (0.4.0) - zeitwerk (2.4.2) + zeitwerk (2.5.4) PLATFORMS ruby diff --git a/lib/kiba/extend.rb b/lib/kiba/extend.rb index 0104ef67d..ea62f49dc 100644 --- a/lib/kiba/extend.rb +++ b/lib/kiba/extend.rb @@ -9,7 +9,6 @@ require 'kiba-common/destinations/csv' require 'kiba-common/destinations/lambda' require 'pry' -#require 'ruby_jard' require 'xxhash' require 'kiba/extend/registry/file_registry' @@ -58,6 +57,9 @@ module Extend # Example: 'a^^y;b^^z' -> [['a', 'y'], ['b', 'z']] setting :sgdelim, default: '^^', reader: true + # Default string to be treated as though it were a null/empty value. + setting :nullvalue, default: '%NULLVALUE%', reader: true + # Default source class for jobs setting :source, default: Kiba::Common::Sources::CSV, reader: true diff --git a/lib/kiba/extend/transforms/deduplicate.rb b/lib/kiba/extend/transforms/deduplicate.rb index 0c1996f08..b250bdc8f 100644 --- a/lib/kiba/extend/transforms/deduplicate.rb +++ b/lib/kiba/extend/transforms/deduplicate.rb @@ -148,8 +148,8 @@ def process(row) # Used in pipeline as: # # ``` - # @deduper = {} - # transform Deduplicate::FieldValues, fields: %i[foo bar], sep: ';' + # @deduper = {} + # transform Deduplicate::FieldValues, fields: %i[foo bar], sep: ';' # ``` # # Results in: @@ -206,8 +206,8 @@ def process(row) # Used in pipeline as: # # ``` - # @deduper = {} - # transform Deduplicate::Flag, on_field: :combined, in_field: :duplicate, using: @deduper + # @deduper = {} + # transform Deduplicate::Flag, on_field: :combined, in_field: :duplicate, using: @deduper # ``` # # Results in: diff --git a/lib/kiba/extend/transforms/delete.rb b/lib/kiba/extend/transforms/delete.rb index 57f40cc30..92b18139c 100644 --- a/lib/kiba/extend/transforms/delete.rb +++ b/lib/kiba/extend/transforms/delete.rb @@ -5,70 +5,7 @@ module Extend module Transforms # Tranformations to delete fields and field values module Delete - ::Delete = Kiba::Extend::Transforms::Delete - class EmptyFieldValues - def initialize(fields:, sep:) - @fields = [fields].flatten - @sep = sep - end - - # @private - def process(row) - @fields.each do |field| - val = row.fetch(field) - row[field] = val.split(@sep).compact.reject(&:empty?).join(@sep) unless val.nil? - end - row - end - end - - class Fields - def initialize(fields:) - @fields = [fields].flatten - end - - # @private - def process(row) - @fields.each { |name| row.delete(name) } - row - end - end - - class FieldsExcept - def initialize(keepfields:) - @fields = keepfields - end - - # @private - def process(row) - deletefields = row.keys - @fields - deletefields.each { |f| row.delete(f) } - row - end - end - - class FieldValueContainingString - def initialize(fields:, match:, casesensitive: true) - @fields = [fields].flatten - @match = casesensitive ? match : match.downcase - @casesensitive = casesensitive - end - - # @private - def process(row) - @fields.each do |field| - exval = row.fetch(field) - if exval.nil? - # do nothing - else - exval = @casesensitive ? row.fetch(field) : row.fetch(field).downcase - row[field] = nil if exval[@match] - end - end - row - end - end - + ::Delete = Kiba::Extend::Transforms::Delete class FieldValueIfEqualsOtherField def initialize(delete:, if_equal_to:, multival: false, sep: nil, grouped_fields: [], case_sensitive: true) @delete = delete @@ -109,26 +46,6 @@ def process(row) row end end - - class FieldValueMatchingRegexp - def initialize(fields:, match:, casesensitive: true) - @fields = [fields].flatten - @match = casesensitive ? Regexp.new(match) : Regexp.new(match, Regexp::IGNORECASE) - end - - # @private - def process(row) - @fields.each do |field| - exval = row.fetch(field) - if exval.nil? - # do nothing - elsif exval.match?(@match) - row[field] = nil - end - end - row - end - end end end end diff --git a/lib/kiba/extend/transforms/delete/empty_field_values.rb b/lib/kiba/extend/transforms/delete/empty_field_values.rb new file mode 100644 index 000000000..1553e175d --- /dev/null +++ b/lib/kiba/extend/transforms/delete/empty_field_values.rb @@ -0,0 +1,111 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # @note Only useful for multi-valued fields + # + # Deletes any empty values from the field. Supports `usenull` = true to treat the value of + # `Kiba::Extend.nullvalue` as empty + # + # # Examples + # + # Assuming `Kiba::Extend.nullvalue` = `%NULLVALUE%`, and input table: + # + # ``` + # | data | + # |------------------| + # | abc;;;d e f | + # | ;;abc | + # | def;;;; | + # | ;;;;; | + # | ;;;%NULLVALUE%;; | + # | | + # | nil | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::EmptyFieldValues, fields: [:data], sep: ';' + # ``` + # + # Results in: + # + # ``` + # | data | + # |-------------| + # | abc;d e f | + # | abc | + # | def | + # | | + # | %NULLVALUE% | + # | | + # | nil | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::EmptyFieldValues, fields: [:data], sep: ';', usenull: true + # ``` + # + # Results in: + # + # ``` + # | data | + # |-----------| + # | abc;d e f | + # | abc | + # | def | + # | | + # | | + # | | + # | nil | + # ``` + # + class EmptyFieldValues + # @note `sep` will be removed in a future version. **DO NOT USE** + # @param fields [Array,Symbol] field(s) to delete from + # @param sep [String] **DEPRECATED; DO NOT USE** + # @param delim [String] on which to split multivalued fields. Defaults to `Kiba::Extend.delim` if not provided. + # @param usenull [Boolean] whether to treat `Kiba::Extend.nullvalue` string as an empty value + def initialize(fields:, sep: nil, delim: nil, usenull: false) + @fields = [fields].flatten + @usenull = usenull + + if sep && delim + puts %Q[#{Kiba::Extend.warning_label}: Do not use both `sep` and `delim`. Prefer `delim`] + elsif sep + puts %Q[#{Kiba::Extend.warning_label}: The `sep` keyword is being deprecated in a future version. Change it to `delim` in your ETL code.] + @delim = sep + else + @delim = delim ? delim : Kiba::Extend.delim + end + end + + # @private + + def process(row) + fields.each do |field| + val = row.fetch(field) + next if val.nil? + + row[field] = val.split(delim) + .compact + .reject{ |str| Helpers.empty?(str, usenull) } + .join(delim) + end + row + end + + private + + attr_reader :fields, :delim, :usenull + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/delete/empty_fields.rb b/lib/kiba/extend/transforms/delete/empty_fields.rb new file mode 100644 index 000000000..8462a36d9 --- /dev/null +++ b/lib/kiba/extend/transforms/delete/empty_fields.rb @@ -0,0 +1,173 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # Removes fields/columns that contain no values. Supports treating `Kiba::Extend.nullvalue` as an empty value. + # Also supports specifying field-specific values that should be treated as though they are empty. + # + # @note This transform runs in memory, so for very large sources, it may take a long time or fail. + # + # # Examples + # + # The examples demonstrating `usenull` assume `Kiba::Extend.nullvalue` is set to `%NULLVALUE%`. + # + # ## Basic usage + # + # Used in pipeline as: + # + # ``` + # transform Delete::EmptyFields + # ``` + # + # Input table: + # + # ``` + # | a | b | c | d | + # |-----+-----+-----+-----| + # | a | | ccc | | + # | | nil | c | nil | + # | nil | | ccc | | + # | a | | | | + # ``` + # + # Results in: + # + # ``` + # | a | c | + # |-----+-----| + # | a | ccc | + # | | c | + # | nil | ccc | + # | a | | + # ``` + # + # ### Notes + # Empty strings and nil values are treated as empty by default. + # + # ## With usenull true + # + # Used in pipeline as: + # + # ``` + # transform Delete::EmptyFields, usenull: true + # ``` + # + # Input table: + # + # ``` + # | a | b | c | d | e | + # |-----+-----+-----+-----+-------------| + # | | nil | c | nil | %NULLVALUE% | + # | a | | ccc | | | + # | nil | | ccc | | %NULLVALUE% | + # | a | | | | | + # ``` + # + # Results in: + # + # ``` + # | a | c | + # |-----+-----| + # | | c | + # | a | ccc | + # | nil | ccc | + # | a | | + # ``` + # + # ## With consider_blank config given + # Used in pipeline as: + # + # ``` + # transform Delete::EmptyFields, consider_blank: {b: 'false', c: 'nope', e: "0#{Kiba::Extend.delim}false"} + # ``` + # + # Input table: + # + # ``` + # | a | b | c | d | e | + # |-----+-------+-------------+-----+-------| + # | | nil | | nil | 0 | + # | a | | %NULLVALUE% | | false | + # | nil | false | nope | | 0 | + # | a | | nil | | | + # ``` + # + # Results in: + # + # ``` + # | a | c | + # |-----+-------------| + # | | | + # | a | %NULLVALUE% | + # | nil | nope | + # | a | nil | + # ``` + # + # ### Notes + # Field `c` is retained because `usenull: true` is not used. If that argument were given, only Field `a` would be returned. + # + class EmptyFields + # @param usenull [Boolean] whether to treat `Kiba::Extend.nullvalue` as empty/blank value + # @param consider_blank [Hash{Symbol=>Array}] specifies field-specific value(s) that should be treated + # as blank/empty. **If multiple values should be considered blank for one field, join them using + # `Kiba::Extend.delimiter`** + def initialize(usenull: false, consider_blank: nil) + @usenull = usenull + @consider_blank = consider_blank ? consider_blank.transform_values{ |val| val.split(Kiba::Extend.delim) } : nil + @pop_fields = {} + @rows = [] + end + + # @private + def process(row) + populate_tracker(row) + nil + end + + # @private + def close + to_delete = rows.first.keys - pop_fields.keys + rows.each do |row| + to_delete.each{ |field| row.delete(field) } + yield row + end + end + + private + + attr_reader :pop_fields, :rows, :usenull, :consider_blank + + def populate_tracker(row) + prepare(row).each{ |field, val| pop_fields[field] = nil unless val.blank? } + rows << row + end + + def prepare(row) + return row unless usenull || consider_blank + + strip_consider_blanks(strip_nulls(row.dup)) + end + + def strip_consider_blanks(row) + return row unless consider_blank + + consider_blank.each do |field, blank_vals| + row[field] = '' if blank_vals.any?(row[field]) + end + row + end + + def strip_nulls(row) + return row unless usenull + + row.transform_values{ |val| Helpers.empty?(val, usenull) ? '' : val } + end + end + end + end + end +end + diff --git a/lib/kiba/extend/transforms/delete/field_value_containing_string.rb b/lib/kiba/extend/transforms/delete/field_value_containing_string.rb new file mode 100644 index 000000000..eeb8d3fe4 --- /dev/null +++ b/lib/kiba/extend/transforms/delete/field_value_containing_string.rb @@ -0,0 +1,100 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # Deletes full field value of all given fields that contain the given string. You can control + # whether match is case sensitive or not. + # + # To be clear, **contain = a partial match**. Use {FieldValueMatchingRegexp} with anchors to + # trigger deletion via a full match. + # + # # Examples + # + # Input table: + # + # ``` + # | a | b | + # |----------------+------| + # | xxxx a thing | foo | + # | thing xxxx 123 | bar | + # | x thing | xxxx | + # | y thing | xXxX | + # | xxxxxxx thing | baz | + # | | nil | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueContainingString, fields: %i[a b], match: 'xxxx' + # ``` + # + # Results in: + # + # ``` + # | a | b | + # |---------+------| + # | nil | foo | + # | nil | bar | + # | x thing | nil | + # | y thing | xXxX | + # | nil | baz | + # | | nil | + # ``` + # + # Input table: + # + # ``` + # | a | b | + # |---------+---------| + # | y thing | xXxXxXy | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueContainingString, fields: :b, match: 'xxxx', casesensitive: false + # ``` + # + # Results in: + # + # ``` + # | a | b | + # |---------+-----| + # | y thing | nil | + # ``` + # + class FieldValueContainingString + # @param fields [Array,Symbol] field(s) to delete from + # @param match [String] value to match + # @param casesensitive [Boolean] match mode + def initialize(fields:, match:, casesensitive: true) + @fields = [fields].flatten + @match = casesensitive ? match : match.downcase + @casesensitive = casesensitive + end + + # @private + def process(row) + fields.each do |field| + exval = row.fetch(field) + next if exval.blank? + + prepped = casesensitive ? exval : exval.downcase + row[field] = nil if prepped[match] + end + + row + end + + private + + attr_reader :fields, :match, :casesensitive + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/delete/field_value_if_equals_other_field.rb b/lib/kiba/extend/transforms/delete/field_value_if_equals_other_field.rb new file mode 100644 index 000000000..c36f7b481 --- /dev/null +++ b/lib/kiba/extend/transforms/delete/field_value_if_equals_other_field.rb @@ -0,0 +1,276 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # Deletes value in `delete` field if that value matches value in `if_equal_to` field. Opinionated treatment + # of multivalued fields described below. Case sensitive or insensitive matching options. Can also delete + # associated field values (by position) in additional grouped fields. This is useful, for example, + # in maintaining the integrity of grouped/subgrouped multivalue fields in CollectionSpace. + # + # **Note that the value of the `if_equal_to` field is never modified by this transform.** + # + # # Examples + # ## Simple example + # + # Input table: + # + # ``` + # | del | compare | + # |-----+---------| + # | a | b | + # | c | c | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueIfEqualsOtherField, delete: :del, if_equal_to: :compare + # ``` + # + # Results in: + # + # ``` + # | del | compare | + # |-----+---------| + # | a | b | + # | nil | c | + # ``` + # + # ### Notes + # + # The first row is left alone because a != b. + # + # In the second row, c is deleted from `del` because the value of `compare` is also c. + # + # ## Complex example + # + # This introduces handling of multivalued grouped fields with case insensitive matching. + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueIfEqualsOtherField, + # delete: :del, + # if_equal_to: :compare, + # multival: true, + # delim: ';', + # grouped_fields: %i[grpa grpb], + # casesensitive: false + # ``` + # + # Input table: + # + # ``` + # | row | del | compare | grpa | grpb | + # |-----+-----------+---------+-----------+-----------| + # | 1 | A;C;d;c;e | c | y;x;w;u;v | e;f;g;h;i | + # | 2 | a;b;c | a;z;c | d;e;f | g;h;i | + # | 3 | a | a;b | d | g | + # | 4 | a | b | z | q | + # | 5 | a | a | z | q | + # ``` + # + # Results in: + # + # ``` + # | row | del | compare | grpa | grpb | + # |-----+-------+---------+-------+-------| + # | 1 | A;d;e | c | y;w;v | e;g;i | + # | 2 | b | a;z;c | e | h | + # | 3 | nil | a;b | nil | nil | + # | 4 | a | b | z | q | + # | 5 | nil | a | nil | nil | + # ``` + # + # ### Notes + # #### Row 1 + # **If `compare` is a single value, all individual values in `del` are compared to the single `compare` value.** + # + # In `del` field, elements 1 (C) and 3 (c) are case-insensitive matches on the value in `compare`. Thus, + # elements 1 and 3 are removed from `del` and both grouped fields. + # + # #### Row 2 + # **If `compare` has multiple values, the values of `del` and `compare` are compared positionally.** + # + # Element 0 is a match (a in both). Element 1 is not (b != z). Element 2 is a match (c in both). + # + # Elements 0 and 2 are removed `del` and all grouped fields. + # + # #### Row 3 + # `compare` is multivalued, so `del` is compared positionally against `compare`, though `del` (and + # the grouped fields) are single valued. + # + # When all values are removed from a field, `nil` is returned. + # + # #### Row 4 + # a != b, so row is returned unmodified. + # + # #### Row 5 + # a = a, so a (Element 0) is removed from `del`. Element 0 is then removed from the grouped fields. + # + # ## Group length mismatch: ragged groups + # + # This introduces handling of multivalued grouped fields if fields grouped together have differnt numbers of + # values. + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueIfEqualsOtherField, + # delete: :del, + # if_equal_to: :compare, + # multival: true, + # delim: ';', + # grouped_fields: %i[grpa grpb], + # casesensitive: false + # ``` + # + # Input table: + # + # ``` + # | del | compare | grpa | grpb | + # |-----------+---------+---------+-----------| + # | A;C;d;e;c | c | y;x;w;u | e;f;g;h;i | + # ``` + # + # Results in: + # + # ``` + # | del | compare | grpa | grpb | + # |-------+---------+-------+-------| + # | A;d;e | c | y;w;u | e;g;h | + # ``` + # + # And a warning printed to STDOUT, which may trigger you to examine the input data: + # + # ``` + # KIBA WARNING: One or more grouped fields (grpa, grpb) has different number of values than the others + # in {:del=>"A;d;e", :compare=>"c", :grpa=>"y;x;w;u", :grpb=>"e;f;g;h;i"} + # ``` + # + # **If `del` had 4 elements and one or more of the grouped fields had a different number of elements, + # this would be handled similarly, with a slightly different warning.** + # + # ### Notes + # `grpa` has 4 values, while `grpb` has 5. + # + # Elements 1 and 4 from `del` match `compare`, so they are deleted. Those elements are also deleted from + # the grouped fields if present. + # + class FieldValueIfEqualsOtherField + # @param delete [Symbol] field from which values will be deleted + # @param if_equal_to [Symbol] field the `delete` values will be compared to. In other words, the "other field" + # @param multival [Boolean] whether to split field values for comparison + # @param delim [String] on which to split if `multival`. Defaults to `Kiba::Extend.delim` if not provided. + # @param grouped_fields [Array] field(s) from which positionally corresponding values should also be removed + # @param casesensitive [Boolean] matching mode + def initialize(delete:, if_equal_to:, multival: false, delim: nil, grouped_fields: [], casesensitive: true) + @delete = delete + @compare = if_equal_to + @multival = multival + @delim = delim ||= Kiba::Extend + @group = grouped_fields + @casesensitive = casesensitive + end + + # @private + def process(row) + del_val = prepare_val(delete, row, :compare) + compare_val = prepare_val(compare, row, :compare) + return row if compare_val.nil? || del_val.blank? + + compare_method = get_compare_method(del_val, compare_val) + to_delete = self.method(compare_method).call(del_val, compare_val) + return row if to_delete.empty? + + orig_del_val = prepare_val(delete, row, :final) + row[delete] = do_deletes(to_delete, orig_del_val.dup) + return row unless grouped? + + grouped = group.map{ |field| prepare_val(field, row) } + validation = validate_groups(grouped, orig_del_val) + report_group_issue(validation, row) unless validation == :valid + grouped.map{ |grp| do_deletes(to_delete, grp) } + .each_with_index{ |grp, i| row[group[i]] = grp } + + row + end + + private + + attr_reader :delete, :compare, :multival, :delim, :group, :casesensitive + + def compare_against_multi_value(del_val, compare_val) + to_del = [] + del_val.each_with_index do |val, i| + to_del << i if val == compare_val[i] + end + to_del.sort.reverse + end + + def compare_against_single_value(del_val, compare_val) + cval = compare_val.first + to_del = [] + del_val.each_with_index do |val, i| + to_del << i if val == cval + end + to_del.sort.reverse + end + + def do_deletes(to_delete, vals) + to_delete.each{ |i| vals.delete_at(i) } + return nil if vals.empty? + + vals.join(delim) + end + + def get_compare_method(del_val, compare_val) + return :compare_against_single_value if compare_val.length == 1 + + :compare_against_multi_value + end + + def grouped? + !group.empty? + end + + def prepare_val(field, row, type = :final) + val = row.fetch(field) + return nil if val.blank? + + if type == :final + split = multival ? val.split(delim) : [val] + return split + end + + norm_val = casesensitive ? val : val.downcase + multival ? norm_val.split(delim) : [norm_val] + end + + def report_group_issue(validation, row) + grpfields = group.join(', ') + case validation + when :ragged_group_length + msg = "One or more grouped fields (#{grpfields}) has different number of values than the others" + when :orig_vs_group_length_mismatch + msg = "Grouped fields (#{grpfields}) have different number of values than #{delete} field" + end + puts %Q[#{Kiba::Extend.warning_label}: #{msg} in #{row}] + end + + def validate_groups(groups, orig_del_val) + orig_length = orig_del_val.length + lengths = groups.map(&:length).uniq + return :valid if lengths.length == 1 || orig_length == lengths.first + return :ragged_group_length if lengths.length > 1 + + :orig_vs_group_length_mismatch + end + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/delete/field_value_matching_regexp.rb b/lib/kiba/extend/transforms/delete/field_value_matching_regexp.rb new file mode 100644 index 000000000..46d445f9a --- /dev/null +++ b/lib/kiba/extend/transforms/delete/field_value_matching_regexp.rb @@ -0,0 +1,95 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # Deletes full field value of all given fields that match the given regular expression pattern. + # You can control whether the regexp is case sensitive or not + # + # # Examples + # + # Input table: + # + # ``` + # | a | b | + # |----------------+------| + # | xxxx a thing | foo | + # | thing xxxx 123 | bar | + # | x thing | xxxx | + # | y thing | xXxX | + # | xxxxxxx thing | baz | + # | | nil | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueMatchingRegexp, fields: %i[a b], match: 'xx+' + # ``` + # + # Results in: + # + # ``` + # | a | b | + # |---------+------| + # | nil | foo | + # | nil | bar | + # | x thing | nil | + # | y thing | xXxX | + # | nil | baz | + # | | nil | + # ``` + # + # Input table: + # + # ``` + # | a | b | + # |---------+---------| + # | an xxxx | xXxXxXy | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldValueMatchingRegexp, fields: %i[a b], match: '^xx+', casesensitive: false + # ``` + # + # Results in: + # + # ``` + # | a | b | + # |---------+-----| + # | an xxxx | nil | + # ``` + # + class FieldValueMatchingRegexp + # @param fields [Array,Symbol] field(s) to delete from + # @param match [String] value to match. Is converted to a regular expression pattern via `Regexp.new(match)` + # @param casesensitive [Boolean] match mode + def initialize(fields:, match:, casesensitive: true) + @fields = [fields].flatten + @match = casesensitive ? Regexp.new(match) : Regexp.new(match, Regexp::IGNORECASE) + end + + # @private + def process(row) + fields.each do |field| + val = row.fetch(field) + next if val.blank? + + row[field] = nil if val.match?(match) + end + + row + end + + private + + attr_reader :fields, :match + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/delete/fields.rb b/lib/kiba/extend/transforms/delete/fields.rb new file mode 100644 index 000000000..6408f5f5e --- /dev/null +++ b/lib/kiba/extend/transforms/delete/fields.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # Deletes field(s) passed in `fields` parameter. + # + # # Examples + # + # Input table: + # + # ``` + # | a | b | c | + # |---+---+---| + # | 1 | 2 | 3 | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::Fields, fields: %i[a c] + # ``` + # + # Results in: + # + # ``` + # | b | + # |---| + # | 2 | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::Fields, fields: :b + # ``` + # + # Results in: + # + # ``` + # | a | c | + # |---+---| + # | 1 | 3 | + # ``` + # + class Fields + # @param fields [Array,Symbol] field(s) to delete from + def initialize(fields:) + @fields = [fields].flatten + end + + # @private + def process(row) + fields.each { |name| row.delete(name) } + row + end + + private + + attr_reader :fields + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/delete/fields_except.rb b/lib/kiba/extend/transforms/delete/fields_except.rb new file mode 100644 index 000000000..c2a715a77 --- /dev/null +++ b/lib/kiba/extend/transforms/delete/fields_except.rb @@ -0,0 +1,89 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + module Delete + + # Deletes all fields except the one(s) passed in `fields` parameter. + # + # # Examples + # + # Input table: + # + # ``` + # | a | b | c | + # |---+---+---| + # | 1 | 2 | 3 | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldsExcept, fields: %i[a c] + # ``` + # + # Results in: + # + # ``` + # | a | c | + # |---+---| + # | 1 | 3 | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Delete::FieldsExcept, fields: :b + # ``` + # + # Results in: + # + # ``` + # | b | + # |---| + # | 2 | + # ``` + # + class FieldsExcept + class MissingKeywordArgumentError < ArgumentError + MSG = 'You must call with `fields` or `keepfields`. `fields` is preferred.' + def initialize(msg = MSG) + super + end + end + + # @param keepfields [Array, Symbol, nil] **DEPRECATED; DO NOT USE** + # @param fields [Array, Symbol, nil] list of fields to keep + # @note The `keepfields` parameter will be deprecated in a future version. Use `fields` in new code. + # @raise {MissingKeywordArgumentError} if neither `fields` nor `keepfields` is provided + def initialize(keepfields: nil, fields: nil) + if keepfields && fields + puts %Q[#{Kiba::Extend.warning_label}: Do not use both `keepfields` and `fields`. Defaulting to process using `fields`] + @fields = [fields].flatten + elsif keepfields + puts %Q[#{Kiba::Extend.warning_label}: The `keepfields` keyword is being deprecated in a future version. Change it to `fields` in your ETL code.] + @fields = [keepfields].flatten + elsif fields + @fields = [fields].flatten + else + raise MissingKeywordArgumentError + end + end + + # @private + def process(row) + deletefields = row.keys - fields + deletefields.each { |f| row.delete(f) } + row + end + + private + + attr_reader :fields + end + end + end + end +end + diff --git a/lib/kiba/extend/transforms/helpers.rb b/lib/kiba/extend/transforms/helpers.rb index 1d3388dbd..141ada51c 100644 --- a/lib/kiba/extend/transforms/helpers.rb +++ b/lib/kiba/extend/transforms/helpers.rb @@ -7,10 +7,10 @@ module Transforms module Helpers module_function # Indicates whether a field value is delimiter-only. If `usenull` is set to true, the - # %NULLVALUE% string is treated as empty in detecting delimiter-only-ness + # config.nullvalue string is treated as empty in detecting delimiter-only-ness # @param val [String] The field value to check # @param delim [String] The multivalue delimiter - # @param usenull [Boolean] If true, replaces '%NULLVALUE%' with '' to make determination + # @param usenull [Boolean] If true, replaces config.nullvalue string with '' to make determination # @return [false] if `value` is nil, empty, or contains characters other than delimiter(s) # and leading/trailing spaces # @return [true] if `value` contains only delimiter(s) and leading/trailing spaces @@ -19,10 +19,23 @@ def delim_only?(val, delim, usenull = false) return false if val.strip.empty? chk = val.gsub(delim, '').strip - chk = chk.gsub('%NULLVALUE%', '').strip if usenull + chk = chk.gsub(Kiba::Extend.nullvalue, '').strip if usenull chk.empty? end + # Indicates whether a given value is empty, ignoring delimiters. If `usenull` is true, + # the config.nullvalue string is treated as empty + # @param val [String] The field value to check + # @param usenull [Boolean] If true, replaces config.nullvalue string with '' to make determination + def empty?(val, usenull = false) + return true if val.nil? + + chkval = usenull ? val.gsub(Kiba::Extend.nullvalue, '') : val + + chkval.strip.empty? + end + + # @param row [Hash{Symbol=>String,Nil}l] A row of data # @param fields [Array(Symbol)] Names of fields to process # @param discard [:nil, :empty, :delim] Types of field values to remove from returned hash @@ -45,7 +58,7 @@ def field_values(row:, fields:, discard: %i[nil empty delim], delim: DELIM, usen # @return [Array(Symbol)] of names of fields that should be kept, based on given discard # and usenull param values and the field values private_class_method def keep_fields(field_vals, discard, delim, usenull) - field_vals = field_vals.transform_values { |val| val.gsub('%NULLVALUE%', '') } if usenull + field_vals = field_vals.transform_values { |val| val.gsub(Kiba::Extend.nullvalue, '') } if usenull field_vals = field_vals.reject { |_field, val| val.empty? } if discard.any?(:empty) field_vals = field_vals.reject { |_field, val| delim_only?(val, delim) } if discard.any?(:delim) field_vals.keys diff --git a/lib/kiba/extend/version.rb b/lib/kiba/extend/version.rb index 153c6a0d5..7c3f42d2d 100644 --- a/lib/kiba/extend/version.rb +++ b/lib/kiba/extend/version.rb @@ -2,6 +2,6 @@ module Kiba module Extend - VERSION = '2.5.3' + VERSION = '2.6.0' end end diff --git a/spec/kiba/extend/transforms/delete/empty_field_values_spec.rb b/spec/kiba/extend/transforms/delete/empty_field_values_spec.rb new file mode 100644 index 000000000..5a7a6262a --- /dev/null +++ b/spec/kiba/extend/transforms/delete/empty_field_values_spec.rb @@ -0,0 +1,122 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::EmptyFieldValues do + before{ Kiba::Extend.config.delim = '|' } + + let(:input) do + [ + {data: 'abc;;;d e f'}, + {data: ';;abc'}, + {data: 'def;;;;'}, + {data: ';;;;;'}, + {data: ';;;%NULLVALUE%;;'}, + {data: ''}, + {data: nil} + ] + end + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + context 'no delimiter given' do + let(:input) do + [ + {data: 'abc|||d e f'}, + {data: '||abc'}, + {data: 'def||||'}, + {data: '|||||'}, + {data: '|||%NULLVALUE%||'}, + {data: ''}, + {data: nil} + ] + end + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFieldValues, fields: :data + end + end + + let(:expected) do + [ + {data: 'abc|d e f'}, + {data: 'abc'}, + {data: 'def'}, + {data: ''}, + {data: '%NULLVALUE%'}, + {data: ''}, + {data: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'with default usenull argument' do + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFieldValues, fields: [:data], delim: ';' + end + end + + let(:expected) do + [ + {data: 'abc;d e f'}, + {data: 'abc'}, + {data: 'def'}, + {data: ''}, + {data: '%NULLVALUE%'}, + {data: ''}, + {data: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'with usenull = true' do + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFieldValues, fields: [:data], delim: ';', usenull: true + end + end + + let(:expected) do + [ + {data: 'abc;d e f'}, + {data: 'abc'}, + {data: 'def'}, + {data: ''}, + {data: ''}, + {data: ''}, + {data: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + context 'with sep given' do + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFieldValues, fields: [:data], sep: ';', usenull: true + end + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + it 'puts warning to STDOUT' do + msg = %Q[#{Kiba::Extend.warning_label}: The `sep` keyword is being deprecated in a future version. Change it to `delim` in your ETL code.\n] + expect{ result }.to output(msg).to_stdout + end + end + end +end diff --git a/spec/kiba/extend/transforms/delete/empty_fields_spec.rb b/spec/kiba/extend/transforms/delete/empty_fields_spec.rb new file mode 100644 index 000000000..23b20d343 --- /dev/null +++ b/spec/kiba/extend/transforms/delete/empty_fields_spec.rb @@ -0,0 +1,118 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::EmptyFields do + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + let(:input) do + [ + {a: 'a', b: '', c: 'ccc', d: ''}, + {a: '', b: nil, c: 'c', d: nil}, + {a: nil, b: '', c: 'ccc', d: ''}, + {a: 'a', b: '', c: '', d: ''} + ] + end + + let(:expected) do + [ + {a: 'a', c: 'ccc'}, + {a: '', c: 'c'}, + {a: nil, c: 'ccc'}, + {a: 'a', c: ''} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFields + end + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + context 'with usenull true' do + let(:input) do + [ + {a: '', b: nil, c: 'c', d: nil, e: '%NULLVALUE%'}, + {a: 'a', b: '', c: 'ccc', d: '', e: ''}, + {a: nil, b: '', c: 'ccc', d: '', e: '%NULLVALUE%'}, + {a: 'a', b: '', c: '', d: '', e: ''} + ] + end + + let(:expected) do + [ + {a: '', c: 'c'}, + {a: 'a', c: 'ccc'}, + {a: nil, c: 'ccc'}, + {a: 'a', c: ''} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFields, usenull: true + end + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'with consider_blank config' do + let(:input) do + [ + {a: '', b: nil, c: '', d: nil, e: '0'}, + {a: 'a', b: '', c: '%NULLVALUE%', d: '', e: 'false'}, + {a: nil, b: 'false', c: 'nope', d: '', e: '0'}, + {a: 'a', b: '', c: nil, d: '', e: ''} + ] + end + + let(:expected) do + [ + {a: '', c: ''}, + {a: 'a', c: '%NULLVALUE%'}, + {a: nil, c: 'nope'}, + {a: 'a', c: nil} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFields, consider_blank: {b: 'false', c: 'nope', e: "0#{Kiba::Extend.delim}false"} + end + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + context 'with usenull true' do + let(:expected) do + [ + {a: ''}, + {a: 'a'}, + {a: nil}, + {a: 'a'} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::EmptyFields, usenull: true, consider_blank: {b: 'false', c: 'nope', e: "0#{Kiba::Extend.delim}false"} + end + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + end +end diff --git a/spec/kiba/extend/transforms/delete/field_value_containing_string_spec.rb b/spec/kiba/extend/transforms/delete/field_value_containing_string_spec.rb new file mode 100644 index 000000000..76070e662 --- /dev/null +++ b/spec/kiba/extend/transforms/delete/field_value_containing_string_spec.rb @@ -0,0 +1,65 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::FieldValueContainingString do + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + let(:input) do + [ + {a: 'xxxx a thing', b: 'foo'}, + {a: 'thing xxxx 123', b: 'bar'}, + {a: 'x thing', b: 'xxxx'}, + {a: 'y thing', b: 'xXxX'}, + {a: 'xxxxxxx thing', b: 'baz'}, + {a: '', b: nil} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueContainingString, fields: %i[a b], match: 'xxxx' + end + end + + let(:expected) do + [ + {a: nil, b: 'foo'}, + {a: nil, b: 'bar'}, + {a: 'x thing', b: nil}, + {a: 'y thing', b: 'xXxX'}, + {a: nil, b: 'baz'}, + {a: '', b: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + context 'with casesensitive = false' do + let(:input) do + [ + {a: 'y thing', b: 'xXxXxXy'} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueContainingString, fields: :b, match: 'xxxx', casesensitive: false + end + end + + let(:expected) do + [ + {a: 'y thing', b: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end +end diff --git a/spec/kiba/extend/transforms/delete/field_value_if_equals_other_field_spec.rb b/spec/kiba/extend/transforms/delete/field_value_if_equals_other_field_spec.rb new file mode 100644 index 000000000..27b88ac44 --- /dev/null +++ b/spec/kiba/extend/transforms/delete/field_value_if_equals_other_field_spec.rb @@ -0,0 +1,113 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::FieldValueIfEqualsOtherField do + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + let(:input) do + [ + { del: 'a', compare: 'b' }, + { del: 'c', compare: 'c' } + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueIfEqualsOtherField, delete: :del, if_equal_to: :compare + end + end + + let(:expected) do + [ + { del: 'a', compare: 'b' }, + { del: nil, compare: 'c' } + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + context 'with grouped field(s)' do + context 'where groups are even as expected' do + let(:input) do + [ + {row: '1', del: 'A;C;d;c;e', compare: 'c', grpa: 'y;x;w;u;v', grpb: 'e;f;g;h;i'}, + {row: '2', del: 'a;b;c', compare: 'a;z;c', grpa: 'd;e;f', grpb: 'g;h;i' }, + {row: '3', del: 'a', compare: 'a;b', grpa: 'd', grpb: 'g'}, + {row: '4', del: 'a', compare: 'b', grpa: 'z', grpb: 'q'}, + {row: '5', del: 'a', compare: 'a', grpa: 'z', grpb: 'q'} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueIfEqualsOtherField, + delete: :del, + if_equal_to: :compare, + multival: true, + delim: ';', + grouped_fields: %i[grpa grpb], + casesensitive: false + end + end + + let(:expected) do + [ + {row: '1', del: 'A;d;e', compare: 'c', grpa: 'y;w;v', grpb: 'e;g;i'}, + {row: '2', del: 'b', compare: 'a;z;c', grpa: 'e', grpb: 'h' }, + {row: '3', del: nil, compare: 'a;b', grpa: nil, grpb: nil}, + {row: '4', del: 'a', compare: 'b', grpa: 'z', grpb: 'q'}, + {row: '5', del: nil, compare: 'a', grpa: nil, grpb: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'where groups are ragged' do + let(:input) do + [ + {del: 'A;C;d;e;c', compare: 'c', grpa: 'y;x;w;u', grpb: 'e;f;g;h;i'}, + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueIfEqualsOtherField, + delete: :del, + if_equal_to: :compare, + multival: true, + delim: ';', + grouped_fields: %i[grpa grpb], + casesensitive: false + end + end + + let(:expected) do + [ + {del: 'A;d;e', compare: 'c', grpa: 'y;w;u', grpb: 'e;g;h'}, + ] + end + + it 'transforms as expected' do + Helpers::ExampleFormatter.new(input, expected) + expect(result).to eq(expected) + end + + it 'outputs warning to STDOUT' do + msg = Regexp.new( + /KIBA WARNING: One or more grouped fields \(grpa, grpb\) has different number of values than the others in \{.*\}/ + ) + expect{ result }.to output(msg).to_stdout + end + end + + end +end + diff --git a/spec/kiba/extend/transforms/delete/field_value_matching_regexp_spec.rb b/spec/kiba/extend/transforms/delete/field_value_matching_regexp_spec.rb new file mode 100644 index 000000000..08638fe09 --- /dev/null +++ b/spec/kiba/extend/transforms/delete/field_value_matching_regexp_spec.rb @@ -0,0 +1,65 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::FieldValueMatchingRegexp do + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + let(:input) do + [ + {a: 'xxxx a thing', b: 'foo'}, + {a: 'thing xxxx 123', b: 'bar'}, + {a: 'x thing', b: 'xxxx'}, + {a: 'y thing', b: 'xXxX'}, + {a: 'xxxxxxx thing', b: 'baz'}, + {a: '', b: nil} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueMatchingRegexp, fields: %i[a b], match: 'xx+' + end + end + + let(:expected) do + [ + {a: nil, b: 'foo'}, + {a: nil, b: 'bar'}, + {a: 'x thing', b: nil}, + {a: 'y thing', b: 'xXxX'}, + {a: nil, b: 'baz'}, + {a: '', b: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + context 'with anchored pattern and casesensitive = false' do + let(:input) do + [ + {a: 'an xxxx', b: 'xXxXxXy'} + ] + end + + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldValueMatchingRegexp, fields: %i[a b], match: '^xx+', casesensitive: false + end + end + + let(:expected) do + [ + {a: 'an xxxx', b: nil} + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end +end diff --git a/spec/kiba/extend/transforms/delete/fields_except_spec.rb b/spec/kiba/extend/transforms/delete/fields_except_spec.rb new file mode 100644 index 000000000..372d21c07 --- /dev/null +++ b/spec/kiba/extend/transforms/delete/fields_except_spec.rb @@ -0,0 +1,108 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::Fields do + let(:input) do + [ + {a: '1', b: '2', c: '3'} + ] + end + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + context 'with multiple fields in array' do + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldsExcept, fields: %i[a c] + end + end + + let(:expected) do + [ + {a: '1', c: '3'}, + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'with single field given' do + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldsExcept, fields: :b + end + end + + let(:expected) do + [ + {b: '2'}, + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'with keepfields given' do + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldsExcept, keepfields: %i[a c] + end + end + + let(:expected) do + [ + {a: '1', c: '3'}, + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + it 'puts warning to STDOUT' do + msg = %Q[#{Kiba::Extend.warning_label}: The `keepfields` keyword is being deprecated in a future version. Change it to `fields` in your ETL code.\n] + expect{ result }.to output(msg).to_stdout + end + end + + context 'with fields and keepfields given' do + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldsExcept, fields: :b, keepfields: %i[a c] + end + end + + let(:expected) do + [ + {b: '2'}, + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + + it 'puts warning to STDOUT' do + msg = %Q[#{Kiba::Extend.warning_label}: Do not use both `keepfields` and `fields`. Defaulting to process using `fields`\n] + expect{ result }.to output(msg).to_stdout + end + end + + context 'with neither fields and keepfields given' do + let(:transforms) do + Kiba.job_segment do + transform Delete::FieldsExcept + end + end + + it 'puts raises MissingKeywordArgumentError' do + expect{ result }.to raise_error(Delete::FieldsExcept::MissingKeywordArgumentError) + end + end +end diff --git a/spec/kiba/extend/transforms/delete/fields_spec.rb b/spec/kiba/extend/transforms/delete/fields_spec.rb new file mode 100644 index 000000000..e25a72944 --- /dev/null +++ b/spec/kiba/extend/transforms/delete/fields_spec.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Delete::Fields do + let(:input) do + [ + {a: '1', b: '2', c: '3'} + ] + end + let(:accumulator){ [] } + let(:test_job){ Helpers::TestJob.new(input: input, accumulator: accumulator, transforms: transforms) } + let(:result){ test_job.accumulator } + + context 'with multiple fields in array' do + let(:transforms) do + Kiba.job_segment do + transform Delete::Fields, fields: %i[a c] + end + end + + let(:expected) do + [ + {b: '2'}, + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end + + context 'with single field given' do + let(:transforms) do + Kiba.job_segment do + transform Delete::Fields, fields: :c + end + end + + let(:expected) do + [ + {a: '1', b: '2'}, + ] + end + + it 'transforms as expected' do + expect(result).to eq(expected) + end + end +end diff --git a/spec/kiba/extend/transforms/delete_spec.rb b/spec/kiba/extend/transforms/delete_spec.rb deleted file mode 100644 index d153bd2ad..000000000 --- a/spec/kiba/extend/transforms/delete_spec.rb +++ /dev/null @@ -1,244 +0,0 @@ -# frozen_string_literal: true - -require 'spec_helper' - -RSpec.describe Kiba::Extend::Transforms::Delete do - describe 'EmptyFieldValues' do - rows = [ - %w[id data], - [1, 'abc;;;d e f'], - [2, ';;abc'], - [3, 'def;;;;'], - [4, ';;;;;'] - ] - - before do - generate_csv(rows) - end - it 'deletes empty field values from multivalued field' do - expected = [ - { id: '1', data: 'abc;d e f' }, - { id: '2', data: 'abc' }, - { id: '3', data: 'def' }, - { id: '4', data: '' } - ] - result = execute_job(filename: test_csv, - xform: Delete::EmptyFieldValues, - xformopt: { fields: [:data], sep: ';' }) - expect(result).to eq(expected) - end - end - - describe 'Fields' do - rows = [ - %w[id name sex source], - [1, 'Weddy', 'm', 'adopted'], - [2, 'Kernel', 'f', 'adopted'] - ] - - before do - generate_csv(rows) - end - it 'deletes fields' do - expected = [ - { id: '1', name: 'Weddy' }, - { id: '2', name: 'Kernel' } - ] - result = execute_job(filename: test_csv, - xform: Delete::Fields, - xformopt: { fields: %i[sex source] }) - expect(result).to eq(expected) - end - end - - describe 'FieldsExcept' do - rows = [ - %w[id name sex source], - [1, 'Weddy', 'm', 'adopted'], - [2, 'Kernel', 'f', 'adopted'] - ] - - before do - generate_csv(rows) - end - it 'deletes all fields except ones given as keepfields' do - expected = [ - { name: 'Weddy' }, - { name: 'Kernel' } - ] - result = execute_job(filename: test_csv, - xform: Delete::FieldsExcept, - xformopt: { keepfields: %i[name] }) - expect(result).to eq(expected) - end - end - - describe 'FieldValueMatchingRegexp' do - after { File.delete(test_csv) if File.exist?(test_csv) } - it 'Deletes whole field value if it matches given regexp' do - rows = [ - %w[id val], - ['1', 'xxxxxx a thing'], - ['2', 'thing xxxx 123'], - ['3', 'x files'] - ] - generate_csv(rows) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueMatchingRegexp, - xformopt: { fields: [:val], - match: 'xx+' }).map { |h| h[:val] } - expected = [nil, nil, 'x files'] - expect(result).to eq(expected) - end - it 'Can do case insensitive match' do - rows = [ - %w[id val], - ['1', 'XxXx a thing'], - ['2', 'thing xxxx 123'], - ['3', 'x files'] - ] - generate_csv(rows) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueMatchingRegexp, - xformopt: { fields: [:val], - match: '^xxxx ', - casesensitive: false }).map { |h| h[:val] } - expected = [nil, 'thing xxxx 123', 'x files'] - expect(result).to eq(expected) - end - it 'Skips nil values' do - rows = [ - %w[id val], - ['1', 'XxXx a thing'], - ['2', 'thing xxxx 123'], - ['3', nil] - ] - generate_csv(rows) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueMatchingRegexp, - xformopt: { fields: [:val], - match: 'xxxx ', - casesensitive: false }).map { |h| h[:val] } - expected = [nil, nil, nil] - expect(result).to eq(expected) - end - end - - describe 'FieldValueIfEqualsOtherField' do - after { File.delete(test_csv) if File.exist?(test_csv) } - it 'deletes data in specified field if it duplicates data in other given field' do - rows2 = [ - %w[id val chk], - [1, 'a', 'b'], - [2, 'c', 'c'] - ] - generate_csv(rows2) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueIfEqualsOtherField, - xformopt: { - delete: :val, - if_equal_to: :chk - }).map { |h| h[:val] } - expected = ['a', ''] - expect(result).to eq(expected) - end - - context 'when `multival` = true and `sep` given' do - it 'deletes individual value matching `if_equal_to` field' do - rows2 = [ - %w[id val chk], - [1, 'a', 'b'], - [2, 'a;c', 'c'] - ] - generate_csv(rows2) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueIfEqualsOtherField, - xformopt: { - delete: :val, - if_equal_to: :chk, - multival: true, - sep: ';' - }).map { |h| h[:val] } - expected = %w[a a] - expect(result).to eq(expected) - end - end - - context 'when `grouped_fields` given' do - it 'deletes corresponding values from grouped fields' do - rows2 = [ - %w[id val chk valgrp valgrp2], - [2, 'a;C;d;c;e', 'c', 'y;x;w;u;v', 'e;f;g;h;i'], - [1, 'a', 'b', 'z', 'q'] - ] - generate_csv(rows2) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueIfEqualsOtherField, - xformopt: { - delete: :val, - if_equal_to: :chk, - multival: true, - sep: ';', - grouped_fields: %i[valgrp valgrp2], - case_sensitive: false - }) - expected = [ - { id: '2', val: 'a;d;e', chk: 'c', valgrp: 'y;w;v', valgrp2: 'e;g;i' }, - { id: '1', val: 'a', chk: 'b', valgrp: 'z', valgrp2: 'q' } - ] - expect(result).to eq(expected) - end - end - end - - describe 'FieldValueContainingString' do - after { File.delete(test_csv) if File.exist?(test_csv) } - it 'Deletes whole field value if it contains given string' do - rows = [ - %w[id val], - ['1', 'xxxx a thing'], - ['2', 'thing xxxx 123'], - ['3', 'x files'] - ] - generate_csv(rows) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueContainingString, - xformopt: { fields: [:val], - match: ' xxxx ' }).map { |h| h[:val] } - expected = ['xxxx a thing', nil, 'x files'] - expect(result).to eq(expected) - end - it 'Can do case insensitive match' do - rows = [ - %w[id val], - ['1', 'XxXx a thing'], - ['2', 'thing xxxx 123'], - ['3', 'x files'] - ] - generate_csv(rows) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueContainingString, - xformopt: { fields: [:val], - match: 'xxxx ', - casesensitive: false }).map { |h| h[:val] } - expected = [nil, nil, 'x files'] - expect(result).to eq(expected) - end - it 'Skips nil values' do - rows = [ - %w[id val], - ['1', 'XxXx a thing'], - ['2', 'thing xxxx 123'], - ['3', nil] - ] - generate_csv(rows) - result = execute_job(filename: test_csv, - xform: Delete::FieldValueContainingString, - xformopt: { fields: [:val], - match: 'xxxx ', - casesensitive: false }).map { |h| h[:val] } - expected = [nil, nil, nil] - expect(result).to eq(expected) - end - end -end diff --git a/spec/kiba/extend/transforms/helpers_spec.rb b/spec/kiba/extend/transforms/helpers_spec.rb index 73d854b12..21fcb3451 100644 --- a/spec/kiba/extend/transforms/helpers_spec.rb +++ b/spec/kiba/extend/transforms/helpers_spec.rb @@ -83,6 +83,68 @@ end end + describe '#empty?' do + let(:result){ empty?(val) } + + context 'with non-empty string' do + let(:val){ 'something' } + + it 'returns false' do + expect(result).to be false + end + end + + context 'with nil' do + let(:val){ nil } + + it 'returns true' do + expect(result).to be true + end + end + + context 'with empty string' do + let(:val){ '' } + + it 'returns true' do + expect(result).to be true + end + end + + context 'with space-only string' do + let(:val){ ' ' } + + it 'returns true' do + expect(result).to be true + end + end + + context 'with space-and-tab string' do + let(:val){ " \t " } + + it 'returns true' do + expect(result).to be true + end + end + + context 'with config.nullvalue string' do + let(:val){ Kiba::Extend.nullvalue } + + context 'with usenull false (default)' do + it 'returns false' do + expect(result).to be false + end + end + + context 'with usenull true' do + let(:result){ empty?(val, true) } + + it 'returns true' do + expect(result).to be true + end + end + end + end + describe '#field_values' do let(:row) { {