From f835ac035f79a607bebcbb944b006268577c93ef Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Tue, 7 Sep 2021 15:06:04 -0400 Subject: [PATCH 1/2] fix typo --- lib/kiba/extend/transforms/deduplicate.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kiba/extend/transforms/deduplicate.rb b/lib/kiba/extend/transforms/deduplicate.rb index 1e46989e0..83d1a296e 100644 --- a/lib/kiba/extend/transforms/deduplicate.rb +++ b/lib/kiba/extend/transforms/deduplicate.rb @@ -366,7 +366,7 @@ def get_value_frequency(fv) # Used in pipeline as: # # ``` - # transform Deduplicate::Table, fields: :combined, delete_field: true + # transform Deduplicate::Table, field: :combined, delete_field: true # ``` # # Results in: From bf187115c3e6617a1fbfb1aafb7b5e010aeeb414 Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Tue, 7 Sep 2021 19:04:20 -0400 Subject: [PATCH 2/2] add new Reshape::SimplePivot transformation --- Gemfile.lock | 2 +- lib/kiba/extend/transforms/reshape.rb | 109 +++++++++++++ lib/kiba/extend/version.rb | 2 +- spec/kiba/extend/transforms/reshape_spec.rb | 169 ++++++++++++++------ 4 files changed, 235 insertions(+), 47 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 4289f29bc..131e747ec 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - kiba-extend (2.2.1) + kiba-extend (2.3.0) activesupport (~> 6.1.4) csv (~> 3.0) dry-configurable (~> 0.11) diff --git a/lib/kiba/extend/transforms/reshape.rb b/lib/kiba/extend/transforms/reshape.rb index b08b915bd..74ab5cca2 100644 --- a/lib/kiba/extend/transforms/reshape.rb +++ b/lib/kiba/extend/transforms/reshape.rb @@ -108,6 +108,115 @@ def process(row) row end end + + # Dynamically pivots your data into a new shape, based on values of the given fields. + + # @note This transformation runs in memory, so it may bog down or crash on extremely large + # data sources + # @note This transformation has some pretty strong assumptions and limitations that can be + # quite destructive, so examine the example below carefully. + # + # # Examples + # + # Input table: + # + # ``` + # | authority | norm | term | unrelated | + # |-----------+---------+-------------+-----------| + # | person | fred | Fred Q. | foo | + # | org | fred | Fred, Inc. | bar | + # | location | unknown | Unknown | baz | + # | person | unknown | Unknown | fuz | + # | org | unknown | Unknown | aaa | + # | work | book | Book | eee | + # | location | book | | zee | + # | | book | Book | squeee | + # | nil | ghost | Ghost | boo | + # | location | | Ghost | zoo | + # | location | ghost | nil | poo | + # | org | fred | Fred, Corp. | bar | + # | issues | nil | nil | bah | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Reshape::SimplePivot, + # field_to_columns: :authority, + # field_to_rows: :norm, + # field_to_col_vals: :term + # ``` + # + # Results in: + # + # ``` + # | norm | person | org | location | work | issues | + # |---------+---------+-------------+----------+------+--------| + # | fred | Fred Q. | Fred, Corp. | nil | nil | nil | + # | unknown | Unknown | Unknown | Unknown | nil | nil | + # | book | nil | nil | nil | Book | nil | + # ``` + # + # **NOTE** + # + # - A new column has been created for each unique value in the `field_to_columns` field + # - A single row has been generated for each unique value in the `field_to_rows` field + # - The value from the `field_to_col_vals` field is in the appropriate column + # - When more than one row has the same values for `field_to_columns` and `field_to_rows`, + # the value of the last row processed's `field_to_col_vals` will be used (we get Fred, Corp. + # instead of Fred, Inc. + # - Only data from the three involved fields is kept! Note that the `unrelated` field from + # the input has been lost + # - Rows lacking a value for any of the three fields will be skipped, in terms of populating + # the dynamically created column (see the Ghost examples) + # - However, a dynamically created column will still be created even if it is given no data + # (See issues example) + class SimplePivot + def initialize(field_to_columns:, field_to_rows:, field_to_col_vals:) + @col_field = field_to_columns + @row_field = field_to_rows + @col_val_field = field_to_col_vals + @rows = {} + @columns = {} + end + + # @private + def process(row) + gather_column_field(row) + nil + end + + # @private + def close + @rows.each do |fieldval, data| + row = {@row_field => fieldval} + row = row.merge(data) + row_fields = row.keys.freeze + @columns.keys.each{ |field| row[field] = nil unless row_fields.any?(field) } + yield row + end + end + + private + + def gather_column_field(row) + col_value = row.fetch(@col_field, nil) + return if col_value.blank? + + col_name = col_value.to_sym + @columns[col_name] = nil + record_column_value_for_row(row, col_name) + end + + def record_column_value_for_row(row, column) + row_field_val = row.fetch(@row_field, nil) + col_val = row.fetch(@col_val_field, nil) + return if row_field_val.blank? || col_val.blank? + + @rows[row_field_val] = {} unless @rows.keys.any?(row_field_val) + @rows[row_field_val][column] = col_val + end + end end end end diff --git a/lib/kiba/extend/version.rb b/lib/kiba/extend/version.rb index 0add0ab10..deb1f1e94 100644 --- a/lib/kiba/extend/version.rb +++ b/lib/kiba/extend/version.rb @@ -2,6 +2,6 @@ module Kiba module Extend - VERSION = '2.2.1' + VERSION = '2.3.0' end end diff --git a/spec/kiba/extend/transforms/reshape_spec.rb b/spec/kiba/extend/transforms/reshape_spec.rb index 0e71b322b..8256fe20e 100644 --- a/spec/kiba/extend/transforms/reshape_spec.rb +++ b/spec/kiba/extend/transforms/reshape_spec.rb @@ -3,61 +3,140 @@ require 'spec_helper' RSpec.describe Kiba::Extend::Transforms::Reshape do + let(:test_job_config){ { source: input, destination: output } } + let(:test_job) { Kiba::Extend::Jobs::TestingJob.new(files: test_job_config, transformer: test_job_transforms) } + let(:output){ [] } + describe 'CollapseMultipleFieldsToOneTypedFieldPair' do context 'when source field may be multivalued' do - rows = [ - %w[homephone workphone mobilephone otherphone unrelated], - ['2', '1', '3;4', '5', 'foo'] - ] - before do - generate_csv(rows) + let(:input) do + [{ + homephone: '2', + workphone: '1', + mobilephone: '3|4', + otherphone: '5', + unrelated: 'foo' + }] + end + + let(:expected) do + [{ + phoneNumber: '1;2;3;4;5', + phoneType: 'business;personal;mobile;mobile;', + unrelated: 'foo' + }] end + + let(:test_job_transforms) do + Kiba.job_segment do + transform Reshape::CollapseMultipleFieldsToOneTypedFieldPair, + sourcefieldmap: { + workphone: 'business', + homephone: 'personal', + mobilephone: 'mobile', + otherphone: '' + }, + datafield: :phoneNumber, + typefield: :phoneType, + sourcesep: '|', + targetsep: ';' + end + end + it 'reshapes the columns as specified after splitting source' do - expected = [ - { phoneNumber: '1;2;3;4;5', phoneType: 'business;personal;mobile;mobile;', unrelated: 'foo' } - ] - result = execute_job(filename: test_csv, - xform: Reshape::CollapseMultipleFieldsToOneTypedFieldPair, - xformopt: { sourcefieldmap: { - workphone: 'business', - homephone: 'personal', - mobilephone: 'mobile', - otherphone: '' - }, - datafield: :phoneNumber, - typefield: :phoneType, - sourcesep: DELIM, - targetsep: DELIM }) - expect(result).to eq(expected) + test_job + expect(output).to eq(expected) end end + context 'when source field is not multivalued' do - rows = [ - %w[workphone homephone mobilephone otherphone unrelated], - ['123', '234', '345;456', '567', 'foo'], - ['123', '234', '345 456', '567', 'bar'] - ] - before do - generate_csv(rows) + let(:input) do + [{ + homephone: '123', + workphone: '234', + mobilephone: '345|456', + otherphone: '567', + unrelated: 'foo' + }, + { + homephone: '123', + workphone: '234', + mobilephone: '345 456', + otherphone: '567', + unrelated: 'bar' + }] end - it 'reshapes the columns as specified' do - expected = [ - { phoneNumber: '123;234;345;456;567', phoneType: 'business;personal;mobile;', unrelated: 'foo' }, - { phoneNumber: '123;234;345 456;567', phoneType: 'business;personal;mobile;', unrelated: 'bar' } + + let(:expected) do + [ + { phoneNumber: '234;123;345|456;567', phoneType: 'business;personal;mobile;', unrelated: 'foo'}, + { phoneNumber: '234;123;345 456;567', phoneType: 'business;personal;mobile;', unrelated: 'bar'}, ] - result = execute_job(filename: test_csv, - xform: Reshape::CollapseMultipleFieldsToOneTypedFieldPair, - xformopt: { sourcefieldmap: { - workphone: 'business', - homephone: 'personal', - mobilephone: 'mobile', - otherphone: '' - }, - datafield: :phoneNumber, - typefield: :phoneType, - targetsep: DELIM }) - expect(result).to eq(expected) end + + let(:test_job_transforms) do + Kiba.job_segment do + transform Reshape::CollapseMultipleFieldsToOneTypedFieldPair, + sourcefieldmap: { + workphone: 'business', + homephone: 'personal', + mobilephone: 'mobile', + otherphone: '' + }, + datafield: :phoneNumber, + typefield: :phoneType, + targetsep: ';' + end + end + + + it 'reshapes the columns as specified' do + test_job + expect(output).to eq(expected) + end + end + end + + describe 'SimplePivot' do + let(:input) do + [ + {authority: 'person', norm: 'fred', term: 'Fred Q.', unrelated: 'foo'}, + {authority: 'org', norm: 'fred', term: 'Fred, Inc.', unrelated: 'bar'}, + {authority: 'location', norm: 'unknown', term: 'Unknown', unrelated: 'baz'}, + {authority: 'person', norm: 'unknown', term: 'Unknown', unrelated: 'fuz'}, + {authority: 'org', norm: 'unknown', term: 'Unknown', unrelated: 'aaa'}, + {authority: 'work', norm: 'book', term: 'Book', unrelated: 'eee'}, + {authority: 'location', norm: 'book', term: '', unrelated: 'zee'}, + {authority: '', norm: 'book', term: 'Book', unrelated: 'squeee'}, + {authority: nil, norm: 'ghost', term: 'Ghost', unrelated: 'boo'}, + {authority: 'location', norm: '', term: 'Ghost', unrelated: 'zoo'}, + {authority: 'location', norm: 'ghost', term: nil, unrelated: 'poo'}, + {authority: 'org', norm: 'fred', term: 'Fred, Corp.', unrelated: 'bar'}, + {authority: 'issues', norm: nil, term: nil, unrelated: 'bah'}, + ] + end + + let(:expected) do + [ + {norm: 'fred', person: 'Fred Q.', org: 'Fred, Corp.', location: nil, work: nil, issues: nil}, + {norm: 'unknown', person: 'Unknown', org: 'Unknown', location: 'Unknown', work: nil, issues: nil}, + {norm: 'book', person: nil, org: nil, location: nil, work: 'Book', issues: nil} + ] + end + + let(:test_job_transforms) do + Kiba.job_segment do + transform Reshape::SimplePivot, + field_to_columns: :authority, + field_to_rows: :norm, + field_to_col_vals: :term + end + end + + it 'reshapes the columns as specified after splitting source' do + Helpers::ExampleFormatter.new(input, expected) + test_job + expect(output).to eq(expected) end end end