Skip to content

Commit

Permalink
Merge pull request #36 from lyrasis/crosstab
Browse files Browse the repository at this point in the history
new Reshape::SimplePivot transformation
  • Loading branch information
kspurgin authored Sep 7, 2021
2 parents f6fe7d0 + 34e4e5e commit 597a074
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
kiba-extend (2.2.1)
kiba-extend (2.3.0)
activesupport (~> 6.1.4)
csv (~> 3.0)
dry-configurable (~> 0.11)
Expand Down
2 changes: 1 addition & 1 deletion lib/kiba/extend/transforms/deduplicate.rb
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ def get_value_frequency(fv)
# Used in pipeline as:
#
# ```
# transform Deduplicate::Table, fields: :combined, delete_field: true
# transform Deduplicate::Table, field: :combined, delete_field: true
# ```
#
# Results in:
Expand Down
109 changes: 109 additions & 0 deletions lib/kiba/extend/transforms/reshape.rb
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,115 @@ def process(row)
row
end
end

# Dynamically pivots your data into a new shape, based on values of the given fields.

# @note This transformation runs in memory, so it may bog down or crash on extremely large
# data sources
# @note This transformation has some pretty strong assumptions and limitations that can be
# quite destructive, so examine the example below carefully.
#
# # Examples
#
# Input table:
#
# ```
# | authority | norm | term | unrelated |
# |-----------+---------+-------------+-----------|
# | person | fred | Fred Q. | foo |
# | org | fred | Fred, Inc. | bar |
# | location | unknown | Unknown | baz |
# | person | unknown | Unknown | fuz |
# | org | unknown | Unknown | aaa |
# | work | book | Book | eee |
# | location | book | | zee |
# | | book | Book | squeee |
# | nil | ghost | Ghost | boo |
# | location | | Ghost | zoo |
# | location | ghost | nil | poo |
# | org | fred | Fred, Corp. | bar |
# | issues | nil | nil | bah |
# ```
#
# Used in pipeline as:
#
# ```
# transform Reshape::SimplePivot,
# field_to_columns: :authority,
# field_to_rows: :norm,
# field_to_col_vals: :term
# ```
#
# Results in:
#
# ```
# | norm | person | org | location | work | issues |
# |---------+---------+-------------+----------+------+--------|
# | fred | Fred Q. | Fred, Corp. | nil | nil | nil |
# | unknown | Unknown | Unknown | Unknown | nil | nil |
# | book | nil | nil | nil | Book | nil |
# ```
#
# **NOTE**
#
# - A new column has been created for each unique value in the `field_to_columns` field
# - A single row has been generated for each unique value in the `field_to_rows` field
# - The value from the `field_to_col_vals` field is in the appropriate column
# - When more than one row has the same values for `field_to_columns` and `field_to_rows`,
# the value of the last row processed's `field_to_col_vals` will be used (we get Fred, Corp.
# instead of Fred, Inc.
# - Only data from the three involved fields is kept! Note that the `unrelated` field from
# the input has been lost
# - Rows lacking a value for any of the three fields will be skipped, in terms of populating
# the dynamically created column (see the Ghost examples)
# - However, a dynamically created column will still be created even if it is given no data
# (See issues example)
class SimplePivot
def initialize(field_to_columns:, field_to_rows:, field_to_col_vals:)
@col_field = field_to_columns
@row_field = field_to_rows
@col_val_field = field_to_col_vals
@rows = {}
@columns = {}
end

# @private
def process(row)
gather_column_field(row)
nil
end

# @private
def close
@rows.each do |fieldval, data|
row = {@row_field => fieldval}
row = row.merge(data)
row_fields = row.keys.freeze
@columns.keys.each{ |field| row[field] = nil unless row_fields.any?(field) }
yield row
end
end

private

def gather_column_field(row)
col_value = row.fetch(@col_field, nil)
return if col_value.blank?

col_name = col_value.to_sym
@columns[col_name] = nil
record_column_value_for_row(row, col_name)
end

def record_column_value_for_row(row, column)
row_field_val = row.fetch(@row_field, nil)
col_val = row.fetch(@col_val_field, nil)
return if row_field_val.blank? || col_val.blank?

@rows[row_field_val] = {} unless @rows.keys.any?(row_field_val)
@rows[row_field_val][column] = col_val
end
end
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/kiba/extend/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

module Kiba
module Extend
VERSION = '2.2.1'
VERSION = '2.3.0'
end
end
169 changes: 124 additions & 45 deletions spec/kiba/extend/transforms/reshape_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,61 +3,140 @@
require 'spec_helper'

RSpec.describe Kiba::Extend::Transforms::Reshape do
let(:test_job_config){ { source: input, destination: output } }
let(:test_job) { Kiba::Extend::Jobs::TestingJob.new(files: test_job_config, transformer: test_job_transforms) }
let(:output){ [] }

describe 'CollapseMultipleFieldsToOneTypedFieldPair' do
context 'when source field may be multivalued' do
rows = [
%w[homephone workphone mobilephone otherphone unrelated],
['2', '1', '3;4', '5', 'foo']
]
before do
generate_csv(rows)
let(:input) do
[{
homephone: '2',
workphone: '1',
mobilephone: '3|4',
otherphone: '5',
unrelated: 'foo'
}]
end

let(:expected) do
[{
phoneNumber: '1;2;3;4;5',
phoneType: 'business;personal;mobile;mobile;',
unrelated: 'foo'
}]
end

let(:test_job_transforms) do
Kiba.job_segment do
transform Reshape::CollapseMultipleFieldsToOneTypedFieldPair,
sourcefieldmap: {
workphone: 'business',
homephone: 'personal',
mobilephone: 'mobile',
otherphone: ''
},
datafield: :phoneNumber,
typefield: :phoneType,
sourcesep: '|',
targetsep: ';'
end
end

it 'reshapes the columns as specified after splitting source' do
expected = [
{ phoneNumber: '1;2;3;4;5', phoneType: 'business;personal;mobile;mobile;', unrelated: 'foo' }
]
result = execute_job(filename: test_csv,
xform: Reshape::CollapseMultipleFieldsToOneTypedFieldPair,
xformopt: { sourcefieldmap: {
workphone: 'business',
homephone: 'personal',
mobilephone: 'mobile',
otherphone: ''
},
datafield: :phoneNumber,
typefield: :phoneType,
sourcesep: DELIM,
targetsep: DELIM })
expect(result).to eq(expected)
test_job
expect(output).to eq(expected)
end
end

context 'when source field is not multivalued' do
rows = [
%w[workphone homephone mobilephone otherphone unrelated],
['123', '234', '345;456', '567', 'foo'],
['123', '234', '345 456', '567', 'bar']
]
before do
generate_csv(rows)
let(:input) do
[{
homephone: '123',
workphone: '234',
mobilephone: '345|456',
otherphone: '567',
unrelated: 'foo'
},
{
homephone: '123',
workphone: '234',
mobilephone: '345 456',
otherphone: '567',
unrelated: 'bar'
}]
end
it 'reshapes the columns as specified' do
expected = [
{ phoneNumber: '123;234;345;456;567', phoneType: 'business;personal;mobile;', unrelated: 'foo' },
{ phoneNumber: '123;234;345 456;567', phoneType: 'business;personal;mobile;', unrelated: 'bar' }

let(:expected) do
[
{ phoneNumber: '234;123;345|456;567', phoneType: 'business;personal;mobile;', unrelated: 'foo'},
{ phoneNumber: '234;123;345 456;567', phoneType: 'business;personal;mobile;', unrelated: 'bar'},
]
result = execute_job(filename: test_csv,
xform: Reshape::CollapseMultipleFieldsToOneTypedFieldPair,
xformopt: { sourcefieldmap: {
workphone: 'business',
homephone: 'personal',
mobilephone: 'mobile',
otherphone: ''
},
datafield: :phoneNumber,
typefield: :phoneType,
targetsep: DELIM })
expect(result).to eq(expected)
end

let(:test_job_transforms) do
Kiba.job_segment do
transform Reshape::CollapseMultipleFieldsToOneTypedFieldPair,
sourcefieldmap: {
workphone: 'business',
homephone: 'personal',
mobilephone: 'mobile',
otherphone: ''
},
datafield: :phoneNumber,
typefield: :phoneType,
targetsep: ';'
end
end


it 'reshapes the columns as specified' do
test_job
expect(output).to eq(expected)
end
end
end

describe 'SimplePivot' do
let(:input) do
[
{authority: 'person', norm: 'fred', term: 'Fred Q.', unrelated: 'foo'},
{authority: 'org', norm: 'fred', term: 'Fred, Inc.', unrelated: 'bar'},
{authority: 'location', norm: 'unknown', term: 'Unknown', unrelated: 'baz'},
{authority: 'person', norm: 'unknown', term: 'Unknown', unrelated: 'fuz'},
{authority: 'org', norm: 'unknown', term: 'Unknown', unrelated: 'aaa'},
{authority: 'work', norm: 'book', term: 'Book', unrelated: 'eee'},
{authority: 'location', norm: 'book', term: '', unrelated: 'zee'},
{authority: '', norm: 'book', term: 'Book', unrelated: 'squeee'},
{authority: nil, norm: 'ghost', term: 'Ghost', unrelated: 'boo'},
{authority: 'location', norm: '', term: 'Ghost', unrelated: 'zoo'},
{authority: 'location', norm: 'ghost', term: nil, unrelated: 'poo'},
{authority: 'org', norm: 'fred', term: 'Fred, Corp.', unrelated: 'bar'},
{authority: 'issues', norm: nil, term: nil, unrelated: 'bah'},
]
end

let(:expected) do
[
{norm: 'fred', person: 'Fred Q.', org: 'Fred, Corp.', location: nil, work: nil, issues: nil},
{norm: 'unknown', person: 'Unknown', org: 'Unknown', location: 'Unknown', work: nil, issues: nil},
{norm: 'book', person: nil, org: nil, location: nil, work: 'Book', issues: nil}
]
end

let(:test_job_transforms) do
Kiba.job_segment do
transform Reshape::SimplePivot,
field_to_columns: :authority,
field_to_rows: :norm,
field_to_col_vals: :term
end
end

it 'reshapes the columns as specified after splitting source' do
Helpers::ExampleFormatter.new(input, expected)
test_job
expect(output).to eq(expected)
end
end
end

0 comments on commit 597a074

Please sign in to comment.