diff --git a/doc/iterative_cleanup.md b/doc/iterative_cleanup.md index 2bcf7212b..0d5d8a1fa 100644 --- a/doc/iterative_cleanup.md +++ b/doc/iterative_cleanup.md @@ -93,3 +93,7 @@ This line should be added before any `registry.transform`, Calling `RegistryData.register` (or whatever method triggers the registration of all your jobs) must be done ***after*** the `config_namespaces` are set. + +## The process + +![Flowchart](https://github.com/lyrasis/kiba-extend/blob/main/doc/iterative_cleanup_flowchart.png?raw=true) diff --git a/doc/iterative_cleanup_flowchart.mmd b/doc/iterative_cleanup_flowchart.mmd new file mode 100644 index 000000000..d144cfdb1 --- /dev/null +++ b/doc/iterative_cleanup_flowchart.mmd @@ -0,0 +1,67 @@ +graph TD; + base_job["`**base_job**`"] + + BaseJobCleaned["`**BaseJobCleaned** + Adds mod.worksheet_add_fields + If cleanup done, merges from Corrections + Adds :clean_fingerprint`"] + + CleanedUniq["`**CleanedUniq** + Deduplicate on :clean_fingerprint + Delete mod.cleaned_uniq_collate_fields + Collate mod.cleaned_uniq_collate_fields`"] + + Worksheet["`**Worksheet** + Add :clean_fingerprint + If worksheet already provided: + - add :to_review + - populate it based on KnownWorksheetValues`"] + + Returned[/"`**Returned_n** + One CSV per cleanup worksheet returned`"\] + + Provided[/"`**Provided_n** + One CSV per cleanup worksheet provided to client`"\] + + ReturnedCompiled["`**ReturnedCompiled** + Delete :to_review + Flag changes via :clean_fingerprint + Deletes fp_ fields`"] + + KnownWorksheetValues[["`**KnownWorksheetValues** + Splits collated mod.orig_values_identifier + Produces list of uniq orig values in previous worksheets`"]] + + Corrections["`**Corrections** + Keep only rows with corrections + Explode collated mod.orig_values_identifier + Deduplicate on full row match`"] + + base_job-->BaseJobCleaned; + + Corrections-. + "lkup on + mod.orig_values_identifier`" .-> + BaseJobCleaned; + + BaseJobCleaned-->CleanedUniq; + + BaseJobCleaned-- + "`lkup on + :clean_fingerprint`" --> + CleanedUniq; + + CleanedUniq-->Worksheet; + + Worksheet-. + "`returned + if cleanup done`" .-> + Returned; + + Returned-.->ReturnedCompiled; + + ReturnedCompiled-.->Corrections; + + Provided-->KnownWorksheetValues; + + KnownWorksheetValues-->Worksheet; diff --git a/lib/kiba/extend/mixins/iterative_cleanup.rb b/lib/kiba/extend/mixins/iterative_cleanup.rb index 62dc08d5f..9234be6c7 100644 --- a/lib/kiba/extend/mixins/iterative_cleanup.rb +++ b/lib/kiba/extend/mixins/iterative_cleanup.rb @@ -80,7 +80,6 @@ module Mixins # # - {collation_delim} # - {orig_values_identifier} - # - {cleaned_values_identifier} # - {cleaned_uniq_collate_fields} # # ## What extending this module does @@ -165,17 +164,6 @@ def orig_values_identifier :fingerprint end - # Field used in cleanup process to deduplicate cleaned values and as - # a matchpoint for collating orig_values_identifiers (and, - # optionally, other field data) associated with cleaned values - # - # @note Optional: override in extending module after extending - # - # @return [:clean_fingerprint] - def cleaned_values_identifier - :clean_fingerprint - end - # Fields from base_job_cleaned that will be deleted in cleaned_uniq, # and then merged back into the deduplicated data from # base_job_cleaned. I.e., fields whose values will be collated @@ -385,7 +373,7 @@ def base_job_cleaned_job_hash(mod) args: {mod: mod} }, tags: mod.job_tags, - lookup_on: mod.cleaned_values_identifier + lookup_on: :clean_fingerprint } end private :base_job_cleaned_job_hash diff --git a/lib/kiba/extend/mixins/iterative_cleanup/jobs/base_job_cleaned.rb b/lib/kiba/extend/mixins/iterative_cleanup/jobs/base_job_cleaned.rb index c19d06b03..75b75854b 100644 --- a/lib/kiba/extend/mixins/iterative_cleanup/jobs/base_job_cleaned.rb +++ b/lib/kiba/extend/mixins/iterative_cleanup/jobs/base_job_cleaned.rb @@ -47,11 +47,6 @@ def xforms(mod) transform Append::NilFields, fields: mod.worksheet_add_fields - # Add :fingerprint (orig values) before merging any cleanup in - transform Fingerprint::Add, - target: :fingerprint, - fields: mod.fingerprint_fields - if mod.cleanup_done? && lookups.any?(mod.corrections_job_key) transform Fingerprint::MergeCorrected, lookup: method(mod.corrections_job_key).call, diff --git a/lib/kiba/extend/mixins/iterative_cleanup/jobs/cleaned_uniq.rb b/lib/kiba/extend/mixins/iterative_cleanup/jobs/cleaned_uniq.rb index cba5e082d..76711ce0a 100644 --- a/lib/kiba/extend/mixins/iterative_cleanup/jobs/cleaned_uniq.rb +++ b/lib/kiba/extend/mixins/iterative_cleanup/jobs/cleaned_uniq.rb @@ -60,13 +60,13 @@ def cleaned_xforms(mod) job = bind.receiver transform Deduplicate::Table, - field: mod.cleaned_values_identifier, + field: :clean_fingerprint, delete_field: false transform Delete::Fields, fields: mod.cleaned_uniq_collate_fields transform Merge::MultiRowLookup, lookup: send(mod.base_job_cleaned_job_key), - keycolumn: mod.cleaned_values_identifier, + keycolumn: :clean_fingerprint, fieldmap: job.send(:fieldmap, mod), delim: mod.collation_delim end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/jobs/worksheet.rb b/lib/kiba/extend/mixins/iterative_cleanup/jobs/worksheet.rb index 2933d7bc7..f1999f6ab 100644 --- a/lib/kiba/extend/mixins/iterative_cleanup/jobs/worksheet.rb +++ b/lib/kiba/extend/mixins/iterative_cleanup/jobs/worksheet.rb @@ -18,16 +18,6 @@ def job(mod:) ) end - def get_lookups(mod) - if mod.cleanup_done? - # todo - elsif mod.worksheet_sent_not_done? - # todo - else - [] - end - end - def get_xforms(mod) base = [] if mod.respond_to?(:worksheet_pre_xforms) @@ -42,8 +32,6 @@ def get_xforms(mod) def xforms(mod) Kiba.job_segment do - transform Append::NilFields, - fields: mod.worksheet_add_fields transform Fingerprint::Add, target: :clean_fingerprint, fields: mod.fingerprint_fields