From 5fb396b52962bd2cf802ceb28ffc85bb2449ec6f Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Mon, 11 Sep 2023 11:22:39 -0400 Subject: [PATCH 1/5] Add IterativeCleanup mixin and support --- lib/kiba/extend.rb | 30 +- lib/kiba/extend/error.rb | 8 + lib/kiba/extend/mixins/iterative_cleanup.rb | 409 ++++++++++++++++++ .../iterative_cleanup/base_job_cleaned.rb | 62 +++ .../mixins/iterative_cleanup/cleaned_uniq.rb | 80 ++++ .../mixins/iterative_cleanup/corrections.rb | 49 +++ .../known_worksheet_values.rb | 38 ++ .../iterative_cleanup/returned_compiled.rb | 44 ++ .../mixins/iterative_cleanup/worksheet.rb | 67 +++ .../utils/iterative_cleanup_job_registrar.rb | 30 ++ .../extend/mixins/iterative_cleanup_spec.rb | 56 +++ 11 files changed, 872 insertions(+), 1 deletion(-) create mode 100644 lib/kiba/extend/mixins/iterative_cleanup.rb create mode 100644 lib/kiba/extend/mixins/iterative_cleanup/base_job_cleaned.rb create mode 100644 lib/kiba/extend/mixins/iterative_cleanup/cleaned_uniq.rb create mode 100644 lib/kiba/extend/mixins/iterative_cleanup/corrections.rb create mode 100644 lib/kiba/extend/mixins/iterative_cleanup/known_worksheet_values.rb create mode 100644 lib/kiba/extend/mixins/iterative_cleanup/returned_compiled.rb create mode 100644 lib/kiba/extend/mixins/iterative_cleanup/worksheet.rb create mode 100644 lib/kiba/extend/utils/iterative_cleanup_job_registrar.rb create mode 100644 spec/kiba/extend/mixins/iterative_cleanup_spec.rb diff --git a/lib/kiba/extend.rb b/lib/kiba/extend.rb index 275b0765c..61f940a45 100644 --- a/lib/kiba/extend.rb +++ b/lib/kiba/extend.rb @@ -76,7 +76,17 @@ def reload! @loader.reload end - # @return [Hash] default options used for CSV sources/destinations + # Ruby modules that serve as namespaces under which config + # modules for a project are nested. + # @note You must set this from + # an individual project if you wish to use the + # {Kiba::Extend::Mixins::IterativeCleanup} mixin. + # @return [Array] + setting :config_namespaces, default: [], reader: true + + # Default options used for CSV sources/destinations + # + # @return [Hash] setting :csvopts, default: {headers: true, header_converters: %i[symbol downcase]}, reader: true @@ -187,6 +197,24 @@ def reload! # - :minimal - bare minimum setting :job_verbosity, default: :normal, reader: true + # List of config modules in project namespaces set in {config_namespaces} + # setting + # + # @return [Array] + def project_configs + config_namespaces.map { |ns| get_config_mods(ns, ns.constants) } + .flatten + .select { |obj| obj.is_a?(Module) && obj.respond_to?(:config) } + end + + # @param ns [Module] + # @param constants [Array] + # @return [Array] + def get_config_mods(ns, constants) + constants.map { |const| ns.const_get(const) } + end + private_class_method :get_config_mods + # The section below is for backward comapatibility only # @since 3.2.1 diff --git a/lib/kiba/extend/error.rb b/lib/kiba/extend/error.rb index 6b9fecacc..2c12700bd 100644 --- a/lib/kiba/extend/error.rb +++ b/lib/kiba/extend/error.rb @@ -43,6 +43,14 @@ def initialize(msg = "Action must be :keep or :reject") end end + class IterativeCleanupSettingUndefinedError < StandardError + include Kiba::Extend::ErrMod + end + + class ProjectSettingUndefinedError < StandardError + include Kiba::Extend::ErrMod + end + class PathRequiredError < ArgumentError include Kiba::Extend::ErrMod def initialize(klass) diff --git a/lib/kiba/extend/mixins/iterative_cleanup.rb b/lib/kiba/extend/mixins/iterative_cleanup.rb new file mode 100644 index 000000000..1e0ce2724 --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup.rb @@ -0,0 +1,409 @@ +# frozen_string_literal: true + +# Mixin module for setting up iterative cleanup based on a source table. +# +# "Iterative cleanup" means the client may provide the worksheet more +# than once, or that you may need to produce a fresh worksheet for +# the client after a new database export is provided. +# +# Your project must follow some setup/configuration conventions in order to use +# this mixin: +# +# - Each cleanup process must be configured in its own config module. +# - A config module is a Ruby module that responds to `:config`. +# +# Refer to todo:link Kiba::Tms::AltNumsForObjTypeCleanup as an example config +# module extending this mixin module in a simple way. See +# todo:link Kiba::Tms::PlacesCleanupInitial for a more complex usage with +# default overrides and custom pre/post transforms. +# +# ## Implementation details +# +# ### Define before extending this module +# +# These can be defined as Dry::Configurable settings or as public methods. The +# section below lists the method/setting name the extending module should +# respond to, each preceded by its YARD signature. +# +# ``` +# # @return [Symbol] registry entry job key for the job whose output +# # will be used as the base for generating the cleanup worksheet. +# # Iterations of cleanup will be layered over this output in the +# # auto-generated. **NOTE: This job's output should include a field +# # which combines/identifies the original values that may be +# # affected by the cleanup process. The default expectation is that +# # this field is named :fingerprint, but this can be overridden by +# # defining a custom `orig_values_identifier` method in the +# # extending module after extension. This field is used as a +# # matchpoint for merging cleaned up data back into the migration, +# # and identifying whether a given value in subsequent worksheet +# # iterations has been previously included in a worksheet** +# # base_job +# # +# # @return [Array] tags assigned to all jobs generated by extending +# # IterativeCleanup +# # job_tags +# # +# # @return [Array] nil/empty fields to be added to worksheet +# # worksheet_add_fields +# # +# # @return [Array] order of fields (in worksheet output). Will be used +# # to set destination special options/initial headers on the worksheet job. +# # worksheet_field_order +# # +# # @return [Array] fields included in the fingerprint value +# # fingerprint_fields +# # +# # @return [Symbol, Array, nil] field or fields included in +# # the fingerprint value that should be ignored when flagging +# # changes +# # fingerprint_flag_ignore_fields +# ``` +# +# ### Then, extend this module +# +# `extend Kiba::Extend::Mixins::IterativeCleanup` +# +# ### Optional settings/methods in extending module +# +# Default values for the following methods defined in this mixin +# module. If you want to override the values, define these methods +# in your config module after extending this module. +# +# - {collation_delim} +# - {orig_values_identifier} +# - {cleaned_values_identifier} +# - {cleaned_uniq_collate_fields} +# +# ## What extending this module does +# +# ### Defines settings in the extending config module +# +# These are empty settings with constructors that will use the values in a +# client-specific project config file to build the data expected for cleanup +# processing +# +# - **:provided_worksheets** - Array of filenames of cleanup +# worksheets provided to the client. Files should be listed +# oldest-to-newest. Assumes files are in the `to_client` +# subdirectory of the migration base directory. **Define actual +# values in client config file.** +## - **:returned_files** - Array of filenames of completed worksheets +# returned by client. Files should be listed oldest-to-newest. +# Assumes files are in the `supplied` subdirectory of the migration +# base directory. **Define actual values in client config file.** +# +# ### Defines methods in the extending config module +# +# See method documentation inline below. +# +# ### Prepares registry entries for iterative cleanup jobs +# +# When the application loads, {Kiba::Tms::RegistryData.register} calls +# {Kiba::Tms::Utils::IterativeCleanupJobRegistrar}. This util class calls +# the {register_cleanup_jobs} method of each config module extending this +# module, adding the cleanup jobs to the registry dynamically. +# +# The jobs themselves (i.e. the sources, lookups, transforms) are +# defined in {Kiba::Tms::Jobs::IterativeCleanup}. See that module's +# documentation for how to set up custom pre/post transforms to customize +# specific cleanup routines. +module Kiba::Extend::Mixins::IterativeCleanup + def self.extended(mod) + check_required_settings(mod) + define_provided_worksheets_setting(mod) + define_returned_files_setting(mod) + end + + # OVERRIDEABLE PUBLIC METHODS + + # Used as the namespace for auto-generated registry entries and the + # base for output file names. By default, this will be the name of + # the extending module, converted to snake case. + # + # @return [String] + def cleanup_base_name + name.split("::")[-1] + .gsub(/([A-Z])/, '_\1') + .delete_prefix("_") + .downcase + end + + # Delimiting string used to join collated-on-deduplicated values. Should be + # distinct from normal application delimiters since the field values being + # joined/split may contain the normal application delimiters. + # @note Optional: override in extending module after extending + # + # @return ["////"] + def collation_delim + "////" + end + + # Field in base job that combines/identifies the original field + # values entering the cleanup process. This field is used as a + # matchpoint for merging cleaned up data back into the migration, + # and identifying whether a given value in subsequent worksheet + # iterations has been previously included in a worksheet + # @note Optional: override in extending module after extending + # + # @return [:fingerprint] + def orig_values_identifier + :fingerprint + end + + # Field used in cleanup process to deduplicate cleaned values and as + # a matchpoint for collating orig_values_identifiers (and, + # optionally, other field data) associated with cleaned values + # @note Optional: override in extending module after extending + # + # @return [:clean_fingerprint] + def cleaned_values_identifier + :clean_fingerprint + end + + # Fields from base_job_cleaned that will be deleted in cleaned_uniq, + # and then merged back into the deduplicated data from + # base_job_cleaned. I.e., fields whose values will be collated + # into multivalued fields on the deduplicated values + # @note Optional: override in extending module after extending + # + # @return [Array<:fingerprint>] + def cleaned_uniq_collate_fields + [orig_values_identifier] + end + + # DO NOT OVERRIDE REMAINING METHODS + + # @return [Array] supplied registry entry job keys corresponding to + # returned cleanup files + def returned_file_jobs + returned_files.map.with_index do |filename, idx| + "#{cleanup_base_name}__file_returned_#{idx}".to_sym + end + end + + # @return [Boolean] + def cleanup_done? + true unless returned_files.empty? + end + alias_method :cleanup_done, :cleanup_done? + + # @return [Boolean] + def worksheet_sent_not_done? + true if !cleanup_done? && !provided_worksheets.empty? + end + + # @return [Symbol] the registry entry job key for the base job with cleanup + # merged in + def base_job_cleaned_job_key + "#{cleanup_base_name}__base_job_cleaned".to_sym + end + + # @return [Symbol] the registry entry job key for the job that deduplicates + # the clean base job data + def cleaned_uniq_job_key + "#{cleanup_base_name}__cleaned_uniq".to_sym + end + + # @return [Symbol] the registry entry job key for the worksheet prep job + def worksheet_job_key + "#{cleanup_base_name}__worksheet".to_sym + end + + # @return [Symbol] the registry entry job key for the compiled corrections job + def returned_compiled_job_key + "#{cleanup_base_name}__returned_compiled".to_sym + end + + # @return [Symbol] the registry entry job key for the compiled corrections job + def corrections_job_key + "#{cleanup_base_name}__corrections".to_sym + end + + # Appends "s" to module's `orig_values_identifier`. Used to manage joining, + # collating, and splitting/exploding on this value, while clarifying that + # any collated field in output is collated (not expected to be a single + # value. + def collated_orig_values_id_field + "#{orig_values_identifier}s".to_sym + end + + def self.check_required_settings(mod) + %i[base_job job_tags + worksheet_add_fields + worksheet_field_order fingerprint_fields + fingerprint_flag_ignore_fields].each do |setting| + unless mod.respond_to?(setting) + raise Kiba::Extend::IterativeCleanupSettingUndefinedError, setting + end + end + end + private_class_method :check_required_settings + + def self.datadir(mod) + dir = nil + parents = mod.module_parents + + until dir || parents.empty? + parent = parents.shift + dir = parent.datadir if parent.respond_to?(:datadir) + end + + raise Kiba::Extend::ProjectSettingUndefinedError, :datadir unless dir + + dir + end + + def self.define_provided_worksheets_setting(mod) + provided_worksheets = <<~CODE + # Filenames of cleanup worksheets provided to the client. Should be + # ordered oldest-to-newest. Assumes files are in the `to_client` + # subdirectory of the migration base directory + # + # @return Array + setting :provided_worksheets, + default: [], + reader: true, + constructor: proc { |value| + value.map do |filename| + File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "to_client", filename) + end + } + CODE + mod.module_eval(provided_worksheets, __FILE__, __LINE__) + end + private_class_method :define_provided_worksheets_setting + + def self.define_returned_files_setting(mod) + returned_files = <<~CODE + # Filenames of cleanup worksheets returned by the client. Should be + # ordered oldest-to-newest. Assumes files are in the `supplied` + # subdirectory of the migration base directory + # + # @return Array + setting :returned_files, + default: [], + reader: true, + constructor: proc { |value| + value.map do |filename| + File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "supplied", filename) + end + } + CODE + mod.module_eval(returned_files, __FILE__, __LINE__) + end + private_class_method :define_returned_files_setting + + def register_cleanup_jobs + ns = build_namespace + Kiba::Extend.registry.import(ns) + end + + def build_namespace + bind = binding + + Dry::Container::Namespace.new(cleanup_base_name) do + mod = bind.receiver + register mod.send(:job_name, mod.send(:base_job_cleaned_job_key)), + mod.send(:base_job_cleaned_job_hash, mod) + register mod.send(:job_name, mod.send(:cleaned_uniq_job_key)), + mod.send(:cleaned_uniq_job_hash, mod) + register mod.send(:job_name, mod.send(:worksheet_job_key)), + mod.send(:worksheet_job_hash, mod) + if mod.cleanup_done? + returned = mod.send(:returned_files) + returned_jobs = mod.send(:returned_file_jobs) + .map { |job| mod.send(:job_name, job) } + returned.each_with_index do |file, idx| + register returned_jobs[idx], { + path: file, + supplied: true, + tags: mod.send(:job_tags) + } + end + register mod.send(:job_name, mod.send(:returned_compiled_job_key)), + mod.send(:returned_compiled_job_hash, mod) + register mod.send(:job_name, mod.send(:corrections_job_key)), + mod.send(:corrections_job_hash, mod) + end + end + end + private :build_namespace + + def job_name(full_job_key) + full_job_key.to_s + .delete_prefix("#{cleanup_base_name}__") + .to_sym + end + private :job_name + + def base_job_cleaned_job_hash(mod) + { + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), "working", + "#{mod.cleanup_base_name}_base_job_cleaned.csv"), + creator: { + callee: Kiba::Extend::Mixins::IterativeCleanup::BaseJobCleaned, + args: {mod: mod} + }, + tags: mod.job_tags, + lookup_on: mod.cleaned_values_identifier + } + end + private :base_job_cleaned_job_hash + + def cleaned_uniq_job_hash(mod) + { + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), "working", + "#{mod.cleanup_base_name}_cleaned_uniq.csv"), + creator: { + callee: Kiba::Extend::Mixins::IterativeCleanup::CleanedUniq, + args: {mod: mod} + }, + tags: mod.job_tags + } + end + private :cleaned_uniq_job_hash + + def worksheet_job_hash(mod) + { + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), "to_client", + "#{mod.cleanup_base_name}_worksheet.csv"), + creator: { + callee: Kiba::Extend::Mixins::IterativeCleanup::Worksheet, + args: {mod: mod} + }, + tags: mod.job_tags, + dest_special_opts: {initial_headers: mod.worksheet_field_order} + } + end + private :worksheet_job_hash + + def returned_compiled_job_hash(mod) + { + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "working", "#{mod.cleanup_base_name}_returned_compiled.csv"), + creator: { + callee: Kiba::Extend::Mixins::IterativeCleanup::ReturnedCompiled, + args: {mod: mod} + }, + tags: mod.job_tags + } + end + private :returned_compiled_job_hash + + def corrections_job_hash(mod) + { + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "working", "#{mod.cleanup_base_name}_corrections.csv"), + creator: { + callee: Kiba::Extend::Mixins::IterativeCleanup::Corrections, + args: {mod: mod} + }, + tags: mod.job_tags, + lookup_on: mod.orig_values_identifier + } + end + private :corrections_job_hash +end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/base_job_cleaned.rb b/lib/kiba/extend/mixins/iterative_cleanup/base_job_cleaned.rb new file mode 100644 index 000000000..c6762d803 --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup/base_job_cleaned.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +module Kiba::Extend::Mixins::IterativeCleanup::BaseJobCleaned + module_function + + def job(mod:) + Kiba::Extend::Jobs::Job.new( + files: { + source: mod.base_job, + destination: mod.base_job_cleaned_job_key, + lookup: get_lookups(mod) + }, + transformer: get_xforms(mod) + ) + end + + def get_lookups(mod) + base = [] + base << mod.corrections_job_key if mod.cleanup_done? + base.select { |job| Kiba::Extend::Job.output?(job) } + end + + def get_xforms(mod) + base = [] + if mod.respond_to?(:base_job_cleaned_pre_xforms) + base << mod.base_job_cleaned_pre_xforms + end + base << xforms(mod) + if mod.respond_to?(:base_job_cleaned_post_xforms) + base << mod.base_job_cleaned_post_xforms + end + base + end + + def xforms(mod) + bind = binding + + Kiba.job_segment do + job = bind.receiver + lookups = job.send(:get_lookups, mod) + + transform Append::NilFields, + fields: mod.worksheet_add_fields + + # Add :fingerprint (orig values) before merging any cleanup in + transform Fingerprint::Add, + target: :fingerprint, + fields: mod.fingerprint_fields + + if mod.cleanup_done? && lookups.any?(mod.corrections_job_key) + transform Fingerprint::MergeCorrected, + lookup: method(mod.corrections_job_key).call, + keycolumn: mod.orig_values_identifier, + todofield: :corrected + end + + transform Fingerprint::Add, + target: :clean_fingerprint, + fields: mod.fingerprint_fields + end + end +end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/cleaned_uniq.rb b/lib/kiba/extend/mixins/iterative_cleanup/cleaned_uniq.rb new file mode 100644 index 000000000..85d166cac --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup/cleaned_uniq.rb @@ -0,0 +1,80 @@ +# frozen_string_literal: true + +module Kiba::Extend::Mixins::IterativeCleanup::CleanedUniq + module_function + + def job(mod:) + Kiba::Extend::Jobs::Job.new( + files: { + source: mod.base_job_cleaned_job_key, + destination: mod.cleaned_uniq_job_key, + lookup: get_lookups(mod) + }, + transformer: get_xforms(mod) + ) + end + + def get_lookups(mod) + base = [mod.base_job_cleaned_job_key] + base.select { |job| Kiba::Extend::Job.output?(job) } + end + + def get_xforms(mod) + base = [] + if mod.respond_to?(:cleaned_uniq_pre_xforms) + base << mod.cleaned_uniq_pre_xforms + end + + base << (mod.cleanup_done? ? cleaned_xforms(mod) : orig_xforms(mod)) + + if mod.respond_to?(:cleaned_uniq_post_xforms) + base << mod.cleaned_uniq_post_xforms + end + base + end + + def orig_xforms(mod) + bind = binding + + Kiba.job_segment do + transform Rename::Fields, + fieldmap: bind.receiver.send(:fieldmap, mod) + .invert + .reject { |key, val| key == val } + end + end + + def cleaned_xforms(mod) + bind = binding + + Kiba.job_segment do + job = bind.receiver + + transform Deduplicate::Table, + field: mod.cleaned_values_identifier, + delete_field: false + transform Delete::Fields, + fields: mod.cleaned_uniq_collate_fields + transform Merge::MultiRowLookup, + lookup: send(mod.base_job_cleaned_job_key), + keycolumn: mod.cleaned_values_identifier, + fieldmap: job.send(:fieldmap, mod), + delim: mod.collation_delim + end + end + + def fieldmap(mod) + mod.cleaned_uniq_collate_fields.map do |field| + field_mapping(field) + end.to_h + end + + def field_mapping(field) + if field.to_s.end_with?("s") + [field, field] + else + ["#{field}s".to_sym, field] + end + end + private :field_mapping +end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/corrections.rb b/lib/kiba/extend/mixins/iterative_cleanup/corrections.rb new file mode 100644 index 000000000..f95ab0911 --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup/corrections.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +module Kiba::Extend::Mixins::IterativeCleanup::Corrections + module_function + + def job(mod:) + return unless mod.cleanup_done? + + Kiba::Extend::Jobs::Job.new( + files: { + source: mod.returned_compiled_job_key, + destination: mod.corrections_job_key + }, + transformer: get_xforms(mod) + ) + end + + def get_xforms(mod) + base = [] + if mod.respond_to?(:corrections_pre_xforms) + base << mod.corrections_pre_xforms + end + + base << xforms(mod) + + if mod.respond_to?(:corrections_post_xforms) + base << mod.corrections_post_xforms + end + base + end + + def xforms(mod) + Kiba.job_segment do + transform FilterRows::FieldPopulated, + action: :keep, + field: :corrected + transform Explode::RowsFromMultivalField, + field: mod.collated_orig_values_id_field, + delim: mod.collation_delim + transform Rename::Field, + from: mod.collated_orig_values_id_field, + to: mod.orig_values_identifier + transform CombineValues::FullRecord + transform Deduplicate::Table, + field: :index, + delete_field: true + end + end +end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/known_worksheet_values.rb b/lib/kiba/extend/mixins/iterative_cleanup/known_worksheet_values.rb new file mode 100644 index 000000000..bcc17e548 --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup/known_worksheet_values.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require "csv" +require "set" + +module Kiba::Extend::Mixins::IterativeCleanup + class KnownWorksheetValues + def initialize(mod) + @mod = mod + @field = mod.collated_orig_values_id_field + .to_s + @values = nil + end + + def call + return values if values + + @values = Set.new + mod.provided_worksheets.each { |file| extract_values(file) } + values + end + + private + + attr_reader :mod, :field, :values + + def extract_values(file) + CSV.foreach(file, headers: true) do |row| + vals = row[field] + next if vals.blank? + + vals.split(mod.collation_delim).each do |val| + values << val + end + end + end + end +end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/returned_compiled.rb b/lib/kiba/extend/mixins/iterative_cleanup/returned_compiled.rb new file mode 100644 index 000000000..28a728bc0 --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup/returned_compiled.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +module Kiba::Extend::Mixins::IterativeCleanup::ReturnedCompiled + module_function + + def job(mod:) + Kiba::Extend::Jobs::Job.new( + files: { + source: mod.returned_file_jobs, + destination: mod.returned_compiled_job_key + }, + transformer: get_xforms(mod) + ) + end + + def get_xforms(mod) + base = [] + if mod.respond_to?(:returned_compiled_pre_xforms) + base << mod.returned_compiled_pre_xforms + end + + base << xforms(mod) + + if mod.respond_to?(:returned_compiled_post_xforms) + base << mod.returned_compiled_post_xforms + end + base + end + + def xforms(mod) + Kiba.job_segment do + transform Delete::Fields, + fields: :to_review + transform Fingerprint::FlagChanged, + fingerprint: :clean_fingerprint, + source_fields: mod.fingerprint_fields, + delete_fp: true, + target: :corrected + transform Delete::FieldnamesStartingWith, + prefix: "fp_" + transform Clean::EnsureConsistentFields + end + end +end diff --git a/lib/kiba/extend/mixins/iterative_cleanup/worksheet.rb b/lib/kiba/extend/mixins/iterative_cleanup/worksheet.rb new file mode 100644 index 000000000..fab1b0111 --- /dev/null +++ b/lib/kiba/extend/mixins/iterative_cleanup/worksheet.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +module Kiba::Extend::Mixins::IterativeCleanup::Worksheet + module_function + + def job(mod:) + Kiba::Extend::Jobs::Job.new( + files: { + source: mod.cleaned_uniq_job_key, + destination: mod.worksheet_job_key + }, + transformer: get_xforms(mod) + ) + end + + def get_lookups(mod) + if mod.cleanup_done? + # todo + elsif mod.worksheet_sent_not_done? + # todo + else + [] + end + end + + def get_xforms(mod) + base = [] + if mod.respond_to?(:worksheet_pre_xforms) + base << mod.worksheet_pre_xforms + end + base << xforms(mod) + if mod.respond_to?(:worksheet_post_xforms) + base << mod.worksheet_post_xforms + end + base + end + + def xforms(mod) + Kiba.job_segment do + transform Append::NilFields, + fields: mod.worksheet_add_fields + transform Fingerprint::Add, + target: :clean_fingerprint, + fields: mod.fingerprint_fields + + unless mod.provided_worksheets.empty? + known_vals = + Kiba::Extend::Mixins::IterativeCleanup::KnownWorksheetValues.new(mod) + .call + transform Append::NilFields, + fields: :to_review + transform do |row| + ids = row[mod.collated_orig_values_id_field] + next row if ids.blank? + + known = ids.split(mod.collation_delim) + .map { |id| known_vals.include?(id) } + .all? + next row if known + + row[:to_review] = "y" + row + end + end + end + end +end diff --git a/lib/kiba/extend/utils/iterative_cleanup_job_registrar.rb b/lib/kiba/extend/utils/iterative_cleanup_job_registrar.rb new file mode 100644 index 000000000..aa8716845 --- /dev/null +++ b/lib/kiba/extend/utils/iterative_cleanup_job_registrar.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Kiba::Extend::Utils + class IterativeCleanupJobRegistrar + def self.call + new.call + end + + def initialize + @to_register = gather + end + + def call + puts "Registering iterative cleanup jobs" + to_register.each do |mod| + mod.register_cleanup_jobs + end + end + + private + + attr_reader :to_register + + def gather + Kiba::Extend.project_configs.select do |config| + config.is_a?(Kiba::Extend::Mixins::IterativeCleanup) + end + end + end +end diff --git a/spec/kiba/extend/mixins/iterative_cleanup_spec.rb b/spec/kiba/extend/mixins/iterative_cleanup_spec.rb new file mode 100644 index 000000000..3dde27123 --- /dev/null +++ b/spec/kiba/extend/mixins/iterative_cleanup_spec.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require "spec_helper" + +module WithoutBaseJob + module_function + + extend Dry::Configurable + setting :cleanup_base_name, default: :test__me, reader: true +end + +module WithSetup + module_function + + extend Dry::Configurable + setting :base_job, default: :base__job, reader: true + setting :job_tags, default: %i[test cleanup], reader: true + setting :worksheet_add_fields, + default: %i[type note], + reader: true + setting :worksheet_field_order, + default: %i[value type note], + reader: true + setting :fingerprint_fields, + default: %i[value type note], + reader: true + setting :fingerprint_flag_ignore_fields, default: nil, reader: true +end + +RSpec.describe Kiba::Extend::Mixins::IterativeCleanup do + let(:subject) { described_class } + + describe ".extended" do + context "when extended without :base_job" do + let(:mod) { WithoutBaseJob } + + it "raises error" do + expect { mod.extend(subject) }.to raise_error( + Kiba::Extend::IterativeCleanupSettingUndefinedError + ) + end + end + + context "when extended with required setup" do + let(:mod) { WithSetup } + + it "extends IterativeCleanup" do + mod.extend(subject) + expect(mod).to be_a(subject) + expect(mod).to respond_to(:provided_worksheets, :returned_files, + :returned_file_jobs, :cleanup_done?) + expect(mod.cleanup_base_name).to eq("with_setup") + end + end + end +end From 3103712dca33b98832f7181b67a53c732888f286 Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Mon, 11 Sep 2023 11:23:10 -0400 Subject: [PATCH 2/5] noblame: Whitespace and/or formatting changes only --- lib/kiba/extend.rb | 165 +++++++++++++++++++++++------------- lib/kiba/extend/registry.rb | 14 ++- 2 files changed, 112 insertions(+), 67 deletions(-) diff --git a/lib/kiba/extend.rb b/lib/kiba/extend.rb index 61f940a45..04f402c0f 100644 --- a/lib/kiba/extend.rb +++ b/lib/kiba/extend.rb @@ -1,7 +1,5 @@ # frozen_string_literal: true -# rubocop:todo Layout/LineLength - require "amazing_print" require "active_support" require "active_support/core_ext/object" @@ -24,19 +22,30 @@ module Kiba # Handles: # # - auto-loading of the code - # - extending `Kiba` with `Kiba::Extend::Jobs::JobSegmenter` so we can call `Kiba.job_segment` - # - defining config settings, all of which can be overridden by project applications using - # `kiba-extend` + # - extending `Kiba` with `Kiba::Extend::Jobs::JobSegmenter` so we + # can call `Kiba.job_segment` + # - defining config settings, all of which can be overridden by + # project applications using `kiba-extend` # # Also defines some CSV converters: # - # - `:stripextra` -- strips leading/trailing spaces, collapses multiple spaces, removes terminal commas, - # strips again - # - `:nulltonil` -- replaces any values that are a literal string NULL with a nil value - # - `:stripplus` -- strips leading/trailing spaces, collapses multiple spaces, removes terminal commas, - # strips again, removes "NULL" (i.e. literal string "NULL" becomes a `nilValue` + # - `:stripextra` -- strips leading/trailing spaces, collapses + # multiple spaces, removes terminal commas, strips again + # - `:nulltonil` -- replaces any values that are a literal string + # NULL with a nil value + # - `:stripplus` -- strips leading/trailing spaces, collapses + # multiple spaces, removes terminal commas, strips again, removes + # "NULL" (i.e. literal string "NULL" becomes a `nilValue` + # + # Note that `:stripplus` combines the functionality of `:stripextra` + # and `:nulltonil` # - # Note that `:stripplus` combines the functionality of `:stripextra` and `:nulltonil` + # ## About pre-job task settings + # + # If configured properly, the pre-job task is run when a job is run + # via Thor invocation. This includes `run:job`, `run:jobs`, and + # `jobs:tagged -r tagvalue`. The task is run once when the Thor + # task is invoked. module Extend module_function @@ -88,9 +97,11 @@ def reload! # # @return [Hash] setting :csvopts, - default: {headers: true, header_converters: %i[symbol downcase]}, reader: true + default: {headers: true, header_converters: %i[symbol downcase]}, + reader: true - # @return [Hash] default settings for Lambda destination + # Default settings for Lambda destination + # @return [Hash] setting :lambdaopts, default: {on_write: ->(r) { accumulator << r }}, reader: true @@ -103,8 +114,8 @@ def reload! # ``` setting :delim, default: "|", reader: true - # @return [String] - # Default subgrouping delimiter for splitting/joining values in multi-valued fields + # Default subgrouping delimiter for splitting/joining values in multi-valued + # fields # # ``` # orig = 'a^^y|b^^z' @@ -112,89 +123,124 @@ def reload! # sgdelim_split = delim_split.map{ |val| val.split(sgdelim) } # sgdelim_split => [['a', 'y'], ['b', 'z']] # ``` + # + # @return [String] setting :sgdelim, default: "^^", reader: true - # @return [String] default string to be treated as though it were a null/empty value. + # Default string to be treated as though it were a null/empty value. + # + # @return [String] setting :nullvalue, default: "%NULLVALUE%", reader: true + # Used to join nested namespaces and registered keys in + # FileRegistry. With namespace 'ns' and registered key 'foo': + # 'ns\__foo'. With parent namespace 'ns', child namespace + # 'child', and registered key 'foo': 'ns\__child\__foo' + # # @return [String] - # Used to join nested namespaces and registered keys in FileRegistry. With namespace 'ns' and registered - # key 'foo': 'ns\__foo'. With parent namespace 'ns', child namespace 'child', and registered key 'foo': - # 'ns\__child\__foo' setting :registry_namespace_separator, default: "__", reader: true - # @!method source - # Default source class for jobs. Must meet implementation criteria in [Kiba wiki](https://github.com/thbar/kiba/wiki/Implementing-ETL-sources) + # Default source class for jobs. Must meet implementation criteria + # in [Kiba + # wiki](https://github.com/thbar/kiba/wiki/Implementing-ETL-sources) + # + # @return [Class] setting :source, constructor: proc { Kiba::Extend::Sources::CSV }, reader: true - # @!method destination - # Default destination class for jobs. Must meet implementation criteria in [Kiba wiki](https://github.com/thbar/kiba/wiki/Implementing-ETL-destinations) + # Default destination class for jobs. Must meet implementation + # criteria in [Kiba + # wiki](https://github.com/thbar/kiba/wiki/Implementing-ETL-destinations) + # + # @return [Class] setting :destination, constructor: proc { Kiba::Extend::Destinations::CSV }, reader: true - # @return [String] prefix for warnings from the ETL + # Prefix for warnings from the ETL + # + # @return [String] setting :warning_label, default: "KIBA WARNING", reader: true - # @return [Kiba::Extend::Registry::FileRegistry] A customized - # [dry-container](https://dry-rb.org/gems/dry-container/main/) for registering and resolving - # jobs + # A customized + # [dry-container](https://dry-rb.org/gems/dry-container/main/) + # for registering and resolving jobs + # + # @return [Kiba::Extend::Registry::FileRegistry] setting :registry, constructor: proc { Kiba::Extend::Registry::FileRegistry.new }, reader: true - # @return [Symbol] the job definition module method expected to be present if you [define a registry - # entry hash creator as a Module](https://lyrasis.github.io/kiba-extend/file.file_registry_entry.html#module-creator-example-since-2-7-2) + # The job definition module method expected to be present if you + # [define a registry entry hash creator as a + # Module](https://lyrasis.github.io/kiba-extend/file.file_registry_entry.html#module-creator-example-since-2-7-2) + # + # @return [Symbol] setting :default_job_method_name, default: :job, reader: true - # ## Pre-job task settings + # Whether to use Kiba::Extend's pre-job task functionality. The + # default is `false` for backward compatibility, as existing + # projects may not have the required settings configured. # - # If configured properly, the pre-job task is run when a job is run via Thor invocation. This includes - # `run:job`, `run:jobs`, and `jobs:tagged -r tagvalue`. The task is run once when the Thor task is - # invoked. - - # @return [Boolean] whether to use Kiba::Extend's pre-job task functionality. The default is `false` - # for backward compatibility, as existing projects may not have the required settings configured. + # @return [Boolean] setting :pre_job_task_run, default: false, reader: true - # @return [String] full path to directory to which files will be moved if `pre_job_task_action == - # :backup`. The directory will be created if it does not exist. + # Full path to directory to which files will be moved if + # `pre_job_task_action == :backup`. The directory will be + # created if it does not exist. + # + # @return [String] setting :pre_job_task_backup_dir, default: nil, reader: true - # @return [Array] full paths to directories that will be affected by the specified pre-task action + # Full paths to directories that will be affected by the specified pre-task + # action + # @return [Array] setting :pre_job_task_directories, default: [], reader: true - # @return [:backup, :nuke] Controls what happens when pre-job task is run + # Controls what happens when pre-job task is run + # + # - :backup - Moves all existing files in specified directories to backup + # directory created in your `:datadir` + # - :nuke - Deletes all existing files in specified directories + # when a job is run. **Make sure you only specify directories + # that contain derived/generated files!** # - # - :backup - Moves all existing files in specified directories to backup directory created in your `:datadir` - # - :nuke - Deletes all existing files in specified directories when a job is run. **Make sure you only - # specify directories that contain derived/generated files!** + # @return [:backup, :nuke] setting :pre_job_task_action, default: :backup, reader: true - # @return [:job, nil, anyValue] - # # Controls whether pre-job task is run # - # - :job - runs pre-job task specified above whenever you invoke `thor run:job ...`. All dependency jobs - # required for the invoked job will be run. This mode is recommended during development when you want - # any change in the dependency chain to get picked up. - # - any other value - only regenerates missing dependency files. Useful when your data is really big - # and/or your jobs are more stable + # - :job - runs pre-job task specified above whenever you invoke + # `thor run:job ...`. All dependency jobs required for the + # invoked job will be run. This mode is recommended during + # development when you want any change in the dependency chain + # to get picked up. + # - any other value - only regenerates missing dependency files. + # Useful when your data is really big and/or your jobs are more + # stable + # + # @return [:job, nil, anyValue] setting :pre_job_task_mode, default: :job, reader: true - # @return [Boolean] whether to output results to STDOUT for debugging + # Whether to output results to STDOUT for debugging + # + # @return [Boolean] setting :job_show_me, default: false, reader: true - # @return [Boolean] whether to have computer audibly say something when job is complete + # Whether to have computer audibly say something when job is complete + # + # @return [Boolean] setting :job_tell_me, default: false, reader: true - # @return [:debug, :normal, :minimal] how much output about jobs to output to STDOUT + # How much output about jobs to output to STDOUT # - # - :debug - tells you A LOT - helpful when developing pipelines and debugging + # - :debug - tells you A LOT - helpful when developing pipelines and + # debugging # - :normal - reports what is running, from where, and the results # - :minimal - bare minimum + # + # @return [:debug, :normal, :minimal] setting :job_verbosity, default: :normal, reader: true # List of config modules in project namespaces set in {config_namespaces} @@ -218,10 +264,12 @@ def get_config_mods(ns, constants) # The section below is for backward comapatibility only # @since 3.2.1 - # Warns that nested job config settings will be deprecated and gives new setting to use + # Warns that nested job config settings will be deprecated and gives new + # setting to use def warn_unnested(name, value) rep_by = "job_#{name}" - msg = "Kiba::Extend.config.job.#{name} setting has been replaced by Kiba::Extend.config.#{rep_by}" + msg = "Kiba::Extend.config.job.#{name} setting has been replaced by "\ + "Kiba::Extend.config.#{rep_by}" warn("#{Kiba::Extend.warning_label}: #{msg}") value end @@ -241,7 +289,7 @@ def warn_unnested(name, value) } end - # strips, collapses multiple spaces, removes terminal commas, strips again + # Strips, collapses multiple spaces, removes terminal commas, strips again # removes "NULL"/treats as nilValue CSV::Converters[:stripplus] = lambda { |s| begin @@ -262,7 +310,7 @@ def warn_unnested(name, value) end } - # strips, collapses multiple spaces, removes terminal commas, strips again + # Strips, collapses multiple spaces, removes terminal commas, strips again CSV::Converters[:stripextra] = lambda { |s| begin if s.nil? @@ -298,4 +346,3 @@ def warn_unnested(name, value) Kiba::Extend.loader # So we can call Kiba.job_segment Kiba.extend(Kiba::Extend::Jobs::JobSegmenter) -# rubocop:enable Layout/LineLength diff --git a/lib/kiba/extend/registry.rb b/lib/kiba/extend/registry.rb index 392f1c94e..8191b61ba 100644 --- a/lib/kiba/extend/registry.rb +++ b/lib/kiba/extend/registry.rb @@ -1,21 +1,19 @@ # frozen_string_literal: true -# rubocop:todo Layout/LineLength - module Kiba module Extend # Support for defining project-specific file registry # - # This DRYs up the process of setting up job configs (i.e. the source, lookup, and destination files - # for that job. + # This DRYs up the process of setting up job configs (i.e. the + # source, lookup, and destination files for that job. # - # This also allows for automated calling of dependencies instead of having to redundantly - # hard code them for every job. If the file(s) needed as sources or lookups do not exist, - # their creator jobs will be run to create them. + # This also allows for automated calling of dependencies instead + # of having to redundantly hard code them for every job. If the + # file(s) needed as sources or lookups do not exist, their + # creator jobs will be run to create them. # # @since 2.2.0 module Registry end end end -# rubocop:enable Layout/LineLength From 3d9ba860b7f4bc9eb17723a1769e12bde257e6a5 Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Mon, 11 Sep 2023 11:23:57 -0400 Subject: [PATCH 3/5] Start on IterativeCleanup documentation page --- doc/iterative_cleanup.md | 95 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 doc/iterative_cleanup.md diff --git a/doc/iterative_cleanup.md b/doc/iterative_cleanup.md new file mode 100644 index 000000000..e73a7bb14 --- /dev/null +++ b/doc/iterative_cleanup.md @@ -0,0 +1,95 @@ +# Using the iterative cleanup mixin + +"Iterative cleanup" means the client may provide the worksheet more +than once, or that you may need to produce a fresh worksheet for the +client after a new database export is provided. + +There is no reason you can't use the pattern for expected one-round +cleanup. How often does one round of cleanup turn into more, after +all? + +## Examples + +[kiba-extend-project](https://github.com/lyrasis/kiba-extend-project) +has been updated to reflect usage of the `IterativeCleanup` mixin. + +Refer to todo:link Kiba::Tms::AltNumsForObjTypeCleanup as an example config + module extending this mixin module in a simple way. See + todo:link Kiba::Tms::PlacesCleanupInitial for a more complex usage with + default overrides and custom pre/post transforms. + +## Project setup assumptions + +Your project must follow some setup/configuration conventions in order + to use this mixin: + +### Each cleanup process must be configured in its own config module + +A config module is a Ruby module that responds to `:config`. + +Extending `Dry::Configurable` adds a `config` method to a module: + +```ruby +module Project::NameCategorization + module_function + extend Dry::Configurable +end +``` + +Or you can manually define a `config` class method on the module: + +```ruby +module Project::PersonCleanup + module_function + + def config + true + end +end +``` + +### `Kiba::Extend` `config_namespaces` setting must be set from your project + +After your project's base file has called the project's `loader`, it +must set the `Kiba::Extend.config.config_namespaces` setting. + +This setting lists the namespace(s) where your config modules live. + +In most of my projects, all of my config modules are in one namespace. +For example, for the above project, I would add: + +```ruby +Kiba::Extend.config.config_namespaces = [Project] +``` + +Note that the +setting takes an array, so you can list multiple namespaces if you +have organized your project differently and your configs are not all +in one namespace. For example, a migration for a Tms client may have +client specific cleanups in the client-specific migration code +project (config namespace: `TmsClientName`). That code project will +make use of the kiba-tms application, which also defines cleanup +configs in the namespace `Kiba::Tms`. Such a project would do this +at the bottom of `lib/tms_client_name.rb`: + +```ruby +Kiba::Extend.config.config_namespaces = [Kiba::Tms, TmsClientName] +``` + +### Add cleanup job registration to your `RegistryData` registration method + +Add the following to `RegistryData.register` (or whatever method +triggers the registration of all your jobs): + +```ruby +Kiba::Extend::Utils::IterativeCleanupJobRegistrar.call +``` + +This line should be added before any `registry.transform`, +`registry.freeze`, or `registry.finalize` methods. + +### `config_namespaces` setting is populated before `RegistryData` registration + +Calling `RegistryData.register` (or whatever method triggers the +registration of all your jobs) must be done ***after*** the +`config_namespaces` are set. From b2ed77169f568b3d46482a1c0707d9ca85257711 Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Mon, 11 Sep 2023 11:28:36 -0400 Subject: [PATCH 4/5] noblame: Whitespace and/or formatting changes only --- lib/kiba/extend/mixins/iterative_cleanup.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/kiba/extend/mixins/iterative_cleanup.rb b/lib/kiba/extend/mixins/iterative_cleanup.rb index 1e0ce2724..27d5a7aa8 100644 --- a/lib/kiba/extend/mixins/iterative_cleanup.rb +++ b/lib/kiba/extend/mixins/iterative_cleanup.rb @@ -341,8 +341,8 @@ def job_name(full_job_key) def base_job_cleaned_job_hash(mod) { - path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), "working", - "#{mod.cleanup_base_name}_base_job_cleaned.csv"), + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "working", "#{mod.cleanup_base_name}_base_job_cleaned.csv"), creator: { callee: Kiba::Extend::Mixins::IterativeCleanup::BaseJobCleaned, args: {mod: mod} @@ -355,8 +355,8 @@ def base_job_cleaned_job_hash(mod) def cleaned_uniq_job_hash(mod) { - path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), "working", - "#{mod.cleanup_base_name}_cleaned_uniq.csv"), + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "working", "#{mod.cleanup_base_name}_cleaned_uniq.csv"), creator: { callee: Kiba::Extend::Mixins::IterativeCleanup::CleanedUniq, args: {mod: mod} @@ -368,8 +368,8 @@ def cleaned_uniq_job_hash(mod) def worksheet_job_hash(mod) { - path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), "to_client", - "#{mod.cleanup_base_name}_worksheet.csv"), + path: File.join(Kiba::Extend::Mixins::IterativeCleanup.datadir(mod), + "to_client", "#{mod.cleanup_base_name}_worksheet.csv"), creator: { callee: Kiba::Extend::Mixins::IterativeCleanup::Worksheet, args: {mod: mod} From b00541f370f990f019d5bfc649d7e20411cbb128 Mon Sep 17 00:00:00 2001 From: Kristina Spurgin Date: Mon, 11 Sep 2023 11:29:55 -0400 Subject: [PATCH 5/5] Update changelog --- CHANGELOG.adoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index b52f10c51..7f8971bda 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -76,6 +76,7 @@ These changes are merged into the `main` branch, but have not been released. Aft * Utility classes to clean ISBD trailing punctuation from name and role term values extracted from MARC data (PR#141) * `Kiba::Extend::Job.output?` convenience method (PR#150) * Job duration report (added to normal and verbose job run) (PR#154, PR#157) +* `IterativeCleanup` mixin (PR#180) === Changed * Transforms that take an `action` argument now mix in the new `ActionArgumentable` module and validate the argument values in a consistent way (PR#138)