feat(transactions): make type agnostic - can be any type as long as c…

…onsistent
JuliaHealth · Feb 28, 2019 · ac1dee6 · ac1dee6
1 parent 3721b3a
commit ac1dee6
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 15 deletions.
diff --git a/src/frequent_itemset_tree.jl b/src/frequent_itemset_tree.jl
@@ -85,7 +85,7 @@ end
 
 # This function is used internally by the frequent() function to create the
 # initial bitarrays used to represent the first "children" in the itemset tree.
-function occurrence(transactions::Array{Array{String, 1}, 1}, uniq_items::Array{String, 1})
+function occurrence(transactions::Array{Array{S, 1}, 1}, uniq_items::Array{S, 1}) where S
     n = length(transactions)
     p = length(uniq_items)
 
@@ -110,7 +110,7 @@ The tree is built recursively using calls to the growtree!() function. The
 `minsupp` and `maxdepth` parameters control the minimum support needed for an
 itemset to be called "frequent", and the max depth of the tree, respectively
 """
-function frequent_item_tree(transactions::Array{Array{String, 1}, 1}, uniq_items::Array{String, 1}, minsupp::Int, maxdepth::Int)
+function frequent_item_tree(transactions::Array{Array{S, 1}, 1}, uniq_items::Array{S, 1}, minsupp::Int, maxdepth::Int) where S
     occ = occurrence(transactions, uniq_items)
 
     # Have to initialize `itms` array like this because type inference
@@ -166,10 +166,10 @@ item sets and their support count (integer) when given and array of transactions
 basically just wraps frequent_item_tree() but gives back the plain text of the items,
 rather than that Int16 representation.
 """
-function frequent(transactions::Array{Array{String, 1}, 1}, minsupp::T, maxdepth) where T <: Real
+function frequent(transactions::Array{Array{S, 1}, 1}, minsupp::T, maxdepth) where {T <: Real, S}
     n = length(transactions)
     uniq_items = unique_items(transactions)
-    item_lkup = Dict{Int16, String}()
+    item_lkup = Dict{Int16, S}()
     for (i, itm) in enumerate(uniq_items)
         item_lkup[i] = itm
     end

diff --git a/src/rule_generation.jl b/src/rule_generation.jl
@@ -93,19 +93,19 @@ function gen_rules(root::Node, supp_dict::Dict{Array{Int16, 1}, Int}, num_transa
 end
 
 
-function rules_to_dataframe(rules::Array{Rule, 1}, item_lkup::Dict{T, String}; join_str = " | ") where T <: Integer
+function rules_to_dataframe(rules::Array{Rule, 1}, item_lkup::Dict{T, S}; join_str = " | ") where {T <: Integer, S}
     n_rules = length(rules)
     dt = DataFrame(lhs = fill("", n_rules),
                    rhs = fill("", n_rules),
                    supp = zeros(n_rules),
                    conf = zeros(n_rules),
                    lift = zeros(n_rules))
     for i = 1:n_rules
-        lhs_items = map(x -> item_lkup[x], rules[i].p)
+        lhs_items = map(x -> string.(item_lkup[x]), rules[i].p)
 
         lhs_string = "{" * join(lhs_items, join_str) * "}"
         dt[i, :lhs] = lhs_string
-        dt[i, :rhs] = item_lkup[rules[i].q]
+        dt[i, :rhs] = string.(item_lkup[rules[i].q])
         dt[i, :supp] = rules[i].supp
         dt[i, :conf] = rules[i].conf
         dt[i, :lift] = rules[i].lift
@@ -123,10 +123,10 @@ required for an itemset to be considered frequent. The `conf` argument allows us
 association rules without at least `conf` level of confidence. The `maxlen` argument stipulates
 the maximum length of an association rule (i.e., total items on left- and right-hand sides)
 """
-function apriori(transactions::Array{Array{String, 1}, 1}; supp::Float64 = 0.01, conf = 0.8, maxlen::Int = 5)
+function apriori(transactions::Array{Array{S, 1}, 1}; supp::Float64 = 0.01, conf = 0.8, maxlen::Int = 5) where S
     n = length(transactions)
     uniq_items = unique_items(transactions)
-    item_lkup = Dict{Int16, String}()
+    item_lkup = Dict{Int16, S}()
     for (i, itm) in enumerate(uniq_items)
         item_lkup[i] = itm
     end
@@ -145,7 +145,7 @@ end
 """
 apriori(occurrences, item_lkup; supp, conf, maxlen)
 
-Given an boolean occurrence matrix of transactions (rows are transactions, columns are items) and 
+Given an boolean occurrence matrix of transactions (rows are transactions, columns are items) and
 a lookup dictionary of column-index to items-string, this function runs the a-priori
 algorithm for generating frequent item sets. These frequent items are then used to generate
 association rules. The `supp` argument allows us to stipulate the minimum support

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,7 +1,7 @@
 
 function shownodes(node::Node, k::Int = 0)
     if has_children(node)
-        for nd in node.children 
+        for nd in node.children
             print("k = $(k + 1): ")
             println(nd.item_ids)
         end
@@ -11,11 +11,8 @@ function shownodes(node::Node, k::Int = 0)
     end
 end
 
-
-
-
 function randstr(n::Int, len::Int = 16)
-    vals = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
+    vals = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
             "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
     upper = map(uppercase, vals)
     append!(vals, upper)