Fixed a bug in the factor counting

IKNL · Feb 1, 2024 · 198cdc5 · 198cdc5
1 parent a1fa288
commit 198cdc5
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 10 deletions.
diff --git a/vtg.summary/src/R/dsummary.R b/vtg.summary/src/R/dsummary.R
@@ -191,16 +191,25 @@ combine_node_statistics <- function(summary_per_node, columns) {
   }
 
   vtg::log$debug("unique_levels: {unique_levels}")
+  # Initialize global_factor_counts
+  for (column in names(all_levels)) {
+    global_factor_counts[[column]] <- list()
+    for (levels in unique_levels[[column]]) {
+      for (level in levels) {
+        global_factor_counts[[column]][[level]] <- 0
+      }
+    }
+  }
 
+  vtg::log$debug("Global levels initialized: {global_factor_counts}")
   # Compute the factor counts per column
   for (node in factor_counts_per_node) {
     for (column in factor_columns) {
       for (levels in unique_levels[[column]]) {
         for (level in levels) {
           if (level %in% names(node[[column]])) {
-            global_factor_counts[[column]][[level]] <- node[[column]][[level]]
-          } else {
-            global_factor_counts[[column]][[level]] <- 0
+            global_factor_counts[[column]][[level]] <-
+              global_factor_counts[[column]][[level]] + node[[column]][[level]]
           }
         }
       }

diff --git a/vtg.summary/src/test.R b/vtg.summary/src/test.R
@@ -5,21 +5,23 @@ devtools::load_all("./vtg.preprocessing")
 # create fake data. Three columns with random numbers, two columns with factors
 set.seed(123L);
 columns = c("A", "B", "C", "D", "E")
+l <- 250
 data <- data.frame(
-  "A" = sample(1:10, size = 1000, replace = TRUE),
-  "B" = sample(c(1:3, NA), size= 1000, replace = TRUE),
-  "C" = sample(c(6:19, NA), size= 1000, replace = TRUE),
-  "D" = sample(gl(10, 100), size = 1000, replace = TRUE),
+  "A" = sample(1:10, size = l, replace = TRUE),
+  "B" = sample(c(1:3, NA), size= l, replace = TRUE),
+  "C" = sample(c(6:19, NA), size= l, replace = TRUE),
+  "D" = sample(gl(4, 1), size = l, replace = TRUE),
   "E" = sample(as.character(c("female", "male", NA)),
-               size = 1000, replace = TRUE),
+               size = l, replace = TRUE),
   "F" = sample(as.character(c("other")),
-               size = 1000, replace = TRUE)
+               size = l, replace = TRUE)
 )
 
 
 # Split the dataframe into two sets
 n_rows <- nrow(data)
-set_size <- floor(n_rows / 2)
+set_size <- floor(l / 3)
+# set_size <- 200
 
 d1 <- data[1:set_size, ]
 d2 <- data[(set_size + 1):n_rows, ]