Skip to content

Commit

Permalink
Made factors a seperate output, levels are optional in types
Browse files Browse the repository at this point in the history
  • Loading branch information
frankcorneliusmartin committed Jan 24, 2024
1 parent 7cecb96 commit f94e08b
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 20 deletions.
8 changes: 6 additions & 2 deletions vtg.summary/src/R/RPC_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@ RPC_summary <- function(data, columns, types = NULL, subset_rules = NULL,
data <- vtg.preprocessing::extend_data(data)
}
data <- vtg.preprocessing::subset_data(data, subset_rules)

vtg::log$debug("Factorizing character data...")
data <- vtg.preprocessing::factorize(data)

# execute checks that are common to all RPCs
vtg::log$debug("Checking data...")
vtg::log$debug("Checking data & Apply types")
data <- vtg.summary::common_checks_rpc(data, columns, types)
if ("error" %in% names(data)) {
# Return error message
Expand Down Expand Up @@ -61,6 +62,8 @@ RPC_summary <- function(data, columns, types = NULL, subset_rules = NULL,

# compute data range
vtg::log$debug("Computing column ranges...")
# FIXME FM 24-01-24: in case of a factor column, the range is not computed but the
# count of each factor is returned. This is not a range
column_ranges <- get_column_ranges(data, columns)

# check if there are disclosure risks for factors in column ranges. If so,
Expand Down Expand Up @@ -90,7 +93,8 @@ RPC_summary <- function(data, columns, types = NULL, subset_rules = NULL,
"nan_count" = nan_count,
"column_lengths" = column_lengths,
"column_sums" = column_sums,
"column_ranges" = column_ranges,
"column_ranges" = column_ranges[setdiff(names(column_ranges), factor_columns)],
"factor_counts" = as.list(column_ranges[factor_columns]),
"complete_rows" = complete_rows
)
)
Expand Down
9 changes: 7 additions & 2 deletions vtg.summary/src/R/assign_types.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,16 @@ assign_types <- function(data, types) {
data[[column_name]] <- as.numeric(data[[column_name]])
} else if (type_ == "factor") {
# TODO check if this is what we want: we basically filter the data here!
data <- data[data[[column_name]] %in% specs$levels,]
data[[column_name]] <- factor(data[[column_name]], levels = specs$levels)

if (!is.null(specs$levels)) {
data <- data[data[[column_name]] %in% specs$levels,]
data[[column_name]] <- factor(data[[column_name]], levels = specs$levels)
}

if (!is.null(specs$ref)) {
data[[column_name]] <- relevel(data[[column_name]], ref = specs$ref)
}

} else {
# TODO error message, wrong type
}
Expand Down
53 changes: 43 additions & 10 deletions vtg.summary/src/R/dsummary.R
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,50 @@ combine_node_statistics <- function(summary_per_node, columns) {
ranges_per_node <- lapply(summary_per_node, function(results) {
results[["column_ranges"]]
})
vtg::log$debug("ranges_per_node: {ranges_per_node}")
factor_counts_per_node <- lapply(summary_per_node, function(results) {
results[["factor_counts"]]
})
vtg::log$debug("factor_counts_per_node2: {factor_counts_per_node}")
vtg::log$debug("names: {names(factor_counts_per_node)}")

global_ranges <- list()
for (column in columns) {
global_factor_counts <- list()
# for (column in columns) {
# combine ranges per column
combined_ranges <- lapply(ranges_per_node, function(node_range) {
node_range[[column]]
})
if (all(sapply(combined_ranges, class, simplify = FALSE) == "table")) {
# column is a factor, so sum the occurrences of each value
global_ranges[[column]] <- Reduce("+", combined_ranges)
} else {
# column is numeric, so the range is the range of the ranges
global_ranges[[column]] <- Reduce("range", combined_ranges)


global_ranges <- Reduce("range", ranges_per_node)

# Collect all levels from all nodes
all_levels <- list()
for (node in factor_counts_per_node) {
factor_columns <- names(node)
for (column in factor_columns) {
all_levels[[column]] <- names(node[[column]])
}
}

# Compute the unique levels per column
unique_levels <- list()
for (column in names(all_levels)) {
unique_levels[[column]] <- unique(all_levels[column])
}

vtg::log$debug("unique_levels: {unique_levels}")

# Compute the factor counts per column
for (node in factor_counts_per_node) {
for (column in factor_columns) {
for (levels in unique_levels[[column]]) {
for (level in levels) {
if (level %in% names(node[[column]])) {
global_factor_counts[[column]][[level]] <- node[[column]][[level]]
} else {
global_factor_counts[[column]][[level]] <- 0
}
}
}
}
}

Expand All @@ -199,6 +231,7 @@ combine_node_statistics <- function(summary_per_node, columns) {
"nan_count" = global_nan_count,
"length" = global_column_length,
"range" = global_ranges,
"factor_counts" = global_factor_counts,
"mean" = global_means,
"complete_rows" = global_complete_rows,
"complete_rows_per_node" = complete_rows_per_node
Expand Down
16 changes: 10 additions & 6 deletions vtg.summary/src/test.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@ devtools::load_all("./vtg.preprocessing")
# create fake data. Three columns with random numbers, two columns with factors
set.seed(123L);
columns = c("A", "B", "C", "D", "E")
data <- data.frame("A" = sample(1:10, size = 1000, replace = TRUE),
"B" = sample(c(1:3, NA), size= 1000, replace = TRUE),
"C" = sample(c(6:19, NA), size= 1000, replace = TRUE),
"D" = sample(gl(10, 100), size = 1000, replace = TRUE),
"E" = sample(as.character(c("female", "male", NA)),
size = 1000, replace = TRUE))
data <- data.frame(
"A" = sample(1:10, size = 1000, replace = TRUE),
"B" = sample(c(1:3, NA), size= 1000, replace = TRUE),
"C" = sample(c(6:19, NA), size= 1000, replace = TRUE),
"D" = sample(gl(10, 100), size = 1000, replace = TRUE),
"E" = sample(as.character(c("female", "male", NA)),
size = 1000, replace = TRUE),
"F" = sample(as.character(c("other")),
size = 1000, replace = TRUE)
)


# Split the dataframe into two sets
Expand Down

0 comments on commit f94e08b

Please sign in to comment.