diff --git a/processing_hd.R b/processing_hd.R index 87ecc6a..355f514 100644 --- a/processing_hd.R +++ b/processing_hd.R @@ -1,5 +1,6 @@ # Load libraries library(tidyverse) +library(arrow) # Set seed for reproducibility set.seed(2024) @@ -18,7 +19,65 @@ health_base <- read_csv(paste0(directory, "manchester/simulationResults/ForUrban # Same for quicker run # shd <- hd |> slice_sample(n = 100) - +####### Referecen data ####### + +# Loop through each row +for (i in 1:nrow(health_base)){ + # Create a long df for each row + ur <- pivot_longer(health_base[i,], cols = -id) + # Get a table for each disease state with the first year when it was appeared + fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy) + # Loop through the first value dataset with first year for the disease appearance + for (j in 1:nrow(fv)){ + # Ignore healty and null states + if (!fv$value[j] %in% c("healthy", "null")) + { + # Do not carry forward the healthy state to any other state + if (fv$value[j - 1] != "healthy") + # Join previous state with the current one with a | as a separator + fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|") + } + } + # Update the long format for each ID with the new states + for (k in 1:nrow(fv)){ + if (!fv$value[k] %in% c("healthy", "null")) + ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k] + } + + # Update the states + health_base <- rows_update(health_base, pivot_wider(ur, id_cols=id)) +} + + + +####### Scenario data ####### + +# Loop through each row +for (i in 1:nrow(health_intervention)){ + # Create a long df for each row + ur <- pivot_longer(health_intervention[i,], cols = -id) + # Get a table for each disease state with the first year when it was appeared + fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy) + # Loop through the first value dataset with first year for the disease appearance + for (j in 1:nrow(fv)){ + # Ignore healty and null states + if (!fv$value[j] %in% c("healthy", "null")) + { + # Do not carry forward the healthy state to any other state + if (fv$value[j - 1] != "healthy") + # Join previous state with the current one with a | as a separator + fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|") + } + } + # Update the long format for each ID with the new states + for (k in 1:nrow(fv)){ + if (!fv$value[k] %in% c("healthy", "null")) + ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k] + } + + # Update the states + health_intervention <- rows_update(health_intervention, pivot_wider(ur, id_cols=id)) +} ### Optimised version @@ -32,34 +91,37 @@ health_base_dt <- as.data.table(health_base) # Melt (pivot_longer) the entire dataset once to create the long format ur <- melt(health_base_dt, id.vars = "id", variable.name = "year", value.name = "state") -# For each id, determine the first year each disease state appears +# For each id, determine the first year each disease state appears, excluding "healthy" and "null" fv <- ur[!(state %in% c("healthy", "null")), .(first_year = min(as.numeric(year))), by = .(id, state)] setorder(fv, id, first_year) -# Concatenate previous states in a cumulative fashion within each id +# Concatenate previous states cumulatively within each id fv[, cumulative_state := Reduce(function(x, y) paste(y, x, sep = "|"), state, accumulate = TRUE), by = id] -# Join the cumulative state information back to the main long format (ur) +# Merge cumulative states back to the main dataset ur <- merge(ur, fv[, .(id, state, cumulative_state, first_year)], by = c("id", "state"), all.x = TRUE) -# Fill cumulative states forward based on the first year appearance within each id -ur[, cumulative_state := ifelse( - as.numeric(year) >= first_year, cumulative_state, NA_character_ +# Create a final state column preserving "healthy" where applicable +ur[, final_state := ifelse( + state == "healthy" | is.na(cumulative_state), + state, # Keep "healthy" or original if cumulative_state is NA + cumulative_state # Otherwise use cumulative state ), by = id] -# Use dplyr::fill to carry forward cumulative states (filling down within each id) +# Use dplyr::fill to carry forward cumulative states within each id ur <- ur %>% group_by(id) %>% arrange(id, year) %>% - mutate(final_state = cumulative_state) %>% + mutate(final_state = ifelse(final_state == "null", NA, final_state)) %>% # Treat "null" as NA fill(final_state, .direction = "down") %>% + replace_na(list(final_state = "healthy")) %>% # Replace any remaining NA with "healthy" ungroup() # Reshape back to wide format, retaining the modified states health_base_updated <- dcast(setDT(ur), id ~ year, value.var = "final_state") -# Replace NA values with original "healthy" and "null" states if needed +# Ensure the original "healthy" and "null" states are preserved where they belong health_base_updated[is.na(health_base_updated)] <- health_base[is.na(health_base_updated)] # Assign the result back to the original health_base variable if desired -health_base <- health_base_updated +health_base_plot <- health_base_updated