From 86aef4320fec0d604395527eb43fcb53189029b9 Mon Sep 17 00:00:00 2001 From: "B. Zapata-Diomedi" Date: Wed, 6 Nov 2024 06:31:08 +0000 Subject: [PATCH] optimised data prep --- processing_hd.R | 86 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/processing_hd.R b/processing_hd.R index 3feef06..87ecc6a 100644 --- a/processing_hd.R +++ b/processing_hd.R @@ -1,39 +1,65 @@ # Load libraries library(tidyverse) -library(arrow) # Set seed for reproducibility set.seed(2024) # Read cycleIntervention health output as an example -hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv") +# hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv") + +## Belen folder +directory <- "C:/Users/mbzd2/OneDrive - RMIT University/JIBE/JIBE-WP6/healthmicrosim/" + +health_base <- read_csv(paste0(directory, "manchester/simulationResults/ForUrbanTransition/reference/health/04_death_and_disease/pp_healthDiseaseTracker_2039.csv")) # %>% +# left_join(person_base) +# health_intervention <- read_csv(paste0(directory,"manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv")) + # Same for quicker run -shd <- hd |> slice_sample(n = 100) - -# Loop through each row -for (i in 1:nrow(shd)){ - # Create a long df for each row - ur <- pivot_longer(shd[i,], cols = -id) - # Get a table for each disease state with the first year when it was appeared - fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy) - # Loop through the first value dataset with first year for the disease appearance - for (j in 1:nrow(fv)){ - # Ignore healty and null states - if (!fv$value[j] %in% c("healthy", "null")) - { - # Do not carry forward the healthy state to any other state - if (fv$value[j - 1] != "healthy") - # Join previous state with the current one with a | as a separator - fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|") - } - } - # Update the long format for each ID with the new states - for (k in 1:nrow(fv)){ - if (!fv$value[k] %in% c("healthy", "null")) - ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k] - } - - # Update the states - shd <- rows_update(shd, pivot_wider(ur, id_cols=id)) -} +# shd <- hd |> slice_sample(n = 100) + + + + +### Optimised version + +library(data.table) +library(dplyr) + +# Convert health_base to data.table for fast processing +health_base_dt <- as.data.table(health_base) + +# Melt (pivot_longer) the entire dataset once to create the long format +ur <- melt(health_base_dt, id.vars = "id", variable.name = "year", value.name = "state") + +# For each id, determine the first year each disease state appears +fv <- ur[!(state %in% c("healthy", "null")), .(first_year = min(as.numeric(year))), by = .(id, state)] +setorder(fv, id, first_year) + +# Concatenate previous states in a cumulative fashion within each id +fv[, cumulative_state := Reduce(function(x, y) paste(y, x, sep = "|"), state, accumulate = TRUE), by = id] + +# Join the cumulative state information back to the main long format (ur) +ur <- merge(ur, fv[, .(id, state, cumulative_state, first_year)], by = c("id", "state"), all.x = TRUE) + +# Fill cumulative states forward based on the first year appearance within each id +ur[, cumulative_state := ifelse( + as.numeric(year) >= first_year, cumulative_state, NA_character_ +), by = id] + +# Use dplyr::fill to carry forward cumulative states (filling down within each id) +ur <- ur %>% + group_by(id) %>% + arrange(id, year) %>% + mutate(final_state = cumulative_state) %>% + fill(final_state, .direction = "down") %>% + ungroup() + +# Reshape back to wide format, retaining the modified states +health_base_updated <- dcast(setDT(ur), id ~ year, value.var = "final_state") + +# Replace NA values with original "healthy" and "null" states if needed +health_base_updated[is.na(health_base_updated)] <- health_base[is.na(health_base_updated)] + +# Assign the result back to the original health_base variable if desired +health_base <- health_base_updated