optimised data prep

jibeproject · Nov 6, 2024 · 86aef43 · 86aef43
1 parent ad4dc10
commit 86aef43
Showing 1 changed file with 56 additions and 30 deletions.
diff --git a/processing_hd.R b/processing_hd.R
@@ -1,39 +1,65 @@
 # Load libraries
 library(tidyverse)
-library(arrow)
 
 # Set seed for reproducibility
 set.seed(2024)
 
 # Read cycleIntervention health output as an example
-hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv")
+# hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv")
+
+## Belen folder
+directory <- "C:/Users/mbzd2/OneDrive - RMIT University/JIBE/JIBE-WP6/healthmicrosim/"
+
+health_base <- read_csv(paste0(directory, "manchester/simulationResults/ForUrbanTransition/reference/health/04_death_and_disease/pp_healthDiseaseTracker_2039.csv")) # %>%
+# left_join(person_base)
+# health_intervention <- read_csv(paste0(directory,"manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv"))
+
 
 # Same for quicker run
-shd <- hd |> slice_sample(n = 100)
-
-# Loop through each row
-for (i in 1:nrow(shd)){
-  # Create a long df for each row
-  ur <- pivot_longer(shd[i,], cols = -id)
-  # Get a table for each disease state with the first year when it was appeared
-  fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy)
-  # Loop through the first value dataset with first year for the disease appearance
-  for (j in 1:nrow(fv)){
-    # Ignore healty and null states
-    if (!fv$value[j] %in% c("healthy", "null"))
-    {
-      # Do not carry forward the healthy state to any other state
-      if (fv$value[j - 1] != "healthy")
-        # Join previous state with the current one with a | as a separator
-        fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|")
-    }
-  }
-  # Update the long format for each ID with the new states
-  for (k in 1:nrow(fv)){
-    if (!fv$value[k] %in% c("healthy", "null"))
-        ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k]
-  }
-
-  # Update the states
-  shd <- rows_update(shd, pivot_wider(ur, id_cols=id))
-}
+# shd <- hd |> slice_sample(n = 100)
+
+
+
+
+### Optimised version
+
+library(data.table)
+library(dplyr)
+
+# Convert health_base to data.table for fast processing
+health_base_dt <- as.data.table(health_base)
+
+# Melt (pivot_longer) the entire dataset once to create the long format
+ur <- melt(health_base_dt, id.vars = "id", variable.name = "year", value.name = "state")
+
+# For each id, determine the first year each disease state appears
+fv <- ur[!(state %in% c("healthy", "null")), .(first_year = min(as.numeric(year))), by = .(id, state)]
+setorder(fv, id, first_year)
+
+# Concatenate previous states in a cumulative fashion within each id
+fv[, cumulative_state := Reduce(function(x, y) paste(y, x, sep = "|"), state, accumulate = TRUE), by = id]
+
+# Join the cumulative state information back to the main long format (ur)
+ur <- merge(ur, fv[, .(id, state, cumulative_state, first_year)], by = c("id", "state"), all.x = TRUE)
+
+# Fill cumulative states forward based on the first year appearance within each id
+ur[, cumulative_state := ifelse(
+  as.numeric(year) >= first_year, cumulative_state, NA_character_
+), by = id]
+
+# Use dplyr::fill to carry forward cumulative states (filling down within each id)
+ur <- ur %>%
+  group_by(id) %>%
+  arrange(id, year) %>%
+  mutate(final_state = cumulative_state) %>%
+  fill(final_state, .direction = "down") %>%
+  ungroup()
+
+# Reshape back to wide format, retaining the modified states
+health_base_updated <- dcast(setDT(ur), id ~ year, value.var = "final_state")
+
+# Replace NA values with original "healthy" and "null" states if needed
+health_base_updated[is.na(health_base_updated)] <- health_base[is.na(health_base_updated)]
+
+# Assign the result back to the original health_base variable if desired
+health_base <- health_base_updated