Skip to content

Commit

Permalink
optimised data prep
Browse files Browse the repository at this point in the history
  • Loading branch information
B. Zapata-Diomedi committed Nov 6, 2024
1 parent ad4dc10 commit 86aef43
Showing 1 changed file with 56 additions and 30 deletions.
86 changes: 56 additions & 30 deletions processing_hd.R
Original file line number Diff line number Diff line change
@@ -1,39 +1,65 @@
# Load libraries
library(tidyverse)
library(arrow)

# Set seed for reproducibility
set.seed(2024)

# Read cycleIntervention health output as an example
hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv")
# hd <- read_csv("C:/Users/ajnab/RMIT University/JIBE working group - General/manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv")

## Belen folder
directory <- "C:/Users/mbzd2/OneDrive - RMIT University/JIBE/JIBE-WP6/healthmicrosim/"

health_base <- read_csv(paste0(directory, "manchester/simulationResults/ForUrbanTransition/reference/health/04_death_and_disease/pp_healthDiseaseTracker_2039.csv")) # %>%
# left_join(person_base)
# health_intervention <- read_csv(paste0(directory,"manchester/simulationResults/ForUrbanTransition/cycleIntervention/health/04_death_and_disease/pp_healthDiseaseTracker_2029.csv"))


# Same for quicker run
shd <- hd |> slice_sample(n = 100)

# Loop through each row
for (i in 1:nrow(shd)){
# Create a long df for each row
ur <- pivot_longer(shd[i,], cols = -id)
# Get a table for each disease state with the first year when it was appeared
fv <- ur |> group_by(value) |> summarise(fy = first(name)) |> arrange(fy)
# Loop through the first value dataset with first year for the disease appearance
for (j in 1:nrow(fv)){
# Ignore healty and null states
if (!fv$value[j] %in% c("healthy", "null"))
{
# Do not carry forward the healthy state to any other state
if (fv$value[j - 1] != "healthy")
# Join previous state with the current one with a | as a separator
fv$value[j] <- paste(fv$value[j], fv$value[j - 1], sep = "|")
}
}
# Update the long format for each ID with the new states
for (k in 1:nrow(fv)){
if (!fv$value[k] %in% c("healthy", "null"))
ur[(ur$name >= as.numeric(fv$fy[k]) & ur$name < (as.numeric(fv$fy[k + 1]))),]$value <- fv$value[k]
}

# Update the states
shd <- rows_update(shd, pivot_wider(ur, id_cols=id))
}
# shd <- hd |> slice_sample(n = 100)




### Optimised version

library(data.table)
library(dplyr)

# Convert health_base to data.table for fast processing
health_base_dt <- as.data.table(health_base)

# Melt (pivot_longer) the entire dataset once to create the long format
ur <- melt(health_base_dt, id.vars = "id", variable.name = "year", value.name = "state")

# For each id, determine the first year each disease state appears
fv <- ur[!(state %in% c("healthy", "null")), .(first_year = min(as.numeric(year))), by = .(id, state)]
setorder(fv, id, first_year)

# Concatenate previous states in a cumulative fashion within each id
fv[, cumulative_state := Reduce(function(x, y) paste(y, x, sep = "|"), state, accumulate = TRUE), by = id]

# Join the cumulative state information back to the main long format (ur)
ur <- merge(ur, fv[, .(id, state, cumulative_state, first_year)], by = c("id", "state"), all.x = TRUE)

# Fill cumulative states forward based on the first year appearance within each id
ur[, cumulative_state := ifelse(
as.numeric(year) >= first_year, cumulative_state, NA_character_
), by = id]

# Use dplyr::fill to carry forward cumulative states (filling down within each id)
ur <- ur %>%
group_by(id) %>%
arrange(id, year) %>%
mutate(final_state = cumulative_state) %>%
fill(final_state, .direction = "down") %>%
ungroup()

# Reshape back to wide format, retaining the modified states
health_base_updated <- dcast(setDT(ur), id ~ year, value.var = "final_state")

# Replace NA values with original "healthy" and "null" states if needed
health_base_updated[is.na(health_base_updated)] <- health_base[is.na(health_base_updated)]

# Assign the result back to the original health_base variable if desired
health_base <- health_base_updated

0 comments on commit 86aef43

Please sign in to comment.