Skip to content

Commit

Permalink
update data processing and dummy data
Browse files Browse the repository at this point in the history
  • Loading branch information
emprestige committed Feb 14, 2024
1 parent edd1324 commit b300964
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 16 deletions.
66 changes: 58 additions & 8 deletions analysis/data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ fs::dir_create(here("analysis"))
#define study start date and study end date
source(here("analysis", "design", "design.R"))
args <- commandArgs(trailingOnly = TRUE)
study_start_date <- as.Date(as.numeric(study_dates[args[2]]), format = "%Y-%m-%d", origin = "1970-01-01")
study_end_date <- as.Date(as.numeric(study_dates[args[3]]), format = "%Y-%m-%d", origin = "1970-01-01")
study_start_date <- study_dates[[args[[2]]]]
study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]

df_input <- read_feather(
here::here("output", paste0("input_", args[[1]], "_", year(study_start_date),
here::here("output", paste0("input_", cohort, "_", year(study_start_date),
"_", year(study_end_date), ".arrow")))

#assign ethnicity group
Expand All @@ -29,22 +30,22 @@ df_input <- df_input %>%
))

#calculate age bands
if(args[[1]] == "older_adults") {
if(cohort == "older_adults") {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 65 & df_input$age <= 74 ~ "65-74y",
df_input$age >= 75 & df_input$age <= 89 ~ "75-89y",
df_input$age >= 90 ~ "90+y",
TRUE ~ NA_character_
))
} else if(args[[1]] == "adults") {
} else if(cohort == "adults") {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 18 & df_input$age <= 39 ~ "18-29y",
df_input$age >= 40 & df_input$age <= 64 ~ "40-64y",
TRUE ~ NA_character_
))
} else if(args[[1]] == "children_adults") {
} else if(cohort == "children_adults") {
df_input <- df_input %>%
mutate(age_band = case_when(
df_input$age >= 2 & df_input$age <= 5 ~ "2-5y",
Expand All @@ -67,15 +68,64 @@ df_input <- df_input %>%
#calculate IMD quintile
df_input <- df_input %>%
mutate(imd_quintile = case_when(
df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1",
df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
df_input$imd_rounded < as.integer(32800 * 2 / 5) ~ "2",
df_input$imd_rounded < as.integer(32800 * 3 / 5) ~ "3",
df_input$imd_rounded < as.integer(32800 * 4 / 5) ~ "4",
df_input$imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)",
TRUE ~ NA_character_
))

#define seasons for covid
covid_season_min = as.Date("2019-09-01", format = "%Y-%m-%d")

#create variable for survival time
df_input$end_time_mild <- study_end_date
df_input$end_time_severe <- study_end_date

#calculate follow-up end date for mild outcomes
df_input <- df_input %>%
rowwise() %>%
mutate(end_time_mild = if (study_start_date >= covid_season_min) {
if (rsv_primary == TRUE) {
rsv_primary_date
} else if (covid_primary == TRUE) {
covid_primary_date
} else if (flu_primary == TRUE) {
flu_primary_date
} else {study_end_date}
} else {
if (rsv_primary == TRUE) {
rsv_primary_date
} else if (flu_primary == TRUE) {
flu_primary_date
} else {study_end_date}}
)

#calculate follow-up end date for severe outcomes
df_input <- df_input %>%
rowwise() %>%
mutate(end_time_severe = if (study_start_date >= covid_season_min) {
if (rsv_secondary == TRUE) {
rsv_secondary_date
} else if (covid_secondary == TRUE) {
covid_secondary_date
} else if (flu_secondary == TRUE) {
flu_secondary_date
} else {study_end_date}
} else {
if (rsv_secondary == TRUE) {
rsv_secondary_date
} else if (flu_secondary == TRUE) {
flu_secondary_date
} else {study_end_date}}
)

#calculate survival time for both outcomes (in weeks)
df_input$time_mild <- difftime(df_input$end_time_mild, study_start_date, df_input, "weeks")
df_input$time_severe <- difftime(df_input$end_time_severe, study_start_date, df_input, "weeks")

#write the new input file
write_feather(df_input, here::here("output",
paste0("input_processed_", args[[1]], "_", year(study_start_date),
paste0("input_processed_", cohort, "_", year(study_start_date),
"_", year(study_end_date), ".arrow")))
2 changes: 1 addition & 1 deletion analysis/dummydata/dummydata_older_adults.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ sim_list = lst(

#age of the patient
age = bn_node(
~ as.integer(rnormTrunc(n = ..n, mean = 60, sd = 14, min = 65)),
~ as.integer(rnormTrunc(n = ..n, mean = 60, sd = 14, min = 65, max = 120)),
missing_rate = ~ 0.001
),

Expand Down
Binary file modified analysis/dummydata/dummyextract_older_adults_2016_2017.arrow
Binary file not shown.
17 changes: 10 additions & 7 deletions analysis/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@ fs::dir_create(here("analysis"))
#define study start date and study end date
source(here("analysis", "design", "design.R"))
args <- commandArgs(trailingOnly = TRUE)
study_start_date <- as.Date(as.numeric(study_dates[args[2]]), format = "%Y-%m-%d", origin = "1970-01-01")
study_end_date <- as.Date(as.numeric(study_dates[args[3]]), format = "%Y-%m-%d", origin = "1970-01-01")
study_start_date <- study_dates[[args[[2]]]]
study_end_date <- study_dates[[args[[3]]]]
cohort <- args[[1]]

df_input <- read_feather(
here::here("output", paste0("input_processed_", args[[1]], "_",
year(study_start_date), "_", year(study_end_date), ".arrow")))
here::here("output", paste0("input_processed_", cohort, "_", year(study_start_date),
"_", year(study_end_date), ".arrow")))

lab <- ifelse(args[1] == "infants", "Age (Months)",
ifelse(args[1] == "infants_subgroup", "Age (Months)", "Age (Years)"))
lab <- ifelse(cohort == "infants", "Age (Months)",
ifelse(cohort == "infants_subgroup", "Age (Months)", "Age (Years)"))

plot_age <- ggplot(data = df_input, aes(age, frequency(age))) + geom_col(width = 0.9) +
xlab(lab) + ylab("Frequency")

ggsave(
plot = plot_age,
filename = paste0("descriptive_", args[[1]], "_", year(study_start_date),
filename = paste0("descriptive_", cohort, "_", year(study_start_date),
"_", year(study_end_date), ".png"), path = here::here("output"),
)


21 changes: 21 additions & 0 deletions project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ actions:
outputs:
highly_sensitive:
cohort: output/input_processed_adults_2016_2017.arrow

describe_dataset_adults_s1:
run: r:latest analysis/report.R adults season1_start_date season1_end_date
needs: [process_dataset_adults_s1]
outputs:
moderately_sensitive:
cohort: output/descriptive_adults_2016_2017.png

generate_dataset_children_adolescents_s1:
run: >
Expand All @@ -63,6 +70,13 @@ actions:
outputs:
highly_sensitive:
cohort: output/input_processed_children_adolescents_2016_2017.arrow

describe_dataset_children_adolescents_s1:
run: r:latest analysis/report.R children_adolescents season1_start_date season1_end_date
needs: [process_dataset_children_adolescents_s1]
outputs:
moderately_sensitive:
cohort: output/descriptive_children_adolescents_2016_2017.png

generate_dataset_infants_s1:
run: >
Expand All @@ -80,4 +94,11 @@ actions:
outputs:
highly_sensitive:
cohort: output/input_processed_infants_2016_2017.arrow

describe_dataset_infants_s1:
run: r:latest analysis/report.R infants season1_start_date season1_end_date
needs: [process_dataset_infants_s1]
outputs:
moderately_sensitive:
cohort: output/descriptive_infants_2016_2017.png

0 comments on commit b300964

Please sign in to comment.