Skip to content

Commit

Permalink
update survival times
Browse files Browse the repository at this point in the history
  • Loading branch information
emprestige committed Mar 20, 2024
1 parent 5b384f8 commit 2468987
Show file tree
Hide file tree
Showing 15 changed files with 576 additions and 295 deletions.
3 changes: 3 additions & 0 deletions analysis/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@
column = "code",
)

# covid secondary - sensitive
coronavirus_unspecified = ["B972", "B342"]

# covid secondary exclusion
covid_secondary_exclusion_codelist = codelist_from_csv(
"codelists/user-emprestige-covid-19-exclusion-secondary-care-maximal-sensitivity.csv",
Expand Down
250 changes: 171 additions & 79 deletions analysis/data_processing.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,6 @@ df_input <- read_feather(
year(study_start_date), "_", year(study_end_date), "_",
codelist_type, "_", investigation_type,".arrow")))

#assign ethnicity group
df_input <- df_input %>%
mutate(
latest_ethnicity_group = ifelse(df_input$latest_ethnicity_code == "1", "White",
ifelse(df_input$latest_ethnicity_code == "2", "Mixed",
ifelse(df_input$latest_ethnicity_code == "3", "Asian or Asian British",
ifelse(df_input$latest_ethnicity_code == "4", "Black or Black British",
ifelse(df_input$latest_ethnicity_code == "5", "Other Ethnic Groups", "Unknown"))))
))

#calculate age bands
if(cohort == "older_adults") {
df_input <- df_input %>%
Expand Down Expand Up @@ -76,91 +66,193 @@ df_input <- df_input %>%
))
}

#calculate IMD quintile
#data manipulation
df_input <- df_input %>%
mutate(imd_quintile = case_when(
df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
df_input$imd_rounded < as.integer(32800 * 2 / 5) ~ "2",
df_input$imd_rounded < as.integer(32800 * 3 / 5) ~ "3",
df_input$imd_rounded < as.integer(32800 * 4 / 5) ~ "4",
df_input$imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)",
TRUE ~ NA_character_
))
mutate(

#assign ethnicity group
latest_ethnicity_group = case_when(
df_input$latest_ethnicity_code == "1" ~ "White",
df_input$latest_ethnicity_code == "2" ~ "Mixed",
df_input$latest_ethnicity_code == "3" ~ "Asian or Asian British",
df_input$latest_ethnicity_code == "4" ~ "Black or Black British",
df_input$latest_ethnicity_code == "5" ~ "Other Ethnic Groups",
TRUE ~ "Unknown"),

#calculate IMD quintile
imd_quintile = case_when(
df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)",
df_input$imd_rounded < as.integer(32800 * 2 / 5) ~ "2",
df_input$imd_rounded < as.integer(32800 * 3 / 5) ~ "3",
df_input$imd_rounded < as.integer(32800 * 4 / 5) ~ "4",
df_input$imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)",
TRUE ~ NA_character_
)
)

#reverse order of IMD classfications
recode(df_input$imd_quintile, "1 (most deprived)" = "5 (most deprived)",
"2" = "4", "3" = "3", "4" = "2", "5 (least deprived)" = "1 (least deprived)")

#recode rurality to 5 levels
#more data manipulation
df_input <- df_input %>%
mutate(
#recode rurality to 5 levels
rurality_code = recode(rural_urban_classification, "1" = "1", "2" = "2",
"3" = "3", "4" = "3", "5" = "4", "6" = "4",
"7" = "5", "8" = "5")
)

#assign rurality classification
df_input <- df_input %>%
mutate(
rurality_classification = ifelse(df_input$rurality_code == "1", "Urban Major Conurbation",
ifelse(df_input$rurality_code == "2", "Urban Minor Conurbation",
ifelse(df_input$rurality_code == "3", "Urban City and Town",
ifelse(df_input$rurality_code == "4", "Rural Town",
ifelse(df_input$rurality_code == "5", "Rural Village", "Unknown"))))
))

#define household size categories
df_input <- df_input %>%
mutate(
household_size_cat = ifelse(df_input$household_size >= 1 & df_input$household_size <= 2, "1",
ifelse(df_input$household_size >= 3 & household_size <= 5, "2",
ifelse(df_input$household_size >= 6, "3", "Unknown")))
"7" = "5", "8" = "5"),
#assign rurality classification
rurality_classification = case_when(
df_input$rurality_code == "1" ~ "Urban Major Conurbation",
df_input$rurality_code == "2" ~ "Urban Minor Conurbation",
df_input$rurality_code == "3" ~ "Urban City and Town",
df_input$rurality_code == "4" ~ "Rural Town and Fringe",
df_input$rurality_code == "5" ~ "Rural Village and Dispersed",
TRUE ~ "Unknown"
),
#define household size categories
household_size_cat = case_when(
df_input$household_size >= 1 & df_input$household_size <= 2 ~ "1",
df_input$household_size >= 3 & household_size <= 5 ~ "2",
df_input$household_size >= 6 ~ "3",
TRUE ~ "Unknown"
),
)

#define seasons for covid
covid_season_min = as.Date("2019-09-01", format = "%Y-%m-%d")

# #create variable for survival time
# df_input$end_time_mild <- study_end_date
# df_input$end_time_severe <- study_end_date
#
# #calculate follow-up end date for mild outcomes
# df_input <- df_input %>%
# rowwise() %>%
# mutate(end_time_mild = case_when(
# study_start_date >= covid_season_min & covid_primary ~ covid_primary_date,
# rsv_primary ~ rsv_primary_date,
# flu_primary ~ flu_primary_date,
# TRUE ~ study_end_date
# ))
#
# #calculate follow-up end date for severe outcomes
# df_input <- df_input %>%
# rowwise() %>%
# mutate(end_time_severe = case_when(
# study_start_date >= covid_season_min & covid_secondary ~ covid_secondary_date,
# rsv_secondary ~ rsv_secondary_date,
# flu_secondary ~ flu_secondary_date,
# TRUE ~ study_end_date
# ))
#define event time
if (study_start_date < covid_season_min) {
df_input <- df_input %>%
mutate(
#infer mild case date for rsv
rsv_primary_inf_date = case_when(
is.na(rsv_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date,
is.na(rsv_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date,
is.na(rsv_primary_date) & !is.na(deregistration_date) ~ deregistration_date,
is.na(rsv_primary_date) & !is.na(rsv_secondary_date) ~ rsv_secondary_date,
TRUE ~ rsv_primary_date
),
#assign censoring indicator
rsv_primary_censor = case_when(
rsv_primary_inf_date == rsv_primary_date ~ 0,
rsv_primary_inf_date == rsv_secondary_date ~ 0,
TRUE ~ 1
),
#infer rsv outcome
rsv_primary_inf = ifelse(rsv_primary_censor == 0, TRUE, FALSE),
#infer mild case date for flu
flu_primary_inf_date = case_when(
is.na(flu_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date,
is.na(flu_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date,
is.na(flu_primary_date) & !is.na(deregistration_date) ~ deregistration_date,
is.na(flu_primary_date) & !is.na(flu_secondary_date) ~ flu_secondary_date,
TRUE ~ flu_primary_date
),
#assign censoring indicator
flu_primary_censor = case_when(
flu_primary_inf_date == flu_primary_date ~ 0,
flu_primary_inf_date == flu_secondary_date ~ 0,
TRUE ~ 1
),
#infer flu outcome
flu_primary_inf = ifelse(flu_primary_censor == 0, TRUE, FALSE)
)
} else {
df_input <- df_input %>%
mutate(
#infer mild case date for rsv
rsv_primary_inf_date = case_when(
is.na(rsv_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date,
is.na(rsv_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date,
is.na(rsv_primary_date) & !is.na(deregistration_date) ~ deregistration_date,
is.na(rsv_primary_date) & !is.na(rsv_secondary_date) ~ rsv_secondary_date,
TRUE ~ rsv_primary_date
),
#assign censoring indicator
rsv_primary_censor = case_when(
rsv_primary_inf_date == rsv_primary_date ~ 0,
rsv_primary_inf_date == rsv_secondary_date ~ 0,
TRUE ~ 1
),
#infer rsv outcome
rsv_primary_inf = ifelse(rsv_primary_censor == 0, TRUE, FALSE),
#infer mild case date for flu
flu_primary_inf_date = case_when(
is.na(flu_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date,
is.na(flu_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date,
is.na(flu_primary_date) & !is.na(deregistration_date) ~ deregistration_date,
is.na(flu_primary_date) & !is.na(flu_secondary_date) ~ flu_secondary_date,
TRUE ~ flu_primary_date
),
#assign censoring indicator
flu_primary_censor = case_when(
flu_primary_inf_date == flu_primary_date ~ 0,
flu_primary_inf_date == flu_secondary_date ~ 0,
TRUE ~ 1
),
#infer flu outcome
flu_primary_inf = ifelse(flu_primary_censor == 0, TRUE, FALSE),
#infer mild case date for covid
covid_primary_inf_date = case_when(
is.na(covid_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date,
is.na(covid_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date,
is.na(covid_primary_date) & !is.na(deregistration_date) ~ deregistration_date,
is.na(covid_primary_date) & !is.na(covid_secondary_date) ~ covid_secondary_date,
TRUE ~ covid_primary_date
),
#assign censoring indicator
covid_primary_censor = case_when(
covid_primary_inf_date == covid_primary_date ~ 0,
covid_primary_inf_date == covid_secondary_date ~ 0,
TRUE ~ 1
),
#infer covid outcome
covid_primary_inf = ifelse(covid_primary_censor == 0, TRUE, FALSE)
)
}

#calculate survival time for both outcomes (in years)
# df_input$time_mild <- as.numeric(difftime(df_input$end_time_mild,
# study_start_date, df_input, "weeks"))/52.25
# df_input$time_severe <- as.numeric(difftime(df_input$end_time_severe,
# study_start_date, df_input, "weeks"))/52.25
df_input$time_rsv_primary <- as.numeric(difftime(df_input$rsv_primary_date,
study_start_date, df_input, "weeks"))/52.25
df_input$time_rsv_secondary <- as.numeric(difftime(df_input$rsv_secondary_date,
study_start_date, df_input, "weeks"))/52.25
df_input$time_flu_primary <- as.numeric(difftime(df_input$flu_primary_date,
study_start_date, df_input, "weeks"))/52.25
df_input$time_flu_secondary <- as.numeric(difftime(df_input$flu_secondary_date,
study_start_date, df_input, "weeks"))/52.25
df_input$time_covid_primary <- as.numeric(difftime(df_input$covid_primary_date,
study_start_date, df_input, "weeks"))/52.25
df_input$time_covid_secondary <- as.numeric(difftime(df_input$covid_secondary_date,
study_start_date, df_input, "weeks"))/52.25
#calculate time to event
if (study_start_date < covid_season_min) {
df_input <- df_input %>%
mutate(
#time until mild RSV outcome
time_rsv_primary = as.numeric(difftime(rsv_primary_inf_date,
study_start_date, "weeks"))/52.25,
#time until severe rsv outcome
time_rsv_secondary = as.numeric(difftime(rsv_secondary_date,
study_start_date, "weeks"))/52.25,
#time until mild flu outcome
time_flu_primary = as.numeric(difftime(flu_primary_inf_date,
study_start_date, "weeks"))/52.25,
#time until severe flu outcome
time_flu_secondary = as.numeric(difftime(flu_secondary_date,
study_start_date, "weeks"))/52.25
)
} else {
df_input <- df_input %>%
mutate(
#time until mild RSV outcome
time_rsv_primary = as.numeric(difftime(rsv_primary_inf_date,
study_start_date, "weeks"))/52.25,
#time until severe rsv outcome
time_rsv_secondary = as.numeric(difftime(rsv_secondary_date,
study_start_date, "weeks"))/52.25,
#time until mild flu outcome
time_flu_primary = as.numeric(difftime(flu_primary_inf_date,
study_start_date, "weeks"))/52.25,
#time until severe flu outcome
time_flu_secondary = as.numeric(difftime(flu_secondary_date,
study_start_date, "weeks"))/52.25,
#time until mild covid outcome
time_covid_primary = as.numeric(difftime(covid_primary_inf_date,
study_start_date, "weeks"))/52.25,
#time until severe covid outcome
time_covid_secondary = as.numeric(difftime(covid_secondary_date,
study_start_date, "weeks"))/52.25
)
}

## create output directories ----
fs::dir_create(here("output", "data"))
Expand Down
Loading

0 comments on commit 2468987

Please sign in to comment.