diff --git a/analysis/codelists.py b/analysis/codelists.py index 8b75590..bd54813 100644 --- a/analysis/codelists.py +++ b/analysis/codelists.py @@ -181,6 +181,9 @@ column = "code", ) +# covid secondary - sensitive +coronavirus_unspecified = ["B972", "B342"] + # covid secondary exclusion covid_secondary_exclusion_codelist = codelist_from_csv( "codelists/user-emprestige-covid-19-exclusion-secondary-care-maximal-sensitivity.csv", diff --git a/analysis/data_processing.R b/analysis/data_processing.R index 2135946..d2aae09 100644 --- a/analysis/data_processing.R +++ b/analysis/data_processing.R @@ -30,16 +30,6 @@ df_input <- read_feather( year(study_start_date), "_", year(study_end_date), "_", codelist_type, "_", investigation_type,".arrow"))) -#assign ethnicity group -df_input <- df_input %>% - mutate( - latest_ethnicity_group = ifelse(df_input$latest_ethnicity_code == "1", "White", - ifelse(df_input$latest_ethnicity_code == "2", "Mixed", - ifelse(df_input$latest_ethnicity_code == "3", "Asian or Asian British", - ifelse(df_input$latest_ethnicity_code == "4", "Black or Black British", - ifelse(df_input$latest_ethnicity_code == "5", "Other Ethnic Groups", "Unknown")))) - )) - #calculate age bands if(cohort == "older_adults") { df_input <- df_input %>% @@ -76,91 +66,193 @@ df_input <- df_input %>% )) } -#calculate IMD quintile +#data manipulation df_input <- df_input %>% - mutate(imd_quintile = case_when( - df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)", - df_input$imd_rounded < as.integer(32800 * 2 / 5) ~ "2", - df_input$imd_rounded < as.integer(32800 * 3 / 5) ~ "3", - df_input$imd_rounded < as.integer(32800 * 4 / 5) ~ "4", - df_input$imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)", - TRUE ~ NA_character_ - )) + mutate( + + #assign ethnicity group + latest_ethnicity_group = case_when( + df_input$latest_ethnicity_code == "1" ~ "White", + df_input$latest_ethnicity_code == "2" ~ "Mixed", + df_input$latest_ethnicity_code == "3" ~ "Asian or Asian British", + df_input$latest_ethnicity_code == "4" ~ "Black or Black British", + df_input$latest_ethnicity_code == "5" ~ "Other Ethnic Groups", + TRUE ~ "Unknown"), + + #calculate IMD quintile + imd_quintile = case_when( + df_input$imd_rounded >= 0 & df_input$imd_rounded < as.integer(32800 * 1 / 5) ~ "1 (most deprived)", + df_input$imd_rounded < as.integer(32800 * 2 / 5) ~ "2", + df_input$imd_rounded < as.integer(32800 * 3 / 5) ~ "3", + df_input$imd_rounded < as.integer(32800 * 4 / 5) ~ "4", + df_input$imd_rounded < as.integer(32800 * 5 / 5) ~ "5 (least deprived)", + TRUE ~ NA_character_ + ) + ) #reverse order of IMD classfications recode(df_input$imd_quintile, "1 (most deprived)" = "5 (most deprived)", "2" = "4", "3" = "3", "4" = "2", "5 (least deprived)" = "1 (least deprived)") -#recode rurality to 5 levels +#more data manipulation df_input <- df_input %>% mutate( + #recode rurality to 5 levels rurality_code = recode(rural_urban_classification, "1" = "1", "2" = "2", "3" = "3", "4" = "3", "5" = "4", "6" = "4", - "7" = "5", "8" = "5") - ) - -#assign rurality classification -df_input <- df_input %>% - mutate( - rurality_classification = ifelse(df_input$rurality_code == "1", "Urban Major Conurbation", - ifelse(df_input$rurality_code == "2", "Urban Minor Conurbation", - ifelse(df_input$rurality_code == "3", "Urban City and Town", - ifelse(df_input$rurality_code == "4", "Rural Town", - ifelse(df_input$rurality_code == "5", "Rural Village", "Unknown")))) - )) - -#define household size categories -df_input <- df_input %>% - mutate( - household_size_cat = ifelse(df_input$household_size >= 1 & df_input$household_size <= 2, "1", - ifelse(df_input$household_size >= 3 & household_size <= 5, "2", - ifelse(df_input$household_size >= 6, "3", "Unknown"))) + "7" = "5", "8" = "5"), + #assign rurality classification + rurality_classification = case_when( + df_input$rurality_code == "1" ~ "Urban Major Conurbation", + df_input$rurality_code == "2" ~ "Urban Minor Conurbation", + df_input$rurality_code == "3" ~ "Urban City and Town", + df_input$rurality_code == "4" ~ "Rural Town and Fringe", + df_input$rurality_code == "5" ~ "Rural Village and Dispersed", + TRUE ~ "Unknown" + ), + #define household size categories + household_size_cat = case_when( + df_input$household_size >= 1 & df_input$household_size <= 2 ~ "1", + df_input$household_size >= 3 & household_size <= 5 ~ "2", + df_input$household_size >= 6 ~ "3", + TRUE ~ "Unknown" + ), ) #define seasons for covid covid_season_min = as.Date("2019-09-01", format = "%Y-%m-%d") -# #create variable for survival time -# df_input$end_time_mild <- study_end_date -# df_input$end_time_severe <- study_end_date -# -# #calculate follow-up end date for mild outcomes -# df_input <- df_input %>% -# rowwise() %>% -# mutate(end_time_mild = case_when( -# study_start_date >= covid_season_min & covid_primary ~ covid_primary_date, -# rsv_primary ~ rsv_primary_date, -# flu_primary ~ flu_primary_date, -# TRUE ~ study_end_date -# )) -# -# #calculate follow-up end date for severe outcomes -# df_input <- df_input %>% -# rowwise() %>% -# mutate(end_time_severe = case_when( -# study_start_date >= covid_season_min & covid_secondary ~ covid_secondary_date, -# rsv_secondary ~ rsv_secondary_date, -# flu_secondary ~ flu_secondary_date, -# TRUE ~ study_end_date -# )) +#define event time +if (study_start_date < covid_season_min) { + df_input <- df_input %>% + mutate( + #infer mild case date for rsv + rsv_primary_inf_date = case_when( + is.na(rsv_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date, + is.na(rsv_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date, + is.na(rsv_primary_date) & !is.na(deregistration_date) ~ deregistration_date, + is.na(rsv_primary_date) & !is.na(rsv_secondary_date) ~ rsv_secondary_date, + TRUE ~ rsv_primary_date + ), + #assign censoring indicator + rsv_primary_censor = case_when( + rsv_primary_inf_date == rsv_primary_date ~ 0, + rsv_primary_inf_date == rsv_secondary_date ~ 0, + TRUE ~ 1 + ), + #infer rsv outcome + rsv_primary_inf = ifelse(rsv_primary_censor == 0, TRUE, FALSE), + #infer mild case date for flu + flu_primary_inf_date = case_when( + is.na(flu_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date, + is.na(flu_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date, + is.na(flu_primary_date) & !is.na(deregistration_date) ~ deregistration_date, + is.na(flu_primary_date) & !is.na(flu_secondary_date) ~ flu_secondary_date, + TRUE ~ flu_primary_date + ), + #assign censoring indicator + flu_primary_censor = case_when( + flu_primary_inf_date == flu_primary_date ~ 0, + flu_primary_inf_date == flu_secondary_date ~ 0, + TRUE ~ 1 + ), + #infer flu outcome + flu_primary_inf = ifelse(flu_primary_censor == 0, TRUE, FALSE) + ) +} else { + df_input <- df_input %>% + mutate( + #infer mild case date for rsv + rsv_primary_inf_date = case_when( + is.na(rsv_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date, + is.na(rsv_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date, + is.na(rsv_primary_date) & !is.na(deregistration_date) ~ deregistration_date, + is.na(rsv_primary_date) & !is.na(rsv_secondary_date) ~ rsv_secondary_date, + TRUE ~ rsv_primary_date + ), + #assign censoring indicator + rsv_primary_censor = case_when( + rsv_primary_inf_date == rsv_primary_date ~ 0, + rsv_primary_inf_date == rsv_secondary_date ~ 0, + TRUE ~ 1 + ), + #infer rsv outcome + rsv_primary_inf = ifelse(rsv_primary_censor == 0, TRUE, FALSE), + #infer mild case date for flu + flu_primary_inf_date = case_when( + is.na(flu_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date, + is.na(flu_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date, + is.na(flu_primary_date) & !is.na(deregistration_date) ~ deregistration_date, + is.na(flu_primary_date) & !is.na(flu_secondary_date) ~ flu_secondary_date, + TRUE ~ flu_primary_date + ), + #assign censoring indicator + flu_primary_censor = case_when( + flu_primary_inf_date == flu_primary_date ~ 0, + flu_primary_inf_date == flu_secondary_date ~ 0, + TRUE ~ 1 + ), + #infer flu outcome + flu_primary_inf = ifelse(flu_primary_censor == 0, TRUE, FALSE), + #infer mild case date for covid + covid_primary_inf_date = case_when( + is.na(covid_primary_date) & is.na(deregistration_date) & is.na(death_date) ~ study_end_date, + is.na(covid_primary_date) & is.na(deregistration_date) & !is.na(death_date) ~ death_date, + is.na(covid_primary_date) & !is.na(deregistration_date) ~ deregistration_date, + is.na(covid_primary_date) & !is.na(covid_secondary_date) ~ covid_secondary_date, + TRUE ~ covid_primary_date + ), + #assign censoring indicator + covid_primary_censor = case_when( + covid_primary_inf_date == covid_primary_date ~ 0, + covid_primary_inf_date == covid_secondary_date ~ 0, + TRUE ~ 1 + ), + #infer covid outcome + covid_primary_inf = ifelse(covid_primary_censor == 0, TRUE, FALSE) + ) +} -#calculate survival time for both outcomes (in years) -# df_input$time_mild <- as.numeric(difftime(df_input$end_time_mild, -# study_start_date, df_input, "weeks"))/52.25 -# df_input$time_severe <- as.numeric(difftime(df_input$end_time_severe, -# study_start_date, df_input, "weeks"))/52.25 -df_input$time_rsv_primary <- as.numeric(difftime(df_input$rsv_primary_date, - study_start_date, df_input, "weeks"))/52.25 -df_input$time_rsv_secondary <- as.numeric(difftime(df_input$rsv_secondary_date, - study_start_date, df_input, "weeks"))/52.25 -df_input$time_flu_primary <- as.numeric(difftime(df_input$flu_primary_date, - study_start_date, df_input, "weeks"))/52.25 -df_input$time_flu_secondary <- as.numeric(difftime(df_input$flu_secondary_date, - study_start_date, df_input, "weeks"))/52.25 -df_input$time_covid_primary <- as.numeric(difftime(df_input$covid_primary_date, - study_start_date, df_input, "weeks"))/52.25 -df_input$time_covid_secondary <- as.numeric(difftime(df_input$covid_secondary_date, - study_start_date, df_input, "weeks"))/52.25 +#calculate time to event +if (study_start_date < covid_season_min) { + df_input <- df_input %>% + mutate( + #time until mild RSV outcome + time_rsv_primary = as.numeric(difftime(rsv_primary_inf_date, + study_start_date, "weeks"))/52.25, + #time until severe rsv outcome + time_rsv_secondary = as.numeric(difftime(rsv_secondary_date, + study_start_date, "weeks"))/52.25, + #time until mild flu outcome + time_flu_primary = as.numeric(difftime(flu_primary_inf_date, + study_start_date, "weeks"))/52.25, + #time until severe flu outcome + time_flu_secondary = as.numeric(difftime(flu_secondary_date, + study_start_date, "weeks"))/52.25 + ) +} else { + df_input <- df_input %>% + mutate( + #time until mild RSV outcome + time_rsv_primary = as.numeric(difftime(rsv_primary_inf_date, + study_start_date, "weeks"))/52.25, + #time until severe rsv outcome + time_rsv_secondary = as.numeric(difftime(rsv_secondary_date, + study_start_date, "weeks"))/52.25, + #time until mild flu outcome + time_flu_primary = as.numeric(difftime(flu_primary_inf_date, + study_start_date, "weeks"))/52.25, + #time until severe flu outcome + time_flu_secondary = as.numeric(difftime(flu_secondary_date, + study_start_date, "weeks"))/52.25, + #time until mild covid outcome + time_covid_primary = as.numeric(difftime(covid_primary_inf_date, + study_start_date, "weeks"))/52.25, + #time until severe covid outcome + time_covid_secondary = as.numeric(difftime(covid_secondary_date, + study_start_date, "weeks"))/52.25 + ) +} ## create output directories ---- fs::dir_create(here("output", "data")) diff --git a/analysis/dataset_definition.py b/analysis/dataset_definition.py index 52294ab..024e610 100644 --- a/analysis/dataset_definition.py +++ b/analysis/dataset_definition.py @@ -195,13 +195,13 @@ def first_infection_event(codelist, where = True): ) #care home resident -if cohort == "older_adults" : - care_home_tpp = ( - addresses.for_patient_on(index_date) - .care_home_is_potential_match.when_null_then(False) - ) - care_home_code = (has_prior_event(codelists.carehome_codelist)) - care_home = care_home_tpp | care_home_code +#if cohort == "older_adults" : +care_home_tpp = ( + addresses.for_patient_on(index_date) + .care_home_is_potential_match.when_null_then(False) +) +care_home_code = (has_prior_event(codelists.carehome_codelist)) +care_home = care_home_tpp | care_home_code #define population if cohort == "infants" or cohort == "infants_subgroup" : @@ -212,6 +212,7 @@ def first_infection_event(codelist, where = True): & has_imd & (~severe_immunodeficiency) & (~risk_group_infants) + & (~care_home) ) if cohort == "older_adults" : @@ -229,6 +230,7 @@ def first_infection_event(codelist, where = True): & is_female_or_male & is_appropriate_age & has_imd + & (~care_home) ) #registration, sex and age @@ -265,12 +267,27 @@ def first_infection_event(codelist, where = True): dataset.household_pseudo_id = household_memberships_2020.household_pseudo_id dataset.household_size = household_memberships_2020.household_size -# #get patients practice's pseudonymised identifier -# dataset.practice = practice_registrations.for_patient_on(index_date).practice_pseudo_id +#get patients practice's pseudonymised identifier +dataset.practice_pseudo_id = ( + (practice_registrations.for_patient_on(index_date)) + .practice_pseudo_id +) #practice and patient information -dataset.region = (practice_registrations.for_patient_on(index_date)).practice_nuts1_region_name -dataset.stp = (practice_registrations.for_patient_on(index_date)).practice_stp +dataset.region = ( + (practice_registrations.for_patient_on(index_date)) + .practice_nuts1_region_name +) +dataset.stp = ( + (practice_registrations.for_patient_on(index_date)) + .practice_stp +) + +#date deregistered from practice +dataset.deregistration_date = ( + (practice_registrations + .for_patient_on(index_date)).end_date +) ##comorbidities @@ -396,12 +413,11 @@ def first_infection_event(codelist, where = True): #occurance of event in exclusion list within one month of secondary care diagnosis rsv_secondary_sens_date = ( - apcs.sort_by(apcs.admission_date) - .where( - ((hospitalisation_diagnosis_matches(codelists + apcs.sort_by(apcs.admission_date).where( + (hospitalisation_diagnosis_matches(codelists .rsv_secondary_codelist).exists_for_patient()) |(hospitalisation_diagnosis_matches(codelists - .unspecified_lrti).exists_for_patient()))) + .unspecified_lrti).exists_for_patient())) .first_for_patient().admission_date ) rsv_exclusion_secondary = (case( @@ -571,218 +587,315 @@ def first_infection_event(codelist, where = True): ##outcomes - covid -#count number of clinical codes in covid symptom list -covid_code_number = ( - (clinical_events.where(clinical_events - .date.is_on_or_between(first_infection_event(codelists - .covid_sensitive_codelist).date, first_infection_event(codelists - .covid_sensitive_codelist).date + days(14))) - .where(clinical_events.snomedct_code - .is_in(codelists.covid_sensitive_codelist))) - .snomedct_code.count_distinct_for_patient() -) - -#date of first occurance of two of the above codes within 2 weeks -covid_codes_date = ( - case(when(covid_code_number > 1) - .then(first_infection_event(codelists - .covid_sensitive_codelist).date)) -) - -#occurance of event in exclusion list within one month of covid_codes_date -covid_exclusion_primary = (case( - when((first_infection_event(codelists.covid_primary_exclusion_codelist) - .date.is_on_or_between(covid_codes_date - days(30), covid_codes_date + days(30))) - |(medications.where(medications.dmd_code - .is_in(codelists.covid_prescriptions_codelist)).date.minimum_for_patient() - .is_on_or_between(covid_codes_date - days(30), covid_codes_date + days(30)))) - .then(has_infection_event(codelists.covid_primary_exclusion_codelist)) -)) - -#covid primary care -if codelist_type == "specific" : - dataset.covid_primary = has_infection_event(codelists.covid_primary_codelist) - -if codelist_type == "sensitive" : - dataset.covid_primary = ( - (has_infection_event(codelists.covid_primary_codelist)) - |(covid_code_number >1)|(medications.where(medications.dmd_code - .is_in(codelists.covid_prescriptions_codelist)).exists_for_patient()) - &(~covid_exclusion_primary) - ) +if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : -#covid primary care date -if codelist_type == "specific" : - dataset.covid_primary_date = ( - first_infection_event(codelists - .covid_primary_codelist).date + #count number of clinical codes in covid symptom list + covid_code_number = ( + (clinical_events.where(clinical_events + .date.is_on_or_between(first_infection_event(codelists + .covid_sensitive_codelist).date, first_infection_event(codelists + .covid_sensitive_codelist).date + days(14))) + .where(clinical_events.snomedct_code + .is_in(codelists.covid_sensitive_codelist))) + .snomedct_code.count_distinct_for_patient() ) - -if codelist_type == "sensitive" : - dataset.covid_primary_date = (case( - when(~covid_exclusion_primary).then( - minimum_of((first_infection_event(codelists - .covid_primary_codelist).date), - (covid_codes_date),(medications.where(medications.dmd_code - .is_in(codelists.covid_prescriptions_codelist)) - .date.minimum_for_patient())))) + + #date of first occurance of two of the above codes within 2 weeks + covid_codes_date = ( + case(when(covid_code_number > 1) + .then(first_infection_event(codelists + .covid_sensitive_codelist).date)) ) - -#occurance of event in exclusion list within one month of secondary care diagnosis -covid_secondary_sens_date = ( - apcs.sort_by(apcs.admission_date) - .where( - ((hospitalisation_diagnosis_matches(codelists - .covid_secondary_codelist).exists_for_patient()))) - .first_for_patient().admission_date -) -covid_exclusion_secondary = (case( - when((hospitalisation_diagnosis_matches(codelists - .covid_secondary_exclusion_codelist)) - .admission_date.minimum_for_patient() - .is_on_or_between(covid_secondary_sens_date - days(30), - covid_secondary_sens_date + days(30))) - .then((hospitalisation_diagnosis_matches(codelists - .covid_secondary_exclusion_codelist)) - .exists_for_patient()) -)) - -#covid secondary care -if codelist_type == "specific" : - dataset.covid_secondary = ( - apcs.where(apcs.primary_diagnosis - .is_in(codelists.covid_secondary_codelist)) - .exists_for_patient() - |apcs.where(apcs.secondary_diagnosis - .is_in(codelists.covid_secondary_codelist)) - .exists_for_patient() + + #occurance of event in exclusion list within one month of covid_codes_date + covid_exclusion_primary = (case( + when((first_infection_event(codelists.covid_primary_exclusion_codelist) + .date.is_on_or_between(covid_codes_date - days(30), covid_codes_date + days(30))) + |(medications.where(medications.dmd_code + .is_in(codelists.covid_prescriptions_codelist)).date.minimum_for_patient() + .is_on_or_between(covid_codes_date - days(30), covid_codes_date + days(30)))) + .then(has_infection_event(codelists.covid_primary_exclusion_codelist)) + )) + + #covid primary care + if codelist_type == "specific" : + dataset.covid_primary = has_infection_event(codelists.covid_primary_codelist) + + if codelist_type == "sensitive" : + dataset.covid_primary = ( + (has_infection_event(codelists.covid_primary_codelist)) + |(covid_code_number >1)|(medications.where(medications.dmd_code + .is_in(codelists.covid_prescriptions_codelist)).exists_for_patient()) + &(~covid_exclusion_primary) + ) + + #covid primary care date + if codelist_type == "specific" : + dataset.covid_primary_date = ( + first_infection_event(codelists + .covid_primary_codelist).date + ) + + if codelist_type == "sensitive" : + dataset.covid_primary_date = (case( + when(~covid_exclusion_primary).then( + minimum_of((first_infection_event(codelists + .covid_primary_codelist).date), + (covid_codes_date),(medications.where(medications.dmd_code + .is_in(codelists.covid_prescriptions_codelist)) + .date.minimum_for_patient())))) + ) + + #occurance of event in exclusion list within one month of secondary care diagnosis + covid_secondary_sens_date = ( + apcs.sort_by(apcs.admission_date).where( + (hospitalisation_diagnosis_matches(codelists + .covid_secondary_codelist).exists_for_patient()) + |(hospitalisation_diagnosis_matches(codelists + .coronavirus_unspecified).exists_for_patient())) + .first_for_patient().admission_date ) -if codelist_type == "sensitive" : - dataset.covid_secondary = ( - (hospitalisation_diagnosis_matches(codelists - .covid_secondary_codelist) + covid_exclusion_secondary = (case( + when((hospitalisation_diagnosis_matches(codelists + .covid_secondary_exclusion_codelist)) + .admission_date.minimum_for_patient() + .is_on_or_between(covid_secondary_sens_date - days(30), + covid_secondary_sens_date + days(30))) + .then((hospitalisation_diagnosis_matches(codelists + .covid_secondary_exclusion_codelist)) .exists_for_patient()) - &(~covid_exclusion_secondary) - ) + )) -#covid secondary care date -if codelist_type == "specific" : - dataset.covid_secondary_date = ( - apcs.sort_by(apcs.admission_date) - .where(apcs.primary_diagnosis - .is_in(codelists.covid_secondary_codelist) - |apcs.secondary_diagnosis - .is_in(codelists.covid_secondary_codelist)) - .first_for_patient() - .admission_date - ) -if codelist_type == "sensitive" : - dataset.covid_secondary_date = (case( - when(~covid_exclusion_secondary) - .then(covid_secondary_sens_date)) - ) + #covid secondary care + if codelist_type == "specific" : + dataset.covid_secondary = ( + apcs.where(apcs.primary_diagnosis + .is_in(codelists.covid_secondary_codelist)) + .exists_for_patient() + |apcs.where(apcs.secondary_diagnosis + .is_in(codelists.covid_secondary_codelist)) + .exists_for_patient() + ) + if codelist_type == "sensitive" : + dataset.covid_secondary = ( + (hospitalisation_diagnosis_matches(codelists + .covid_secondary_codelist) + .exists_for_patient()) + |(hospitalisation_diagnosis_matches(codelists + .coronavirus_unspecified).exists_for_patient()) + &(~covid_exclusion_secondary) + ) + + #covid secondary care date + if codelist_type == "specific" : + dataset.covid_secondary_date = ( + apcs.sort_by(apcs.admission_date) + .where(apcs.primary_diagnosis + .is_in(codelists.covid_secondary_codelist) + |apcs.secondary_diagnosis + .is_in(codelists.covid_secondary_codelist)) + .first_for_patient() + .admission_date + ) + if codelist_type == "sensitive" : + dataset.covid_secondary_date = (case( + when(~covid_exclusion_secondary) + .then(covid_secondary_sens_date)) + ) ##outcomes - unspecified respiratory infection #unspecified respiratory virus primary care if codelist_type == "sensitive" : if cohort == "older_adults" : - dataset.overall_resp_primary = ( - (dataset.rsv_primary)|(dataset.flu_primary) - |(dataset.covid_primary)|(has_infection_event(codelists. - respiratory_virus_primary_codelist)) - |(emergency_care_diagnosis_matches(codelists. - rtri_attendance).exists_for_patient()) - |(emergency_care_diagnosis_matches(codelists. - copd_exacerbation_attendance).exists_for_patient()) - |(has_infection_event(codelists - .copd_exacerbation_primary_codelist)) - |(has_infection_event(codelists - .asthma_exacerbation_primary_codelist)) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_primary = ( + (dataset.rsv_primary)|(dataset.flu_primary)|(dataset.covid_primary) + |(has_infection_event(codelists. + respiratory_virus_primary_codelist)) + |(emergency_care_diagnosis_matches(codelists. + rtri_attendance).exists_for_patient()) + |(emergency_care_diagnosis_matches(codelists. + copd_exacerbation_attendance).exists_for_patient()) + |(has_infection_event(codelists + .copd_exacerbation_primary_codelist)) + |(has_infection_event(codelists + .asthma_exacerbation_primary_codelist)) + ) + else: + dataset.overall_resp_primary = ( + (dataset.rsv_primary)|(dataset.flu_primary) + |(has_infection_event(codelists. + respiratory_virus_primary_codelist)) + |(emergency_care_diagnosis_matches(codelists. + rtri_attendance).exists_for_patient()) + |(emergency_care_diagnosis_matches(codelists. + copd_exacerbation_attendance).exists_for_patient()) + |(has_infection_event(codelists + .copd_exacerbation_primary_codelist)) + |(has_infection_event(codelists + .asthma_exacerbation_primary_codelist)) + ) else: - dataset.overall_resp_primary = ( - (dataset.rsv_primary)|(dataset.flu_primary) - |(dataset.covid_primary)|(has_infection_event(codelists. - respiratory_virus_primary_codelist)) - |(emergency_care_diagnosis_matches(codelists. - rtri_attendance).exists_for_patient()) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_primary = ( + (dataset.rsv_primary)|(dataset.flu_primary)|(dataset.covid_primary) + |(has_infection_event(codelists. + respiratory_virus_primary_codelist)) + |(emergency_care_diagnosis_matches(codelists. + rtri_attendance).exists_for_patient()) + ) + else: + dataset.overall_resp_primary = ( + (dataset.rsv_primary)|(dataset.flu_primary) + |(has_infection_event(codelists. + respiratory_virus_primary_codelist)) + |(emergency_care_diagnosis_matches(codelists. + rtri_attendance).exists_for_patient()) + ) #unspecified respiratory virus primary care date if codelist_type == "sensitive" : if cohort == "older_adults" : - dataset.overall_resp_primary_date = ( - minimum_of(dataset.rsv_primary_date, dataset.flu_primary_date, - dataset.covid_primary_date, first_infection_event(codelists. - respiratory_virus_primary_codelist).date, - emergency_care_diagnosis_matches(codelists. - rtri_attendance).arrival_date.minimum_for_patient(), - emergency_care_diagnosis_matches(codelists. - copd_exacerbation_attendance) - .arrival_date.minimum_for_patient(), - first_infection_event(codelists - .copd_exacerbation_primary_codelist).date, - first_infection_event(codelists - .asthma_exacerbation_primary_codelist).date) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_primary_date = ( + minimum_of(dataset.rsv_primary_date, dataset.flu_primary_date, + dataset.covid_primary_date, first_infection_event(codelists. + respiratory_virus_primary_codelist).date, + emergency_care_diagnosis_matches(codelists. + rtri_attendance).arrival_date.minimum_for_patient(), + emergency_care_diagnosis_matches(codelists. + copd_exacerbation_attendance) + .arrival_date.minimum_for_patient(), + first_infection_event(codelists + .copd_exacerbation_primary_codelist).date, + first_infection_event(codelists + .asthma_exacerbation_primary_codelist).date) + ) + else: + dataset.overall_resp_primary_date = ( + minimum_of(dataset.rsv_primary_date, dataset.flu_primary_date, + first_infection_event(codelists. + respiratory_virus_primary_codelist).date, + emergency_care_diagnosis_matches(codelists. + rtri_attendance).arrival_date.minimum_for_patient(), + emergency_care_diagnosis_matches(codelists. + copd_exacerbation_attendance) + .arrival_date.minimum_for_patient(), + first_infection_event(codelists + .copd_exacerbation_primary_codelist).date, + first_infection_event(codelists + .asthma_exacerbation_primary_codelist).date) + ) else: - dataset.overall_resp_primary_date = ( - minimum_of(dataset.rsv_primary_date, dataset.flu_primary_date, - dataset.covid_primary_date, first_infection_event(codelists. - respiratory_virus_primary_codelist).date, - emergency_care_diagnosis_matches(codelists. - rtri_attendance).arrival_date.minimum_for_patient()) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_primary_date = ( + minimum_of(dataset.rsv_primary_date, dataset.flu_primary_date, + dataset.covid_primary_date, first_infection_event(codelists. + respiratory_virus_primary_codelist).date, + emergency_care_diagnosis_matches(codelists. + rtri_attendance).arrival_date.minimum_for_patient()) + ) + else: + dataset.overall_resp_primary_date = ( + minimum_of(dataset.rsv_primary_date, dataset.flu_primary_date, + first_infection_event(codelists. + respiratory_virus_primary_codelist).date, + emergency_care_diagnosis_matches(codelists. + rtri_attendance).arrival_date.minimum_for_patient()) + ) #unspecified respiratory virus secondary care if codelist_type == "sensitive" : if cohort == "older_adults" : - dataset.overall_resp_secondary = ( - (dataset.rsv_secondary)|(dataset.flu_secondary) - |(dataset.covid_secondary) - |(hospitalisation_diagnosis_matches(codelists. - respiratory_virus_secondary_codelist) - .exists_for_patient()) - |(hospitalisation_diagnosis_matches(codelists - .copd_exacerbation_secondary_codelist) - .exists_for_patient()) - |(hospitalisation_diagnosis_matches(codelists - .asthma_exacerbation_secondary_codelist) - .exists_for_patient()) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_secondary = ( + (dataset.rsv_secondary)|(dataset.flu_secondary) + |(dataset.covid_secondary) + |(hospitalisation_diagnosis_matches(codelists. + respiratory_virus_secondary_codelist) + .exists_for_patient()) + |(hospitalisation_diagnosis_matches(codelists + .copd_exacerbation_secondary_codelist) + .exists_for_patient()) + |(hospitalisation_diagnosis_matches(codelists + .asthma_exacerbation_secondary_codelist) + .exists_for_patient()) + ) + else: + dataset.overall_resp_secondary = ( + (dataset.rsv_secondary)|(dataset.flu_secondary) + |(hospitalisation_diagnosis_matches(codelists. + respiratory_virus_secondary_codelist) + .exists_for_patient()) + |(hospitalisation_diagnosis_matches(codelists + .copd_exacerbation_secondary_codelist) + .exists_for_patient()) + |(hospitalisation_diagnosis_matches(codelists + .asthma_exacerbation_secondary_codelist) + .exists_for_patient()) + ) else: - dataset.overall_resp_secondary = ( - (dataset.rsv_secondary)|(dataset.flu_secondary) - |(dataset.covid_secondary) - |(hospitalisation_diagnosis_matches(codelists. - respiratory_virus_secondary_codelist) - .exists_for_patient()) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_secondary = ( + (dataset.rsv_secondary)|(dataset.flu_secondary) + |(dataset.covid_secondary) + |(hospitalisation_diagnosis_matches(codelists. + respiratory_virus_secondary_codelist) + .exists_for_patient()) + ) + else: + dataset.overall_resp_secondary = ( + (dataset.rsv_secondary)|(dataset.flu_secondary) + |(hospitalisation_diagnosis_matches(codelists. + respiratory_virus_secondary_codelist) + .exists_for_patient()) + ) #unspecified respiratory virus secondary care date if codelist_type == "sensitive" : if cohort == "older_adults" : - dataset.overall_resp_secondary_date = ( - minimum_of(dataset.rsv_secondary_date, - dataset.flu_secondary_date, dataset.covid_secondary_date, - hospitalisation_diagnosis_matches(codelists. - respiratory_virus_secondary_codelist).admission_date - .minimum_for_patient(), hospitalisation_diagnosis_matches(codelists - .copd_exacerbation_secondary_codelist) - .admission_date.minimum_for_patient(), - hospitalisation_diagnosis_matches(codelists - .asthma_exacerbation_secondary_codelist) - .admission_date.minimum_for_patient()) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_secondary_date = ( + minimum_of(dataset.rsv_secondary_date, + dataset.flu_secondary_date, dataset.covid_secondary_date, + hospitalisation_diagnosis_matches(codelists. + respiratory_virus_secondary_codelist).admission_date + .minimum_for_patient(), hospitalisation_diagnosis_matches(codelists + .copd_exacerbation_secondary_codelist) + .admission_date.minimum_for_patient(), + hospitalisation_diagnosis_matches(codelists + .asthma_exacerbation_secondary_codelist) + .admission_date.minimum_for_patient()) + ) + else: + dataset.overall_resp_secondary_date = ( + minimum_of(dataset.rsv_secondary_date, + dataset.flu_secondary_date, hospitalisation_diagnosis_matches(codelists + .respiratory_virus_secondary_codelist).admission_date + .minimum_for_patient(), hospitalisation_diagnosis_matches(codelists + .copd_exacerbation_secondary_codelist) + .admission_date.minimum_for_patient(), + hospitalisation_diagnosis_matches(codelists + .asthma_exacerbation_secondary_codelist) + .admission_date.minimum_for_patient()) + ) else: - dataset.overall_resp_secondary_date = ( - minimum_of(dataset.rsv_secondary_date, - dataset.flu_secondary_date, dataset.covid_secondary_date, - hospitalisation_diagnosis_matches(codelists. - respiratory_virus_secondary_codelist) - .admission_date.minimum_for_patient()) - ) + if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + dataset.overall_resp_secondary_date = ( + minimum_of(dataset.rsv_secondary_date, + dataset.flu_secondary_date, dataset.covid_secondary_date, + hospitalisation_diagnosis_matches(codelists. + respiratory_virus_secondary_codelist) + .admission_date.minimum_for_patient()) + ) + else: + dataset.overall_resp_secondary_date = ( + minimum_of(dataset.rsv_secondary_date, dataset.flu_secondary_date, + hospitalisation_diagnosis_matches(codelists + .respiratory_virus_secondary_codelist).admission_date + .minimum_for_patient()) + ) ## outcomes - mortality @@ -810,25 +923,36 @@ def first_infection_event(codelist, where = True): .then(ons_deaths.date)) ) -#covid mortality -dataset.covid_mortality = ( - cause_of_death_matches(codelists - .covid_secondary_codelist) -) +if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : -#covid mortality date -dataset.covid_mortality_date = (case(when( - dataset.covid_mortality) - .then(ons_deaths.date)) -) + #covid mortality + dataset.covid_mortality = ( + cause_of_death_matches(codelists + .covid_secondary_codelist) + ) + + #covid mortality date + dataset.covid_mortality_date = (case(when( + dataset.covid_mortality) + .then(ons_deaths.date)) + ) #overall mortality -dataset.overall_resp_mortality = ( - (dataset.rsv_mortality)|(dataset.flu_mortality) - |(dataset.covid_mortality) - |(cause_of_death_matches(codelists - .respiratory_virus_secondary_codelist)) -) +if datetime.strptime(study_start_date, "%Y-%m-%d") >= covid_season_min : + + dataset.overall_resp_mortality = ( + (dataset.rsv_mortality)|(dataset.flu_mortality) + |(dataset.covid_mortality)|(cause_of_death_matches(codelists + .respiratory_virus_secondary_codelist)) + ) + +else: + + dataset.overall_resp_mortality = ( + (dataset.rsv_mortality)|(dataset.flu_mortality) + |(cause_of_death_matches(codelists + .respiratory_virus_secondary_codelist)) + ) #overall mortality date dataset.overall_resp_mortality_date = (case(when( diff --git a/analysis/dummydata/dummydata_adults.R b/analysis/dummydata/dummydata_adults.R index 435c788..f6288f7 100644 --- a/analysis/dummydata/dummydata_adults.R +++ b/analysis/dummydata/dummydata_adults.R @@ -45,6 +45,12 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.99), ), + #date of deregistration + deregistration_day = bn_node( + ~ as.integer(runif(n = ..n, index_day, index_day + 365)), + missing_rate = ~ 0.99 + ), + #sex of the patient sex = bn_node( ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), @@ -365,6 +371,13 @@ sim_list = lst( #date all_cause_mortality_day = bn_node( ~ as.integer(runif(n = ..n, index_day, index_day + 365)) + ), + + ##exclusion criteria + + #care home resident + care_home = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummydata_children_and_adolescents.R b/analysis/dummydata/dummydata_children_and_adolescents.R index 6fe6f8e..7d1553b 100644 --- a/analysis/dummydata/dummydata_children_and_adolescents.R +++ b/analysis/dummydata/dummydata_children_and_adolescents.R @@ -45,6 +45,12 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.99), ), + #date of deregistration + deregistration_day = bn_node( + ~ as.integer(runif(n = ..n, index_day, index_day + 365)), + missing_rate = ~ 0.99 + ), + #sex of the patient sex = bn_node( ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), @@ -275,6 +281,11 @@ sim_list = lst( #date all_cause_mortality_day = bn_node( ~ as.integer(runif(n = ..n, index_day, index_day + 365)) + ), + + #care home resident + care_home = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummydata_infants.R b/analysis/dummydata/dummydata_infants.R index e55b416..5a37a95 100644 --- a/analysis/dummydata/dummydata_infants.R +++ b/analysis/dummydata/dummydata_infants.R @@ -45,6 +45,12 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.99), ), + #date of deregistration + deregistration_day = bn_node( + ~ as.integer(runif(n = ..n, index_day, index_day + 365)), + missing_rate = ~ 0.99 + ), + #sex of the patient sex = bn_node( ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), @@ -270,6 +276,11 @@ sim_list = lst( #severe immunodeficiency severe_immunodeficiency = bn_node( ~ rbernoulli(n = ..n, p = 0.1) + ), + + #care home resident + care_home = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummydata_infants_subgroup.R b/analysis/dummydata/dummydata_infants_subgroup.R index 9b91c2c..5215b4e 100644 --- a/analysis/dummydata/dummydata_infants_subgroup.R +++ b/analysis/dummydata/dummydata_infants_subgroup.R @@ -45,6 +45,12 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.99), ), + #date of deregistration + deregistration_day = bn_node( + ~ as.integer(runif(n = ..n, index_day, index_day + 365)), + missing_rate = ~ 0.99 + ), + #sex of the patient sex = bn_node( ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), @@ -317,6 +323,11 @@ sim_list = lst( #severe immunodeficiency severe_immunodeficiency = bn_node( ~ rbernoulli(n = ..n, p = 0.1) + ), + + #care home resident + care_home = bn_node( + ~ rbernoulli(n = ..n, p = 0.1) ) ) diff --git a/analysis/dummydata/dummydata_older_adults.R b/analysis/dummydata/dummydata_older_adults.R index 759731d..47c7f98 100644 --- a/analysis/dummydata/dummydata_older_adults.R +++ b/analysis/dummydata/dummydata_older_adults.R @@ -45,6 +45,12 @@ sim_list = lst( ~ rbernoulli(n = ..n, p = 0.99), ), + #date of deregistration + deregistration_day = bn_node( + ~ as.integer(runif(n = ..n, index_day, index_day + 365)), + missing_rate = ~ 0.99 + ), + #sex of the patient sex = bn_node( ~ rfactor(n = ..n, levels = c("female", "male", "intersex", "unknown"), diff --git a/analysis/dummydata/dummyextract_adults_2016_2017.arrow b/analysis/dummydata/dummyextract_adults_2016_2017.arrow index 667e680..7bf4eb3 100644 Binary files a/analysis/dummydata/dummyextract_adults_2016_2017.arrow and b/analysis/dummydata/dummyextract_adults_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_adults_2022_2023.arrow b/analysis/dummydata/dummyextract_adults_2022_2023.arrow new file mode 100644 index 0000000..59d68a3 Binary files /dev/null and b/analysis/dummydata/dummyextract_adults_2022_2023.arrow differ diff --git a/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow b/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow index e1a931f..d8ae900 100644 Binary files a/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow and b/analysis/dummydata/dummyextract_children_and_adolescents_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_infants_2016_2017.arrow b/analysis/dummydata/dummyextract_infants_2016_2017.arrow index 97f52c0..982622a 100644 Binary files a/analysis/dummydata/dummyextract_infants_2016_2017.arrow and b/analysis/dummydata/dummyextract_infants_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow b/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow index b1b1af4..14ccdb3 100644 Binary files a/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow and b/analysis/dummydata/dummyextract_infants_subgroup_2016_2017.arrow differ diff --git a/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow b/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow index 7ed895f..e25e8ca 100644 Binary files a/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow and b/analysis/dummydata/dummyextract_older_adults_2016_2017.arrow differ diff --git a/project.yaml b/project.yaml index c36a939..4ffe7c7 100644 --- a/project.yaml +++ b/project.yaml @@ -410,4 +410,14 @@ actions: needs: [process_dataset_infants_s1_sens_sec] outputs: moderately_sensitive: - png: output/models/descriptive_infants_2016_2017_sensitive_secondary.png \ No newline at end of file + png: output/models/descriptive_infants_2016_2017_sensitive_secondary.png + + generate_dataset_adults_s7_spec: + run: > + ehrql:v1 generate-dataset analysis/dataset_definition.py + --output output/data/input_adults_2022_2023_specific_primary.arrow + --dummy-data-file analysis/dummydata/dummyextract_adults_2022_2023.arrow + -- adults season7_start_date season7_end_date specific primary + outputs: + highly_sensitive: + dataset: output/data/input_adults_2022_2023_specific_primary.arrow