From d5f3558a1ec16d8e1f0b83ac751b34422540a4df Mon Sep 17 00:00:00 2001
From: Andree Valle Campos <avallecam@gmail.com>
Date: Wed, 9 Oct 2024 04:41:04 +0100
Subject: [PATCH] fix solution given new data

---
 learners/epikinetics-descriptive.md | 186 +++++++++++++++-------------
 1 file changed, 100 insertions(+), 86 deletions(-)

diff --git a/learners/epikinetics-descriptive.md b/learners/epikinetics-descriptive.md
index 3443ac46..3e981e2d 100644
--- a/learners/epikinetics-descriptive.md
+++ b/learners/epikinetics-descriptive.md
@@ -20,74 +20,40 @@ library(incidence2)
 # read data ---------------------------------------------------------------
 
 # rawdata <- "data-raw/delta.csv"
-rawdata <- "https://raw.githubusercontent.com/seroanalytics/epikinetics/refs/heads/main/inst/delta_full.rds"
+# rawdata <- "https://raw.githubusercontent.com/seroanalytics/epikinetics/refs/heads/main/inst/delta_full.rds"
+rawdata <- "data-out/delta_full-messy.csv" # fix this path
 
 dat <- read_csv(rawdata)
 
-dat %>% glimpse()
+dat #%>% glimpse()
 
-# what these columns mean? ------------------------------------------------
-
-#' data dictionary: https://seroanalytics.org/epikinetics/articles/data.html
-#' reference paper: https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(24)00484-5/fulltext
-#' location: https://github.com/seroanalytics/epikinetics/tree/main/inst
-
-# 335 subjects where followed up
-dat %>% count(pid)
-
-## what "titre type" means? -------------------------------------------------
-
-#' In the time series, 
-#' each subject had monthly serum measurements 
-#' for three types of antigens ("titre_type").
-#' 
-#' Serum samples where challenged against Ancestral, Alpha and Delta antigens.
-#' 
-#' The column "value" measures the titre of 
-#' the neutralizing effect of each sample against each antigen 
+# dat %>% 
+#   # arrange columns
+#   dplyr::select(
+#     pid, infection_history, exp_num, last_exp_date, last_vax_type,
+#     dplyr::everything()
+#   ) %>% 
+#   # arrange rows
+#   dplyr::arrange(
+#     pid, infection_history, exp_num, last_exp_date, last_vax_type, date
+#   ) %>% 
+#   write_csv("data-out/delta_full-messy.csv")
 
-dat %>%
-  dplyr::filter(pid == 2) %>% 
-  dplyr::arrange(date) %>% 
-  # select time invariant columns
-  dplyr::select(
-    pid, infection_history, exp_num, last_exp_date, last_vax_type,
-    dplyr::everything()
-  )
+# datatagr 
 
-## what "censored" means? ----------------------------------------------------
-
-# context: censored regression model
-# the "value" as the outcome is censored above or below
-# because the it was measured outside the limits of detection
-# threshold limit below: 5
-# threshold limit above: 2560
-
-dat %>% 
-  ggplot(aes(value, fill = as.factor(censored))) + 
-  geom_histogram()
-
-# datatagr ----------------------------------------------------------------
-
-datatagr::lost_labels_action()
-datatagr::get_lost_labels_action()
-# datatagr::lost_labels_action(action = "error")
+# datatagr::lost_labels_action()
+# datatagr::get_lost_labels_action()
+# # datatagr::lost_labels_action(action = "error")
 
 # cleanepi ----------------------------------------------------------------
 
 # check sequence of events
 
 dat_clean <- dat %>% 
-  # arrange columns
-  dplyr::select(
-    pid, infection_history, exp_num, last_exp_date, last_vax_type,
-    dplyr::everything()
-  ) %>% 
-  # arrange rows
-  dplyr::arrange(
-    pid, infection_history, exp_num, last_exp_date, last_vax_type, date
-  ) %>% 
   # cleanepi
+  cleanepi::standardize_column_names() %>% 
+  cleanepi::standardize_dates(target_columns = "date") %>%
+  cleanepi::convert_to_numeric(target_columns = "exp_num") %>% 
   cleanepi::check_date_sequence(
     target_columns = c("last_exp_date", "date")
   ) %>% 
@@ -106,53 +72,101 @@ dat_clean <- dat %>%
     titre_type = forcats::fct_relevel(titre_type,"Ancestral", "Alpha"),
     censored = forcats::as_factor(censored)
   ) %>% 
-  # tag with {datatagr}
-  datatagr::make_datatagr(
-    pid = "subject id",
-    infection_history = "subject infection history",
-    exp_num = "number of vaccine exposures",
-    last_exp_date = "date of last exposure",
-    last_vax_type = "type of vaccine in the last exposure",
-    date = "date of observation of titre in serum sample",
-    titre_type = "type of antigen challenged against serum sample",
-    value = "titre value",
-    censored = "censored titre value out of limit of detection [5 - 2560] bellow (-1) or above (+1)",
-    t_since_last_exp = "time interval between last vaccine exposure and observed serum sample titre"
+  # tag with {linelist}
+  linelist::make_linelist( # ISSUE: make_linelist can rearrange columns
+    id = "pid",
+    allow_extra = TRUE,
+    infection_history = "infection_history",
+    exp_num = "exp_num",
+    last_exp_date = "last_exp_date",
+    last_vax_type = "last_vax_type",
+    date = "date",
+    titre_type = "titre_type",
+    value = "value",
+    censored = "censored",
+    # last_vax_type = "last_vax_type", # ISSUE: can tolerate replicates
+    t_since_last_exp = "t_since_last_exp" # it is possible to pass validation without tagging?
   ) %>% 
-  # validate with {datatagr}
-  datatagr::validate_datatagr(
-    pid = "numeric",
-    infection_history = "character",
-    exp_num = "factor",
-    last_exp_date="Date",
-    last_vax_type = "factor",
-    date = "Date",
-    titre_type = "factor",
-    value = "numeric",
-    censored = "factor",
-    t_since_last_exp = "numeric"
+  # validate 
+  linelist::validate_linelist(
+    allow_extra = TRUE,
+    ref_types = linelist::tags_types(
+      infection_history = c("character"),
+      exp_num = c("factor"),
+      last_exp_date = c("Date"),
+      last_vax_type = c("factor"),
+      date = c("Date"),
+      titre_type = c("factor"),
+      value = c("numeric"),
+      censored = c("factor"),
+      t_since_last_exp = c("numeric"),
+      allow_extra = TRUE
+    )
   ) %>% 
-  # datatagr::labels_df() %>% # this extract labels as column names [affects downstream] 
-  identity()
+  # keep tags data frame
+  linelist::tags_df()
 
 dat_clean
 
 # distribution of the time from the last vaccine to first observation
 dat_clean %>% 
-  group_by(pid) %>% 
+  group_by(id) %>% 
   filter(date == min(date)) %>% 
   slice(1) %>% 
   ungroup() %>% 
   ggplot(aes(t_since_last_exp)) + 
   geom_histogram()
 
+# what these columns mean? ------------------------------------------------
+
+#' data dictionary: https://seroanalytics.org/epikinetics/articles/data.html
+#' reference paper: https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(24)00484-5/fulltext
+#' location: https://github.com/seroanalytics/epikinetics/tree/main/inst
+
+# 335 subjects where followed up
+dat_clean %>% count(pid)
+
+
+## what "titre type" means? -------------------------------------------------
+
+#' In the time series, 
+#' each subject had monthly serum measurements 
+#' for three types of antigens ("titre_type").
+#' 
+#' Serum samples where challenged against Ancestral, Alpha and Delta antigens.
+#' 
+#' The column "value" measures the titre of 
+#' the neutralizing effect of each sample against each antigen 
+
+dat_clean %>%
+  dplyr::filter(id == 2) %>% 
+  dplyr::arrange(date) #%>% 
+  # # select time invariant columns
+  # dplyr::select(
+  #   id, infection_history, exp_num, last_exp_date, last_vax_type,
+  #   dplyr::everything()
+  # )
+
+## what "censored" means? ----------------------------------------------------
+
+# context: censored regression model
+# the "value" as the outcome is censored above or below
+# because the it was measured outside the limits of detection
+# threshold limit below: 5
+# threshold limit above: 2560
+
+dat_clean %>% 
+  ggplot(aes(value, fill = as.factor(censored))) + 
+  geom_histogram()
+
+
 ## subject table -----------------------------------------------------------
 
 # subject time-invariant data
 dat_subject <- dat_clean %>% 
   # {datatagr} reacts with dplyr::select() but not with dplyr::count() when losing tags
-  dplyr::select(pid, infection_history, exp_num, last_exp_date, last_vax_type) %>% 
-  dplyr::count(pid, infection_history, exp_num, last_exp_date, last_vax_type)
+  dplyr::select(id, infection_history, exp_num, last_exp_date, last_vax_type) %>% 
+  dplyr::count(id, infection_history, exp_num, last_exp_date, last_vax_type)
   
 # table 1: time-invariant columns
 dat_subject %>% 
@@ -187,7 +201,7 @@ dat_subject %>%
   # incidence2::cumulate() %>% 
   # plot
   incidence2:::plot.incidence2(
-    fill = "last_vax_type"
+    fill = "last_vax_type" # change: "infection_history", "titre_type", or "last_vax_type"
   )
 
 # observations ------------------------------------------------------------
@@ -207,7 +221,7 @@ dat_clean %>%
     # complete_dates = TRUE # relevant to downstream analysis [time-series data]
   ) %>% 
   incidence2:::plot.incidence2(
-    fill = "censored"
+    fill = "censored" # change: "censored" or "infection_history", "titre_type", or "last_vax_type"
   )
 
 ```