multiple tables copy over

aim-rsf · Jul 24, 2024 · 8cca66c · 8cca66c
1 parent 2eb7838
commit 8cca66c
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 34 deletions.
diff --git a/R/domain_mapping.R b/R/domain_mapping.R
@@ -9,7 +9,7 @@
 #' @param domain_file The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template.
 #' @param look_up_file The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'.
 #' @param output_dir The path to the directory where the two csv output files will be saved. By default, the current working directory is used.
-#' @param table_copy Turn on copying between tables (TRUE or FALSE, default TRUE). If TRUE, categorisations you make for the last table you processed will be carried over to another, as long as the csv files share an output_dir.
+#' @param table_copy Turn on copying between tables (TRUE or FALSE, default TRUE). If TRUE, categorisations you made for all other tables in this dataset will be copied over (if 'OUTPUT_' files are found in output_dir).
 #' @return The function will return two csv files: 'OUTPUT_' which contains the mappings and 'LOG_' which contains details about the dataset and session.
 #' @examples
 #' # Run in demo mode by providing no inputs: domain_mapping()
@@ -132,19 +132,24 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file =
     cli_h1("Table Last Updated")
     cat(meta_json$dataModel$childDataClasses[[dc]]$lastUpdated, "\n", fill = TRUE)
 
-    # Check if previous table output exists in this output_dir (for table copying)
+    # Check if previous table output(s) exists in this output_dir (for table copying)
     if (table_copy == TRUE){
       dataset_search = paste0("^OUTPUT_",gsub(" ", "", meta_json$dataModel$label),'*')
       csv_list <- data.frame(file = list.files(output_dir,pattern = dataset_search))
       if (nrow(csv_list) != 0){
-        csv_list$date <- as.POSIXct(substring(csv_list$file,nchar(csv_list$file)-22,nchar(csv_list$file)-4), format="%Y-%m-%d-%H-%M-%S")
-        csv_last_filename <- csv_list[which.min(csv_list$date),]
-        csv_last <- read.csv(paste0(output_dir,'/',csv_last_filename$file))
-        csv_last_exist <- TRUE
+        df_list <- lapply(paste0(output_dir,'/',csv_list$file), read.csv)
+        df_combined <- do.call("rbind", df_list) #combine all df
+        df_combined$timestamp2 <- as.POSIXct(df_combined$timestamp, format="%Y-%m-%d-%H-%M-%S") #create new date column
+        df_combined <- df_combined[order(df_combined$timestamp2),] #order by earliest datetime
+        df_combined <- df_combined %>% distinct(DataElement, .keep_all = TRUE) #remove duplicates, keep earliest categorisation
+        df_combined <- df_combined[-(which(df_combined$Note %in% "AUTO CATEGORISED")),] #remove auto categorised
+        df_combined_exist <- TRUE
         cat("\n")
-        cli_alert_info(paste0("Copying from previous session: ",csv_last_filename$file))
-      } else {csv_last_exist <- FALSE}
-      } else {csv_last_exist <- FALSE}
+        cli_alert_info(paste0("Copying from previous session(s): "))
+        cat("\n")
+        print(csv_list$file)
+      } else {df_combined_exist <- FALSE}
+      } else {df_combined_exist <- FALSE}
 
     table_desc <- ""
     while (table_desc != "Y" & table_desc != "y" & table_desc != "N" & table_desc != "n") {
@@ -194,6 +199,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file =
 
     row_Output <- data.frame(
       timestamp = character(0),
+      Table = character(0),
       DataElement_N = character(0),
       DataElement = character(0),
       Domain_code = character(0),
@@ -222,24 +228,25 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file =
       # prepare output
       this_Output <- row_Output
       this_Output[nrow(this_Output) + 1 , ] <- NA
+      this_Output$Table[1] <- meta_json$dataModel$childDataClasses[[dc]]$label
       this_Output$DataElement[1] <- selectTable_df$Label[datavar]
       this_Output$DataElement_N[1] <- paste(as.character(datavar),'of',as.character(nrow(selectTable_df)))
       # search if this data element matches with auto categorisations in lookup
       datavar_index <- which(lookup$DataElement == selectTable_df$Label[datavar]) #we should code this to ignore the case
       lookup_subset <- lookup[datavar_index,]
       # search if this data element matches with any data elements processed in previous table
-      if (csv_last_exist == TRUE) {
-        datavar_index <- which(csv_last$DataElement == selectTable_df$Label[datavar])
-        csv_last_subset <- csv_last[datavar_index,]
-        } else {csv_last_subset <- data.frame()}
+      if (df_combined_exist == TRUE) {
+        datavar_index <- which(df_combined$DataElement == selectTable_df$Label[datavar])
+        df_combined_subset <- df_combined[datavar_index,]
+        } else {df_combined_subset <- data.frame()}
       # decide how to process the data element out of 3 options
       if (nrow(lookup_subset) == 1) { # 1 - auto categorisation
         this_Output$Domain_code[1] <- lookup_subset$DomainCode
         this_Output$Note[1] <- "AUTO CATEGORISED"
         Output <- rbind(Output,this_Output)
-        } else if (csv_last_exist == TRUE & nrow(csv_last_subset) == 1){ # 2 - copy from previous table
-          this_Output$Domain_code[1] <- csv_last_subset$Domain_code
-          suppressWarnings(this_Output$Note[1] <- paste0("COPIED FROM: ",csv_last_filename))
+        } else if (df_combined_exist == TRUE & nrow(df_combined_subset) == 1){ # 2 - copy from previous table
+          this_Output$Domain_code[1] <- df_combined_subset$Domain_code
+          suppressWarnings(this_Output$Note[1] <- paste0("COPIED FROM: ",df_combined_subset$Table))
           Output <- rbind(Output,this_Output)
         } else { # 3 - collect user responses
           decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar],max(Code$Code))

diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ To speed up this process, the function automatically categorises some variables
 that regularly appear in health datasets (e.g. ID, Sex, Age). The function also 
 accounts for the same data element appearing in multiple tables across a dataset, 
 and allows the user to active a table copying function which copies categorisations
-they've done for one table, onto the current table they are processing. 
+they've done for previous tables, into the current table they are processing. 
 
 🚧 :warning: This package is in early development, and has only been
 tested on a limited number of metadata files. In theory, this package
@@ -347,44 +347,61 @@ Unlike in demo mode, it will ask you to specify the range of variables you want
 
 #### table_copy: 
 - default is TRUE, so set this to FALSE if you want to deactivate table copying 
-- table copying means that the categorisations you make for the last table you processed will be carried over to this table, as long as the csv files share an output_dir 
+- table copying means that the categorisations you made for previous tables will be carried over to this table, as long as the csv files share an output_dir 
 - this can be useful because the same data elements (variables) appear across multiple tables within one dataset
 - copying from one table to the next will save the user time, and ensure consistency of categorisations across tables 
 - the 'Note' column in the output csv file will indicate that the categorisation has been copied and where from
 - a typical session could look like this: 
 
-*Run 1, select table 'EXAM'*
+
+*Run 1, select table 'CHILD'*
 
 ```
- domain_mapping()
+ℹ Processing Table 6 of 13
 
-ℹ Running domain_mapping in demo mode using package data files
-ℹ Using the default look-up table in data/look-up.rda
+── Table Name ──
+CHILD
+
+── Table Last Updated ──
+[datetime]
 ```
 
-*Run 2, select table 'CHILD' (the function notices we have already run the table 'EXAM')*
+*Run 2, select table 'CHILD_BIRTHS' (the function notices we have already run the table 'CHILD')*
 
 ```
- domain_mapping()
+ℹ Processing Table 7 of 13
 
-ℹ Running domain_mapping in demo mode using package data files
-ℹ Using the default look-up table in data/look-up.rda
+── Table Name ──
+CHILD_BIRTHS
+
+── Table Last Updated ──
+[datetime]
 ...
-ℹ Copying from previous session: OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_EXAM_[datetime].csv
+ℹ Copying from previous session(s): 
+
+[1] "OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_[datetime].csv"
+
 ```
 
-*Run 3, select table 'REFR_IMM_VAC' (the function notices we have already run the table 'CHILD')*
+*Run 3, select table 'PATH_BLOOD_TESTS' (the function notices we have already run the table 'CHILD' and 'CHILD_BIRTHS')*
 
 ```
- domain_mapping()
+ℹ Processing Table 8 of 13
 
-ℹ Running domain_mapping in demo mode using package data files
-ℹ Using the default look-up table in data/look-up.rda
+── Table Name ──
+PATH_BLOOD_TESTS
+
+── Table Last Updated ──
+[datetime]
 ...
-ℹ Copying from previous session: OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_[datetime].csv
+ℹ Copying from previous session(s): 
+
+[1] "OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_[datetime].csv"
+[2] "OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_BIRTHS_[datetime].csv"
+
 ```
 
-*And so on ...*  Each run has the potential to be shorter for the user to complete because if there are the same data elements that appear across tables, the user will not be asked to categorise them twice. 
+*And so on ...*  Each run where you process a table has the potential to be shorter for the user to complete because if there are the same data elements that appear across tables, the user will not be asked to categorise them twice. 
 
 
 ### Potential use-cases for the output files

diff --git a/man/domain_mapping.Rd b/man/domain_mapping.Rd