Merge pull request #30 from tcrombie/dev/WF

package down
AndersenLab · Oct 9, 2023 · 41af4a5 · 41af4a5
2 parents 2d55c73 + 498f055
commit 41af4a5
Show file tree

Hide file tree

Showing 31 changed files with 413 additions and 63 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,3 +2,7 @@
 ^\.Rproj\.user$
 ^doc$
 ^Meta$
+^vignettes/articles$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@
 inst/doc
 doc
 Meta
+docs
diff --git a/R/checkModels.R b/R/checkModels.R
@@ -67,6 +67,7 @@ checkModels <- function(data, ..., modelName = "MDHD", OF = "filter", length_thr
 
   # nest the data
   nest <- of.data %>%
+    dplyr::mutate(well.id = paste(Metadata_Experiment, Metadata_Plate, Metadata_Well, sep = "_")) %>%
     dplyr::group_by(...) %>%
     tidyr::nest() %>%
     tidyr::unite(col = group, -data)
@@ -77,7 +78,7 @@ checkModels <- function(data, ..., modelName = "MDHD", OF = "filter", length_thr
     d <- nest %>%
       dplyr::filter(group == i) %>%
       tidyr::unnest(cols = data) %>%
-      dplyr::group_by(Metadata_Experiment, Metadata_Plate, Metadata_Well) %>%
+      dplyr::group_by(well.id) %>%
       dplyr::mutate(mean_wormlength_um = mean(worm_length_um, na.rm = TRUE)) %>%
       dplyr::group_by(strain) %>%
       dplyr::mutate(mean_strain_length = mean(mean_wormlength_um, na.rm = TRUE)) %>%
@@ -93,20 +94,19 @@ checkModels <- function(data, ..., modelName = "MDHD", OF = "filter", length_thr
 
     # get lowest wellN wells for those strains
     wells <- d %>%
-      dplyr::distinct(Metadata_Experiment, Metadata_Plate, Metadata_Well, strain, mean_wormlength_um) %>%
+      dplyr::distinct(well.id, strain, mean_wormlength_um) %>%
       dplyr::filter(strain %in% strains) %>%
       dplyr::group_by(strain) %>%
       dplyr::arrange(mean_wormlength_um) %>%
       dplyr::mutate(slice_index = 1:dplyr::n()) %>%
       dplyr::filter(slice_index <= wellN) %>%
       dplyr::ungroup() %>%
-      dplyr::mutate(well_id = paste(Metadata_Experiment, Metadata_Plate, Metadata_Well, sep = "_")) %>%
-      dplyr::pull(well_id)
+      dplyr::pull(well.id)
 
     # filter down d
     f.d <- d %>%
       dplyr::filter(strain %in% strains) %>%
-      dplyr::mutate(checkMV_well_id = paste(Metadata_Experiment, Metadata_Plate, Metadata_Well, sep = "_")) %>%
+      dplyr::mutate(checkMV_well_id = well.id) %>%
       dplyr::filter(checkMV_well_id %in% wells) %>%
       dplyr::arrange(strain)
 

diff --git a/R/classifierOF.R b/R/classifierOF.R
@@ -6,28 +6,66 @@
 #' @param model Specify one of the built in models. Currently only "gbm2x" is available. This classifier was
 #' trained on over 1,000 worm objects selected from a large GWAS experiment and classifies objects as "worm" or "non-worm" with ~90% accuracy.
 #' NOTE: The model was trained to classify poorly segmented worms as 'non-worm' so true worms are often classified as "non-worm".
+#' @param thresh The probability threshold for flagging objects based on the classifier. By default the thresh is st to 0.6.
+#' Only the objects the classifier predicts to be improperly segmented, with a probability greater than \code{thres}, will be flagged.
 #' @return A single data frame identical to the input data with the \code{classifier_ObjectFlag} variable added.
 #' The \code{classifier_ObjectFlag} variable is coded as \code{"classifier"} for objects that are called non-worm by the 2X classifier.
-#' All other objects are coded as \code{NA_character}. The \code{gbm2x_worm_prob} variable provides the probability that the object is a properly segmented worm.
+#' All other objects are coded as \code{NA_character}, or if there are NAs in any of the variables used to classify objects they are coded as \code{"classErr"}. The \code{gbm2x_worm_prob} variable provides the probability that the object is a properly segmented worm.
 #' @export
 
-classifierOF <- function(data, model = "gbm2x"){
+classifierOF <- function(data, model = "gbm2x", thresh = 0.6){
   if(model == "gbm2x") {
     # get the classifier
     gbm2x <- easyXpress::gbm2x
 
+    # look for missing values or columns in expected model data
+    # Specify column names to check
+    check <- names(gbm2x$trainingData)[-1]
+
+    # send error if one or more variables are missing
+    if(F %in% (check %in% names(data))) {
+      message("ERROR: The data does not contain the variables expected for the gbm2x classifier. Consider running the latest version of cellprofiler-nf. You can find the expected variables with <names(easyXpress::gbm2x$trainingData)>")
+      stop()
+    }
+
+    # Filter rows with NAs in any of the specified columns
+    nas <- data %>%
+      dplyr::filter(rowSums(is.na(dplyr::select(., dplyr::all_of(check)))) > 0) %>%
+      dplyr::mutate(gbm2x_worm_prob = NA_real_,
+                    gbm2x_class = NA_character_,
+                    classifier_ObjectFlag = "classErr")
+
+    if(nrow(nas) > 0) {
+      # filter to rows with no NAs in expected variables
+      f.data <- data %>%
+        dplyr::filter(rowSums(!is.na(dplyr::select(., dplyr::all_of(check)))) == length(check))
+
+      # message about it
+      message(glue::glue('WARNING: There are {nrow(nas)} rows with NAs in one or more object variables used for the classifier. These rows are flagged as "classErr" in the output.'))
+
+    } else {
+      # assign to data
+      f.data <- data
+    }
+
     # use it
-    gbm2x_prob <- stats::predict(gbm2x, newdata = data, type = "prob")
-    gbm2x_class <- stats::predict(gbm2x, newdata = data)
+    gbm2x_prob <- stats::predict(gbm2x, newdata = f.data, type = "prob")
+    gbm2x_class <- stats::predict(gbm2x, newdata = f.data)
 
     # add these predictions
-    classifier_df <- data %>%
+    classifier_df <- f.data %>%
       dplyr::bind_cols(gbm2x_prob, gbm2x_class = gbm2x_class) %>%
-      dplyr::mutate(classifier_ObjectFlag = dplyr::case_when(gbm2x_class == "non-worm" ~ "classifier",
-                                                      gbm2x_class == "worm" ~ NA_character_,
-                                                      TRUE ~ "ERROR")) %>%
+      # dplyr::mutate(classifier_ObjectFlag = dplyr::case_when(gbm2x_class == "non-worm" ~ "classifier",
+      #                                                 gbm2x_class == "worm" ~ NA_character_,
+      #                                                 TRUE ~ "ERROR")) %>%
+      dplyr::mutate(classifier_ObjectFlag = dplyr::case_when(`non-worm` > thresh ~ "classifier",
+                                                             `non-worm` <= thresh ~ NA_character_,
+                                                              TRUE ~ "ERROR")) %>%
       dplyr::select(-`non-worm`) %>%
-      dplyr::rename(gbm2x_worm_prob = worm)
+      dplyr::rename(gbm2x_worm_prob = worm) %>%
+      dplyr::bind_rows(nas) %>%
+      dplyr::arrange(Metadata_Experiment, Metadata_Plate, Metadata_Well, Parent_WormObjects)
+
 
     # test for unclassified objects
     if("ERROR" %in% classifier_df$classifier_ObjectFlag) {

diff --git a/R/delta.R b/R/delta.R
@@ -9,22 +9,45 @@
 #' @param WF Select \code{"filter"} or \code{"ignore"}. The default, \code{"filter"}, will filter out all flagged wells before calculating the delta from control.
 #' \code{"ignore"} will calculate the delta including all flagged data. Be careful using \code{"ignore"}, it is only included for diagnostic purposes.
 #' @param vars The well summary statistics to perform the delta calculation on. These are supplied in a character vector. For example, the default is set to \code{c("median_wormlength_um", "cv_wormlength_um")}.
+#' @param doseR Logical, is this dose response data? The default, \code{doseR = FALSE},
+#' expects control data to be recorded in the design file a particular way. Specifically, the drug and diluent variables should be identical for controls,
+#' e.g, \code{drug = DMSO, diluent = DMSO, concentration_um = 0}. If \code{doseR = TRUE}, the controls are expected to be coded differently,
+#' .e.g, \code{drug = ABZ, diluent = DMSO, concentration_um = 0}. Warning messages are produced if the controls do not fit expectations, but try to ensure the
+#' controls are coded properly without relying this function to catch all edge cases.
 #' @return A data frame identical to the input but with control delta variables added, i.e., \code{median_wormlength_um_delta} and \code{cv_wormlength_um_delta}.
 #' The median values for the control conditions are also added as \code{control_median_wormlength_um} and \code{control_cv_wormlength_um}.
 #' @export
 
-delta <- function(data, ..., WF = "filter", vars = c("median_wormlength_um", "cv_wormlength_um")) {
+delta <- function(data, ..., WF = "filter", vars = c("median_wormlength_um", "cv_wormlength_um"), doseR = F) {
+  # expecting message
+  if(doseR == T){
+    message("You set doseR = TRUE. Expecting controls to be coded as for a dose response.")
+  }
+  if(doseR == F){
+    message("You set doseR = FALSE. Not expecting controls to be coded for a dose reponse.")
+  }
   # check on control coding
   controls <- unique(data$diluent)
   drugs <- unique(data$drug)
 
-  if(F %in% (controls %in% drugs)) {
-    # send a stop message
-    message(glue::glue("ERROR: the controls are not configured as expected. Control conditions should have the same value for drug and diluent and a 0 for concentration_um. Please correct the control condition coding before using easyXpress well flag functions.
+  # check on expected dcontrols
+  if(doseR == F & F %in% (controls %in% drugs)) {
+    # send a warning.
+    message(glue::glue("ERROR: the controls are not configured as expected in the design file doseR = FALSE. Do you want doseR = TRUE? If not, control conditions should have the same value for drug and diluent and a 0 for concentration_um. Please correct the control condition coding before using easyXpress well flag (WF) functions.
                            For example:"))
-    example <- tibble::tibble(drug = c("DMSO", "Water", "death juice", "seizure sauce"),
+    example <- tibble::tibble(drug = c("DMSO", "water", "death juice", "seizure sauce"),
                               concentration_um = c(0, 0, 10, 100),
-                              diluent = c("DMSO", "Water", "DMSO", "Water"))
+                              diluent = c("DMSO", "water", "DMSO", "water"))
+    stop(message(message(paste0(capture.output(knitr::kable(example)), collapse = "\n"))))
+  }
+  # warn about doseR=T
+  if(doseR == T & T %in% (controls %in% drugs)){
+    # send a warning.
+    message(glue::glue("ERROR: the controls are not configured as expected in the design file for doseR = TRUE. Do you want doseR = FALSE? If not, control conditions should have 0 for concentration_um and be named for the drug not the diluent. Please correct the control condition coding before using easyXpress well flag (WF) functions.
+                           For example:"))
+    example <- tibble::tibble(drug = c("death juice", "death juice", "seizure sauce", "seizure sauce"),
+                              concentration_um = c(0, 10, 0, 100),
+                              diluent = c("DMSO", "DMSO", "water", "water"))
     stop(message(message(paste0(capture.output(knitr::kable(example)), collapse = "\n"))))
   }
 
@@ -42,14 +65,24 @@ delta <- function(data, ..., WF = "filter", vars = c("median_wormlength_um", "cv
   message(glue::glue("The data are grouped by, {group_by}."))
 
   # setup means
+  if(doseR == F) {
   # get control values for each group
   control_values <- d %>%
     dplyr::filter(drug %in% controls) %>% # filter to control wells
     dplyr::group_by(...) %>%
     dplyr::mutate(dplyr::across(dplyr::all_of(vars), ~mean(.), .names = "control_{.col}")) %>% # get control values
     dplyr::distinct(dplyr::across(dplyr::matches("_delta|control_"))) %>%
     dplyr::ungroup()
-
+  }
+  if(doseR == T) {
+    # get control values for each group
+    control_values <- d %>%
+      dplyr::filter(concentration_um == 0) %>% # filter to control wells
+      dplyr::group_by(...) %>%
+      dplyr::mutate(dplyr::across(dplyr::all_of(vars), ~mean(.), .names = "control_{.col}")) %>% # get control values
+      dplyr::distinct(dplyr::across(dplyr::matches("_delta|control_"))) %>%
+      dplyr::ungroup()
+  }
   # calculate the delta
   suppressMessages(delta <- d %>%
     dplyr::left_join(., control_values) %>%

diff --git a/R/modelSelection.R b/R/modelSelection.R
@@ -125,7 +125,7 @@ modelSelection <- function(df) {
     dplyr::group_by(Metadata_Experiment, Metadata_Plate, Metadata_Well, Parent_WormObjects) %>%
     dplyr::distinct(model, .keep_all = T) %>%
     dplyr::ungroup() %>%
-    dplyr::select(Metadata_Experiment, Metadata_Plate, Metadata_Well,
+    dplyr::select(well.id, Metadata_Experiment, Metadata_Plate, Metadata_Well,
                   Parent_WormObjects, model, num_worms) %>%
     tidyr::spread(model, num_worms) %>%
     dplyr::mutate_at(dplyr::vars(tidyselect::one_of(model_names)),

diff --git a/R/outlierOF.R b/R/outlierOF.R
@@ -1,6 +1,7 @@
 #' outlierOF
 #'
-#' This function will flag outlier objects based the Worm_Legnth variable.
+#' This function will flag outlier objects based the worm_length_um variable.
+#'
 #' @param data A data frame output from the \code{modelSelection} function.
 #' @param iqr Logical, if \code{TRUE}, objects in a well are called outliers if their worm_length_um is outside the range \code{median(worm_length_um) +/- (thresh * IQR)}.
 #' If \code{FALSE}, objects in a well are called outliers if their worm_length_um is outside the range \code{mean(worm_length_um) +/- (thresh * SD)}.
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ @@
     inst/doc
     doc
     Meta
+    docs