Merge branch 'develop' into more_dataset_tests

traitecoevo · Nov 16, 2023 · 5d127b7 · 5d127b7
2 parents 4e50cf7 + 256ab7a
commit 5d127b7
Show file tree

Hide file tree

Showing 10 changed files with 107 additions and 52 deletions.
diff --git a/R/get.R b/R/get.R
@@ -47,5 +47,8 @@ util_get_version <- function(path =  "config/metadata.yml") {
 #' @return 40-digit SHA character string for the latest commit to the repository
 #' @export
 util_get_SHA <- function(path = ".") {
-  git2r::sha(git2r::last_commit(git2r::repository(path)))
+  sha <- tryCatch({
+      git2r::sha(git2r::last_commit(git2r::repository(path)))
+    }, error = function(cond) {NA})
+  sha
 }
diff --git a/R/plot_trait_beeswarm.R → R/plot.R b/R/plot_trait_beeswarm.R → R/plot.R
@@ -2,7 +2,7 @@
 #' @description Plots distribution of trait values by a  grouping variable using ggbeeswarm package
 #'
 #' @param austraits austraits data object
-#' @param trait_name Name of trait to plot
+#' @param trait Name of trait to plot
 #' @param y_axis_category One of `dataset_id`, `family`
 #' @param highlight specify a group to highlight
 #' @param hide_ids add label on y_axis?
@@ -17,22 +17,22 @@
 #' @export
 
 #
-plot_trait_distribution_beeswarm <- function(austraits, trait_name, y_axis_category, highlight=NA, hide_ids = FALSE) {
+plot_trait_distribution_beeswarm <- function(austraits, trait, y_axis_category, highlight = NA, hide_ids = FALSE) {
 
   # Subset data to this trait
   austraits_trait <-
-    austraits$traits %>% filter(trait_name == trait_name) %>%
-    mutate(value = as.numeric(.data$value))
+    austraits$traits %>% dplyr::filter(.data$trait_name == trait) %>%
+    dplyr::mutate(value = as.numeric(.data$value))
 
-  my_shapes = c("_min" = 60, "_mean" = 16, "_max" =62, "unknown" = 18)
+  my_shapes <- c("_min" = 60, "_mean" = 16, "_max" = 62, "unknown" = 18)
 
   as_shape <- function(value_type) {
     p <- rep("unknown", length(value_type))
 
     p[grepl("mean", value_type)] <- "_mean" #16
     p[grepl("min", value_type)] <- "_min" #60
     p[grepl("max", value_type)] <- "_max" #62
-    factor(p, levels=names(my_shapes))
+    factor(p, levels = names(my_shapes))
   }
 
   tax_info <- austraits$taxa
@@ -43,57 +43,58 @@ plot_trait_distribution_beeswarm <- function(austraits, trait_name, y_axis_categ
     dplyr::left_join(by = "taxon_name", tax_info)
 
   # Define grouping variables and derivatives
-  if(!y_axis_category %in% names(data)){
+  if (!y_axis_category %in% names(data)) {
     stop("Incorrect grouping variable! Currently implemented for `family` or `dataset_id`")
   }
 
   # define grouping variable, ordered by group-level by mean values
   # use log_value where possible
-  if(min(data$value, na.rm=TRUE) > 0 ) {
+  if (min(data$value, na.rm = TRUE) > 0) {
     data$value2 <- log10(data$value)
   } else {
     data$value2 <- data$value
   }
-  data$Group = forcats::fct_reorder(data[[y_axis_category]], data$value2, na.rm=TRUE)
+
+  data$Group <- forcats::fct_reorder(data[[y_axis_category]], data$value2, na.rm = TRUE)
 
   n_group <- levels(data$Group) %>% length()
 
-  # set colour to be alternating
-  data$colour = ifelse(data$Group %in% levels(data$Group)[seq(1, n_group, by=2)],
+  # Set colour to be alternating
+  data$colour <- ifelse(data$Group %in% levels(data$Group)[seq(1, n_group, by = 2)],
                        "a", "b")
 
-  # set colour of group to highlight
-  if(!is.na(highlight) & highlight %in% data$Group) {
+  # Set colour of group to highlight
+  if (!is.na(highlight) && highlight %in% data$Group) {
     data <- dplyr::mutate(data, colour = ifelse(.data$Group %in% highlight, "c", .data$colour))
   }
 
-  vals <- list(minimum = purrr::pluck(austraits, "definitions", trait_name, "allowed_values_min"),
-           maximum = purrr::pluck(austraits, "definitions", trait_name, "allowed_values_max"))
+  vals <- list(minimum = purrr::pluck(austraits, "definitions", trait, "allowed_values_min"),
+           maximum = purrr::pluck(austraits, "definitions", trait, "allowed_values_max"))
 
   range <- (vals$maximum/vals$minimum)
 
   # Check range on y-axis
   y.text <- ifelse(n_group > 20, 0.75, 1)
-  heights = c(1, max(1, n_group/7))
+  heights <- c(1, max(1, n_group / 7))
 
   # Top plot - plain histogram of data
   p1 <-
-    ggplot2::ggplot(data, ggplot2::aes(x=.data$value)) +
-    ggplot2::geom_histogram(ggplot2::aes(y = ..density..), color="darkgrey", fill="darkgrey", bins=50) +
-    ggplot2::geom_density(color="black") +
+    ggplot2::ggplot(data, ggplot2::aes(x = .data$value)) +
+    ggplot2::geom_histogram(ggplot2::aes(y = ..density..), color = "darkgrey", fill = "darkgrey", bins = 50) +
+    ggplot2::geom_density(color = "black") +
     ggplot2::xlab("") + ggplot2::ylab("All data") +
     ggplot2::theme_bw()  +
     ggplot2::theme(legend.position = "none",
           panel.grid.minor = ggplot2::element_blank(),
           panel.grid.major = ggplot2::element_blank(),
-          axis.ticks.y= ggplot2::element_blank(),
-          axis.text= ggplot2::element_blank(),
+          axis.ticks.y = ggplot2::element_blank(),
+          axis.text = ggplot2::element_blank(),
           panel.background = ggplot2::element_blank()
     )
   # Second plot -- dots by groups, using ggbeeswarm package
   p2 <-
     ggplot2::ggplot(data, ggplot2::aes(x = .data$value, y = .data$Group, colour = .data$colour, shape = .data$shapes)) +
-    ggbeeswarm::geom_quasirandom(groupOnX=FALSE) +
+    ggbeeswarm::geom_quasirandom(groupOnX = FALSE) +
     ggplot2::ylab(paste("By ", y_axis_category)) +
     # inclusion of custom shapes: for min, mean, unknown
     # NB: this single line of code makes function about 4-5 slower for some reason
@@ -102,45 +103,45 @@ plot_trait_distribution_beeswarm <- function(austraits, trait_name, y_axis_categ
     ggplot2::theme(legend.position = "none",
           panel.grid.major.x = ggplot2::element_blank(),
           panel.grid.minor.x = ggplot2::element_blank(),
-          axis.text.x = ggplot2::element_text(size=ggplot2::rel(1.25)),
-          axis.text.y = ggplot2::element_text(size=ggplot2::rel(y.text))
+          axis.text.x = ggplot2::element_text(size = ggplot2::rel(1.25)),
+          axis.text.y = ggplot2::element_text(size = ggplot2::rel(y.text))
     ) #+
    # guides(colour=FALSE)
 
 
-  if(hide_ids) {
+  if (hide_ids) {
     p2 <- p2 + ggplot2::theme(axis.text.y = ggplot2::element_blank())
   }
 
   #Sourced from https://gist.github.com/bbolker/5ba6a37d64b06a176e320b2b696b6733
-  scientific_10 <- function(x,suppress_ones=TRUE) {
+  scientific_10 <- function(x, suppress_ones = TRUE) {
     s <- scales::scientific_format()(x)
     ## substitute for exact zeros
-    s[s=="0e+00"] <- "0"
+    s[s == "0e+00"] <- "0"
     ## regex: [+]?  = "zero or one occurrences of '+'"
-    s2 <- gsub("e[+]?", " %*% 10^", s )
+    s2 <- gsub("e[+]?", " %*% 10^", s)
     ## suppress 1 x
-    if (suppress_ones) s2 <- gsub("1 %\\*% +","",s2)
-    parse(text=s2)
+    if (suppress_ones) s2 <- gsub("1 %\\*% +", "", s2)
+    parse(text = s2)
   }
 
   # Define scale on x-axis and transform to log if required
-  if(vals$minimum !=0 & range > 20) {
+  if (vals$minimum != 0 && range > 20) {
     #log transformation
     p1 <- p1 +
-      ggplot2::scale_x_log10(name="",
+      ggplot2::scale_x_log10(name = "",
                     breaks = scales::breaks_log(),
                     labels = scientific_10,
-                    limits=c(vals$minimum, vals$maximum))
+                    limits = c(vals$minimum, vals$maximum))
     p2 <- p2 +
-      ggplot2::scale_x_log10(name=paste(trait_name, ' (', data$unit[1], ')'),
+      ggplot2::scale_x_log10(name = paste(trait, " (", data$unit[1], ")"),
                     breaks = scales::breaks_log(),
                     labels = scientific_10,
-                    limits=c(vals$minimum, vals$maximum))
+                    limits = c(vals$minimum, vals$maximum))
   } else {
-    p1 <- p1 + ggplot2::scale_x_continuous(limits=c(vals$minimum, vals$maximum))
-    p2 <- p2 + ggplot2::scale_x_continuous(limits=c(vals$minimum, vals$maximum)) +
-      ggplot2::xlab(paste(trait_name, ' (', data$unit[1], ')'))
+    p1 <- p1 + ggplot2::scale_x_continuous(limits = c(vals$minimum, vals$maximum))
+    p2 <- p2 + ggplot2::scale_x_continuous(limits = c(vals$minimum, vals$maximum)) +
+      ggplot2::xlab(paste(trait, " (", data$unit[1], ")"))
 
   }
 
@@ -152,5 +153,5 @@ plot_trait_distribution_beeswarm <- function(austraits, trait_name, y_axis_categ
   p2 <- f(p2)
   # Fix width of second plot to be same as bottom using ggplot_table
   p1$widths[2:3] <- p2$widths[2:3]
-  gridExtra::grid.arrange(p1, p2, nrow=2, widths=c(1), heights=heights)
+  gridExtra::grid.arrange(p1, p2, nrow = 2, widths = c(1), heights = heights)
 }
diff --git a/R/process.R b/R/process.R
@@ -250,10 +250,28 @@ dataset_process <- function(filename_data_raw,
       traits %>% dplyr::filter(!(!is.na(.data$error) & (.data$error == "Missing value")))
   }
 
-  # Todo - resource_metadata
-  # - Add contributors
+  # Update metadata
+  metadata <- resource_metadata
+
+  if (is.null(metadata[["related_identifiers"]][1])) {
+    metadata[["related_identifiers"]] <- list()
+  }
+
+  metadata[["related_identifiers"]] <-
+    util_append_to_list(
+      metadata[["related_identifiers"]],
+      list(
+        related_identifier_type = "url",
+        identifier = "https://github.com/traitecoevo/traits.build",
+        relation_type = "isCompiledBy",
+        resource_type = "software",
+        version = as.character(packageVersion("traits.build"))
+      )
+    )
+
 
   # Combine for final output
+  ret <- 
   list(
     traits = traits %>% dplyr::filter(is.na(.data$error)) %>% dplyr::select(-dplyr::all_of(c("error", "unit_in"))),
     locations = locations,
@@ -272,9 +290,13 @@ dataset_process <- function(filename_data_raw,
     sources = sources,
     definitions = definitions,
     schema = schema,
-    metadata = resource_metadata,
+    metadata = metadata,
     build_info = list(session_info = utils::sessionInfo())
   )
+
+  class(ret) <- c("list", "traits.build")
+
+  ret
 }
 
 #' Build dataset
@@ -1859,6 +1881,9 @@ build_combine <- function(..., d = list(...)) {
                       session_info = utils::sessionInfo()
                       )
               )
+
+  class(ret) <- c("list", "traits.build")
+
   ret
 }
 

diff --git a/R/testdata.R b/R/testdata.R
@@ -305,7 +305,7 @@ dataset_test_worker <-
     }
 
     expect_list <- function(data, info) {
-      expect_true(class(data) == "list", info = sprintf("%s - is not a list", info))
+      expect_true("list" %in% class(data), info = sprintf("%s - is not a list", info))
     }
 
     expect_list_names_valid <- function(data, info, label) {

diff --git a/inst/support/report_dataset.Rmd b/inst/support/report_dataset.Rmd
@@ -567,7 +567,7 @@ if (nrow(filter(data_study$excluded_data, error == "Value out of allowable range
     writeLines()
 }
 
- plot_trait_distribution_beeswarm(austraits, trait, "dataset_id", highlight = dataset_id, hide_ids = TRUE)
+  traits.build::plot_trait_distribution_beeswarm(austraits, trait, "dataset_id", highlight = dataset_id, hide_ids = TRUE)
 
   writeLines(c(""))
 

diff --git a/man/plot_trait_distribution_beeswarm.Rd b/man/plot_trait_distribution_beeswarm.Rd
diff --git a/tests/testthat/functions.R b/tests/testthat/functions.R
@@ -54,7 +54,7 @@ expect_not_NA <- function(object, info = NULL, label = NULL) {
 
 
 expect_list <- function(data, info) {
-  expect_true(class(data) == "list", info = info)
+  expect_true("list" %in% class(data), info = info)
 }
 
 

diff --git a/tests/testthat/test-process.R b/tests/testthat/test-process.R
@@ -24,6 +24,7 @@ test_that("`dataset_process` is working", {
   expect_no_error(austraits_names <- schema$austraits$elements %>% names())
   expect_no_error(x <- dataset_process(test_data, test_config, schema, resource_metadata, unit_conversions))
   expect_type(x, "list")
+  expect_equal(class(x), c("list", "traits.build"))
   expect_length(x, 13)
   expect_named(x, austraits_names)
   expect_equal(nrow(x$excluded_data), 0)

diff --git a/tests/testthat/test-setup.R b/tests/testthat/test-setup.R
@@ -594,7 +594,9 @@ test_that("`build_setup_pipeline` is working", {
   expect_true(file.copy("data/Test_2022/test-metadata.yml", "data/Test_2022/metadata.yml", overwrite = TRUE))
 
   expect_no_error(zip::unzip("config/testgit.zip"))
-  expect_no_error(sha <- git2r::sha(git2r::last_commit()))
+  # Expect no error if not within a git repo
+  expect_no_error(sha <- util_get_SHA("../../.."))
+  expect_no_error(sha <- util_get_SHA())
   # Expect error if path or method is wrong
   expect_error(build_setup_pipeline(path = "Datas"))
   expect_error(build_setup_pipeline(method = "grrrr"))
@@ -666,8 +668,29 @@ test_that("`build_setup_pipeline` is working", {
   expect_equal(austraits$build_info$git_SHA, sha)
   expect_equal(austraits$build_info$git_SHA, "6c73238d8d048781d9a4f5239a03813be313f0dd")
 
+  # Check output lists have required parts
+  ## Todo add mode here
+
+  ## sources
+
+  ## metadata
+
+  ## schema
+  expect_equal(austraits$schema, get_schema())
+
+  ## Compiled by traits.build
+  traits.build_tag <- last(austraits$metadata$related_identifiers)
+  expected_output <- list(
+    related_identifier_type = "url",
+    identifier = "https://github.com/traitecoevo/traits.build",
+    relation_type = "isCompiledBy",
+    resource_type = "software",
+    version = as.character(packageVersion("traits.build"))
+  )
+  expect_equal(traits.build_tag, expected_output)
   #expect_length(austraits_raw$taxa, 14) #not valid test with new `dataset_update_taxonomy setup`
   #expect_length(austraits$taxa, 14) #not valid test with new `dataset_update_taxonomy setup`
+
   expect_equal(nrow(austraits$taxa), nrow(austraits_raw$taxa))
 
   # Compare products from three methods, except `build_info`
@@ -708,7 +731,7 @@ test_that("reports and plots are produced", {
   # Not testing right now
   #expect_no_error(
     #p <-
-      #plot_trait_distribution_beeswarm(
+      #traits.build::plot_trait_distribution_beeswarm(
         #austraits, "huber_value", "dataset_id", highlight = "Test_2022", hide_ids = TRUE)
   #)
   expect_silent(

diff --git a/tests/testthat/test-usage.R b/tests/testthat/test-usage.R
@@ -5,5 +5,7 @@ austraits <- readRDS("test_austraits.rds")
 # Note, requires existnec of "test_austraits.rds", generated from `test-process.R`
 
 test_that("plots", {
-  expect_invisible(suppressMessages(austraits %>% plot_trait_distribution_beeswarm("wood_density", "dataset_id", "Test_2022")))
+  expect_invisible(suppressMessages(
+    austraits %>% traits.build::plot_trait_distribution_beeswarm("wood_density", "dataset_id", "Test_2022")
+  ))
 })