#201 basic ratio estimator based on BV and CL weight data

ices-tools-dev · Oct 17, 2024 · 4ccb8f7 · 4ccb8f7
1 parent 13b9e67
commit 4ccb8f7
Show file tree

Hide file tree

Showing 8 changed files with 279 additions and 134 deletions.
diff --git a/R/addCLtoLowerCS.R b/R/addCLtoLowerCS.R
@@ -10,11 +10,11 @@
 #'   The names of the list should match column names in any of the `CS` tables.
 #' @param strataListCL A named list of filter criteria for subsetting the `CL` data in the `RDBESDataObject`.
 #'   The names of the list should match column names in the `CL` table.
-#' @param samplingEventsTable A character string specifying the sampling events table name in the `RDBESDataObject`.
 #' @param combineStrata Logical, if `TRUE`, strata in the `strataListCS` are combined using a vertical bar (`|`).
 #' @param lowerHierarchy A character string specifying the level of the lower hierarchy table to which the CL data will be added.
 #'   Currently, only "C" is supported ie BV data only.
 #' @param CLfields A character vector of field names from the `CL` table that will be summed and added as new columns to the lower-level biological data.
+#' @param verbose Logical, if `TRUE`, the function prints informative text.
 #'
 #' @return A data.table containing the biological data from the lower hierarchy with added strata information from the `CL` table and
 #'   the sum of the specified fields from the `CL` data.
@@ -35,22 +35,14 @@
 #'                      CLmetier6 = "OTM_SPF_16-31_0_0",
 #'                      CLspecFAO = "SPR")
 #' biolCL <- addCLtoLowerCS(rdbesObject, strataListCS, strataListCL,
-#'                          samplingEventsTable = "VS",
 #'                          combineStrata = TRUE,
 #'                          lowerHierarchy = "C",
 #'                          CLfields = c("CLoffWeight"))
 #' }
 #'
 #' @seealso \code{\link{getLowerTableSubsets}}, \code{\link{upperTblData}}
 #' @export
-addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, samplingEventsTable, combineStrata =T, lowerHierarchy = "C", CLfields = c("CLoffWeight")){
-  if(inherits(rdbes, "RDBESDataObject")) {
-    tableNames <- names(summary(rdbes)$data)
-    #select only the relevant hierarchy table names
-    rdbesTbl <- rdbes[tableNames]
-  } else {
-    stop("rdbes must be of class RDBESDataObject")
-  }
+addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, combineStrata =T, lowerHierarchy = "C", CLfields = c("CLoffWeight"), verbose = FALSE){
 
   # Function to subset data.table based on criteria list
   subset_dt <- function(dt, criteria) {
@@ -78,10 +70,6 @@ addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, samplingEventsTabl
   #get the sampling data
   biolData <- getLowerTableSubsets(strataListCS, tblName, rdbes, combineStrata = combineStrata)
 
-  #get the trip count
-  biolTrips <- upperTblData(tblId, biolData[[tblId]], rdbesTbl,samplingEventsTable)
-  biolTrips$samplingEvents <- nrow(biolTrips)
-
 
   #get the CL data
   CL <- subset_dt(rdbes$CL, strataListCL)
@@ -91,6 +79,7 @@ addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, samplingEventsTabl
     x_new <- ifelse(length(x) == 1, x, paste0(x, collapse = "|"))
     rep(x_new, nrow(biolData))
     })
+
   strataList <- as.data.table(strataList)
   biolData <- cbind(biolData, strataList)
 

diff --git a/R/doBVestimCACNUM.R b/R/doBVestimCACNUM.R
@@ -0,0 +1,92 @@
+##' Estimate Catch at Number (CANUM) for Biological Variables
+#'
+#' This function estimates catch at number (CANUM) for a specified biological variable, such as age or length. It aggregates data based on specified columns and generates a "plus group" for the highest value in the defined classes. The function supports grouping by various units (e.g., age, length, weight) and calculates required indices, totals, and proportions for the groups.
+#'
+#' @param bv A `data.table` containing biological data, with columns for the biological variable, class units (e.g., `Ageyear`, `Lengthmm`, `Weightg`), and other relevant variables.
+#' @param addColumns A character vector of additional column names used to group the data for aggregation (e.g., `BVfishId` and other identifiers).
+#' @param classUnits A character string specifying the class units of the biological variable to use for grouping (e.g., "Ageyear", "Lengthmm", "Weightg"). Default is "Ageyear".
+#' @param classBreaks A numeric vector specifying the breakpoints for classifying the biological variable. The last value defines the lower bound of the "plus group". Default is `1:8` for age groups.
+#' @param verbose Logical, if `TRUE`, prints detailed information about the process. Default is `FALSE`.
+#'
+#' @return A `data.table` containing the aggregated results, including groupings, calculated means, proportions, indices, and totals for the specified biological variable.
+#'
+#' @details The function performs the following steps:
+#' \itemize{
+#'   \item Validates the presence of the `classUnits` in the biological variable data.
+#'   \item Reshapes the input data using `dcast` and groups the biological variable into classes using `cut()`.
+#'   \item Aggregates mean weights and lengths by the defined classes, along with calculating proportions and indices based on the sample size.
+#'   \item A "plus group" is created for values exceeding the highest `classBreaks` value.
+#'   \item Calculates total weights, catch numbers, and performs a sanity check to ensure there are no rounding errors in the final results.
+#' }
+#' @export
+doBVestimCANUM <- function(bv, addColumns,
+                         classUnits = "Ageyear",
+                         classBreaks = 1:8,
+                         verbose = FALSE){
+  rightF <- "BVvalUnitScale"
+  #the class unit must be one of "Sex" "Lengthmm" "Ageyear"  "Weightg"  "SMSF"
+  if(!(classUnits %in% unique(bv[[rightF]]))){
+  stop("The class unit must be present in data column BVvalUnitScale ",
+       "the available values are: ", paste0(unique(bv[[rightF]]), collapse = ", "))
+  }
+
+  #extract raw values
+  leftF <- paste0(c("BVfishId", addColumns), collapse = "+")
+
+  bv_wide <- data.table::dcast(bv, formula(paste0(leftF, "~", rightF)), value.var = "BVvalueMeas")
+  bv_wide$target <- as.numeric(bv_wide[[classUnits]])
+
+  classLabs <- switch(classUnits,
+                      Ageyear = c(classBreaks[-length(classBreaks)], paste0(max(classBreaks), "+")),
+                      c(paste0(classBreaks[-length(classBreaks)], "-", classBreaks[-1]),  paste0(max(classBreaks), "+")))
+
+  # Create the 'plus group' by using cut() to assign groups based on classBreaks
+  bv_wide$Group <- cut(bv_wide$target, breaks = c(classBreaks, Inf),
+                             include.lowest = TRUE, right = FALSE,
+                             labels =classLabs)
+
+  bv_wide$Lengthmm <- as.numeric(bv_wide$Lengthmm)
+  bv_wide$Weightg <- as.numeric(bv_wide$Weightg)
+
+  #aggregate values
+  a <- bv_wide[, .(WeightgMean = mean(Weightg, na.rm = TRUE),
+                   WeightgLen = sum(!is.na(Weightg)),
+                   LengthmmMean = mean(Lengthmm, na.rm = TRUE)),
+               by = c("Group",addColumns)]
+
+  b <- bv_wide[, .(lenMeas = sum(!is.na(Lengthmm)),
+                   targetMeas = sum(!is.na(Group))),
+               by = addColumns]
+
+  targetWeights <- merge(a, b, by = addColumns)
+
+  #remove the NA row
+  targetWeights <- targetWeights[!is.na(targetWeights$Group), ]
+
+  #add extra columns
+  #targetWeights$MeanLengthCm <- targetWeights$Lengthmm / 10
+  targetWeights$plusGroup <- classBreaks[length(classBreaks)]
+
+  #calculate required values
+  targetWeights$propSample <- targetWeights$WeightgLen / targetWeights$targetMeas
+  targetWeights$WeightIndex <- targetWeights$propSample * (targetWeights$WeightgMean / 1000)
+
+  # Calculate the sum of WeightIndex for each group defined by addColumns
+  targetWeights[, WeightIndexSum := sum(WeightIndex), by = addColumns]
+
+  targetWeights$TWCoef <- targetWeights$sumCLoffWeight / targetWeights$WeightIndexSum
+  targetWeights$totWeight <-  targetWeights$WeightIndex * targetWeights$TWCoef
+  targetWeights$totNum <- targetWeights$totWeight / (targetWeights$WeightgMean / 1000)
+
+  # Sanity check with tolerance to avoid rounding error
+  weights <- targetWeights$totNum * (targetWeights$WeightgMean / 1000)
+  expected_sum <- sum(unique(targetWeights$sumCLoffWeight))
+
+
+  # Use all.equal to compare with tolerance or manually check the difference
+  if(!isTRUE(all.equal(sum(weights), expected_sum))) {
+    stop("Strange problem: sums do not match within tolerance")
+  }
+
+  targetWeights
+}
diff --git a/R/getLowerTableSubsets.R b/R/getLowerTableSubsets.R
@@ -11,6 +11,7 @@
 #' @param combineStrata A logical value indicating whether to include the strata information in the result.
 #'   If `TRUE`, and if any strata has more than one value, those values are collapsed into a single string
 #'   and appended to the result with a warning.
+#' @param verbose A logical value indicating whether to print informative text.
 #'
 #' @return A unique data frame containing the rows of the target lower level table that are associated with
 #'   the given values of the upper table field in each subset. If `combineStrata = TRUE`, the result will also include
@@ -20,12 +21,11 @@
 #' the values from each subset in the upper tables. It then ensures that only unique rows are returned,
 #' based on the ID column of the target table.
 #'
-#' @export
-getLowerTableSubsets <- function(subsets, tblName, rdbesTables, combineStrata=F){
-  #check if tbls is of class "RDBESDataObject"
+getLowerTableSubsets <- function(subsets, tblName, rdbesTables, combineStrata = TRUE, verbose = FALSE) {
+  # Check if rdbesTables is of class "RDBESDataObject"
   if(inherits(rdbesTables, "RDBESDataObject")) {
     tableNames <- names(summary(rdbesTables)$data)
-    #select only the relevant hierarchy table names
+    # Select only the relevant hierarchy table names
     rdbesTables <- rdbesTables[tableNames]
   } else {
     stop("rdbesTables must be of class RDBESDataObject")
@@ -35,42 +35,54 @@ getLowerTableSubsets <- function(subsets, tblName, rdbesTables, combineStrata=F)
   }
 
   res <- list()
+  if(verbose) print("Getting lower table data")
+
+  # Process each subset
   for(ss in names(subsets)){
-    res[[ss]] <- lowerTblData(ss, subsets[[ss]], rdbesTables, tblName,F)
+    res[[ss]] <- lowerTblData(ss, subsets[[ss]], rdbesTables, tblName, verbose)
   }
+
+  # Function to get the intersection of IDs
   intesectIDs <- function(x, y, tblName){
     idCol <- paste0(tblName, "id")
-    if(is.data.frame(x)){ start <- x[[idCol]]} else{start <- x}
-
+    if(is.data.frame(x)){
+      start <- x[[idCol]]
+    } else {
+      start <- x
+    }
     intersect(start, y[[idCol]])
   }
+
+  # Get the IDs from the first subset if only one subset is present
   if(length(res) == 1) {
     ids <- res[[1]][[paste0(tblName, "id")]]
   } else{
-    ids <- Reduce(function(x, y) {intesectIDs(x, y, tblName) }, res)
+    ids <- Reduce(function(x, y) {intesectIDs(x, y, tblName)}, res)
   }
 
+  # Bind the data together and filter based on intersected IDs
   res <- data.table::rbindlist(res)
   res <- res[get(paste0(tblName, "id")) %in% ids]
   res <- unique(res, by = paste0(tblName, "id"))
 
-  if(combineStrata){
-    # Check the length of each element in the list
+  if(combineStrata) {
+    # Collapse multiple strata values into a single string if combineStrata = TRUE
     lengths <- sapply(subsets, length)
-    if(any(lengths>1)){
-      # If the length is more than 1, collapse it into a single string
+    if(any(lengths > 1)){
       subsets[lengths > 1] <- sapply(subsets[lengths > 1], paste0, collapse = "|")
       items <- subsets[lengths > 1]
       for(i in 1:length(items)){
         warning(names(items[i]),
                 " is collapsed in the result into: \"",items[i],"\"\n")
       }
     }
-
+  } else {
+   stop("combineStrata must be TRUE for now")
   }
 
-  res <- cbind(res,as.data.frame(subsets))
-
+  # Combine the data with the strata subsets
+  res <- cbind(res, as.data.frame(subsets))
 
   return(res)
 }
+
diff --git a/R/lowerTblData.R b/R/lowerTblData.R
@@ -14,11 +14,11 @@
 #'   the given values of the upper table field.
 #'
 #' @examples
-#'DE <- data.table(DEid = c(1, 2, 3, 4), SDid = c(1, 2, 3, 4))
-#'SD <- data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
-#'TE <- data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
-#'VS <- data.table(TEid = c(1, 2, 3, 4), VSid = c(1, 2, 3, 4))
-#'LE <- data.table(VSid = 1:5, LEid = 1:5, value = c(10, 20, 3, 4, 6))
+#'DE <- data.table::data.table(DEid = c(1, 2, 3, 4), SDid = c(1, 2, 3, 4))
+#'SD <- data.table::data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
+#'TE <- data.table::data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
+#'VS <- data.table::data.table(TEid = c(1, 2, 3, 4), VSid = c(1, 2, 3, 4))
+#'LE <- data.table::data.table(VSid = 1:5, LEid = 1:5, value = c(10, 20, 3, 4, 6))
 #' tblsSprat <- list( DE = DE ,SD = SD, TE = TE, VS = VS, LE = LE )
 #'
 #' lowerTblData("TEid", c(4), tblsSprat, "LE", T)

diff --git a/tests/testthat/test-addCLtoLowerCS.R b/tests/testthat/test-addCLtoLowerCS.R
@@ -10,8 +10,8 @@ strataListCL <- list(CLarea="27.3.d.28.1", CLquar = 1, CLmetier6 = "OTM_SPF_16-3
 # Test 1: Expect error if rdbes is not of class RDBESDataObject
 test_that("addCLtoLowerCS throws an error if rdbes is not of class RDBESDataObject", {
   expect_error(
-    addCLtoLowerCS(list(), strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
-    "rdbes must be of class RDBESDataObject"
+    addCLtoLowerCS(list(), strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
+    "No CL data in the input RDBESDataObject"
   )
 })
 
@@ -20,14 +20,14 @@ test_that("addCLtoLowerCS throws an error if CL data is missing in rdbes", {
   mock_rdbes_no_CL <- mock_rdbes
   mock_rdbes_no_CL$CL <- NULL
   expect_error(
-    addCLtoLowerCS(mock_rdbes_no_CL, strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
+    addCLtoLowerCS(mock_rdbes_no_CL, strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
     "No CL data in the input RDBESDataObject"
   )
 })
 
 # Test 3: Check correct output with valid input
 test_that("addCLtoLowerCS adds the correct sumCLoffWeight column with valid input", {
-  result <- expect_warning(addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")))
+  result <- expect_warning(addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")))
 
   # Check that the result has the correct columns
   expect_true("sumCLoffWeight" %in% names(result))
@@ -37,7 +37,7 @@ test_that("addCLtoLowerCS adds the correct sumCLoffWeight column with valid inpu
 # Test 4: Check that the function handles unsupported lowerHierarchy correctly
 test_that("addCLtoLowerCS throws an error if lowerHierarchy is unsupported", {
   expect_error(
-    addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "A", CLfields = c("CLoffWeight")),
+    addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "A", CLfields = c("CLoffWeight")),
     "Lower hierarchy A not yet supported"
   )
 })
diff --git a/tests/testthat/test-doBVestimCACNUM.R b/tests/testthat/test-doBVestimCACNUM.R
@@ -0,0 +1,84 @@
+# Step 1: Generate independent random normal variables
+n <- 100  # Number of samples
+Lengthmm <- rnorm(n, mean = 100, sd = 1)  # Lengthmm variable
+Ageyear <- rnorm(n, mean = 2, sd = 1)     # Ageyear variable
+Weightg <- rnorm(n, mean = 10, sd = 1)    # Weightg variable
+
+
+biolCLQ1 <- data.table(
+  BVfishId = rep(1:100,each=3),
+  BVvalueMeas = c(Lengthmm, Ageyear, Weightg),
+  BVvalUnitScale = rep(c("Lengthmm", "Ageyear", "Weightg"), 100),
+  sumCLoffWeight = 396210
+)
+
+# Test 1: Check the output structure of BVestimCANUM
+test_that("doBVestimCANUM returns correct structure", {
+  lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
+                             classUnits = "Lengthmm",
+                             classBreaks = seq(70, 130, 10),
+                             verbose = FALSE)
+
+  # Check that the output is a data.table
+  expect_true(is.data.table(lenCANUMQ1))
+
+  # Check that the expected columns are present
+  expected_columns <- c("Group", "WeightgMean", "WeightgLen", "lenMeas", "targetMeas",
+                        "propSample", "WeightIndex", "totWeight", "totNum")
+  expect_true(all(expected_columns %in% names(lenCANUMQ1)))
+})
+
+# Test 2: Check that Group is correctly classified
+test_that("doBVestimCANUM correctly classifies Lengthmm groups", {
+  lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
+                             classUnits = "Lengthmm",
+                             classBreaks = seq(70, 130, 10),
+                             verbose = FALSE)
+
+  # Check that the Group column contains the correct labels
+  expected_groups <- c("70-80", "80-90", "90-100", "100-110", "110-120", "120-130", "130+")
+  actual_groups <- unique(lenCANUMQ1$Group)
+  expect_true(all(actual_groups %in% expected_groups))
+})
+
+# Test 3: Check WeightgMean calculation
+test_that("doBVestimCANUM calculates WeightgMean correctly", {
+  lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
+                             classUnits = "Lengthmm",
+                             classBreaks = seq(70, 130, 10),
+                             verbose = FALSE)
+
+  # Ensure WeightgMean is numeric and positive
+  expect_true(is.numeric(lenCANUMQ1$WeightgMean))
+  expect_true(all(lenCANUMQ1$WeightgMean > 0))
+})
+
+
+# Test 5: Sanity check for total weights
+test_that("doBVestimCANUM performs a sanity check for total weights", {
+  lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
+                             classUnits = "Lengthmm",
+                             classBreaks = seq(70, 130, 10),
+                             verbose = FALSE)
+
+  # Check that the sum of totWeight equals sumCLoffWeight (within tolerance)
+  total_weight <- sum(lenCANUMQ1$totWeight)
+  expected_sum <- sum(unique(lenCANUMQ1$sumCLoffWeight))
+
+  expect_true(isTRUE(all.equal(total_weight, expected_sum)))
+})
+
+# Test 6: Check TWCoef calculation
+test_that("doBVestimCANUM calculates TWCoef correctly", {
+  lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
+                             classUnits = "Lengthmm",
+                             classBreaks = seq(70, 130, 10),
+                             verbose = FALSE)
+
+  # Recalculate TWCoef
+  lenCANUMQ1[, expected_TWCoef := sumCLoffWeight / WeightIndexSum]
+
+  # Ensure TWCoef is correctly calculated
+  expect_equal(lenCANUMQ1$TWCoef, lenCANUMQ1$expected_TWCoef)
+})
+