Skip to content

Commit

Permalink
#201 basic ratio estimator based on BV and CL weight data
Browse files Browse the repository at this point in the history
  • Loading branch information
rix133 committed Oct 17, 2024
1 parent 13b9e67 commit 4ccb8f7
Show file tree
Hide file tree
Showing 8 changed files with 279 additions and 134 deletions.
17 changes: 3 additions & 14 deletions R/addCLtoLowerCS.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
#' The names of the list should match column names in any of the `CS` tables.
#' @param strataListCL A named list of filter criteria for subsetting the `CL` data in the `RDBESDataObject`.
#' The names of the list should match column names in the `CL` table.
#' @param samplingEventsTable A character string specifying the sampling events table name in the `RDBESDataObject`.
#' @param combineStrata Logical, if `TRUE`, strata in the `strataListCS` are combined using a vertical bar (`|`).
#' @param lowerHierarchy A character string specifying the level of the lower hierarchy table to which the CL data will be added.
#' Currently, only "C" is supported ie BV data only.
#' @param CLfields A character vector of field names from the `CL` table that will be summed and added as new columns to the lower-level biological data.
#' @param verbose Logical, if `TRUE`, the function prints informative text.
#'
#' @return A data.table containing the biological data from the lower hierarchy with added strata information from the `CL` table and
#' the sum of the specified fields from the `CL` data.
Expand All @@ -35,22 +35,14 @@
#' CLmetier6 = "OTM_SPF_16-31_0_0",
#' CLspecFAO = "SPR")
#' biolCL <- addCLtoLowerCS(rdbesObject, strataListCS, strataListCL,
#' samplingEventsTable = "VS",
#' combineStrata = TRUE,
#' lowerHierarchy = "C",
#' CLfields = c("CLoffWeight"))
#' }
#'
#' @seealso \code{\link{getLowerTableSubsets}}, \code{\link{upperTblData}}
#' @export
addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, samplingEventsTable, combineStrata =T, lowerHierarchy = "C", CLfields = c("CLoffWeight")){
if(inherits(rdbes, "RDBESDataObject")) {
tableNames <- names(summary(rdbes)$data)
#select only the relevant hierarchy table names
rdbesTbl <- rdbes[tableNames]
} else {
stop("rdbes must be of class RDBESDataObject")
}
addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, combineStrata =T, lowerHierarchy = "C", CLfields = c("CLoffWeight"), verbose = FALSE){

# Function to subset data.table based on criteria list
subset_dt <- function(dt, criteria) {
Expand Down Expand Up @@ -78,10 +70,6 @@ addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, samplingEventsTabl
#get the sampling data
biolData <- getLowerTableSubsets(strataListCS, tblName, rdbes, combineStrata = combineStrata)

#get the trip count
biolTrips <- upperTblData(tblId, biolData[[tblId]], rdbesTbl,samplingEventsTable)
biolTrips$samplingEvents <- nrow(biolTrips)


#get the CL data
CL <- subset_dt(rdbes$CL, strataListCL)
Expand All @@ -91,6 +79,7 @@ addCLtoLowerCS <- function(rdbes, strataListCS, strataListCL, samplingEventsTabl
x_new <- ifelse(length(x) == 1, x, paste0(x, collapse = "|"))
rep(x_new, nrow(biolData))
})

strataList <- as.data.table(strataList)
biolData <- cbind(biolData, strataList)

Expand Down
92 changes: 92 additions & 0 deletions R/doBVestimCACNUM.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
##' Estimate Catch at Number (CANUM) for Biological Variables
#'
#' This function estimates catch at number (CANUM) for a specified biological variable, such as age or length. It aggregates data based on specified columns and generates a "plus group" for the highest value in the defined classes. The function supports grouping by various units (e.g., age, length, weight) and calculates required indices, totals, and proportions for the groups.
#'
#' @param bv A `data.table` containing biological data, with columns for the biological variable, class units (e.g., `Ageyear`, `Lengthmm`, `Weightg`), and other relevant variables.
#' @param addColumns A character vector of additional column names used to group the data for aggregation (e.g., `BVfishId` and other identifiers).
#' @param classUnits A character string specifying the class units of the biological variable to use for grouping (e.g., "Ageyear", "Lengthmm", "Weightg"). Default is "Ageyear".
#' @param classBreaks A numeric vector specifying the breakpoints for classifying the biological variable. The last value defines the lower bound of the "plus group". Default is `1:8` for age groups.
#' @param verbose Logical, if `TRUE`, prints detailed information about the process. Default is `FALSE`.
#'
#' @return A `data.table` containing the aggregated results, including groupings, calculated means, proportions, indices, and totals for the specified biological variable.
#'
#' @details The function performs the following steps:
#' \itemize{
#' \item Validates the presence of the `classUnits` in the biological variable data.
#' \item Reshapes the input data using `dcast` and groups the biological variable into classes using `cut()`.
#' \item Aggregates mean weights and lengths by the defined classes, along with calculating proportions and indices based on the sample size.
#' \item A "plus group" is created for values exceeding the highest `classBreaks` value.
#' \item Calculates total weights, catch numbers, and performs a sanity check to ensure there are no rounding errors in the final results.
#' }
#' @export
doBVestimCANUM <- function(bv, addColumns,
classUnits = "Ageyear",
classBreaks = 1:8,
verbose = FALSE){
rightF <- "BVvalUnitScale"
#the class unit must be one of "Sex" "Lengthmm" "Ageyear" "Weightg" "SMSF"
if(!(classUnits %in% unique(bv[[rightF]]))){
stop("The class unit must be present in data column BVvalUnitScale ",
"the available values are: ", paste0(unique(bv[[rightF]]), collapse = ", "))
}

#extract raw values
leftF <- paste0(c("BVfishId", addColumns), collapse = "+")

bv_wide <- data.table::dcast(bv, formula(paste0(leftF, "~", rightF)), value.var = "BVvalueMeas")
bv_wide$target <- as.numeric(bv_wide[[classUnits]])

classLabs <- switch(classUnits,
Ageyear = c(classBreaks[-length(classBreaks)], paste0(max(classBreaks), "+")),
c(paste0(classBreaks[-length(classBreaks)], "-", classBreaks[-1]), paste0(max(classBreaks), "+")))

# Create the 'plus group' by using cut() to assign groups based on classBreaks
bv_wide$Group <- cut(bv_wide$target, breaks = c(classBreaks, Inf),
include.lowest = TRUE, right = FALSE,
labels =classLabs)

bv_wide$Lengthmm <- as.numeric(bv_wide$Lengthmm)
bv_wide$Weightg <- as.numeric(bv_wide$Weightg)

#aggregate values
a <- bv_wide[, .(WeightgMean = mean(Weightg, na.rm = TRUE),
WeightgLen = sum(!is.na(Weightg)),
LengthmmMean = mean(Lengthmm, na.rm = TRUE)),
by = c("Group",addColumns)]

b <- bv_wide[, .(lenMeas = sum(!is.na(Lengthmm)),
targetMeas = sum(!is.na(Group))),
by = addColumns]

targetWeights <- merge(a, b, by = addColumns)

#remove the NA row
targetWeights <- targetWeights[!is.na(targetWeights$Group), ]

#add extra columns
#targetWeights$MeanLengthCm <- targetWeights$Lengthmm / 10
targetWeights$plusGroup <- classBreaks[length(classBreaks)]

#calculate required values
targetWeights$propSample <- targetWeights$WeightgLen / targetWeights$targetMeas
targetWeights$WeightIndex <- targetWeights$propSample * (targetWeights$WeightgMean / 1000)

# Calculate the sum of WeightIndex for each group defined by addColumns
targetWeights[, WeightIndexSum := sum(WeightIndex), by = addColumns]

targetWeights$TWCoef <- targetWeights$sumCLoffWeight / targetWeights$WeightIndexSum
targetWeights$totWeight <- targetWeights$WeightIndex * targetWeights$TWCoef
targetWeights$totNum <- targetWeights$totWeight / (targetWeights$WeightgMean / 1000)

# Sanity check with tolerance to avoid rounding error
weights <- targetWeights$totNum * (targetWeights$WeightgMean / 1000)
expected_sum <- sum(unique(targetWeights$sumCLoffWeight))


# Use all.equal to compare with tolerance or manually check the difference
if(!isTRUE(all.equal(sum(weights), expected_sum))) {
stop("Strange problem: sums do not match within tolerance")
}

targetWeights
}
42 changes: 27 additions & 15 deletions R/getLowerTableSubsets.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#' @param combineStrata A logical value indicating whether to include the strata information in the result.
#' If `TRUE`, and if any strata has more than one value, those values are collapsed into a single string
#' and appended to the result with a warning.
#' @param verbose A logical value indicating whether to print informative text.
#'
#' @return A unique data frame containing the rows of the target lower level table that are associated with
#' the given values of the upper table field in each subset. If `combineStrata = TRUE`, the result will also include
Expand All @@ -20,12 +21,11 @@
#' the values from each subset in the upper tables. It then ensures that only unique rows are returned,
#' based on the ID column of the target table.
#'
#' @export
getLowerTableSubsets <- function(subsets, tblName, rdbesTables, combineStrata=F){
#check if tbls is of class "RDBESDataObject"
getLowerTableSubsets <- function(subsets, tblName, rdbesTables, combineStrata = TRUE, verbose = FALSE) {
# Check if rdbesTables is of class "RDBESDataObject"
if(inherits(rdbesTables, "RDBESDataObject")) {
tableNames <- names(summary(rdbesTables)$data)
#select only the relevant hierarchy table names
# Select only the relevant hierarchy table names
rdbesTables <- rdbesTables[tableNames]
} else {
stop("rdbesTables must be of class RDBESDataObject")
Expand All @@ -35,42 +35,54 @@ getLowerTableSubsets <- function(subsets, tblName, rdbesTables, combineStrata=F)
}

res <- list()
if(verbose) print("Getting lower table data")

# Process each subset
for(ss in names(subsets)){
res[[ss]] <- lowerTblData(ss, subsets[[ss]], rdbesTables, tblName,F)
res[[ss]] <- lowerTblData(ss, subsets[[ss]], rdbesTables, tblName, verbose)
}

# Function to get the intersection of IDs
intesectIDs <- function(x, y, tblName){
idCol <- paste0(tblName, "id")
if(is.data.frame(x)){ start <- x[[idCol]]} else{start <- x}

if(is.data.frame(x)){
start <- x[[idCol]]
} else {
start <- x
}
intersect(start, y[[idCol]])
}

# Get the IDs from the first subset if only one subset is present
if(length(res) == 1) {
ids <- res[[1]][[paste0(tblName, "id")]]
} else{
ids <- Reduce(function(x, y) {intesectIDs(x, y, tblName) }, res)
ids <- Reduce(function(x, y) {intesectIDs(x, y, tblName)}, res)
}

# Bind the data together and filter based on intersected IDs
res <- data.table::rbindlist(res)
res <- res[get(paste0(tblName, "id")) %in% ids]
res <- unique(res, by = paste0(tblName, "id"))

if(combineStrata){
# Check the length of each element in the list
if(combineStrata) {
# Collapse multiple strata values into a single string if combineStrata = TRUE
lengths <- sapply(subsets, length)
if(any(lengths>1)){
# If the length is more than 1, collapse it into a single string
if(any(lengths > 1)){
subsets[lengths > 1] <- sapply(subsets[lengths > 1], paste0, collapse = "|")
items <- subsets[lengths > 1]
for(i in 1:length(items)){
warning(names(items[i]),
" is collapsed in the result into: \"",items[i],"\"\n")
}
}

} else {
stop("combineStrata must be TRUE for now")
}

res <- cbind(res,as.data.frame(subsets))

# Combine the data with the strata subsets
res <- cbind(res, as.data.frame(subsets))

return(res)
}

10 changes: 5 additions & 5 deletions R/lowerTblData.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
#' the given values of the upper table field.
#'
#' @examples
#'DE <- data.table(DEid = c(1, 2, 3, 4), SDid = c(1, 2, 3, 4))
#'SD <- data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
#'TE <- data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
#'VS <- data.table(TEid = c(1, 2, 3, 4), VSid = c(1, 2, 3, 4))
#'LE <- data.table(VSid = 1:5, LEid = 1:5, value = c(10, 20, 3, 4, 6))
#'DE <- data.table::data.table(DEid = c(1, 2, 3, 4), SDid = c(1, 2, 3, 4))
#'SD <- data.table::data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
#'TE <- data.table::data.table(SDid = c(1, 2, 3, 4), TEid = c(1, 2, 3, 4))
#'VS <- data.table::data.table(TEid = c(1, 2, 3, 4), VSid = c(1, 2, 3, 4))
#'LE <- data.table::data.table(VSid = 1:5, LEid = 1:5, value = c(10, 20, 3, 4, 6))
#' tblsSprat <- list( DE = DE ,SD = SD, TE = TE, VS = VS, LE = LE )
#'
#' lowerTblData("TEid", c(4), tblsSprat, "LE", T)
Expand Down
10 changes: 5 additions & 5 deletions tests/testthat/test-addCLtoLowerCS.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ strataListCL <- list(CLarea="27.3.d.28.1", CLquar = 1, CLmetier6 = "OTM_SPF_16-3
# Test 1: Expect error if rdbes is not of class RDBESDataObject
test_that("addCLtoLowerCS throws an error if rdbes is not of class RDBESDataObject", {
expect_error(
addCLtoLowerCS(list(), strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
"rdbes must be of class RDBESDataObject"
addCLtoLowerCS(list(), strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
"No CL data in the input RDBESDataObject"
)
})

Expand All @@ -20,14 +20,14 @@ test_that("addCLtoLowerCS throws an error if CL data is missing in rdbes", {
mock_rdbes_no_CL <- mock_rdbes
mock_rdbes_no_CL$CL <- NULL
expect_error(
addCLtoLowerCS(mock_rdbes_no_CL, strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
addCLtoLowerCS(mock_rdbes_no_CL, strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")),
"No CL data in the input RDBESDataObject"
)
})

# Test 3: Check correct output with valid input
test_that("addCLtoLowerCS adds the correct sumCLoffWeight column with valid input", {
result <- expect_warning(addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")))
result <- expect_warning(addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "C", CLfields = c("CLoffWeight")))

# Check that the result has the correct columns
expect_true("sumCLoffWeight" %in% names(result))
Expand All @@ -37,7 +37,7 @@ test_that("addCLtoLowerCS adds the correct sumCLoffWeight column with valid inpu
# Test 4: Check that the function handles unsupported lowerHierarchy correctly
test_that("addCLtoLowerCS throws an error if lowerHierarchy is unsupported", {
expect_error(
addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, samplingEventsTable = "VS", combineStrata = TRUE, lowerHierarchy = "A", CLfields = c("CLoffWeight")),
addCLtoLowerCS(mock_rdbes, strataListCS, strataListCL, combineStrata = TRUE, lowerHierarchy = "A", CLfields = c("CLoffWeight")),
"Lower hierarchy A not yet supported"
)
})
84 changes: 84 additions & 0 deletions tests/testthat/test-doBVestimCACNUM.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Step 1: Generate independent random normal variables
n <- 100 # Number of samples
Lengthmm <- rnorm(n, mean = 100, sd = 1) # Lengthmm variable
Ageyear <- rnorm(n, mean = 2, sd = 1) # Ageyear variable
Weightg <- rnorm(n, mean = 10, sd = 1) # Weightg variable


biolCLQ1 <- data.table(
BVfishId = rep(1:100,each=3),
BVvalueMeas = c(Lengthmm, Ageyear, Weightg),
BVvalUnitScale = rep(c("Lengthmm", "Ageyear", "Weightg"), 100),
sumCLoffWeight = 396210
)

# Test 1: Check the output structure of BVestimCANUM
test_that("doBVestimCANUM returns correct structure", {
lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
classUnits = "Lengthmm",
classBreaks = seq(70, 130, 10),
verbose = FALSE)

# Check that the output is a data.table
expect_true(is.data.table(lenCANUMQ1))

# Check that the expected columns are present
expected_columns <- c("Group", "WeightgMean", "WeightgLen", "lenMeas", "targetMeas",
"propSample", "WeightIndex", "totWeight", "totNum")
expect_true(all(expected_columns %in% names(lenCANUMQ1)))
})

# Test 2: Check that Group is correctly classified
test_that("doBVestimCANUM correctly classifies Lengthmm groups", {
lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
classUnits = "Lengthmm",
classBreaks = seq(70, 130, 10),
verbose = FALSE)

# Check that the Group column contains the correct labels
expected_groups <- c("70-80", "80-90", "90-100", "100-110", "110-120", "120-130", "130+")
actual_groups <- unique(lenCANUMQ1$Group)
expect_true(all(actual_groups %in% expected_groups))
})

# Test 3: Check WeightgMean calculation
test_that("doBVestimCANUM calculates WeightgMean correctly", {
lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
classUnits = "Lengthmm",
classBreaks = seq(70, 130, 10),
verbose = FALSE)

# Ensure WeightgMean is numeric and positive
expect_true(is.numeric(lenCANUMQ1$WeightgMean))
expect_true(all(lenCANUMQ1$WeightgMean > 0))
})


# Test 5: Sanity check for total weights
test_that("doBVestimCANUM performs a sanity check for total weights", {
lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
classUnits = "Lengthmm",
classBreaks = seq(70, 130, 10),
verbose = FALSE)

# Check that the sum of totWeight equals sumCLoffWeight (within tolerance)
total_weight <- sum(lenCANUMQ1$totWeight)
expected_sum <- sum(unique(lenCANUMQ1$sumCLoffWeight))

expect_true(isTRUE(all.equal(total_weight, expected_sum)))
})

# Test 6: Check TWCoef calculation
test_that("doBVestimCANUM calculates TWCoef correctly", {
lenCANUMQ1 <- doBVestimCANUM(biolCLQ1, c("sumCLoffWeight"),
classUnits = "Lengthmm",
classBreaks = seq(70, 130, 10),
verbose = FALSE)

# Recalculate TWCoef
lenCANUMQ1[, expected_TWCoef := sumCLoffWeight / WeightIndexSum]

# Ensure TWCoef is correctly calculated
expect_equal(lenCANUMQ1$TWCoef, lenCANUMQ1$expected_TWCoef)
})

Loading

0 comments on commit 4ccb8f7

Please sign in to comment.