diff --git a/R/import_check_functions.R b/R/import_check_functions.R index bf044bf..897f97a 100644 --- a/R/import_check_functions.R +++ b/R/import_check_functions.R @@ -155,7 +155,7 @@ ctsm.check.basis.biota <- function(data, info) { new[!ok] <- "W" }) - id <- data$determinand %in% c("EXLIP%", "FATWT%", "LIPIDWT%") + id <- data$determinand %in% c("EXLIP%", "FATWT%", "LIPIDWT%", "WTMEA") if (any(id)) data[id,] <- within(data[id,], { ok <- basis %in% c("W", "D") @@ -164,8 +164,8 @@ ctsm.check.basis.biota <- function(data, info) { }) id <- data$determinand %in% c( - "VDS", "IMPS", "INTS", "VDSI", "PCI", "INTSI", "%FEMALEPOP", "SURVT", "NRR", "LP", "%DNATAIL", - "MNC", "CMT-QC-NR", "MNC-QC-NR") + "VDS", "IMPS", "INTS", "VDSI", "PCI", "INTSI", "%FEMALEPOP", "SURVT", "NRR", + "LP", "%DNATAIL", "MNC", "CMT-QC-NR", "MNC-QC-NR") if (any(id)) data[id,] <- within(data[id,], { ok <- is.na(basis) @@ -250,7 +250,7 @@ ctsm.check.matrix.biota <- function(data, info) { # actually LNMEA could be feather length (but have not allowed for this at present) # NB procedures for merging with LNMEA are similary complicated for birds - id <- data$determinand %in% "LNMEA" + id <- data$determinand %in% c("LNMEA", "WTMEA") if (any(id)) data[id,] <- within(data[id,], { ok <- (species_group %in% c("Fish", "Mammal") & matrix %in% "WO") | @@ -450,25 +450,25 @@ ctsm.check.species_group.biota <- function(data, info) { ctsm.check.sex.biota <- function(data, info) { + # check sex has valid ICES codes + # extra checks for imposex determinands and EROD + # silence non-standard evaluation warnings sex <- NULL - # NB any changes should really be made at the sub-sample level - id <- ctsm_is_contaminant(data$pargroup) | - data$group %in% "Metabolites" | - data$determinand %in% c("AGMEA", "LNMEA", "DRYWT%", "EXLIP%", "FATWT%", "LIPIDWT%") | - data$determinand %in% c("ALAD", "SFG", "ACHE", "GST", "SURVT", "NRR", "LP", "%DNATAIL", - "MNC", "CMT-QC-NR", "MNC-QC-NR") - - if (any(id)) - data[id,] <- within(data[id,], { - ok <- sex %in% c("F", "I", "M", "U", "X", NA) - action <- ifelse(ok, "none", "warning") - new[!ok] <- NA - }) - + # global check of ICES codes + + data <- dplyr::mutate( + data, + ok = .data$sex %in% c("F", "H", "I", "M", "T", "U", "X", NA), + action = dplyr::if_else(ok, "none", "warning"), + new[!ok] <- NA + ) + + # imposex + id <- data$determinand %in% c("VDS", "IMPS", "INTS") if (any(id)) data[id,] <- within(data[id,], { @@ -476,7 +476,7 @@ ctsm.check.sex.biota <- function(data, info) { action <- ifelse(ok, "none", ifelse(sex %in% NA, "warning", "error")) new[sex %in% NA] <- "F" }) - + id <- data$determinand %in% c("VDSI", "PCI", "INTSI", "%FEMALEPOP") if (any(id)) data[id,] <- within(data[id,], { @@ -485,16 +485,27 @@ ctsm.check.sex.biota <- function(data, info) { new[sex %in% NA] <- "X" }) + + # EROD + id <- data$determinand %in% "EROD" if (any(id)) data[id,] <- within(data[id,], { ok <- sex %in% c("F", "M") - ok.delete <- sex %in% c("U", "I", "X") + ok.delete <- sex %in% c("H", "I", "T", "U", "X") action <- ifelse(ok, "none", ifelse(ok.delete, "delete", "error")) if (any(ok.delete)) cat(" Dropping EROD data with immature or unidentifiable sex\n") }) + # id <- is.na(data$ok) + # if (any(id)) + # data[id,] <- within(data[id,], { + # ok <- sex %in% c("F", "I", "M", "T", "U", "X", NA) + # action <- ifelse(ok, "none", "warning") + # new[!ok] <- NA + # }) + data } @@ -502,8 +513,8 @@ ctsm.check.sex.biota <- function(data, info) { ctsm.check.unit.biota <- function(data, info) { standard_unit <- c( - "g/g", "mg/mg", "ug/ug", "ng/ng", "pg/pg", "mg/g", "ug/g", "ng/g", "pg/g", "g/kg", "mg/kg", - "ug/kg", "ng/kg", "pg/kg") + "g/g", "mg/mg", "ug/ug", "ng/ng", "pg/pg", "mg/g", "ug/g", "ng/g", "pg/g", + "g/kg", "mg/kg", "ug/kg", "ng/kg", "pg/kg") id <- ctsm_is_contaminant(data$pargroup, exclude = "I-RNC") & data$determinand != "TEQDFP" if (any(id)) @@ -551,6 +562,13 @@ ctsm.check.unit.biota <- function(data, info) { action <- ifelse(ok, "none", "error") }) + id <- data$determinand %in% c("WTMEA") + if (any(id)) + data[id,] <- within(data[id,], { + ok <- unit %in% c("kg", "g", "mg") + action <- ifelse(ok, "none", "error") + }) + id <- data$determinand %in% c("DRYWT%", "EXLIP%", "FATWT%", "LIPIDWT%") if (any(id)) data[id,] <- within(data[id,], { @@ -775,7 +793,7 @@ ctsm.check.value.biota <- function(data, info) { id <- ctsm_is_contaminant(data$pargroup, exclude = "I-MTC") | data$group %in% "Metabolites" | - data$determinand %in% c("EROD", "ALAD", "ACHE", "GST", "AGMEA", "LNMEA") + data$determinand %in% c("EROD", "ALAD", "ACHE", "GST", "AGMEA", "LNMEA", "WTMEA") if (any(id)) data[id,] <- within(data[id,], { diff --git a/R/import_functions.R b/R/import_functions.R index d865779..19ea3b1 100644 --- a/R/import_functions.R +++ b/R/import_functions.R @@ -68,10 +68,36 @@ library(readxl) #' #' ## Control parameters #' -#' Many aspects of the assessment process can be controlled through the -#' parameters stored in `info$control`. This is a list populated with default -#' values which can then be overwritten, if required, using the `control` -#' argument. +#' Many aspects of the assessment process can be controlled using parameters +#' which are stored in the `info` component of the harsat data object. The +#' default control values can be overwritten using the `control` argument. +#' +#' * `reporting_window` A scalar (default 6) which determines whether +#' timeseries are excluded because they have no 'recent' data. Formally, +#' timeseries are excluded if they have no data in the period +#' `max_year - reporting_window + 1` and `max_year`, so the default approach +#' is to exclude timeseries if they have no dat in the most recent six +#' monitoring years. The value of 6 is chosen to match with Marine Strategy +#' Framework Directive reporting periods. +#' * `region` +#' * `add_stations` +#' * `bivalve_spawning_season` +#' * `use_stage` +#' * `relative_uncertainty` +#' * `auxiliary` A list which allows flexibility in the treatment of auxiliary +#' variables. At present, there is just one component `by_matrix`, a +#' character vector that determines which auxiliary variables are matched to +#' the contaminant data by `sample` and `matrix` as opposed to just `sample`. +#' For sediment and water, the default is `all`; i.e. all variables are +#' matched by `sample` and `matrix`. This ensures, for example, that +#' sediment normalisers such as aluminium and organic carbon content are +#' matched to chemical measurements in the same grain fraction. For biota, the +#' default is `c("DRYWT%", "LIPIDWT%)`, so these variables are matched by +#' `sample` and `matrix` and all other variables (e.g. LNMEA or %FEMALEPOP) +#' are matched by `sample`. Thus, dry weight and lipid weight contents are +#' matched to chemical measurements in the same tissue. However, mean length +#' (which is usually the lenght of the whole organism) is matched to all +#' tissue types. #' #' ## External data #' @@ -272,7 +298,7 @@ safe_read_file <- function(file, header=TRUE, sep=",", quote="\"", dec=".", fill control_default <- function(purpose, compartment) { - # import functions + # import_functions.R # sets up default values that control the assessment # reporting_window is set to 6 to match the MSFD reporting cycle @@ -295,6 +321,12 @@ control_default <- function(purpose, compartment) { # relative uncertainties for log-normally distributed data; the default is # to accept relative uncertainties greater than (but not equal) to 1% and # less than (but not equal to) 100% + + # auxiliary is a list (to be extended) that allows flexibility in the + # treatment of auxiliary variables + # by_matrix determines which variables are merged by matrix in addition to + # by sample; default is DRYWT% and LIPIDWT% for biota and everything for + # sediment and water region <- list() @@ -373,14 +405,22 @@ control_default <- function(purpose, compartment) { ) ) - + auxiliary = list( + by_matrix = switch( + compartment, + biota = c("DRYWT%", "LIPIDWT%"), + "all" + ) + ) + list( reporting_window = 6L, region = region, add_stations = add_stations, bivalve_spawning_season = bivalve_spawning_season, use_stage = use_stage, - relative_uncertainty = relative_uncertainty + relative_uncertainty = relative_uncertainty, + auxiliary = auxiliary ) } @@ -2227,7 +2267,7 @@ tidy_contaminants <- function(data, info) { - +# create timeseries ---- #' Create a time series #' @@ -2600,7 +2640,7 @@ create_timeseries <- function( # merge auxiliary data with determinand data - data <- ctsm_merge_auxiliary(data, info) + data <- merge_auxiliary(data, info) # impute %femalepop when missing and sex = 1 - write out remaining @@ -4006,15 +4046,18 @@ check_subseries <- function(data, info) { } -ctsm_merge_auxiliary <- function(data, info) { +merge_auxiliary <- function(data, info) { # import_functions.R # merge auxiliary variables with data + control <- info$auxiliary + + # identify auxiliary variables and split data set accordingly - + auxiliary_var <- ctsm_get_auxiliary(data$determinand, info) - + id <- data$determinand %in% auxiliary_var auxiliary <- data[id, ] @@ -4032,9 +4075,14 @@ ctsm_merge_auxiliary <- function(data, info) { auxiliary <- split(auxiliary, auxiliary$determinand) - # catch for LNMEA measured in WO and ES for birds - probably shouldn't happen because the - # sample will differ? need to check + # update control + + if (length(control$by_matrix) == 1L && control$by_matrix == "all") { + control$by_matrix <- auxiliary_var + } + + # merge auxiliary variables one at a time for (aux_id in names(auxiliary)) { @@ -4049,7 +4097,7 @@ ctsm_merge_auxiliary <- function(data, info) { # additional variables for some auxiliaries # need to make this more flexible - issue raised - if (aux_id %in% c("DRYWT%", "LIPIDWT%", "CORG", "LOIGN", "AL", "LI")) { + if (aux_id %in% control$by_matrix) { merge_id <- c(merge_id, "matrix") @@ -4110,6 +4158,7 @@ ctsm_merge_auxiliary <- function(data, info) { data <- droplevels(data) } + ctsm_convert_to_target_basis <- function(data, info, get_basis) { # location: import_functions.R diff --git a/man/read_data.Rd b/man/read_data.Rd index e406751..c74be86 100644 --- a/man/read_data.Rd +++ b/man/read_data.Rd @@ -98,10 +98,37 @@ parameters that dictate the assessment process. \details{ \subsection{Control parameters}{ -Many aspects of the assessment process can be controlled through the -parameters stored in \code{info$control}. This is a list populated with default -values which can then be overwritten, if required, using the \code{control} -argument. +Many aspects of the assessment process can be controlled using parameters +which are stored in the \code{info} component of the harsat data object. The +default control values can be overwritten using the \code{control} argument. +\itemize{ +\item \code{reporting_window} A scalar (default 6) which determines whether +timeseries are excluded because they have no 'recent' data. Formally, +timeseries are excluded if they have no data in the period +\code{max_year - reporting_window + 1} and \code{max_year}, so the default approach +is to exclude timeseries if they have no dat in the most recent six +monitoring years. The value of 6 is chosen to match with Marine Strategy +Framework Directive reporting periods. +\item \code{region} +\item \code{add_stations} +\item \code{bivalve_spawning_season} +\item \code{use_stage} +\item \code{relative_uncertainty} +\item \code{auxiliary} A list which allows flexibility in the treatment of auxiliary +variables. At present, there is just one component \code{by_matrix}, a +character vector that determines which auxiliary variables are matched to +the contaminant data by \code{sample} and \code{matrix} as opposed to just \code{sample}. +For sediment and water, the default is \code{all}; i.e. all variables are +matched by \code{sample} and \code{matrix}. This ensures, for example, that +sediment normalisers such as aluminium and organic carbon content are +matched to chemical measurements in the same grain fraction. For biota, the +default is \verb{c("DRYWT\%", "LIPIDWT\%)}, so these variables are matched by +\code{sample} and \code{matrix} and all other variables (e.g. LNMEA or \%FEMALEPOP) +are matched by \code{sample}. Thus, dry weight and lipid weight contents are +matched to chemical measurements in the same tissue. However, mean length +(which is usually the lenght of the whole organism) is matched to all +tissue types. +} } \subsection{External data}{