From aa8717cbf431f982f73fc8642c0dcdfed95f0656 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Thu, 20 Jun 2024 21:25:41 -0700 Subject: [PATCH 1/9] allow multiple types of GSR analyses GSR validation now allows multiple pairs of analysis/file tables. Naming convention is expected to be _analysis and _file --- gsr_data_report.R | 4 +- gsr_data_report.wdl | 4 ++ prep_gsr.R | 86 ++++++++++++++++++++---------------- select_gsr_files.R | 30 +++++++++---- testdata/gsr_data_model.json | 8 ++-- testdata/table_files_gsr.tsv | 4 +- validate_gsr_model.wdl | 13 +++--- 7 files changed, 91 insertions(+), 58 deletions(-) diff --git a/gsr_data_report.R b/gsr_data_report.R index f3dca5b..946600b 100644 --- a/gsr_data_report.R +++ b/gsr_data_report.R @@ -5,17 +5,19 @@ library(readr) argp <- arg_parser("report") argp <- add_argument(argp, "--data_file", help="tsv file with data") argp <- add_argument(argp, "--dd_file", help="json file with GSR data dictionary") +argp <- add_argument(argp, "--dd_table_name", help="name of data dictionary table in dd_file") argp <- add_argument(argp, "--analysis_file", help="tsv file with analysis table") argp <- add_argument(argp, "--stop_on_fail", flag=TRUE, help="return an error code if data_file does not pass checks") argv <- parse_args(argp) # argv <- list(data_file="testdata/gsr_chr1.tsv", # dd_file="testdata/gsr_data_model.json", +# dd_table_name="gsr_files_dd", # analysis_file="output_analysis_table.tsv") # read data model dd <- json_to_dm(argv$dd_file) -dd_table_name <- "gsr_files_dd" +dd_table_name <- argv$dd_table_name stopifnot(dd_table_name %in% names(dd)) # read 1000 rows for checking data against expected type diff --git a/gsr_data_report.wdl b/gsr_data_report.wdl index d394209..9cb499e 100644 --- a/gsr_data_report.wdl +++ b/gsr_data_report.wdl @@ -4,12 +4,14 @@ workflow gsr_data_report { input { File data_file String dd_url + String dd_table_name File analysis_file } call validate_data { input: data_file = data_file, dd_url = dd_url, + dd_table_name = dd_table_name, analysis_file = analysis_file } @@ -28,6 +30,7 @@ task validate_data { input { File data_file String dd_url + String dd_table_name File analysis_file } @@ -35,6 +38,7 @@ task validate_data { Rscript /usr/local/primed-file-checks/gsr_data_report.R \ --data_file ~{data_file} \ --dd_file ~{dd_url} \ + --dd_table_name ~{dd_table_name} \ --analysis_file ~{analysis_file} \ --stop_on_fail >>> diff --git a/prep_gsr.R b/prep_gsr.R index 37f4567..cb121b2 100644 --- a/prep_gsr.R +++ b/prep_gsr.R @@ -10,7 +10,8 @@ argp <- add_argument(argp, "--hash_id_nchar", default=16, help="number of charac argv <- parse_args(argp) # argv <- list(table_files="testdata/table_files_gsr.tsv", -# model_file="testdata/gsr_data_model.json") +# model_file="testdata/gsr_data_model.json", +# hash_id_nchar=16) # read data model model <- json_to_dm(argv$model_file) @@ -18,44 +19,55 @@ model <- json_to_dm(argv$model_file) # read tables table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc") -# read analysis field,value pairs -analysis_file <- table_files$files[table_files$names == "analysis"] -if (length(analysis_file) == 0) stop("analysis table not found in table_files") -fv <- read_tsv(analysis_file, col_types=cols(.default=col_character())) +# identify analyses +analysis_files <- table_files %>% + filter(grepl("_analysis$", names) | grepl("_file$", names)) %>% + separate(names, into=c("type", "table"), sep="_") %>% + pivot_wider(names_from=table, values_from=files) -# transpose -transpose_fv <- function(fv) { - stopifnot(setequal(names(fv), c("field", "value"))) - lapply(setNames(1:nrow(fv), fv$field), function(i) { - v <- fv$value[i] - return(v) - }) %>% - bind_cols() +if (nrow(analysis_files == 0)) stop("no valid analysis/file table pairs found") +for (i in 1:nrow(analysis_files)) { + type <- analysis_files$type[i] + analysis_table_name <- paste0(type, "_analysis") + file_table_name <- paste0(type, "_file") + + # read analysis field,value pairs + fv <- read_tsv(analysis_files$analysis[i], col_types=cols(.default=col_character())) + + # transpose + transpose_fv <- function(fv) { + stopifnot(setequal(names(fv), c("field", "value"))) + lapply(setNames(1:nrow(fv), fv$field), function(i) { + v <- fv$value[i] + return(v) + }) %>% + bind_cols() + } + analysis <- transpose_fv(fv) + + # add analysis_id + analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar) + analysis <- bind_cols(analysis_id=analysis_id, analysis) + + # read file table + file <- read_tsv(analysis_files$file[i], col_types=cols(.default=col_character())) + + # add analysis_id + file <- bind_cols(analysis_id=analysis$analysis_id, file) + + # add file_id + file <- add_auto_columns(file, table_name=file_table_name, model=model, + error_on_missing=FALSE, nchar=argv$hash_id_nchar) + + # write tsv files + analysis_file <- paste0("output_", type, "_analysis_table.tsv") + write_tsv(analysis, analysis_file) + file_file <- paste0("output_", type, "_file_table.tsv") + write_tsv(file, file_file) + + table_files$files[table_files$names == analysis_table_name] <- analysis_file + table_files$files[table_files$names == file_table_name] <- file_file } -analysis <- transpose_fv(fv) - -# add analysis_id -analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar) -analysis <- bind_cols(analysis_id=analysis_id, analysis) - -# read file table -file_file <- table_files$files[table_files$names == "gsr_file"] -if (length(file_file) == 0) stop("gsr_file table not found in table_files") -file <- read_tsv(file_file, col_types=cols(.default=col_character())) - -# add analysis_id -file <- bind_cols(analysis_id=analysis$analysis_id, file) - -# add file_id -file <- add_auto_columns(file, table_name="gsr_file", model=model, - error_on_missing=FALSE, nchar=argv$hash_id_nchar) - -# write tsv files -analysis_file <- "output_analysis_table.tsv" -write_tsv(analysis, analysis_file) -file_file <- "output_gsr_file_table.tsv" -write_tsv(file, file_file) # write new version of table_files -table_files <- tibble(c("analysis", "gsr_file"), c(analysis_file, file_file)) write_tsv(table_files, "output_table_files.tsv", col_names=FALSE) diff --git a/select_gsr_files.R b/select_gsr_files.R index d052c12..4df20c3 100644 --- a/select_gsr_files.R +++ b/select_gsr_files.R @@ -1,6 +1,8 @@ library(argparser) library(AnvilDataModels) library(readr) +library(dplyr) +library(tidyr) argp <- arg_parser("select") argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)") @@ -9,14 +11,26 @@ argv <- parse_args(argp) # read tables table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc") tables <- read_data_tables(table_files$files, table_names=table_files$names) -stopifnot(setequal(names(tables), c("analysis", "gsr_file"))) +stopifnot(all(grepl("analysis$", names(tables)) | grepl("file$", names(tables)))) -analysis_id <- tables[["analysis"]]$analysis_id -stopifnot(length(analysis_id) == 1) -writeLines(analysis_id, "analysis_id.txt") +analyses <- table_files %>% + separate_wider_delim(names, delim="_", names=c("type", "table")) %>% + pivot_wider(names_from=table, values_from=files) -data_files <- tables[["gsr_file"]]$file_path -writeLines(data_files, "data_files.txt") +data_files <- list() +analysis_files <- list() +md5 <- list() +for (t in analyses$type) { + file_table_name <- paste0(t, "_file") + md5[[t]] <- tables[[file_table_name]]$md5sum + data_files[[t]] <- tables[[file_table_name]]$file_path + analysis_files[[t]] <- analyses %>% + filter(type == t) %>% + select(analysis) %>% + unlist() %>% + rep(length(data_files[[t]])) +} -md5 <- tables[["gsr_file"]]$md5sum -writeLines(md5, "md5sum.txt") +writeLines(unlist(md5), "md5sum.txt") +writeLines(unlist(data_files), "data_files.txt") +writeLines(unlist(analysis_files), "analysis_files.txt") diff --git a/testdata/gsr_data_model.json b/testdata/gsr_data_model.json index 5ad2898..9724dfb 100644 --- a/testdata/gsr_data_model.json +++ b/testdata/gsr_data_model.json @@ -4,10 +4,10 @@ "version": "0", "tables": [ { - "table": "analysis", + "table": "gsr_analysis", "columns": [ { - "column": "analysis_id", + "column": "gsr_analysis_id", "primary_key": true, "description": "unique identifier for a gwas in primed", "data_type": "string" @@ -100,10 +100,10 @@ }, { - "column": "analysis_id", + "column": "gsr_analysis_id", "required": true, "data_type": "string", - "references": "> analysis.analysis_id" + "references": "> gsr_analysis.gsr_analysis_id" }, { "column": "chromosome", diff --git a/testdata/table_files_gsr.tsv b/testdata/table_files_gsr.tsv index 5e91807..13d0d7e 100644 --- a/testdata/table_files_gsr.tsv +++ b/testdata/table_files_gsr.tsv @@ -1,2 +1,2 @@ -analysis testdata/gsr_analysis_table.tsv -gsr_file testdata/gsr_file.tsv +association_analysis testdata/gsr_analysis_table.tsv +association_file testdata/gsr_file.tsv diff --git a/validate_gsr_model.wdl b/validate_gsr_model.wdl index 89fbe80..eaedbc3 100644 --- a/validate_gsr_model.wdl +++ b/validate_gsr_model.wdl @@ -36,10 +36,10 @@ workflow validate_gsr_model { md5_check = md5check.md5_check } - scatter (f in validate.data_files) { + scatter (pair in zip(validate.data_files, validate.analysis_files)) { call gsr.validate_data { - input: data_file = f, - analysis_file = validate.analysis_file, + input: data_file = pair.left, + analysis_file = pair.right, dd_url = model_url } } @@ -52,7 +52,7 @@ workflow validate_gsr_model { output { File validation_report = validate.validation_report - Array[File] tables = [validate.analysis_file, validate.gsr_file] + Array[File] tables = flatten([validate.analysis_file, validate.gsr_file]) String? md5_check_summary = summarize_md5_check.summary File? md5_check_details = summarize_md5_check.details String? data_report_summary = summarize_data_check.summary @@ -106,8 +106,9 @@ task validate { output { File validation_report = "data_model_validation.html" - File analysis_file = "output_analysis_table.tsv" - File gsr_file = "output_gsr_file_table.tsv" + Array[File] analysis_file = glob("output_*_analysis_table.tsv") + Array[File] gsr_file = glob("output_*_file_table.tsv") + Array[File] analysis_files = read_lines("analysis_files.txt") Array[File] data_files = read_lines("data_files.txt") Array[String] md5sum = read_lines("md5sum.txt") } From c7dd8a303f8cbc4fc39a10d5d28749f78f203b24 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 21 Jun 2024 00:07:50 -0700 Subject: [PATCH 2/9] new docker image --- validate_gsr_model.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validate_gsr_model.wdl b/validate_gsr_model.wdl index eaedbc3..5561358 100644 --- a/validate_gsr_model.wdl +++ b/validate_gsr_model.wdl @@ -114,6 +114,6 @@ task validate { } runtime { - docker: "uwgac/primed-file-checks:0.5.1" + docker: "uwgac/primed-file-checks:0.5.1-1" } } From 4b71f2e1c5bb9ca383d87fc7323fb97674f13ceb Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 21 Jun 2024 08:53:46 -0700 Subject: [PATCH 3/9] missing R library --- prep_gsr.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prep_gsr.R b/prep_gsr.R index cb121b2..3a5f900 100644 --- a/prep_gsr.R +++ b/prep_gsr.R @@ -1,7 +1,8 @@ library(argparser) library(AnvilDataModels) -library(dplyr) library(readr) +library(dplyr) +library(tidyr) argp <- arg_parser("report") argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)") From 96b168e4602de5f7aa6a12037b6d47bdc2758bdc Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 21 Jun 2024 15:37:43 -0700 Subject: [PATCH 4/9] fix parentheses --- prep_gsr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prep_gsr.R b/prep_gsr.R index 3a5f900..01d10c6 100644 --- a/prep_gsr.R +++ b/prep_gsr.R @@ -26,7 +26,7 @@ analysis_files <- table_files %>% separate(names, into=c("type", "table"), sep="_") %>% pivot_wider(names_from=table, values_from=files) -if (nrow(analysis_files == 0)) stop("no valid analysis/file table pairs found") +if (nrow(analysis_files) == 0) stop("no valid analysis/file table pairs found") for (i in 1:nrow(analysis_files)) { type <- analysis_files$type[i] analysis_table_name <- paste0(type, "_analysis") From 988459b8669e470bf7fb870a025fdf9c3d9c2c5d Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 21 Jun 2024 16:37:57 -0700 Subject: [PATCH 5/9] id column names depend on type --- prep_gsr.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/prep_gsr.R b/prep_gsr.R index 01d10c6..1e87794 100644 --- a/prep_gsr.R +++ b/prep_gsr.R @@ -47,14 +47,17 @@ for (i in 1:nrow(analysis_files)) { analysis <- transpose_fv(fv) # add analysis_id + analysis_id_name <- paste0(analysis_table_name, "_id") analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar) - analysis <- bind_cols(analysis_id=analysis_id, analysis) + analysis <- bind_cols(as_tibble(setNames(list(analysis_id), analysis_id_name)), analysis) # read file table file <- read_tsv(analysis_files$file[i], col_types=cols(.default=col_character())) # add analysis_id - file <- bind_cols(analysis_id=analysis$analysis_id, file) + file <- analysis %>% + select(!!analysis_id_name) %>% + bind_cols(file) # add file_id file <- add_auto_columns(file, table_name=file_table_name, model=model, From 04fdc5c70f6a3316df6514325e562e9c42f52aa0 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Mon, 24 Jun 2024 17:08:50 -0700 Subject: [PATCH 6/9] scatter over pairs of analysis and file tables table paths will be in a glob* directory, so we can't use the file paths directly output from the validate task. Instead, move select_gsr_files to a separate task that is inside a scatter() over pairs of analysis and file tables. --- select_gsr_files.R | 33 ++++---------------- validate_gsr_model.wdl | 69 +++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/select_gsr_files.R b/select_gsr_files.R index 4df20c3..555443c 100644 --- a/select_gsr_files.R +++ b/select_gsr_files.R @@ -1,36 +1,15 @@ library(argparser) library(AnvilDataModels) library(readr) -library(dplyr) -library(tidyr) argp <- arg_parser("select") -argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)") +argp <- add_argument(argp, "--file_table", help="tsv file with file table") argv <- parse_args(argp) -# read tables -table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc") -tables <- read_data_tables(table_files$files, table_names=table_files$names) -stopifnot(all(grepl("analysis$", names(tables)) | grepl("file$", names(tables)))) +file_table <- read_tsv(argv$file_table) -analyses <- table_files %>% - separate_wider_delim(names, delim="_", names=c("type", "table")) %>% - pivot_wider(names_from=table, values_from=files) +data_files <- file_table$file_path +writeLines(data_files, "data_files.txt") -data_files <- list() -analysis_files <- list() -md5 <- list() -for (t in analyses$type) { - file_table_name <- paste0(t, "_file") - md5[[t]] <- tables[[file_table_name]]$md5sum - data_files[[t]] <- tables[[file_table_name]]$file_path - analysis_files[[t]] <- analyses %>% - filter(type == t) %>% - select(analysis) %>% - unlist() %>% - rep(length(data_files[[t]])) -} - -writeLines(unlist(md5), "md5sum.txt") -writeLines(unlist(data_files), "data_files.txt") -writeLines(unlist(analysis_files), "analysis_files.txt") +md5 <- file_table$md5sum +writeLines(md5, "md5sum.txt") diff --git a/validate_gsr_model.wdl b/validate_gsr_model.wdl index 5561358..314b850 100644 --- a/validate_gsr_model.wdl +++ b/validate_gsr_model.wdl @@ -24,35 +24,42 @@ workflow validate_gsr_model { import_tables = import_tables } - scatter (pair in zip(validate.data_files, validate.md5sum)) { - call md5.md5check { - input: file = pair.left, - md5sum = pair.right + scatter (pair in zip(validate.analysis_tables, validate.file_tables)) { + call select_gsr_files { + input: file_table = pair.right } - } - call md5.summarize_md5_check { - input: file = validate.data_files, - md5_check = md5check.md5_check - } + scatter (data_pair in zip(select_gsr_files.data_files, select_gsr_files.md5sum)) { + call md5.md5check { + input: file = data_pair.left, + md5sum = data_pair.right + } + } - scatter (pair in zip(validate.data_files, validate.analysis_files)) { - call gsr.validate_data { - input: data_file = pair.left, - analysis_file = pair.right, - dd_url = model_url + scatter (f in select_gsr_files.data_files) { + call gsr.validate_data { + input: data_file = f, + analysis_file = pair.left, + dd_table_name = sub(basename(pair.left, "_table.tsv"), "output_", ""), + dd_url = model_url + } } } + call md5.summarize_md5_check { + input: file = flatten(select_gsr_files.data_files), + md5_check = flatten(md5check.md5_check) + } + call gsr.summarize_data_check { - input: file = validate.data_files, - data_check = validate_data.pass_checks, - validation_report = validate_data.validation_report + input: file = flatten(select_gsr_files.data_files), + data_check = flatten(validate_data.pass_checks), + validation_report = flatten(validate_data.validation_report) } output { File validation_report = validate.validation_report - Array[File] tables = flatten([validate.analysis_file, validate.gsr_file]) + Array[File] tables = flatten([validate.analysis_tables, validate.file_tables]) String? md5_check_summary = summarize_md5_check.summary File? md5_check_details = summarize_md5_check.details String? data_report_summary = summarize_data_check.summary @@ -100,20 +107,32 @@ task validate { --workspace_name ~{workspace_name} \ --workspace_namespace ~{workspace_namespace} fi - Rscript /usr/local/primed-file-checks/select_gsr_files.R \ - --table_files output_tables.tsv >>> output { File validation_report = "data_model_validation.html" - Array[File] analysis_file = glob("output_*_analysis_table.tsv") - Array[File] gsr_file = glob("output_*_file_table.tsv") - Array[File] analysis_files = read_lines("analysis_files.txt") - Array[File] data_files = read_lines("data_files.txt") - Array[String] md5sum = read_lines("md5sum.txt") + Array[File] analysis_tables = glob("output_*_analysis_table.tsv") + Array[File] file_tables = glob("output_*_file_table.tsv") } runtime { docker: "uwgac/primed-file-checks:0.5.1-1" } } + + +task select_gsr_files { + input { + File file_table + } + + command <<< + Rscript /usr/local/primed-file-checks/select_gsr_files.R \ + --file_table ~{file_table} + >>> + + output { + Array[File] data_files = read_lines("data_files.txt") + Array[String] md5sum = read_lines("md5sum.txt") + } +} From 7b24aed13495f73046ee8883b18ff7717dfdb08e Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Mon, 24 Jun 2024 21:29:57 -0700 Subject: [PATCH 7/9] add runtime --- validate_gsr_model.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/validate_gsr_model.wdl b/validate_gsr_model.wdl index 314b850..df29ef6 100644 --- a/validate_gsr_model.wdl +++ b/validate_gsr_model.wdl @@ -135,4 +135,8 @@ task select_gsr_files { Array[File] data_files = read_lines("data_files.txt") Array[String] md5sum = read_lines("md5sum.txt") } + + runtime { + docker: "uwgac/primed-file-checks:0.5.1-1" + } } From 1238e9d9e29e6ebab8d660f63dba87d23c4fa43d Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Tue, 25 Jun 2024 11:32:35 -0700 Subject: [PATCH 8/9] use new docker image --- gsr_data_report.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gsr_data_report.wdl b/gsr_data_report.wdl index 9cb499e..7d2b834 100644 --- a/gsr_data_report.wdl +++ b/gsr_data_report.wdl @@ -49,7 +49,7 @@ task validate_data { } runtime { - docker: "uwgac/primed-file-checks:0.5.1" + docker: "uwgac/primed-file-checks:0.5.1-1" } } From e12fe539841862a5c6afa10b9073563ec7bcf825 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Tue, 25 Jun 2024 14:06:54 -0700 Subject: [PATCH 9/9] fix DD table name --- validate_gsr_model.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validate_gsr_model.wdl b/validate_gsr_model.wdl index df29ef6..66cc797 100644 --- a/validate_gsr_model.wdl +++ b/validate_gsr_model.wdl @@ -40,7 +40,7 @@ workflow validate_gsr_model { call gsr.validate_data { input: data_file = f, analysis_file = pair.left, - dd_table_name = sub(basename(pair.left, "_table.tsv"), "output_", ""), + dd_table_name = sub(basename(pair.left, "_analysis_table.tsv"), "output_", "") + "_files_dd", dd_url = model_url } }