diff --git a/gsr_data_report.R b/gsr_data_report.R index f3dca5b..946600b 100644 --- a/gsr_data_report.R +++ b/gsr_data_report.R @@ -5,17 +5,19 @@ library(readr) argp <- arg_parser("report") argp <- add_argument(argp, "--data_file", help="tsv file with data") argp <- add_argument(argp, "--dd_file", help="json file with GSR data dictionary") +argp <- add_argument(argp, "--dd_table_name", help="name of data dictionary table in dd_file") argp <- add_argument(argp, "--analysis_file", help="tsv file with analysis table") argp <- add_argument(argp, "--stop_on_fail", flag=TRUE, help="return an error code if data_file does not pass checks") argv <- parse_args(argp) # argv <- list(data_file="testdata/gsr_chr1.tsv", # dd_file="testdata/gsr_data_model.json", +# dd_table_name="gsr_files_dd", # analysis_file="output_analysis_table.tsv") # read data model dd <- json_to_dm(argv$dd_file) -dd_table_name <- "gsr_files_dd" +dd_table_name <- argv$dd_table_name stopifnot(dd_table_name %in% names(dd)) # read 1000 rows for checking data against expected type diff --git a/gsr_data_report.wdl b/gsr_data_report.wdl index d394209..7d2b834 100644 --- a/gsr_data_report.wdl +++ b/gsr_data_report.wdl @@ -4,12 +4,14 @@ workflow gsr_data_report { input { File data_file String dd_url + String dd_table_name File analysis_file } call validate_data { input: data_file = data_file, dd_url = dd_url, + dd_table_name = dd_table_name, analysis_file = analysis_file } @@ -28,6 +30,7 @@ task validate_data { input { File data_file String dd_url + String dd_table_name File analysis_file } @@ -35,6 +38,7 @@ task validate_data { Rscript /usr/local/primed-file-checks/gsr_data_report.R \ --data_file ~{data_file} \ --dd_file ~{dd_url} \ + --dd_table_name ~{dd_table_name} \ --analysis_file ~{analysis_file} \ --stop_on_fail >>> @@ -45,7 +49,7 @@ task validate_data { } runtime { - docker: "uwgac/primed-file-checks:0.5.1" + docker: "uwgac/primed-file-checks:0.5.1-1" } } diff --git a/prep_gsr.R b/prep_gsr.R index 37f4567..1e87794 100644 --- a/prep_gsr.R +++ b/prep_gsr.R @@ -1,7 +1,8 @@ library(argparser) library(AnvilDataModels) -library(dplyr) library(readr) +library(dplyr) +library(tidyr) argp <- arg_parser("report") argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)") @@ -10,7 +11,8 @@ argp <- add_argument(argp, "--hash_id_nchar", default=16, help="number of charac argv <- parse_args(argp) # argv <- list(table_files="testdata/table_files_gsr.tsv", -# model_file="testdata/gsr_data_model.json") +# model_file="testdata/gsr_data_model.json", +# hash_id_nchar=16) # read data model model <- json_to_dm(argv$model_file) @@ -18,44 +20,58 @@ model <- json_to_dm(argv$model_file) # read tables table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc") -# read analysis field,value pairs -analysis_file <- table_files$files[table_files$names == "analysis"] -if (length(analysis_file) == 0) stop("analysis table not found in table_files") -fv <- read_tsv(analysis_file, col_types=cols(.default=col_character())) +# identify analyses +analysis_files <- table_files %>% + filter(grepl("_analysis$", names) | grepl("_file$", names)) %>% + separate(names, into=c("type", "table"), sep="_") %>% + pivot_wider(names_from=table, values_from=files) -# transpose -transpose_fv <- function(fv) { - stopifnot(setequal(names(fv), c("field", "value"))) - lapply(setNames(1:nrow(fv), fv$field), function(i) { - v <- fv$value[i] - return(v) - }) %>% - bind_cols() +if (nrow(analysis_files) == 0) stop("no valid analysis/file table pairs found") +for (i in 1:nrow(analysis_files)) { + type <- analysis_files$type[i] + analysis_table_name <- paste0(type, "_analysis") + file_table_name <- paste0(type, "_file") + + # read analysis field,value pairs + fv <- read_tsv(analysis_files$analysis[i], col_types=cols(.default=col_character())) + + # transpose + transpose_fv <- function(fv) { + stopifnot(setequal(names(fv), c("field", "value"))) + lapply(setNames(1:nrow(fv), fv$field), function(i) { + v <- fv$value[i] + return(v) + }) %>% + bind_cols() + } + analysis <- transpose_fv(fv) + + # add analysis_id + analysis_id_name <- paste0(analysis_table_name, "_id") + analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar) + analysis <- bind_cols(as_tibble(setNames(list(analysis_id), analysis_id_name)), analysis) + + # read file table + file <- read_tsv(analysis_files$file[i], col_types=cols(.default=col_character())) + + # add analysis_id + file <- analysis %>% + select(!!analysis_id_name) %>% + bind_cols(file) + + # add file_id + file <- add_auto_columns(file, table_name=file_table_name, model=model, + error_on_missing=FALSE, nchar=argv$hash_id_nchar) + + # write tsv files + analysis_file <- paste0("output_", type, "_analysis_table.tsv") + write_tsv(analysis, analysis_file) + file_file <- paste0("output_", type, "_file_table.tsv") + write_tsv(file, file_file) + + table_files$files[table_files$names == analysis_table_name] <- analysis_file + table_files$files[table_files$names == file_table_name] <- file_file } -analysis <- transpose_fv(fv) - -# add analysis_id -analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar) -analysis <- bind_cols(analysis_id=analysis_id, analysis) - -# read file table -file_file <- table_files$files[table_files$names == "gsr_file"] -if (length(file_file) == 0) stop("gsr_file table not found in table_files") -file <- read_tsv(file_file, col_types=cols(.default=col_character())) - -# add analysis_id -file <- bind_cols(analysis_id=analysis$analysis_id, file) - -# add file_id -file <- add_auto_columns(file, table_name="gsr_file", model=model, - error_on_missing=FALSE, nchar=argv$hash_id_nchar) - -# write tsv files -analysis_file <- "output_analysis_table.tsv" -write_tsv(analysis, analysis_file) -file_file <- "output_gsr_file_table.tsv" -write_tsv(file, file_file) # write new version of table_files -table_files <- tibble(c("analysis", "gsr_file"), c(analysis_file, file_file)) write_tsv(table_files, "output_table_files.tsv", col_names=FALSE) diff --git a/select_gsr_files.R b/select_gsr_files.R index d052c12..555443c 100644 --- a/select_gsr_files.R +++ b/select_gsr_files.R @@ -3,20 +3,13 @@ library(AnvilDataModels) library(readr) argp <- arg_parser("select") -argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)") +argp <- add_argument(argp, "--file_table", help="tsv file with file table") argv <- parse_args(argp) -# read tables -table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc") -tables <- read_data_tables(table_files$files, table_names=table_files$names) -stopifnot(setequal(names(tables), c("analysis", "gsr_file"))) +file_table <- read_tsv(argv$file_table) -analysis_id <- tables[["analysis"]]$analysis_id -stopifnot(length(analysis_id) == 1) -writeLines(analysis_id, "analysis_id.txt") - -data_files <- tables[["gsr_file"]]$file_path +data_files <- file_table$file_path writeLines(data_files, "data_files.txt") -md5 <- tables[["gsr_file"]]$md5sum +md5 <- file_table$md5sum writeLines(md5, "md5sum.txt") diff --git a/testdata/gsr_data_model.json b/testdata/gsr_data_model.json index 5ad2898..9724dfb 100644 --- a/testdata/gsr_data_model.json +++ b/testdata/gsr_data_model.json @@ -4,10 +4,10 @@ "version": "0", "tables": [ { - "table": "analysis", + "table": "gsr_analysis", "columns": [ { - "column": "analysis_id", + "column": "gsr_analysis_id", "primary_key": true, "description": "unique identifier for a gwas in primed", "data_type": "string" @@ -100,10 +100,10 @@ }, { - "column": "analysis_id", + "column": "gsr_analysis_id", "required": true, "data_type": "string", - "references": "> analysis.analysis_id" + "references": "> gsr_analysis.gsr_analysis_id" }, { "column": "chromosome", diff --git a/testdata/table_files_gsr.tsv b/testdata/table_files_gsr.tsv index 5e91807..13d0d7e 100644 --- a/testdata/table_files_gsr.tsv +++ b/testdata/table_files_gsr.tsv @@ -1,2 +1,2 @@ -analysis testdata/gsr_analysis_table.tsv -gsr_file testdata/gsr_file.tsv +association_analysis testdata/gsr_analysis_table.tsv +association_file testdata/gsr_file.tsv diff --git a/validate_gsr_model.wdl b/validate_gsr_model.wdl index 89fbe80..66cc797 100644 --- a/validate_gsr_model.wdl +++ b/validate_gsr_model.wdl @@ -24,35 +24,42 @@ workflow validate_gsr_model { import_tables = import_tables } - scatter (pair in zip(validate.data_files, validate.md5sum)) { - call md5.md5check { - input: file = pair.left, - md5sum = pair.right + scatter (pair in zip(validate.analysis_tables, validate.file_tables)) { + call select_gsr_files { + input: file_table = pair.right } - } - call md5.summarize_md5_check { - input: file = validate.data_files, - md5_check = md5check.md5_check - } + scatter (data_pair in zip(select_gsr_files.data_files, select_gsr_files.md5sum)) { + call md5.md5check { + input: file = data_pair.left, + md5sum = data_pair.right + } + } - scatter (f in validate.data_files) { - call gsr.validate_data { - input: data_file = f, - analysis_file = validate.analysis_file, - dd_url = model_url + scatter (f in select_gsr_files.data_files) { + call gsr.validate_data { + input: data_file = f, + analysis_file = pair.left, + dd_table_name = sub(basename(pair.left, "_analysis_table.tsv"), "output_", "") + "_files_dd", + dd_url = model_url + } } } + call md5.summarize_md5_check { + input: file = flatten(select_gsr_files.data_files), + md5_check = flatten(md5check.md5_check) + } + call gsr.summarize_data_check { - input: file = validate.data_files, - data_check = validate_data.pass_checks, - validation_report = validate_data.validation_report + input: file = flatten(select_gsr_files.data_files), + data_check = flatten(validate_data.pass_checks), + validation_report = flatten(validate_data.validation_report) } output { File validation_report = validate.validation_report - Array[File] tables = [validate.analysis_file, validate.gsr_file] + Array[File] tables = flatten([validate.analysis_tables, validate.file_tables]) String? md5_check_summary = summarize_md5_check.summary File? md5_check_details = summarize_md5_check.details String? data_report_summary = summarize_data_check.summary @@ -100,19 +107,36 @@ task validate { --workspace_name ~{workspace_name} \ --workspace_namespace ~{workspace_namespace} fi - Rscript /usr/local/primed-file-checks/select_gsr_files.R \ - --table_files output_tables.tsv >>> output { File validation_report = "data_model_validation.html" - File analysis_file = "output_analysis_table.tsv" - File gsr_file = "output_gsr_file_table.tsv" + Array[File] analysis_tables = glob("output_*_analysis_table.tsv") + Array[File] file_tables = glob("output_*_file_table.tsv") + } + + runtime { + docker: "uwgac/primed-file-checks:0.5.1-1" + } +} + + +task select_gsr_files { + input { + File file_table + } + + command <<< + Rscript /usr/local/primed-file-checks/select_gsr_files.R \ + --file_table ~{file_table} + >>> + + output { Array[File] data_files = read_lines("data_files.txt") Array[String] md5sum = read_lines("md5sum.txt") } runtime { - docker: "uwgac/primed-file-checks:0.5.1" + docker: "uwgac/primed-file-checks:0.5.1-1" } }