Skip to content

Commit

Permalink
Merge pull request #25 from UW-GAC/new_gsr_tables
Browse files Browse the repository at this point in the history
New gsr tables
  • Loading branch information
smgogarten authored Jun 26, 2024
2 parents 7f53be6 + e12fe53 commit 9088b4e
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 80 deletions.
4 changes: 3 additions & 1 deletion gsr_data_report.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@ library(readr)
argp <- arg_parser("report")
argp <- add_argument(argp, "--data_file", help="tsv file with data")
argp <- add_argument(argp, "--dd_file", help="json file with GSR data dictionary")
argp <- add_argument(argp, "--dd_table_name", help="name of data dictionary table in dd_file")
argp <- add_argument(argp, "--analysis_file", help="tsv file with analysis table")
argp <- add_argument(argp, "--stop_on_fail", flag=TRUE, help="return an error code if data_file does not pass checks")
argv <- parse_args(argp)

# argv <- list(data_file="testdata/gsr_chr1.tsv",
# dd_file="testdata/gsr_data_model.json",
# dd_table_name="gsr_files_dd",
# analysis_file="output_analysis_table.tsv")

# read data model
dd <- json_to_dm(argv$dd_file)
dd_table_name <- "gsr_files_dd"
dd_table_name <- argv$dd_table_name
stopifnot(dd_table_name %in% names(dd))

# read 1000 rows for checking data against expected type
Expand Down
6 changes: 5 additions & 1 deletion gsr_data_report.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ workflow gsr_data_report {
input {
File data_file
String dd_url
String dd_table_name
File analysis_file
}

call validate_data {
input: data_file = data_file,
dd_url = dd_url,
dd_table_name = dd_table_name,
analysis_file = analysis_file
}

Expand All @@ -28,13 +30,15 @@ task validate_data {
input {
File data_file
String dd_url
String dd_table_name
File analysis_file
}

command <<<
Rscript /usr/local/primed-file-checks/gsr_data_report.R \
--data_file ~{data_file} \
--dd_file ~{dd_url} \
--dd_table_name ~{dd_table_name} \
--analysis_file ~{analysis_file} \
--stop_on_fail
>>>
Expand All @@ -45,7 +49,7 @@ task validate_data {
}

runtime {
docker: "uwgac/primed-file-checks:0.5.1"
docker: "uwgac/primed-file-checks:0.5.1-1"
}
}

Expand Down
92 changes: 54 additions & 38 deletions prep_gsr.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
library(argparser)
library(AnvilDataModels)
library(dplyr)
library(readr)
library(dplyr)
library(tidyr)

argp <- arg_parser("report")
argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)")
Expand All @@ -10,52 +11,67 @@ argp <- add_argument(argp, "--hash_id_nchar", default=16, help="number of charac
argv <- parse_args(argp)

# argv <- list(table_files="testdata/table_files_gsr.tsv",
# model_file="testdata/gsr_data_model.json")
# model_file="testdata/gsr_data_model.json",
# hash_id_nchar=16)

# read data model
model <- json_to_dm(argv$model_file)

# read tables
table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc")

# read analysis field,value pairs
analysis_file <- table_files$files[table_files$names == "analysis"]
if (length(analysis_file) == 0) stop("analysis table not found in table_files")
fv <- read_tsv(analysis_file, col_types=cols(.default=col_character()))
# identify analyses
analysis_files <- table_files %>%
filter(grepl("_analysis$", names) | grepl("_file$", names)) %>%
separate(names, into=c("type", "table"), sep="_") %>%
pivot_wider(names_from=table, values_from=files)

# transpose
transpose_fv <- function(fv) {
stopifnot(setequal(names(fv), c("field", "value")))
lapply(setNames(1:nrow(fv), fv$field), function(i) {
v <- fv$value[i]
return(v)
}) %>%
bind_cols()
if (nrow(analysis_files) == 0) stop("no valid analysis/file table pairs found")
for (i in 1:nrow(analysis_files)) {
type <- analysis_files$type[i]
analysis_table_name <- paste0(type, "_analysis")
file_table_name <- paste0(type, "_file")

# read analysis field,value pairs
fv <- read_tsv(analysis_files$analysis[i], col_types=cols(.default=col_character()))

# transpose
transpose_fv <- function(fv) {
stopifnot(setequal(names(fv), c("field", "value")))
lapply(setNames(1:nrow(fv), fv$field), function(i) {
v <- fv$value[i]
return(v)
}) %>%
bind_cols()
}
analysis <- transpose_fv(fv)

# add analysis_id
analysis_id_name <- paste0(analysis_table_name, "_id")
analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar)
analysis <- bind_cols(as_tibble(setNames(list(analysis_id), analysis_id_name)), analysis)

# read file table
file <- read_tsv(analysis_files$file[i], col_types=cols(.default=col_character()))

# add analysis_id
file <- analysis %>%
select(!!analysis_id_name) %>%
bind_cols(file)

# add file_id
file <- add_auto_columns(file, table_name=file_table_name, model=model,
error_on_missing=FALSE, nchar=argv$hash_id_nchar)

# write tsv files
analysis_file <- paste0("output_", type, "_analysis_table.tsv")
write_tsv(analysis, analysis_file)
file_file <- paste0("output_", type, "_file_table.tsv")
write_tsv(file, file_file)

table_files$files[table_files$names == analysis_table_name] <- analysis_file
table_files$files[table_files$names == file_table_name] <- file_file
}
analysis <- transpose_fv(fv)

# add analysis_id
analysis_id <- hash_id(paste(analysis, collapse=""), nchar=argv$hash_id_nchar)
analysis <- bind_cols(analysis_id=analysis_id, analysis)

# read file table
file_file <- table_files$files[table_files$names == "gsr_file"]
if (length(file_file) == 0) stop("gsr_file table not found in table_files")
file <- read_tsv(file_file, col_types=cols(.default=col_character()))

# add analysis_id
file <- bind_cols(analysis_id=analysis$analysis_id, file)

# add file_id
file <- add_auto_columns(file, table_name="gsr_file", model=model,
error_on_missing=FALSE, nchar=argv$hash_id_nchar)

# write tsv files
analysis_file <- "output_analysis_table.tsv"
write_tsv(analysis, analysis_file)
file_file <- "output_gsr_file_table.tsv"
write_tsv(file, file_file)

# write new version of table_files
table_files <- tibble(c("analysis", "gsr_file"), c(analysis_file, file_file))
write_tsv(table_files, "output_table_files.tsv", col_names=FALSE)
15 changes: 4 additions & 11 deletions select_gsr_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,13 @@ library(AnvilDataModels)
library(readr)

argp <- arg_parser("select")
argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)")
argp <- add_argument(argp, "--file_table", help="tsv file with file table")
argv <- parse_args(argp)

# read tables
table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc")
tables <- read_data_tables(table_files$files, table_names=table_files$names)
stopifnot(setequal(names(tables), c("analysis", "gsr_file")))
file_table <- read_tsv(argv$file_table)

analysis_id <- tables[["analysis"]]$analysis_id
stopifnot(length(analysis_id) == 1)
writeLines(analysis_id, "analysis_id.txt")

data_files <- tables[["gsr_file"]]$file_path
data_files <- file_table$file_path
writeLines(data_files, "data_files.txt")

md5 <- tables[["gsr_file"]]$md5sum
md5 <- file_table$md5sum
writeLines(md5, "md5sum.txt")
8 changes: 4 additions & 4 deletions testdata/gsr_data_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
"version": "0",
"tables": [
{
"table": "analysis",
"table": "gsr_analysis",
"columns": [
{
"column": "analysis_id",
"column": "gsr_analysis_id",
"primary_key": true,
"description": "unique identifier for a gwas in primed",
"data_type": "string"
Expand Down Expand Up @@ -100,10 +100,10 @@

},
{
"column": "analysis_id",
"column": "gsr_analysis_id",
"required": true,
"data_type": "string",
"references": "> analysis.analysis_id"
"references": "> gsr_analysis.gsr_analysis_id"
},
{
"column": "chromosome",
Expand Down
4 changes: 2 additions & 2 deletions testdata/table_files_gsr.tsv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
analysis testdata/gsr_analysis_table.tsv
gsr_file testdata/gsr_file.tsv
association_analysis testdata/gsr_analysis_table.tsv
association_file testdata/gsr_file.tsv
70 changes: 47 additions & 23 deletions validate_gsr_model.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,35 +24,42 @@ workflow validate_gsr_model {
import_tables = import_tables
}

scatter (pair in zip(validate.data_files, validate.md5sum)) {
call md5.md5check {
input: file = pair.left,
md5sum = pair.right
scatter (pair in zip(validate.analysis_tables, validate.file_tables)) {
call select_gsr_files {
input: file_table = pair.right
}
}

call md5.summarize_md5_check {
input: file = validate.data_files,
md5_check = md5check.md5_check
}
scatter (data_pair in zip(select_gsr_files.data_files, select_gsr_files.md5sum)) {
call md5.md5check {
input: file = data_pair.left,
md5sum = data_pair.right
}
}

scatter (f in validate.data_files) {
call gsr.validate_data {
input: data_file = f,
analysis_file = validate.analysis_file,
dd_url = model_url
scatter (f in select_gsr_files.data_files) {
call gsr.validate_data {
input: data_file = f,
analysis_file = pair.left,
dd_table_name = sub(basename(pair.left, "_analysis_table.tsv"), "output_", "") + "_files_dd",
dd_url = model_url
}
}
}

call md5.summarize_md5_check {
input: file = flatten(select_gsr_files.data_files),
md5_check = flatten(md5check.md5_check)
}

call gsr.summarize_data_check {
input: file = validate.data_files,
data_check = validate_data.pass_checks,
validation_report = validate_data.validation_report
input: file = flatten(select_gsr_files.data_files),
data_check = flatten(validate_data.pass_checks),
validation_report = flatten(validate_data.validation_report)
}

output {
File validation_report = validate.validation_report
Array[File] tables = [validate.analysis_file, validate.gsr_file]
Array[File] tables = flatten([validate.analysis_tables, validate.file_tables])
String? md5_check_summary = summarize_md5_check.summary
File? md5_check_details = summarize_md5_check.details
String? data_report_summary = summarize_data_check.summary
Expand Down Expand Up @@ -100,19 +107,36 @@ task validate {
--workspace_name ~{workspace_name} \
--workspace_namespace ~{workspace_namespace}
fi
Rscript /usr/local/primed-file-checks/select_gsr_files.R \
--table_files output_tables.tsv
>>>

output {
File validation_report = "data_model_validation.html"
File analysis_file = "output_analysis_table.tsv"
File gsr_file = "output_gsr_file_table.tsv"
Array[File] analysis_tables = glob("output_*_analysis_table.tsv")
Array[File] file_tables = glob("output_*_file_table.tsv")
}

runtime {
docker: "uwgac/primed-file-checks:0.5.1-1"
}
}


task select_gsr_files {
input {
File file_table
}

command <<<
Rscript /usr/local/primed-file-checks/select_gsr_files.R \
--file_table ~{file_table}
>>>

output {
Array[File] data_files = read_lines("data_files.txt")
Array[String] md5sum = read_lines("md5sum.txt")
}

runtime {
docker: "uwgac/primed-file-checks:0.5.1"
docker: "uwgac/primed-file-checks:0.5.1-1"
}
}

0 comments on commit 9088b4e

Please sign in to comment.