Skip to content

Commit

Permalink
Merge pull request #2 from UW-GAC/feature/add-genotype-inventory
Browse files Browse the repository at this point in the history
Genotype inventory
  • Loading branch information
amstilp authored Jul 25, 2024
2 parents 929707b + 5d9a2e2 commit ae7c8cb
Show file tree
Hide file tree
Showing 6 changed files with 219 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,13 @@ workflows:
primaryDescriptorPath: /primed_phenotype_inventory.wdl
testParameterFiles:
- /primed_phenotype_inventory.json
- name: primed_genotype_inventory
subclass: WDL
primaryDescriptorPath: /primed_genotype_inventory.wdl
testParameterFiles:
- /primed_genotype_inventory.json
- name: primed_inventories
subclass: WDL
primaryDescriptorPath: /primed_inventories.wdl
testParameterFiles:
- /primed_inventories.json
10 changes: 10 additions & 0 deletions primed_genotype_inventory.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"primed_genotype_inventory.workspaces": {
"primed-data-prevent-1/PRIMED_ARIC_DBGAP_PHS000280_V8_P2_HMB-IRB": "ARIC",
"primed-data-dprism-1/PRIMED_RPGEH_DBGAP_PHS000788_V2_P3_HMB-IRB-NPU": "GERA, RPGEH",
"primed-data-topmed-1/PRIMED_CARDIA_TOPMED_DBGAP_PHS001612_V1_P1_HMB-IRB": "CARDIA"
},
"primed_genotype_inventory.output_workspace_namespace": "primed-adrienne",
"primed_genotype_inventory.output_workspace_name": "primed-sandbox",
"primed_genotype_inventory.output_table": "genotype_inventory"
}
44 changes: 44 additions & 0 deletions primed_genotype_inventory.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
version 1.0

workflow primed_genotype_inventory {
input {
Map[String, String] workspaces
String output_workspace_name
String output_workspace_namespace
String output_table
}

call write_primed_genotype_inventory_table {
input: workspaces = workspaces,
output_workspace_name = output_workspace_name,
output_workspace_namespace = output_workspace_namespace,
output_table = output_table
}

meta {
author: "Adrienne stilp"
email: "[email protected]"
}
}

task write_primed_genotype_inventory_table {
input {
Map[String, String] workspaces
String output_workspace_name
String output_workspace_namespace
String output_table
}

command <<<
set -e
Rscript /usr/local/primed-inventory-workflows/write_primed_genotype_inventory_table.R \
--workspaces-file ~{write_map(workspaces)} \
--output-workspace-name ~{output_workspace_name} \
--output-workspace-namespace ~{output_workspace_namespace} \
--output-table-name ~{output_table}
>>>

runtime {
docker: "uwgac/primed-inventory-workflows:0.2.0"
}
}
9 changes: 9 additions & 0 deletions primed_inventories.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"primed_inventories.workspaces": {
"primed-data-prevent-1/PRIMED_ARIC_DBGAP_PHS000280_V8_P2_HMB-IRB": "ARIC",
"primed-data-dprism-1/PRIMED_RPGEH_DBGAP_PHS000788_V2_P3_HMB-IRB-NPU": "GERA, RPGEH",
"primed-data-topmed-1/PRIMED_CARDIA_TOPMED_DBGAP_PHS001612_V1_P1_HMB-IRB": "CARDIA"
},
"primed_inventories.output_workspace_namespace": "primed-adrienne",
"primed_inventories.output_workspace_name": "primed-sandbox"
}
31 changes: 31 additions & 0 deletions primed_inventories.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
version 1.0

import "primed_phenotype_inventory.wdl" as phenotype_inventory
import "primed_genotype_inventory.wdl" as genotype_inventory

workflow primed_inventories {
input {
Map[String, String] workspaces
String output_workspace_name
String output_workspace_namespace
}

call phenotype_inventory.write_primed_phenotype_inventory_table {
input: workspaces = workspaces,
output_workspace_name = output_workspace_name,
output_workspace_namespace = output_workspace_namespace,
output_table = "phenotype_inventory"
}

call genotype_inventory.write_primed_genotype_inventory_table {
input: workspaces = workspaces,
output_workspace_name = output_workspace_name,
output_workspace_namespace = output_workspace_namespace,
output_table = "genotype_inventory"
}

meta {
author: "Adrienne stilp"
email: "[email protected]"
}
}
115 changes: 115 additions & 0 deletions write_primed_genotype_inventory_table.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
library(argparser)
library(AnVIL)
library(AnvilDataModels)
library(dplyr)
library(readr)
library(tidyr)
library(stringr)


argp <- arg_parser("write_genotype_inventory_table.R", description="Write genotype inventory table for shared workspaces.")
argp <- add_argument(argp, "--workspaces-file", help="2-column tsv file with (workspace, studies)")
argp <- add_argument(argp, "--output-workspace-namespace", help="Namespace of the AnVIL workspace to write the genotype inventory table to.")
argp <- add_argument(argp, "--output-workspace-name", help="Name of the AnVIL workspace to write the genotype inventory table to.")
argp <- add_argument(argp, "--output-table-name", help="Name of the data table to write the genotype inventory table to.")
argv <- parse_args(argp)

# Read in the workspaces.
x <- read_tsv(argv$workspaces_file, col_names=c("workspace", "studies"))
# x <- tribble(
# ~workspace, ~studies,
# "primed-data-prevent-1/PRIMED_ARIC_DBGAP_PHS000280_V8_P2_HMB-IRB", "ARIC",
# "primed-data-dprism-1/PRIMED_RPGEH_DBGAP_PHS000788_V2_P3_HMB-IRB-NPU", "GERA, RPGEH",
# "primed-data-topmed-1/PRIMED_CARDIA_TOPMED_DBGAP_PHS001612_V1_P1_HMB-IRB", "CARDIA"
# )

genotype_tables <- c(
"array_dataset",
"imputation_dataset",
"sequencing_dataset",
"simulation_dataset"
)

# Split workspace into namespace and name.
workspaces <- x %>%
separate(
workspace,
into=c("workspace_namespace", "workspace_name"),
sep="/",
remove=FALSE
)

# Just a check:
print(workspaces)

# Loop over workspaces and pull the genotype inventory information.
results_list <- list()
for (i in seq_along(workspaces$workspace)) {
print(paste("Processing workspace:", workspaces$workspace[i]))
# Loop over the different genotype tables.
workspace_results_list <- list()
for (input_table_name in genotype_tables) {

workspace = workspaces$workspace[i]
workspace_namespace = workspaces$workspace_namespace[i]
workspace_name = workspaces$workspace_name[i]

tables <- avtables(namespace=workspace_namespace, name=workspace_name)
if (input_table_name %in% tables$table) {
x <- avtable(input_table_name, namespace=workspace_namespace, name=workspace_name)
# Subset to and rename the id column.
id_column_name = quo_name(paste0(input_table_name, "_id"))
x <- x %>%
select(
dataset_id = !!id_column_name,
reference_assembly,
sample_set_id
)
# Pull the sample set table and calculate the number of samples
number_of_samples <- avtable("sample_set", namespace=workspace_namespace, name=workspace_name) %>%
unnest_set_table() %>%
count(sample_set_id, name="n_samples")
x <- x %>% left_join(number_of_samples, by="sample_set_id")
}
else {
x = tibble()
}
workspace_results_list[[input_table_name]] <- x
}
results_list[[workspace]] <- bind_rows(workspace_results_list, .id="genotype_dataset_table")
}

# Combine the results into a single data frame.
results <- bind_rows(results_list, .id="workspace") %>%
left_join(workspaces, by="workspace")

# Set up output workspace info.
# output_workspace = avworkspace() # This will be different when we actually run the script.
# output_workspace_namespace = str_split_1(output_workspace, pattern="/")[1]
# output_workspace_name = str_split_1(output_workspace, pattern="/")[2]
# output_table_name <- "tmp_genotype_inventory"
output_workspace_namespace = argv$output_workspace_namespace
output_workspace_name = argv$output_workspace_name
output_table_name = argv$output_table_name

id_column_name = quo_name(paste0(output_table_name, "_id"))
results <- results %>%
select(dataset_id, everything()) %>%
rename(!!id_column_name := dataset_id) %>%
# We separated workspace into namespace and name, so we don't need it anymore.
select(-workspace)
print(results)

# Delete the table before writing the new data, if it already exists.
tables <- avtables(namespace=output_workspace_namespace, name=output_workspace_name)
if (output_table_name %in% tables$table) {
original_results <- avtable(output_table_name, namespace=output_workspace_namespace, name=output_workspace_name)
avtable_delete_values(output_table_name, original_results[[id_column_name]], namespace=output_workspace_namespace, name=output_workspace_name)
}

tables <- list(tmp=results) %>%
setNames(output_table_name)

# Write the new results the table.
# Note: anvil_import_tables will check job status and timeout after an hour (by default).
anvil_import_tables(tables, namespace=output_workspace_namespace, name=output_workspace_name, overwrite=TRUE)

0 comments on commit ae7c8cb

Please sign in to comment.