-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep_phenotypes.R
122 lines (100 loc) · 4.15 KB
/
prep_phenotypes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
library(argparser)
library(AnVIL)
library(AnvilDataModels)
library(dplyr)
library(readr)
library(jsonlite)
argp <- arg_parser("prep")
argp <- add_argument(argp, "--table_files", help="2-column tsv file with (table name, table tsv file)")
argp <- add_argument(argp, "--model_file", help="json file with data model")
argp <- add_argument(argp, "--workspace_name", help="name of AnVIL workspace to import data to")
argp <- add_argument(argp, "--workspace_namespace", help="namespace of AnVIL workspace to import data to")
argv <- parse_args(argp)
# argv <- list(table_files="testdata/table_files_phenotype.tsv",
# model_file="testdata/data_model_phen.json",
# workspace_name="PRIMED_genotype_test", workspace_namespace="primed-stephanie")
# read data model
model <- fromJSON(argv$model_file)
table_versions <- model$tables %>%
select(table, data_model_version=version) %>%
filter(!is.na(data_model_version))
# read tables
table_files <- read_tsv(argv$table_files, col_names=c("names", "files"), col_types="cc")
stopifnot(all(table_files$names %in% c("subject", "phenotype_harmonized", "phenotype_unharmonized")))
get_table <- function(x) {
table_files %>%
filter(names == x) %>%
select(files) %>%
unlist() %>%
read_tsv()
}
# make sure we have a subject table
if ("subject" %in% table_files$names) {
subj <- get_table("subject")
} else {
existing_table_names <- avtables(namespace=argv$workspace_namespace, name=argv$workspace_name)$table
if ("subject" %in% existing_table_names) {
subj <- avtable("subject", namespace=argv$workspace_namespace, name=argv$workspace_name)
} else {
stop("subject table must be provided if not present in workspace")
}
}
# check subject id in files
check_subject_id <- function(phen_table) {
for (i in 1:nrow(phen_table)) {
message("checking subjects in file ", phen_table$file_path)
phen <- read_tsv(phen_table$file_path[i])
if (!("subject_id" %in% names(phen))) {
stop("no subject_id column found")
}
extra <- setdiff(phen$subject_id, subj$subject_id)
if (length(extra) > 0) {
stop("subject_id values not present in subject table: ", paste(extra, collapse=", "))
}
ns1 <- phen_table$n_subjects[i]
ns2 <- length(unique(phen$subject_id))
if (ns1 != ns2) {
stop("reported ", ns1, " subjects but counted ", ns2)
}
nr1 <- phen_table$n_rows[i]
nr2 <- nrow(phen)
if (nr1 != nr2) {
stop("reported ", nr1, " rows but counted ", nr2)
}
}
}
# columns in common between harmonized and unharmonized tables
common_cols <- c("md5sum", "file_path", "n_subjects", "n_rows")
# tables to validate - to be updated below
validate_files <- table_files
if ("phenotype_harmonized" %in% table_files$names) {
# read phenotype table
phen_table <- get_table("phenotype_harmonized")
stopifnot(all(c(common_cols, "domain", "file_readme_path") %in% names(phen_table)))
# add model version
phen_table <- phen_table %>%
left_join(table_versions, by=c("domain"="table"))
outfile <- "output_phenotype_harmonized.tsv"
write_tsv(phen_table, outfile)
table_files$files[table_files$names == "phenotype_harmonized"] <- outfile
# copy files to local instance
gsutil_cp(phen_table$file_path, ".")
phen_table$file_path <- basename(phen_table$file_path)
# check subject_id in tables
check_subject_id(phen_table)
# tables to validate - add harmonized tables
validate_files <- phen_table %>%
select(names=domain, files=file_path) %>%
bind_rows(table_files)
}
if ("phenotype_unharmonized" %in% table_files$names) {
# read phenotype table
phen_table <- get_table("phenotype_unharmonized")
stopifnot(all(c(common_cols, "description", "file_dd_path") %in% names(phen_table)))
# copy files to local instance
gsutil_cp(phen_table$file_path, ".")
phen_table$file_path <- basename(phen_table$file_path)
# check subject_id
check_subject_id(phen_table)
}
write_tsv(validate_files, "output_table_files_validate.tsv", col_names=FALSE)