Skip to content

Commit

Permalink
Merge pull request #27 from UW-GAC/bucket_path
Browse files Browse the repository at this point in the history
add is_bucket_path to data model
  • Loading branch information
smgogarten authored Nov 7, 2023
2 parents d00a894 + b0f28a1 commit 824bcdb
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 39 deletions.
5 changes: 3 additions & 2 deletions PRIMED_GSR_data_model.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "PRIMED GSR Data Model",
"description": "Data model for Genomic Summary Results in the PRIMED consortium",
"version": "1.0",
"version": "1.1",
"tables": [
{
"table": "analysis",
Expand Down Expand Up @@ -423,7 +423,8 @@
"column": "file_path",
"required": true,
"description": "File path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_type",
Expand Down
14 changes: 9 additions & 5 deletions PRIMED_genotype_data_model.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "PRIMED Genotype Data Model",
"description": "Data model for genotype data in the PRIMED consortium",
"version": "1.2",
"version": "1.3",
"tables": [
{
"table": "subject",
Expand Down Expand Up @@ -198,7 +198,8 @@
"column": "file_path",
"required": true,
"description": "absolute file path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_type",
Expand Down Expand Up @@ -305,7 +306,8 @@
"column": "file_path",
"required": true,
"description": "absolute file path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_type",
Expand Down Expand Up @@ -441,7 +443,8 @@
"column": "file_path",
"required": true,
"description": "absolute file path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_type",
Expand Down Expand Up @@ -541,7 +544,8 @@
"column": "file_path",
"required": true,
"description": "absolute file path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_type",
Expand Down
24 changes: 18 additions & 6 deletions PRIMED_phenotype_data_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,15 @@
"column": "file_path",
"required": true,
"description": "absolute file path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_readme_path",
"required": true,
"description": "path to the README",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "n_subjects",
Expand All @@ -148,6 +150,12 @@
"required": true,
"description": "Number of rows in file (may be > n_subjects for longitudinal data)",
"data_type": "integer"
},
{
"column": "data_model_version",
"description": "data model version for this table",
"data_type": "float",
"notes": "added automatically by validation workflow"
}
]
},
Expand Down Expand Up @@ -179,13 +187,15 @@
"column": "file_path",
"required": true,
"description": "absolute file path in cloud storage",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "file_dd_path",
"required": true,
"description": "path to the data dictionary",
"data_type": "string"
"data_type": "string",
"is_bucket_path": true
},
{
"column": "n_subjects",
Expand Down Expand Up @@ -821,6 +831,8 @@
"required": true,
"description": "the age at which the observation or measurement for the phenotype(s) were taken",
"data_type": "float",
"min": 0,
"max": 89,
"examples": "56.2"
},
{
Expand Down Expand Up @@ -942,7 +954,7 @@
{
"column": "year_at_diagnosis_1",
"description": "year participant was diagnosed",
"data_type": "float",
"data_type": "integer",
"examples": "1999"
},
{
Expand Down Expand Up @@ -1206,7 +1218,7 @@
{
"column": "year_at_diagnosis",
"description": "year participant was diagnosed",
"data_type": "float",
"data_type": "integer",
"examples": "1999"
},
{
Expand Down
4 changes: 3 additions & 1 deletion sheets_to_JSON_genotype.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ library(jsonlite)
url <- "https://docs.google.com/spreadsheets/d/1lwVMGT-TQaWbMWvi3hdqWuEthZvaKGOImINAqXguPaM"
model_name <- "PRIMED Genotype Data Model"
model_description <- "Data model for genotype data in the PRIMED consortium"
model_version <- "1.2"
model_version <- "1.3"


# read in the data
Expand All @@ -27,13 +27,15 @@ rm(list = c("table_names", "url"))
for (i in 1:length(tables)) {
tables[[i]] <- tables[[i]] %>%
mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>%
mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>%
select(column = Column,
primary_key,
required = Required,
description = Description,
data_type = `Data type`,
references = References,
enumerations = Enumerations,
is_bucket_path,
examples = Examples,
notes = `Notes/comments`) %>%
mutate(description=gsub('"', "'", description),
Expand Down
52 changes: 27 additions & 25 deletions sheets_to_JSON_gsr.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ library(jsonlite)
url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I"
model_name <- "PRIMED GSR Data Model"
model_description <- "Data model for Genomic Summary Results in the PRIMED consortium"
model_version <- "1.0"
model_version <- "1.1"


# table metadata
Expand All @@ -28,35 +28,37 @@ for (i in 1:length(tables)) {
tmp <- tables[[i]] %>%
filter(!is.na(`Data type`)) %>% # keep only valid rows
mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>%
mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>%
mutate(Description=gsub('"', "'", Description), # replace double with single quote
Description=gsub('\n', ' ', Description), # replace newline with space
`Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote
`Notes/comments`=gsub('\n', ' ', `Notes/comments`), # replace newline with space
References=ifelse(grepl("omop_concept", References), NA, References)) # remove external table reference
if ("Multi-value delimiter" %in% names(tmp)) {
tables[[i]] <- tmp %>%
select(column = Column,
primary_key,
required = Required,
description = Description,
data_type = `Data type`,
references = References,
enumerations = Enumerations,
multi_value_delimiter = `Multi-value delimiter`,
examples = Examples,
notes = `Notes/comments`)
} else {
tables[[i]] <- tmp %>%
select(column = Column,
primary_key,
required = Required,
description = Description,
data_type = `Data type`,
references = References,
enumerations = Enumerations,
examples = Examples,
notes = `Notes/comments`)
}

lookup <- c(
data_type = "Data type",
multi_value_delimiter = "Multi-value delimiter",
notes = "Notes/comments"
)
tmp <- tmp %>%
rename(any_of(lookup)) %>%
rename_with(tolower)

keep_cols <- c(
"column",
"primary_key",
"required",
"description",
"data_type",
"references",
"enumerations",
"is_bucket_path",
"multi_value_delimiter",
"examples",
"notes"
)
tables[[i]] <- tmp %>%
select(any_of(keep_cols))
}
rm(list = c("tmp"))

Expand Down
2 changes: 2 additions & 0 deletions sheets_to_JSON_phenotype.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ rm(list = c("table_info", "url"))
for (i in 1:length(tables)) {
tmp <- tables[[i]] %>%
filter(!is.na(`Data type`)) %>% # keep only valid rows
mutate(is_bucket_path = ifelse(grepl("file_.*path", Column), TRUE, NA)) %>%
mutate(Required=as.logical(Required), # non-T/F values will be NA
Description=gsub('"', "'", Description), # replace double with single quote
Description=gsub('\n', ' ', Description), # replace newline with space
Expand Down Expand Up @@ -70,6 +71,7 @@ for (i in 1:length(tables)) {
"max",
"references",
"enumerations",
"is_bucket_path",
"multi_value_delimiter",
"examples",
"notes"
Expand Down

0 comments on commit 824bcdb

Please sign in to comment.