Merge pull request #27 from UW-GAC/bucket_path

add is_bucket_path to data model
UW-GAC · Nov 7, 2023 · 824bcdb · 824bcdb
2 parents d00a894 + b0f28a1
commit 824bcdb
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 39 deletions.
diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED GSR Data Model",
   "description": "Data model for Genomic Summary Results in the PRIMED consortium",
-  "version": "1.0",
+  "version": "1.1",
   "tables": [
     {
       "table": "analysis",
@@ -423,7 +423,8 @@
           "column": "file_path",
           "required": true,
           "description": "File path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",

diff --git a/PRIMED_genotype_data_model.json b/PRIMED_genotype_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED Genotype Data Model",
   "description": "Data model for genotype data in the PRIMED consortium",
-  "version": "1.2",
+  "version": "1.3",
   "tables": [
     {
       "table": "subject",
@@ -198,7 +198,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
@@ -305,7 +306,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
@@ -441,7 +443,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
@@ -541,7 +544,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
@@ -129,13 +129,15 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_readme_path",
           "required": true,
           "description": "path to the README",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "n_subjects",
@@ -148,6 +150,12 @@
           "required": true,
           "description": "Number of rows in file (may be > n_subjects for longitudinal data)",
           "data_type": "integer"
+        },
+        {
+          "column": "data_model_version",
+          "description": "data model version for this table",
+          "data_type": "float",
+          "notes": "added automatically by validation workflow"
         }
       ]
     },
@@ -179,13 +187,15 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_dd_path",
           "required": true,
           "description": "path to the data dictionary",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "n_subjects",
@@ -821,6 +831,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -942,7 +954,7 @@
         {
           "column": "year_at_diagnosis_1",
           "description": "year participant was diagnosed",
-          "data_type": "float",
+          "data_type": "integer",
           "examples": "1999"
         },
         {
@@ -1206,7 +1218,7 @@
         {
           "column": "year_at_diagnosis",
           "description": "year participant was diagnosed",
-          "data_type": "float",
+          "data_type": "integer",
           "examples": "1999"
         },
         {

diff --git a/sheets_to_JSON_genotype.R b/sheets_to_JSON_genotype.R
@@ -8,7 +8,7 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1lwVMGT-TQaWbMWvi3hdqWuEthZvaKGOImINAqXguPaM"
 model_name <- "PRIMED Genotype Data Model"
 model_description <- "Data model for genotype data in the PRIMED consortium"
-model_version <- "1.2"
+model_version <- "1.3"
 
 
 # read in the data
@@ -27,13 +27,15 @@ rm(list = c("table_names", "url"))
 for (i in 1:length(tables)) {
     tables[[i]] <- tables[[i]] %>%
         mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>%
+        mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>%
         select(column = Column, 
                primary_key,
                required = Required,
                description = Description, 
                data_type = `Data type`, 
                references = References, 
                enumerations = Enumerations, 
+               is_bucket_path,
                examples = Examples, 
                notes = `Notes/comments`) %>%
         mutate(description=gsub('"', "'", description),

diff --git a/sheets_to_JSON_gsr.R b/sheets_to_JSON_gsr.R
@@ -9,7 +9,7 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I"
 model_name <- "PRIMED GSR Data Model"
 model_description <- "Data model for Genomic Summary Results in the PRIMED consortium"
-model_version <- "1.0"
+model_version <- "1.1"
 
 
 # table metadata
@@ -28,35 +28,37 @@ for (i in 1:length(tables)) {
     tmp <- tables[[i]] %>%
         filter(!is.na(`Data type`)) %>% # keep only valid rows
         mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>%
+        mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>%
         mutate(Description=gsub('"', "'", Description), # replace double with single quote
                Description=gsub('\n', ' ', Description), # replace newline with space
                `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote
                `Notes/comments`=gsub('\n', ' ', `Notes/comments`), # replace newline with space
                References=ifelse(grepl("omop_concept", References), NA, References)) # remove external table reference
-    if ("Multi-value delimiter" %in% names(tmp)) {
-        tables[[i]] <- tmp %>%
-            select(column = Column, 
-               primary_key,
-               required = Required,
-               description = Description, 
-               data_type = `Data type`, 
-               references = References, 
-               enumerations = Enumerations, 
-               multi_value_delimiter = `Multi-value delimiter`,
-               examples = Examples, 
-               notes = `Notes/comments`)
-    } else {
-        tables[[i]] <- tmp %>%
-            select(column = Column, 
-               primary_key,
-               required = Required,
-               description = Description, 
-               data_type = `Data type`, 
-               references = References, 
-               enumerations = Enumerations, 
-               examples = Examples, 
-               notes = `Notes/comments`)
-    }
+
+    lookup <- c(
+        data_type = "Data type", 
+        multi_value_delimiter = "Multi-value delimiter",
+        notes = "Notes/comments"
+    )
+    tmp <- tmp %>%
+        rename(any_of(lookup)) %>%
+        rename_with(tolower)
+
+    keep_cols <- c(
+        "column", 
+        "primary_key",
+        "required",
+        "description", 
+        "data_type", 
+        "references", 
+        "enumerations", 
+        "is_bucket_path",
+        "multi_value_delimiter",
+        "examples", 
+        "notes"
+    )
+    tables[[i]] <- tmp %>%
+        select(any_of(keep_cols))
 }
 rm(list = c("tmp"))
 

diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R
@@ -37,6 +37,7 @@ rm(list = c("table_info", "url"))
 for (i in 1:length(tables)) {
     tmp <- tables[[i]] %>%
         filter(!is.na(`Data type`)) %>% # keep only valid rows
+        mutate(is_bucket_path = ifelse(grepl("file_.*path", Column), TRUE, NA)) %>%
         mutate(Required=as.logical(Required), # non-T/F values will be NA
                Description=gsub('"', "'", Description), # replace double with single quote
                Description=gsub('\n', ' ', Description), # replace newline with space
@@ -70,6 +71,7 @@ for (i in 1:length(tables)) {
         "max",
         "references", 
         "enumerations", 
+        "is_bucket_path",
         "multi_value_delimiter",
         "examples", 
         "notes"