From 1df9b2c51213742c59aebfbea8fab17b7da28e26 Mon Sep 17 00:00:00 2001
From: Alyna Khan <alynak@uw.edu>
Date: Fri, 1 Sep 2023 11:09:09 -0700
Subject: [PATCH 1/9] changed pheno_cad_id  reference from age_at_observation
 to age_at_obs

---
 PRIMED_phenotype_data_model.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index 66ffffc..39918de 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -846,8 +846,8 @@
         {
           "column": "pheno_cad_id",
           "data_type": "string",
-          "references": "from: subject_id, age_at_observation",
-          "notes": "values auto-generated by CC using subject_id and age_at_observation"
+          "references": "from: subject_id, age_at_obs",
+          "notes": "values auto-generated by CC using subject_id and age_at_obs"
         },
         {
           "column": "subject_id",

From 346ea5dab2c9d3beda5a8539d7b942e84d53cd45 Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Fri, 8 Sep 2023 16:13:24 -0700
Subject: [PATCH 2/9] don't auto-generate additional primary keys for pheno
 tables

Since pheno tables aren't imported into AnVIL data tables, they
do not need a single column as the primary key. Adding this column
only adds confusion and leads to multiple versions of the data
file.
---
 PRIMED_phenotype_data_model.json | 79 ++++++++++----------------------
 sheets_to_JSON_phenotype.R       | 17 +++++--
 2 files changed, 38 insertions(+), 58 deletions(-)

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index 39918de..717e635 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED Phenotype Data Model",
   "description": "Data model for phenotype data in the PRIMED consortium",
-  "version": "1.2",
+  "version": "1.3",
   "tables": [
     {
       "table": "subject",
@@ -279,14 +279,9 @@
       "table": "cmqt_flags",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=0",
       "columns": [
-        {
-          "column": "pheno_flag_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -295,6 +290,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float"
@@ -427,14 +423,9 @@
       "table": "cmqt_anthropometry",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1206657525",
       "columns": [
-        {
-          "column": "pheno_anthropometry_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -443,6 +434,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -472,6 +464,11 @@
           "description": "body mass index calculated",
           "data_type": "float",
           "examples": "26.45"
+        },
+        {
+          "column": "waist_hip_ratio_1",
+          "description": "wait hip ratio calculated",
+          "data_type": "float"
         }
       ]
     },
@@ -479,14 +476,9 @@
       "table": "cmqt_blood_pressure",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=455811479",
       "columns": [
-        {
-          "column": "pheno_blood_pressure_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -495,6 +487,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -525,14 +518,9 @@
       "table": "cmqt_lipids",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1052869785",
       "columns": [
-        {
-          "column": "pheno_lipids_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -541,6 +529,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -588,14 +577,9 @@
       "table": "cmqt_hematology",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1535206686",
       "columns": [
-        {
-          "column": "pheno_hematology_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -604,6 +588,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -697,14 +682,9 @@
       "table": "cmqt_glycemic",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=2078107573",
       "columns": [
-        {
-          "column": "pheno_glycemic_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -713,6 +693,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -751,14 +732,9 @@
       "table": "cmqt_kidney_function",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=746527944",
       "columns": [
-        {
-          "column": "pheno_kidney_function_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -767,6 +743,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -795,14 +772,9 @@
       "table": "diabetes_diabetes",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1891810958",
       "columns": [
-        {
-          "column": "pheno_diabetes_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -811,6 +783,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -843,14 +816,9 @@
       "table": "cvd_cad",
       "url": "https://docs.google.com/spreadsheets/d/1gchIrBIPt2s_3uVEloUK1c1Rst6IxnbBm7Zwps2k31I/edit#gid=253559161",
       "columns": [
-        {
-          "column": "pheno_cad_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -859,6 +827,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R
index 56a8ae8..b6103bd 100644
--- a/sheets_to_JSON_phenotype.R
+++ b/sheets_to_JSON_phenotype.R
@@ -9,12 +9,12 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks"
 model_name <- "PRIMED Phenotype Data Model"
 model_description <- "Data model for phenotype data in the PRIMED consortium"
-model_version <-"1.2"
-
+model_version <-"1.3"
 
 # table metadata
 meta <- read_sheet(url, sheet="Description", skip=1) %>%
-    select(table=Table, required=Required, url=Link)
+    select(table=Table, required=Required, url=Link) %>%
+    filter(!is.na(url)) # only keep tables with links
 
 #table_names <- meta$table
 #tables <- lapply(table_names, function(x) read_sheet(url, sheet=x, skip=1))
@@ -42,6 +42,11 @@ for (i in 1:length(tables)) {
                Description=gsub('\n', ' ', Description), # replace newline with space
                `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote
                `Notes/comments`=gsub('\n', ' ', `Notes/comments`)) # replace newline with space
+    
+    
+    nofix <- c("pilot", "subject", "population_descriptor", 
+                   "phenotype_harmonized", "phenotype_unharmonized")
+    if (names(tables)[i] %in% nofix) { # temporary
     if ("Primary key" %in% names(tmp)) {
         tmp <- tmp %>%
             rename(primary_key = `Primary key`)
@@ -49,6 +54,12 @@ for (i in 1:length(tables)) {
         tmp <- tmp %>%
             mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA))
     }
+    } else { # temporary
+        tmp <- tmp %>%
+            mutate(primary_key = ifelse(Column %in% c("subject_id", "age_at_obs"), TRUE, NA))
+        tmp <- tmp[-1,] # remove auto-generated primary key
+    }
+    
     if ("Multi-value delimiter" %in% names(tmp)) {
         tables[[i]] <- tmp %>%
             select(column = Column, 

From e146717dcb0bf1812b84cbbab510d367e8565b43 Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Mon, 11 Sep 2023 12:49:47 -0700
Subject: [PATCH 3/9] remove temporary code after updating phenotype google
 sheets

---
 sheets_to_JSON_phenotype.R | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R
index b6103bd..8b270be 100644
--- a/sheets_to_JSON_phenotype.R
+++ b/sheets_to_JSON_phenotype.R
@@ -43,10 +43,6 @@ for (i in 1:length(tables)) {
                `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote
                `Notes/comments`=gsub('\n', ' ', `Notes/comments`)) # replace newline with space
     
-    
-    nofix <- c("pilot", "subject", "population_descriptor", 
-                   "phenotype_harmonized", "phenotype_unharmonized")
-    if (names(tables)[i] %in% nofix) { # temporary
     if ("Primary key" %in% names(tmp)) {
         tmp <- tmp %>%
             rename(primary_key = `Primary key`)
@@ -54,11 +50,6 @@ for (i in 1:length(tables)) {
         tmp <- tmp %>%
             mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA))
     }
-    } else { # temporary
-        tmp <- tmp %>%
-            mutate(primary_key = ifelse(Column %in% c("subject_id", "age_at_obs"), TRUE, NA))
-        tmp <- tmp[-1,] # remove auto-generated primary key
-    }
     
     if ("Multi-value delimiter" %in% names(tmp)) {
         tables[[i]] <- tmp %>%

From c9ff382bad75c366a4e7915bc9a0218ce1645b17 Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Fri, 29 Sep 2023 13:15:01 -0700
Subject: [PATCH 4/9] add version numbers to phenotype tables

---
 PRIMED_phenotype_data_model.json | 19 ++++++++++++++++++-
 sheets_to_JSON_phenotype.R       |  6 +++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index 717e635..6f7062d 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED Phenotype Data Model",
   "description": "Data model for phenotype data in the PRIMED consortium",
-  "version": "1.3",
+  "version": "1.4",
   "tables": [
     {
       "table": "subject",
@@ -57,6 +57,7 @@
     {
       "table": "population_descriptor",
       "url": "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks/edit#gid=1733510035",
+      "version": "1.0",
       "columns": [
         {
           "column": "population_descriptor_id",
@@ -203,6 +204,7 @@
     {
       "table": "pilot",
       "url": "https://docs.google.com/spreadsheets/d/1bo_I8_yOx0sXK9UcNAJ8b8DoKe8O3W4S9zzZG-y0thM/edit#gid=0",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -278,6 +280,7 @@
     {
       "table": "cmqt_flags",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=0",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -422,6 +425,7 @@
     {
       "table": "cmqt_anthropometry",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1206657525",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -475,6 +479,7 @@
     {
       "table": "cmqt_blood_pressure",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=455811479",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -511,12 +516,19 @@
           "description": "Resting diastolic blood pressure from the upper arm in a clinical setting",
           "data_type": "float",
           "examples": "80"
+        },
+        {
+          "column": "hypertension_1",
+          "description": "<description>",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"]
         }
       ]
     },
     {
       "table": "cmqt_lipids",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1052869785",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -576,6 +588,7 @@
     {
       "table": "cmqt_hematology",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1535206686",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -681,6 +694,7 @@
     {
       "table": "cmqt_glycemic",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=2078107573",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -731,6 +745,7 @@
     {
       "table": "cmqt_kidney_function",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=746527944",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -771,6 +786,7 @@
     {
       "table": "diabetes_diabetes",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1891810958",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -815,6 +831,7 @@
     {
       "table": "cvd_cad",
       "url": "https://docs.google.com/spreadsheets/d/1gchIrBIPt2s_3uVEloUK1c1Rst6IxnbBm7Zwps2k31I/edit#gid=253559161",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R
index 8b270be..ee8458e 100644
--- a/sheets_to_JSON_phenotype.R
+++ b/sheets_to_JSON_phenotype.R
@@ -9,11 +9,11 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks"
 model_name <- "PRIMED Phenotype Data Model"
 model_description <- "Data model for phenotype data in the PRIMED consortium"
-model_version <-"1.3"
+model_version <-"1.4"
 
 # table metadata
-meta <- read_sheet(url, sheet="Description", skip=1) %>%
-    select(table=Table, required=Required, url=Link) %>%
+meta <- read_sheet(url, sheet="Description", skip=1, col_types="c") %>%
+    select(table=Table, required=Required, url=Link, version=Version) %>%
     filter(!is.na(url)) # only keep tables with links
 
 #table_names <- meta$table

From e71227dcdfd965cb3af1ac6df9c90591de6586df Mon Sep 17 00:00:00 2001
From: Alyna Khan <alynak@uw.edu>
Date: Tue, 17 Oct 2023 11:24:14 -0700
Subject: [PATCH 5/9] added breast cancer and prostate cancer data tables

---
 PRIMED_phenotype_data_model.json | 547 ++++++++++++++++++++++++++++---
 1 file changed, 493 insertions(+), 54 deletions(-)

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index 66ffffc..f11ffb2 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -279,14 +279,9 @@
       "table": "cmqt_flags",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=0",
       "columns": [
-        {
-          "column": "pheno_flag_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -295,6 +290,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float"
@@ -427,14 +423,9 @@
       "table": "cmqt_anthropometry",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1206657525",
       "columns": [
-        {
-          "column": "pheno_anthropometry_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -443,6 +434,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -472,6 +464,11 @@
           "description": "body mass index calculated",
           "data_type": "float",
           "examples": "26.45"
+        },
+        {
+          "column": "waist_hip_ratio_1",
+          "description": "wait hip ratio calculated",
+          "data_type": "float"
         }
       ]
     },
@@ -479,14 +476,9 @@
       "table": "cmqt_blood_pressure",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=455811479",
       "columns": [
-        {
-          "column": "pheno_blood_pressure_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -495,6 +487,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -518,6 +511,13 @@
           "description": "Resting diastolic blood pressure from the upper arm in a clinical setting",
           "data_type": "float",
           "examples": "80"
+        },
+        {
+          "column": "hypertension_1",
+          "description": "Indicator of whether or not a participant has a combined systolic and diastolic reading of greater than or equal to 140/90mmHg",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
         }
       ]
     },
@@ -525,14 +525,9 @@
       "table": "cmqt_lipids",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1052869785",
       "columns": [
-        {
-          "column": "pheno_lipids_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -541,6 +536,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -588,14 +584,9 @@
       "table": "cmqt_hematology",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1535206686",
       "columns": [
-        {
-          "column": "pheno_hematology_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -604,6 +595,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -697,14 +689,9 @@
       "table": "cmqt_glycemic",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=2078107573",
       "columns": [
-        {
-          "column": "pheno_glycemic_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -713,6 +700,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -751,14 +739,9 @@
       "table": "cmqt_kidney_function",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=746527944",
       "columns": [
-        {
-          "column": "pheno_kidney_function_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -767,6 +750,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -795,14 +779,9 @@
       "table": "diabetes_diabetes",
       "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1891810958",
       "columns": [
-        {
-          "column": "pheno_diabetes_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_obs",
-          "notes": "values auto-generated by CC using subject_id and age_at_obs"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -811,6 +790,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -843,14 +823,9 @@
       "table": "cvd_cad",
       "url": "https://docs.google.com/spreadsheets/d/1gchIrBIPt2s_3uVEloUK1c1Rst6IxnbBm7Zwps2k31I/edit#gid=253559161",
       "columns": [
-        {
-          "column": "pheno_cad_id",
-          "data_type": "string",
-          "references": "from: subject_id, age_at_observation",
-          "notes": "values auto-generated by CC using subject_id and age_at_observation"
-        },
         {
           "column": "subject_id",
+          "primary_key": true,
           "required": true,
           "description": "the PRIMED subject id",
           "data_type": "string",
@@ -859,6 +834,7 @@
         },
         {
           "column": "age_at_obs",
+          "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
@@ -886,6 +862,469 @@
           "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
         }
       ]
+    },
+    {
+      "table": "cancer_breast",
+      "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=0",
+      "columns": [
+        {
+          "column": "subject_id",
+          "primary_key": true,
+          "required": true,
+          "description": "the PRIMED subject id",
+          "data_type": "string",
+          "references": "> subject.subject_id",
+          "notes": "references subject_id in participant table"
+        },
+        {
+          "column": "age_at_obs",
+          "primary_key": true,
+          "required": true,
+          "description": "the age at which the observation or measurement for the phenotype(s) were taken",
+          "data_type": "float",
+          "examples": "56.2"
+        },
+        {
+          "column": "visit",
+          "description": "indicator of visit or time of observation",
+          "data_type": "string",
+          "examples": ["visit_2", "baseline", "median"],
+          "notes": "this can be any value that is used consistently within the study"
+        },
+        {
+          "column": "breast_cancer_status_1",
+          "description": "whether or not the participant had breast cancer",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "age_at_diagnosis_1",
+          "description": "age at which participant was diagnosed with breast cancer",
+          "data_type": "float",
+          "notes": "may be the same as age_at_obs or not"
+        },
+        {
+          "column": "year_at_diagnosis_1",
+          "description": "year participant was diagnosed",
+          "data_type": "float",
+          "examples": "1999"
+        },
+        {
+          "column": "breast_cancer_type_1",
+          "description": "breast cancer type upon diagnosis",
+          "data_type": "enumeration",
+          "enumerations": ["unilateral", "bilateral"]
+        },
+        {
+          "column": "cancer_behavior_1",
+          "description": "behavior of the tumor",
+          "data_type": "enumeration",
+          "enumerations": ["benign", "borderline", "in_situ", "invasive"],
+          "notes": "Missing values can be left blank or set to NA"
+        },
+        {
+          "column": "her2_1",
+          "description": "human epidermal growth factor receptor 2",
+          "data_type": "enumeration",
+          "enumerations": ["positive", "negative", "unknown"]
+        },
+        {
+          "column": "pr_1",
+          "description": "progesterone receptor breast cancer cells",
+          "data_type": "enumeration",
+          "enumerations": ["positive", "negative", "unknown"]
+        },
+        {
+          "column": "er_1",
+          "description": "estrogen receptor breast cancer cells",
+          "data_type": "enumeration",
+          "enumerations": ["positive", "negative", "unknown"]
+        },
+        {
+          "column": "T_stage_clinical_1",
+          "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.",
+          "data_type": "enumeration",
+          "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"]
+        },
+        {
+          "column": "T_stage_pathological_1",
+          "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.",
+          "data_type": "enumeration",
+          "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"]
+        },
+        {
+          "column": "T_stage_uknown_1",
+          "description": "staging (clinical, pathological) is uknown",
+          "data_type": "enumeration",
+          "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"]
+        },
+        {
+          "column": "T_stage_clinical_2",
+          "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.",
+          "data_type": "enumeration",
+          "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"],
+          "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "T_stage_pathological_2",
+          "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.",
+          "data_type": "enumeration",
+          "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"],
+          "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "T_stage_unknown_2",
+          "description": "staging (clinical, pathological) is uknown",
+          "data_type": "enumeration",
+          "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"],
+          "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "nodal_involvement_1",
+          "description": "regional lymph nodes (N)",
+          "data_type": "enumeration",
+          "enumerations": ["NX", "N0", "N1", "N2", "N3"]
+        },
+        {
+          "column": "distant_metastasis_1",
+          "description": "distant metastasis (M)",
+          "data_type": "enumeration",
+          "enumerations": ["MX", "M0", "M1"]
+        },
+        {
+          "column": "stage_system",
+          "description": "definition of staging system used (e.g., SEER, AJCC) and time period (e.g., year), if applicable",
+          "data_type": "string",
+          "examples": ["AJCC 2003", "SEER 1999"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "grade_clinical_1",
+          "description": "grading per diagnosis or biopsy",
+          "data_type": "enumeration",
+          "enumerations": ["grade 1", "grade 2", "grade 3"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "grade_pathological_1",
+          "description": "grading per surgical pathology",
+          "data_type": "enumeration",
+          "enumerations": ["grade 1", "grade 2", "grade 3"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "grade_unknown_1",
+          "description": "grading determination unknown",
+          "data_type": "enumeration",
+          "enumerations": ["grade 1", "grade 2", "grade 3"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "screening_history_1",
+          "description": "whether or not participant underwent screening for breast cancer",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "recurrence_1",
+          "description": "whether or not a recurrence occurred",
+          "data_type": "enumeration",
+          "enumerations": ["recurrence_primary", "recurrence_second_primary", "unknown", "none"]
+        },
+        {
+          "column": "surgery_1",
+          "description": "whether or not participant received surgery",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "radiotherapy_1",
+          "description": "whether or not participant received radiotherapy",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "chemotherapy_1",
+          "description": "whether or not participant received chemotherapy",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "hormone_therapy_1",
+          "description": "whether or not participant received hormone therapy. Includes oopherectomy, hysterectomy",
+          "data_type": "enumeration",
+          "enumerations": ["pharmaceutical", "surgical", "both", "none", "unknown"],
+          "notes": "Indicate hormone therapy type in the analyst comments"
+        },
+        {
+          "column": "NSAID_1",
+          "description": "use of non-steroidal anti-inflammatory drugs",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "age_at_natural_menopause_1",
+          "description": "age at which natural menopause occurred",
+          "data_type": "float",
+          "notes": "NA if menopause has not occurred"
+        },
+        {
+          "column": "post_menopausal_hormone_use_1",
+          "description": "whether or not hormone use occurred post menopause (not cancer-related use)",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "parity_1",
+          "description": "the number of pregnancies carried for at least 20 weeks",
+          "data_type": "integer"
+        },
+        {
+          "column": "age_at_first_birth_1",
+          "description": "age at which the first birth occurred",
+          "data_type": "float"
+        },
+        {
+          "column": "age_at_menarche_1",
+          "description": "age at which menarche occurred",
+          "data_type": "float"
+        },
+        {
+          "column": "deceased_1",
+          "description": "indication of whether individual is deceased",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "cause_of_death_breast_cancer_1",
+          "description": "indication of whether cause of death was due to breast cancer or other cause",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no, cause of death not due to breast cancer' 1 = 'yes, cause of death due to breast cancer' Missing values indicate unknown cause of death not deceased and can either be left blank or set to NA."
+        },
+        {
+          "column": "age_at_death_1",
+          "description": "age at death of individual",
+          "data_type": "float",
+          "notes": "Set to missing (either blank or NA) if individual is not deceased"
+        }
+      ]
+    },
+    {
+      "table": "cancer_prostate",
+      "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=1811888649",
+      "columns": [
+        {
+          "column": "subject_id",
+          "primary_key": true,
+          "required": true,
+          "description": "the PRIMED subject id",
+          "data_type": "string",
+          "references": "> subject.subject_id",
+          "notes": "references subject_id in participant table"
+        },
+        {
+          "column": "age_at_obs",
+          "primary_key": true,
+          "required": true,
+          "description": "the age at which the observation or measurement for the phenotype(s) were taken",
+          "data_type": "float",
+          "examples": "56.2"
+        },
+        {
+          "column": "visit",
+          "description": "indicator of visit or time of observation",
+          "data_type": "string",
+          "examples": ["visit_2", "baseline", "median"],
+          "notes": "this can be any value that is used consistently within the study"
+        },
+        {
+          "column": "prostate_cancer_status_1",
+          "description": "whether or not the participant had prostate cancer",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "age_at_diagnosis_1",
+          "description": "age at which participant was diagnosed with prostate cancer",
+          "data_type": "float",
+          "notes": "may be the same as age_at_obs or not"
+        },
+        {
+          "column": "year_at_diagnosis",
+          "description": "year participant was diagnosed",
+          "data_type": "float",
+          "examples": "1999"
+        },
+        {
+          "column": "cancer_behavior_1",
+          "description": "behavior of the tumor",
+          "data_type": "enumeration",
+          "enumerations": ["benign", "borderline", "in_situ", "invasive"],
+          "notes": "Missing values can be left blank or set to NA"
+        },
+        {
+          "column": "T_stage_clinical_1",
+          "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.",
+          "data_type": "enumeration",
+          "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"]
+        },
+        {
+          "column": "T_stage_pathological_1",
+          "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.",
+          "data_type": "enumeration",
+          "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"]
+        },
+        {
+          "column": "T_stage_uknown_1",
+          "description": "staging (clinical, pathological) is uknown",
+          "data_type": "enumeration",
+          "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"]
+        },
+        {
+          "column": "T_stage_clinical_2",
+          "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.",
+          "data_type": "enumeration",
+          "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"],
+          "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "T_stage_pathological_2",
+          "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.",
+          "data_type": "enumeration",
+          "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"],
+          "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "T_stage_unknown_2",
+          "description": "staging (clinical, pathological) is uknown",
+          "data_type": "enumeration",
+          "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"],
+          "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "nodal_involvement_1",
+          "description": "regional lymph nodes (N)",
+          "data_type": "enumeration",
+          "enumerations": ["NX", "N0", "N1", "N2", "N3"]
+        },
+        {
+          "column": "distant_metastasis_1",
+          "description": "distant metastasis (M)",
+          "data_type": "enumeration",
+          "enumerations": ["MX", "M0", "M1"]
+        },
+        {
+          "column": "stage_system",
+          "description": "definition of staging system (e.g., SEER, AJCC) and time period (e.g., year), if applicable",
+          "data_type": "string",
+          "examples": ["AJCC 2003", "SEER 1999"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "gleason_score_clinical_1",
+          "description": "grading per diagnosis or biopsy",
+          "data_type": "enumeration",
+          "enumerations": ["2", "3", "4", "5", "6", "7", "8", "9", "10"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "gleason_score_pathological_1",
+          "description": "grading per surgical pathology",
+          "data_type": "enumeration",
+          "enumerations": ["2", "3", "4", "5", "6", "7", "8", "9", "10"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "gleason_score_unknown_1",
+          "description": "grading determination unknown",
+          "data_type": "enumeration",
+          "enumerations": ["2", "3", "4", "5", "6", "7", "8", "9", "10"],
+          "notes": "missing values can either be left blank or set to NA"
+        },
+        {
+          "column": "psa_at_diagnosis_1",
+          "description": "psa score at diagnosis of prostate cancer",
+          "data_type": "float"
+        },
+        {
+          "column": "screening_history_1",
+          "description": "whether or not participant underwent screening for prostate cancer",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "recurrence_1",
+          "description": "whether or not a recurrence occurred",
+          "data_type": "enumeration",
+          "enumerations": ["recurrence_primary", "recurrence_second_primary", "unknown", "none"]
+        },
+        {
+          "column": "surgery_1",
+          "description": "whether or not participant received surgery",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "radiotherapy_1",
+          "description": "whether or not participant received radiotherapy",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "chemotherapy_1",
+          "description": "whether or not participant received chemotherapy",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "hormone_therapy_1",
+          "description": "whether or not participant received horome therapy (e.g., androgen deprivation therapy or ADT)",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "NSAID_1",
+          "description": "use of non-steroidal anti-inflammatory drugs",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "deceased_1",
+          "description": "indicator of whether individual is deceased",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA."
+        },
+        {
+          "column": "cause_of_death_prostate_cancer_1",
+          "description": "indication of whether cause of death was due to prostate cancer or other cause",
+          "data_type": "enumeration",
+          "enumerations": ["0", "1"],
+          "notes": "0 = 'no, cause of death not due to prostate cancer' 1 = 'yes, cause of death due to prostate cancer' Missing values indicate unknown cause of death not deceased and can either be left blank or set to NA."
+        },
+        {
+          "column": "age_at_death_1",
+          "description": "age at death of individual",
+          "data_type": "float",
+          "notes": "Set to missing (either blank or NA) if individual is not deceased"
+        }
+      ]
     }
   ]
 }

From 210600fd0c3e5ca782f45d86d45bccbaa4900817 Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Tue, 31 Oct 2023 12:36:38 -0700
Subject: [PATCH 6/9] add min and max values for some columns

---
 PRIMED_phenotype_data_model.json | 41 +++++++++++++++++++++---
 sheets_to_JSON_phenotype.R       | 53 ++++++++++++++++----------------
 2 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index a4046b3..ebfbc8a 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED Phenotype Data Model",
   "description": "Data model for phenotype data in the PRIMED consortium",
-  "version": "1.4",
+  "version": "1.5",
   "tables": [
     {
       "table": "subject",
@@ -296,7 +296,9 @@
           "primary_key": true,
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
-          "data_type": "float"
+          "data_type": "float",
+          "min": " 0",
+          "max": "89"
         },
         {
           "column": "visit",
@@ -442,6 +444,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -455,18 +459,21 @@
           "column": "height_1",
           "description": "standing body height",
           "data_type": "float",
+          "min": 0,
           "examples": "165.1"
         },
         {
           "column": "weight_1",
           "description": "body weight at baseline",
           "data_type": "float",
+          "min": 0,
           "examples": "72.574"
         },
         {
           "column": "bmi_1",
           "description": "body mass index calculated",
           "data_type": "float",
+          "min": 0,
           "examples": "26.45"
         },
         {
@@ -496,6 +503,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -546,6 +555,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -606,6 +617,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -712,6 +725,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -763,6 +778,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -849,6 +866,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -877,6 +896,7 @@
     {
       "table": "cancer_breast",
       "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=0",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
@@ -893,6 +913,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -913,6 +935,8 @@
           "column": "age_at_diagnosis_1",
           "description": "age at which participant was diagnosed with breast cancer",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "notes": "may be the same as age_at_obs or not"
         },
         {
@@ -1083,6 +1107,8 @@
           "column": "age_at_natural_menopause_1",
           "description": "age at which natural menopause occurred",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "notes": "NA if menopause has not occurred"
         },
         {
@@ -1100,12 +1126,16 @@
         {
           "column": "age_at_first_birth_1",
           "description": "age at which the first birth occurred",
-          "data_type": "float"
+          "data_type": "float",
+          "min": 0,
+          "max": 89
         },
         {
           "column": "age_at_menarche_1",
           "description": "age at which menarche occurred",
-          "data_type": "float"
+          "data_type": "float",
+          "min": 0,
+          "max": 89
         },
         {
           "column": "deceased_1",
@@ -1125,6 +1155,8 @@
           "column": "age_at_death_1",
           "description": "age at death of individual",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "notes": "Set to missing (either blank or NA) if individual is not deceased"
         }
       ]
@@ -1132,6 +1164,7 @@
     {
       "table": "cancer_prostate",
       "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=1811888649",
+      "version": "1.0",
       "columns": [
         {
           "column": "subject_id",
diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R
index ee8458e..b9b6d9a 100644
--- a/sheets_to_JSON_phenotype.R
+++ b/sheets_to_JSON_phenotype.R
@@ -9,11 +9,11 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks"
 model_name <- "PRIMED Phenotype Data Model"
 model_description <- "Data model for phenotype data in the PRIMED consortium"
-model_version <-"1.4"
+model_version <-"1.5"
 
 # table metadata
 meta <- read_sheet(url, sheet="Description", skip=1, col_types="c") %>%
-    select(table=Table, required=Required, url=Link, version=Version) %>%
+    select(table=Table, required=Required, url=Link, version=`Table version`) %>%
     filter(!is.na(url)) # only keep tables with links
 
 #table_names <- meta$table
@@ -51,30 +51,31 @@ for (i in 1:length(tables)) {
             mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA))
     }
     
-    if ("Multi-value delimiter" %in% names(tmp)) {
-        tables[[i]] <- tmp %>%
-            select(column = Column, 
-                   primary_key,
-                   required = Required,
-                   description = Description, 
-                   data_type = `Data type`, 
-                   references = References, 
-                   enumerations = Enumerations, 
-                   multi_value_delimiter = `Multi-value delimiter`,
-                   examples = Examples, 
-                   notes = `Notes/comments`)
-    } else {
-        tables[[i]] <- tmp %>%
-            select(column = Column, 
-                   primary_key,
-                   required = Required,
-                   description = Description, 
-                   data_type = `Data type`, 
-                   references = References, 
-                   enumerations = Enumerations, 
-                   examples = Examples, 
-                   notes = `Notes/comments`)
-    }
+    lookup <- c(
+        data_type = "Data type", 
+        multi_value_delimiter = "Multi-value delimiter",
+        notes = "Notes/comments"
+    )
+    tmp <- tmp %>%
+        rename(any_of(lookup)) %>%
+        rename_with(tolower)
+    
+    keep_cols <- c(
+        "column", 
+        "primary_key",
+        "required",
+        "description", 
+        "data_type", 
+        "min",
+        "max",
+        "references", 
+        "enumerations", 
+        "multi_value_delimiter",
+        "examples", 
+        "notes"
+    )
+    tables[[i]] <- tmp %>%
+        select(any_of(keep_cols))
 }
 
 

From 9f4a0317f2df7775e3014edbde86e6c067763d1e Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Thu, 2 Nov 2023 15:41:55 -0700
Subject: [PATCH 7/9] add is_bucket_path to file_path columns

---
 PRIMED_genotype_data_model.json | 14 +++++++++-----
 sheets_to_JSON_genotype.R       |  4 +++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/PRIMED_genotype_data_model.json b/PRIMED_genotype_data_model.json
index a4eef72..4b6e04a 100644
--- a/PRIMED_genotype_data_model.json
+++ b/PRIMED_genotype_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED Genotype Data Model",
   "description": "Data model for genotype data in the PRIMED consortium",
-  "version": "1.2",
+  "version": "1.3",
   "tables": [
     {
       "table": "subject",
@@ -198,7 +198,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
@@ -305,7 +306,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
@@ -441,7 +443,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
@@ -541,7 +544,8 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
diff --git a/sheets_to_JSON_genotype.R b/sheets_to_JSON_genotype.R
index 9c75628..8ec0080 100644
--- a/sheets_to_JSON_genotype.R
+++ b/sheets_to_JSON_genotype.R
@@ -8,7 +8,7 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1lwVMGT-TQaWbMWvi3hdqWuEthZvaKGOImINAqXguPaM"
 model_name <- "PRIMED Genotype Data Model"
 model_description <- "Data model for genotype data in the PRIMED consortium"
-model_version <- "1.2"
+model_version <- "1.3"
 
 
 # read in the data
@@ -27,6 +27,7 @@ rm(list = c("table_names", "url"))
 for (i in 1:length(tables)) {
     tables[[i]] <- tables[[i]] %>%
         mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>%
+        mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>%
         select(column = Column, 
                primary_key,
                required = Required,
@@ -34,6 +35,7 @@ for (i in 1:length(tables)) {
                data_type = `Data type`, 
                references = References, 
                enumerations = Enumerations, 
+               is_bucket_path,
                examples = Examples, 
                notes = `Notes/comments`) %>%
         mutate(description=gsub('"', "'", description),

From feefbcc1d8c3e999fae5bec42798d9060df83d82 Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Sat, 4 Nov 2023 12:49:24 -0700
Subject: [PATCH 8/9] add bucket path to gsr and phenotype models

---
 PRIMED_GSR_data_model.json       |  5 +--
 PRIMED_phenotype_data_model.json | 12 +++++---
 sheets_to_JSON_gsr.R             | 52 +++++++++++++++++---------------
 sheets_to_JSON_phenotype.R       |  2 ++
 4 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json
index 00135a9..d54d118 100644
--- a/PRIMED_GSR_data_model.json
+++ b/PRIMED_GSR_data_model.json
@@ -1,7 +1,7 @@
 {
   "name": "PRIMED GSR Data Model",
   "description": "Data model for Genomic Summary Results in the PRIMED consortium",
-  "version": "1.0",
+  "version": "1.1",
   "tables": [
     {
       "table": "analysis",
@@ -423,7 +423,8 @@
           "column": "file_path",
           "required": true,
           "description": "File path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_type",
diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index ebfbc8a..d663eca 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -129,13 +129,15 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_readme_path",
           "required": true,
           "description": "path to the README",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "n_subjects",
@@ -179,13 +181,15 @@
           "column": "file_path",
           "required": true,
           "description": "absolute file path in cloud storage",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "file_dd_path",
           "required": true,
           "description": "path to the data dictionary",
-          "data_type": "string"
+          "data_type": "string",
+          "is_bucket_path": true
         },
         {
           "column": "n_subjects",
diff --git a/sheets_to_JSON_gsr.R b/sheets_to_JSON_gsr.R
index 7b30f2d..d79da87 100644
--- a/sheets_to_JSON_gsr.R
+++ b/sheets_to_JSON_gsr.R
@@ -9,7 +9,7 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I"
 model_name <- "PRIMED GSR Data Model"
 model_description <- "Data model for Genomic Summary Results in the PRIMED consortium"
-model_version <- "1.0"
+model_version <- "1.1"
 
 
 # table metadata
@@ -28,35 +28,37 @@ for (i in 1:length(tables)) {
     tmp <- tables[[i]] %>%
         filter(!is.na(`Data type`)) %>% # keep only valid rows
         mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>%
+        mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>%
         mutate(Description=gsub('"', "'", Description), # replace double with single quote
                Description=gsub('\n', ' ', Description), # replace newline with space
                `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote
                `Notes/comments`=gsub('\n', ' ', `Notes/comments`), # replace newline with space
                References=ifelse(grepl("omop_concept", References), NA, References)) # remove external table reference
-    if ("Multi-value delimiter" %in% names(tmp)) {
-        tables[[i]] <- tmp %>%
-            select(column = Column, 
-               primary_key,
-               required = Required,
-               description = Description, 
-               data_type = `Data type`, 
-               references = References, 
-               enumerations = Enumerations, 
-               multi_value_delimiter = `Multi-value delimiter`,
-               examples = Examples, 
-               notes = `Notes/comments`)
-    } else {
-        tables[[i]] <- tmp %>%
-            select(column = Column, 
-               primary_key,
-               required = Required,
-               description = Description, 
-               data_type = `Data type`, 
-               references = References, 
-               enumerations = Enumerations, 
-               examples = Examples, 
-               notes = `Notes/comments`)
-    }
+    
+    lookup <- c(
+        data_type = "Data type", 
+        multi_value_delimiter = "Multi-value delimiter",
+        notes = "Notes/comments"
+    )
+    tmp <- tmp %>%
+        rename(any_of(lookup)) %>%
+        rename_with(tolower)
+    
+    keep_cols <- c(
+        "column", 
+        "primary_key",
+        "required",
+        "description", 
+        "data_type", 
+        "references", 
+        "enumerations", 
+        "is_bucket_path",
+        "multi_value_delimiter",
+        "examples", 
+        "notes"
+    )
+    tables[[i]] <- tmp %>%
+        select(any_of(keep_cols))
 }
 rm(list = c("tmp"))
 
diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R
index b9b6d9a..92dd03a 100644
--- a/sheets_to_JSON_phenotype.R
+++ b/sheets_to_JSON_phenotype.R
@@ -37,6 +37,7 @@ rm(list = c("table_info", "url"))
 for (i in 1:length(tables)) {
     tmp <- tables[[i]] %>%
         filter(!is.na(`Data type`)) %>% # keep only valid rows
+        mutate(is_bucket_path = ifelse(grepl("file_.*path", Column), TRUE, NA)) %>%
         mutate(Required=as.logical(Required), # non-T/F values will be NA
                Description=gsub('"', "'", Description), # replace double with single quote
                Description=gsub('\n', ' ', Description), # replace newline with space
@@ -70,6 +71,7 @@ for (i in 1:length(tables)) {
         "max",
         "references", 
         "enumerations", 
+        "is_bucket_path",
         "multi_value_delimiter",
         "examples", 
         "notes"

From b0f28a1757b5cf19be307ba5a230567fa7ab09a7 Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Mon, 6 Nov 2023 15:33:18 -0800
Subject: [PATCH 9/9] add data model version to harmonized table

---
 PRIMED_phenotype_data_model.json | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json
index d663eca..e53d6a5 100755
--- a/PRIMED_phenotype_data_model.json
+++ b/PRIMED_phenotype_data_model.json
@@ -150,6 +150,12 @@
           "required": true,
           "description": "Number of rows in file (may be > n_subjects for longitudinal data)",
           "data_type": "integer"
+        },
+        {
+          "column": "data_model_version",
+          "description": "data model version for this table",
+          "data_type": "float",
+          "notes": "added automatically by validation workflow"
         }
       ]
     },
@@ -825,6 +831,8 @@
           "required": true,
           "description": "the age at which the observation or measurement for the phenotype(s) were taken",
           "data_type": "float",
+          "min": 0,
+          "max": 89,
           "examples": "56.2"
         },
         {
@@ -946,7 +954,7 @@
         {
           "column": "year_at_diagnosis_1",
           "description": "year participant was diagnosed",
-          "data_type": "float",
+          "data_type": "integer",
           "examples": "1999"
         },
         {
@@ -1210,7 +1218,7 @@
         {
           "column": "year_at_diagnosis",
           "description": "year participant was diagnosed",
-          "data_type": "float",
+          "data_type": "integer",
           "examples": "1999"
         },
         {