From 1df9b2c51213742c59aebfbea8fab17b7da28e26 Mon Sep 17 00:00:00 2001 From: Alyna Khan Date: Fri, 1 Sep 2023 11:09:09 -0700 Subject: [PATCH 1/9] changed pheno_cad_id reference from age_at_observation to age_at_obs --- PRIMED_phenotype_data_model.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index 66ffffc..39918de 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -846,8 +846,8 @@ { "column": "pheno_cad_id", "data_type": "string", - "references": "from: subject_id, age_at_observation", - "notes": "values auto-generated by CC using subject_id and age_at_observation" + "references": "from: subject_id, age_at_obs", + "notes": "values auto-generated by CC using subject_id and age_at_obs" }, { "column": "subject_id", From 346ea5dab2c9d3beda5a8539d7b942e84d53cd45 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 8 Sep 2023 16:13:24 -0700 Subject: [PATCH 2/9] don't auto-generate additional primary keys for pheno tables Since pheno tables aren't imported into AnVIL data tables, they do not need a single column as the primary key. Adding this column only adds confusion and leads to multiple versions of the data file. --- PRIMED_phenotype_data_model.json | 79 ++++++++++---------------------- sheets_to_JSON_phenotype.R | 17 +++++-- 2 files changed, 38 insertions(+), 58 deletions(-) diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index 39918de..717e635 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED Phenotype Data Model", "description": "Data model for phenotype data in the PRIMED consortium", - "version": "1.2", + "version": "1.3", "tables": [ { "table": "subject", @@ -279,14 +279,9 @@ "table": "cmqt_flags", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=0", "columns": [ - { - "column": "pheno_flag_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -295,6 +290,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float" @@ -427,14 +423,9 @@ "table": "cmqt_anthropometry", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1206657525", "columns": [ - { - "column": "pheno_anthropometry_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -443,6 +434,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -472,6 +464,11 @@ "description": "body mass index calculated", "data_type": "float", "examples": "26.45" + }, + { + "column": "waist_hip_ratio_1", + "description": "wait hip ratio calculated", + "data_type": "float" } ] }, @@ -479,14 +476,9 @@ "table": "cmqt_blood_pressure", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=455811479", "columns": [ - { - "column": "pheno_blood_pressure_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -495,6 +487,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -525,14 +518,9 @@ "table": "cmqt_lipids", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1052869785", "columns": [ - { - "column": "pheno_lipids_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -541,6 +529,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -588,14 +577,9 @@ "table": "cmqt_hematology", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1535206686", "columns": [ - { - "column": "pheno_hematology_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -604,6 +588,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -697,14 +682,9 @@ "table": "cmqt_glycemic", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=2078107573", "columns": [ - { - "column": "pheno_glycemic_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -713,6 +693,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -751,14 +732,9 @@ "table": "cmqt_kidney_function", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=746527944", "columns": [ - { - "column": "pheno_kidney_function_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -767,6 +743,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -795,14 +772,9 @@ "table": "diabetes_diabetes", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1891810958", "columns": [ - { - "column": "pheno_diabetes_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -811,6 +783,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -843,14 +816,9 @@ "table": "cvd_cad", "url": "https://docs.google.com/spreadsheets/d/1gchIrBIPt2s_3uVEloUK1c1Rst6IxnbBm7Zwps2k31I/edit#gid=253559161", "columns": [ - { - "column": "pheno_cad_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -859,6 +827,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R index 56a8ae8..b6103bd 100644 --- a/sheets_to_JSON_phenotype.R +++ b/sheets_to_JSON_phenotype.R @@ -9,12 +9,12 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks" model_name <- "PRIMED Phenotype Data Model" model_description <- "Data model for phenotype data in the PRIMED consortium" -model_version <-"1.2" - +model_version <-"1.3" # table metadata meta <- read_sheet(url, sheet="Description", skip=1) %>% - select(table=Table, required=Required, url=Link) + select(table=Table, required=Required, url=Link) %>% + filter(!is.na(url)) # only keep tables with links #table_names <- meta$table #tables <- lapply(table_names, function(x) read_sheet(url, sheet=x, skip=1)) @@ -42,6 +42,11 @@ for (i in 1:length(tables)) { Description=gsub('\n', ' ', Description), # replace newline with space `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote `Notes/comments`=gsub('\n', ' ', `Notes/comments`)) # replace newline with space + + + nofix <- c("pilot", "subject", "population_descriptor", + "phenotype_harmonized", "phenotype_unharmonized") + if (names(tables)[i] %in% nofix) { # temporary if ("Primary key" %in% names(tmp)) { tmp <- tmp %>% rename(primary_key = `Primary key`) @@ -49,6 +54,12 @@ for (i in 1:length(tables)) { tmp <- tmp %>% mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) } + } else { # temporary + tmp <- tmp %>% + mutate(primary_key = ifelse(Column %in% c("subject_id", "age_at_obs"), TRUE, NA)) + tmp <- tmp[-1,] # remove auto-generated primary key + } + if ("Multi-value delimiter" %in% names(tmp)) { tables[[i]] <- tmp %>% select(column = Column, From e146717dcb0bf1812b84cbbab510d367e8565b43 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Mon, 11 Sep 2023 12:49:47 -0700 Subject: [PATCH 3/9] remove temporary code after updating phenotype google sheets --- sheets_to_JSON_phenotype.R | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R index b6103bd..8b270be 100644 --- a/sheets_to_JSON_phenotype.R +++ b/sheets_to_JSON_phenotype.R @@ -43,10 +43,6 @@ for (i in 1:length(tables)) { `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote `Notes/comments`=gsub('\n', ' ', `Notes/comments`)) # replace newline with space - - nofix <- c("pilot", "subject", "population_descriptor", - "phenotype_harmonized", "phenotype_unharmonized") - if (names(tables)[i] %in% nofix) { # temporary if ("Primary key" %in% names(tmp)) { tmp <- tmp %>% rename(primary_key = `Primary key`) @@ -54,11 +50,6 @@ for (i in 1:length(tables)) { tmp <- tmp %>% mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) } - } else { # temporary - tmp <- tmp %>% - mutate(primary_key = ifelse(Column %in% c("subject_id", "age_at_obs"), TRUE, NA)) - tmp <- tmp[-1,] # remove auto-generated primary key - } if ("Multi-value delimiter" %in% names(tmp)) { tables[[i]] <- tmp %>% From c9ff382bad75c366a4e7915bc9a0218ce1645b17 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 29 Sep 2023 13:15:01 -0700 Subject: [PATCH 4/9] add version numbers to phenotype tables --- PRIMED_phenotype_data_model.json | 19 ++++++++++++++++++- sheets_to_JSON_phenotype.R | 6 +++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index 717e635..6f7062d 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED Phenotype Data Model", "description": "Data model for phenotype data in the PRIMED consortium", - "version": "1.3", + "version": "1.4", "tables": [ { "table": "subject", @@ -57,6 +57,7 @@ { "table": "population_descriptor", "url": "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks/edit#gid=1733510035", + "version": "1.0", "columns": [ { "column": "population_descriptor_id", @@ -203,6 +204,7 @@ { "table": "pilot", "url": "https://docs.google.com/spreadsheets/d/1bo_I8_yOx0sXK9UcNAJ8b8DoKe8O3W4S9zzZG-y0thM/edit#gid=0", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -278,6 +280,7 @@ { "table": "cmqt_flags", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=0", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -422,6 +425,7 @@ { "table": "cmqt_anthropometry", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1206657525", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -475,6 +479,7 @@ { "table": "cmqt_blood_pressure", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=455811479", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -511,12 +516,19 @@ "description": "Resting diastolic blood pressure from the upper arm in a clinical setting", "data_type": "float", "examples": "80" + }, + { + "column": "hypertension_1", + "description": "", + "data_type": "enumeration", + "enumerations": ["0", "1"] } ] }, { "table": "cmqt_lipids", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1052869785", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -576,6 +588,7 @@ { "table": "cmqt_hematology", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1535206686", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -681,6 +694,7 @@ { "table": "cmqt_glycemic", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=2078107573", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -731,6 +745,7 @@ { "table": "cmqt_kidney_function", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=746527944", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -771,6 +786,7 @@ { "table": "diabetes_diabetes", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1891810958", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -815,6 +831,7 @@ { "table": "cvd_cad", "url": "https://docs.google.com/spreadsheets/d/1gchIrBIPt2s_3uVEloUK1c1Rst6IxnbBm7Zwps2k31I/edit#gid=253559161", + "version": "1.0", "columns": [ { "column": "subject_id", diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R index 8b270be..ee8458e 100644 --- a/sheets_to_JSON_phenotype.R +++ b/sheets_to_JSON_phenotype.R @@ -9,11 +9,11 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks" model_name <- "PRIMED Phenotype Data Model" model_description <- "Data model for phenotype data in the PRIMED consortium" -model_version <-"1.3" +model_version <-"1.4" # table metadata -meta <- read_sheet(url, sheet="Description", skip=1) %>% - select(table=Table, required=Required, url=Link) %>% +meta <- read_sheet(url, sheet="Description", skip=1, col_types="c") %>% + select(table=Table, required=Required, url=Link, version=Version) %>% filter(!is.na(url)) # only keep tables with links #table_names <- meta$table From e71227dcdfd965cb3af1ac6df9c90591de6586df Mon Sep 17 00:00:00 2001 From: Alyna Khan Date: Tue, 17 Oct 2023 11:24:14 -0700 Subject: [PATCH 5/9] added breast cancer and prostate cancer data tables --- PRIMED_phenotype_data_model.json | 547 ++++++++++++++++++++++++++++--- 1 file changed, 493 insertions(+), 54 deletions(-) diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index 66ffffc..f11ffb2 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -279,14 +279,9 @@ "table": "cmqt_flags", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=0", "columns": [ - { - "column": "pheno_flag_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -295,6 +290,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float" @@ -427,14 +423,9 @@ "table": "cmqt_anthropometry", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1206657525", "columns": [ - { - "column": "pheno_anthropometry_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -443,6 +434,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -472,6 +464,11 @@ "description": "body mass index calculated", "data_type": "float", "examples": "26.45" + }, + { + "column": "waist_hip_ratio_1", + "description": "wait hip ratio calculated", + "data_type": "float" } ] }, @@ -479,14 +476,9 @@ "table": "cmqt_blood_pressure", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=455811479", "columns": [ - { - "column": "pheno_blood_pressure_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -495,6 +487,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -518,6 +511,13 @@ "description": "Resting diastolic blood pressure from the upper arm in a clinical setting", "data_type": "float", "examples": "80" + }, + { + "column": "hypertension_1", + "description": "Indicator of whether or not a participant has a combined systolic and diastolic reading of greater than or equal to 140/90mmHg", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." } ] }, @@ -525,14 +525,9 @@ "table": "cmqt_lipids", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1052869785", "columns": [ - { - "column": "pheno_lipids_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -541,6 +536,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -588,14 +584,9 @@ "table": "cmqt_hematology", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1535206686", "columns": [ - { - "column": "pheno_hematology_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -604,6 +595,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -697,14 +689,9 @@ "table": "cmqt_glycemic", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=2078107573", "columns": [ - { - "column": "pheno_glycemic_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -713,6 +700,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -751,14 +739,9 @@ "table": "cmqt_kidney_function", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=746527944", "columns": [ - { - "column": "pheno_kidney_function_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -767,6 +750,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -795,14 +779,9 @@ "table": "diabetes_diabetes", "url": "https://docs.google.com/spreadsheets/d/1vQE4gHvKQUOLPaRt1bpLLfNUFp2st9qOl56zD57gob0/edit#gid=1891810958", "columns": [ - { - "column": "pheno_diabetes_id", - "data_type": "string", - "references": "from: subject_id, age_at_obs", - "notes": "values auto-generated by CC using subject_id and age_at_obs" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -811,6 +790,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -843,14 +823,9 @@ "table": "cvd_cad", "url": "https://docs.google.com/spreadsheets/d/1gchIrBIPt2s_3uVEloUK1c1Rst6IxnbBm7Zwps2k31I/edit#gid=253559161", "columns": [ - { - "column": "pheno_cad_id", - "data_type": "string", - "references": "from: subject_id, age_at_observation", - "notes": "values auto-generated by CC using subject_id and age_at_observation" - }, { "column": "subject_id", + "primary_key": true, "required": true, "description": "the PRIMED subject id", "data_type": "string", @@ -859,6 +834,7 @@ }, { "column": "age_at_obs", + "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", @@ -886,6 +862,469 @@ "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." } ] + }, + { + "table": "cancer_breast", + "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=0", + "columns": [ + { + "column": "subject_id", + "primary_key": true, + "required": true, + "description": "the PRIMED subject id", + "data_type": "string", + "references": "> subject.subject_id", + "notes": "references subject_id in participant table" + }, + { + "column": "age_at_obs", + "primary_key": true, + "required": true, + "description": "the age at which the observation or measurement for the phenotype(s) were taken", + "data_type": "float", + "examples": "56.2" + }, + { + "column": "visit", + "description": "indicator of visit or time of observation", + "data_type": "string", + "examples": ["visit_2", "baseline", "median"], + "notes": "this can be any value that is used consistently within the study" + }, + { + "column": "breast_cancer_status_1", + "description": "whether or not the participant had breast cancer", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "age_at_diagnosis_1", + "description": "age at which participant was diagnosed with breast cancer", + "data_type": "float", + "notes": "may be the same as age_at_obs or not" + }, + { + "column": "year_at_diagnosis_1", + "description": "year participant was diagnosed", + "data_type": "float", + "examples": "1999" + }, + { + "column": "breast_cancer_type_1", + "description": "breast cancer type upon diagnosis", + "data_type": "enumeration", + "enumerations": ["unilateral", "bilateral"] + }, + { + "column": "cancer_behavior_1", + "description": "behavior of the tumor", + "data_type": "enumeration", + "enumerations": ["benign", "borderline", "in_situ", "invasive"], + "notes": "Missing values can be left blank or set to NA" + }, + { + "column": "her2_1", + "description": "human epidermal growth factor receptor 2", + "data_type": "enumeration", + "enumerations": ["positive", "negative", "unknown"] + }, + { + "column": "pr_1", + "description": "progesterone receptor breast cancer cells", + "data_type": "enumeration", + "enumerations": ["positive", "negative", "unknown"] + }, + { + "column": "er_1", + "description": "estrogen receptor breast cancer cells", + "data_type": "enumeration", + "enumerations": ["positive", "negative", "unknown"] + }, + { + "column": "T_stage_clinical_1", + "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.", + "data_type": "enumeration", + "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"] + }, + { + "column": "T_stage_pathological_1", + "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.", + "data_type": "enumeration", + "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"] + }, + { + "column": "T_stage_uknown_1", + "description": "staging (clinical, pathological) is uknown", + "data_type": "enumeration", + "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"] + }, + { + "column": "T_stage_clinical_2", + "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.", + "data_type": "enumeration", + "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"], + "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA" + }, + { + "column": "T_stage_pathological_2", + "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.", + "data_type": "enumeration", + "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"], + "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA" + }, + { + "column": "T_stage_unknown_2", + "description": "staging (clinical, pathological) is uknown", + "data_type": "enumeration", + "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"], + "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA" + }, + { + "column": "nodal_involvement_1", + "description": "regional lymph nodes (N)", + "data_type": "enumeration", + "enumerations": ["NX", "N0", "N1", "N2", "N3"] + }, + { + "column": "distant_metastasis_1", + "description": "distant metastasis (M)", + "data_type": "enumeration", + "enumerations": ["MX", "M0", "M1"] + }, + { + "column": "stage_system", + "description": "definition of staging system used (e.g., SEER, AJCC) and time period (e.g., year), if applicable", + "data_type": "string", + "examples": ["AJCC 2003", "SEER 1999"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "grade_clinical_1", + "description": "grading per diagnosis or biopsy", + "data_type": "enumeration", + "enumerations": ["grade 1", "grade 2", "grade 3"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "grade_pathological_1", + "description": "grading per surgical pathology", + "data_type": "enumeration", + "enumerations": ["grade 1", "grade 2", "grade 3"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "grade_unknown_1", + "description": "grading determination unknown", + "data_type": "enumeration", + "enumerations": ["grade 1", "grade 2", "grade 3"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "screening_history_1", + "description": "whether or not participant underwent screening for breast cancer", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "recurrence_1", + "description": "whether or not a recurrence occurred", + "data_type": "enumeration", + "enumerations": ["recurrence_primary", "recurrence_second_primary", "unknown", "none"] + }, + { + "column": "surgery_1", + "description": "whether or not participant received surgery", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "radiotherapy_1", + "description": "whether or not participant received radiotherapy", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "chemotherapy_1", + "description": "whether or not participant received chemotherapy", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "hormone_therapy_1", + "description": "whether or not participant received hormone therapy. Includes oopherectomy, hysterectomy", + "data_type": "enumeration", + "enumerations": ["pharmaceutical", "surgical", "both", "none", "unknown"], + "notes": "Indicate hormone therapy type in the analyst comments" + }, + { + "column": "NSAID_1", + "description": "use of non-steroidal anti-inflammatory drugs", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "age_at_natural_menopause_1", + "description": "age at which natural menopause occurred", + "data_type": "float", + "notes": "NA if menopause has not occurred" + }, + { + "column": "post_menopausal_hormone_use_1", + "description": "whether or not hormone use occurred post menopause (not cancer-related use)", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "parity_1", + "description": "the number of pregnancies carried for at least 20 weeks", + "data_type": "integer" + }, + { + "column": "age_at_first_birth_1", + "description": "age at which the first birth occurred", + "data_type": "float" + }, + { + "column": "age_at_menarche_1", + "description": "age at which menarche occurred", + "data_type": "float" + }, + { + "column": "deceased_1", + "description": "indication of whether individual is deceased", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "cause_of_death_breast_cancer_1", + "description": "indication of whether cause of death was due to breast cancer or other cause", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no, cause of death not due to breast cancer' 1 = 'yes, cause of death due to breast cancer' Missing values indicate unknown cause of death not deceased and can either be left blank or set to NA." + }, + { + "column": "age_at_death_1", + "description": "age at death of individual", + "data_type": "float", + "notes": "Set to missing (either blank or NA) if individual is not deceased" + } + ] + }, + { + "table": "cancer_prostate", + "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=1811888649", + "columns": [ + { + "column": "subject_id", + "primary_key": true, + "required": true, + "description": "the PRIMED subject id", + "data_type": "string", + "references": "> subject.subject_id", + "notes": "references subject_id in participant table" + }, + { + "column": "age_at_obs", + "primary_key": true, + "required": true, + "description": "the age at which the observation or measurement for the phenotype(s) were taken", + "data_type": "float", + "examples": "56.2" + }, + { + "column": "visit", + "description": "indicator of visit or time of observation", + "data_type": "string", + "examples": ["visit_2", "baseline", "median"], + "notes": "this can be any value that is used consistently within the study" + }, + { + "column": "prostate_cancer_status_1", + "description": "whether or not the participant had prostate cancer", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "age_at_diagnosis_1", + "description": "age at which participant was diagnosed with prostate cancer", + "data_type": "float", + "notes": "may be the same as age_at_obs or not" + }, + { + "column": "year_at_diagnosis", + "description": "year participant was diagnosed", + "data_type": "float", + "examples": "1999" + }, + { + "column": "cancer_behavior_1", + "description": "behavior of the tumor", + "data_type": "enumeration", + "enumerations": ["benign", "borderline", "in_situ", "invasive"], + "notes": "Missing values can be left blank or set to NA" + }, + { + "column": "T_stage_clinical_1", + "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.", + "data_type": "enumeration", + "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"] + }, + { + "column": "T_stage_pathological_1", + "description": "staging per diagnosis or biopsy. Note that this refers to T stage measuring tumor size.", + "data_type": "enumeration", + "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"] + }, + { + "column": "T_stage_uknown_1", + "description": "staging (clinical, pathological) is uknown", + "data_type": "enumeration", + "enumerations": ["stage 1", "stage 2", "stage 3", "stage 4", "unstaged", "unknown"] + }, + { + "column": "T_stage_clinical_2", + "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.", + "data_type": "enumeration", + "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"], + "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA" + }, + { + "column": "T_stage_pathological_2", + "description": "staging per diagnosis or biopsy. Note that this refers to the general T stage and should be used if numeric T stage is not available.", + "data_type": "enumeration", + "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"], + "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA" + }, + { + "column": "T_stage_unknown_2", + "description": "staging (clinical, pathological) is uknown", + "data_type": "enumeration", + "enumerations": ["localized", "regional", "distant", "in_situ", "unknown", "unstaged"], + "notes": "numeric stage is preferred, if available. missing values can either be left blank or set to NA" + }, + { + "column": "nodal_involvement_1", + "description": "regional lymph nodes (N)", + "data_type": "enumeration", + "enumerations": ["NX", "N0", "N1", "N2", "N3"] + }, + { + "column": "distant_metastasis_1", + "description": "distant metastasis (M)", + "data_type": "enumeration", + "enumerations": ["MX", "M0", "M1"] + }, + { + "column": "stage_system", + "description": "definition of staging system (e.g., SEER, AJCC) and time period (e.g., year), if applicable", + "data_type": "string", + "examples": ["AJCC 2003", "SEER 1999"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "gleason_score_clinical_1", + "description": "grading per diagnosis or biopsy", + "data_type": "enumeration", + "enumerations": ["2", "3", "4", "5", "6", "7", "8", "9", "10"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "gleason_score_pathological_1", + "description": "grading per surgical pathology", + "data_type": "enumeration", + "enumerations": ["2", "3", "4", "5", "6", "7", "8", "9", "10"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "gleason_score_unknown_1", + "description": "grading determination unknown", + "data_type": "enumeration", + "enumerations": ["2", "3", "4", "5", "6", "7", "8", "9", "10"], + "notes": "missing values can either be left blank or set to NA" + }, + { + "column": "psa_at_diagnosis_1", + "description": "psa score at diagnosis of prostate cancer", + "data_type": "float" + }, + { + "column": "screening_history_1", + "description": "whether or not participant underwent screening for prostate cancer", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "recurrence_1", + "description": "whether or not a recurrence occurred", + "data_type": "enumeration", + "enumerations": ["recurrence_primary", "recurrence_second_primary", "unknown", "none"] + }, + { + "column": "surgery_1", + "description": "whether or not participant received surgery", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "radiotherapy_1", + "description": "whether or not participant received radiotherapy", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "chemotherapy_1", + "description": "whether or not participant received chemotherapy", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "hormone_therapy_1", + "description": "whether or not participant received horome therapy (e.g., androgen deprivation therapy or ADT)", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "NSAID_1", + "description": "use of non-steroidal anti-inflammatory drugs", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "deceased_1", + "description": "indicator of whether individual is deceased", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no' 1 = 'yes' Missing values can either be left blank or set to NA." + }, + { + "column": "cause_of_death_prostate_cancer_1", + "description": "indication of whether cause of death was due to prostate cancer or other cause", + "data_type": "enumeration", + "enumerations": ["0", "1"], + "notes": "0 = 'no, cause of death not due to prostate cancer' 1 = 'yes, cause of death due to prostate cancer' Missing values indicate unknown cause of death not deceased and can either be left blank or set to NA." + }, + { + "column": "age_at_death_1", + "description": "age at death of individual", + "data_type": "float", + "notes": "Set to missing (either blank or NA) if individual is not deceased" + } + ] } ] } From 210600fd0c3e5ca782f45d86d45bccbaa4900817 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Tue, 31 Oct 2023 12:36:38 -0700 Subject: [PATCH 6/9] add min and max values for some columns --- PRIMED_phenotype_data_model.json | 41 +++++++++++++++++++++--- sheets_to_JSON_phenotype.R | 53 ++++++++++++++++---------------- 2 files changed, 64 insertions(+), 30 deletions(-) diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index a4046b3..ebfbc8a 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED Phenotype Data Model", "description": "Data model for phenotype data in the PRIMED consortium", - "version": "1.4", + "version": "1.5", "tables": [ { "table": "subject", @@ -296,7 +296,9 @@ "primary_key": true, "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", - "data_type": "float" + "data_type": "float", + "min": " 0", + "max": "89" }, { "column": "visit", @@ -442,6 +444,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -455,18 +459,21 @@ "column": "height_1", "description": "standing body height", "data_type": "float", + "min": 0, "examples": "165.1" }, { "column": "weight_1", "description": "body weight at baseline", "data_type": "float", + "min": 0, "examples": "72.574" }, { "column": "bmi_1", "description": "body mass index calculated", "data_type": "float", + "min": 0, "examples": "26.45" }, { @@ -496,6 +503,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -546,6 +555,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -606,6 +617,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -712,6 +725,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -763,6 +778,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -849,6 +866,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -877,6 +896,7 @@ { "table": "cancer_breast", "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=0", + "version": "1.0", "columns": [ { "column": "subject_id", @@ -893,6 +913,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -913,6 +935,8 @@ "column": "age_at_diagnosis_1", "description": "age at which participant was diagnosed with breast cancer", "data_type": "float", + "min": 0, + "max": 89, "notes": "may be the same as age_at_obs or not" }, { @@ -1083,6 +1107,8 @@ "column": "age_at_natural_menopause_1", "description": "age at which natural menopause occurred", "data_type": "float", + "min": 0, + "max": 89, "notes": "NA if menopause has not occurred" }, { @@ -1100,12 +1126,16 @@ { "column": "age_at_first_birth_1", "description": "age at which the first birth occurred", - "data_type": "float" + "data_type": "float", + "min": 0, + "max": 89 }, { "column": "age_at_menarche_1", "description": "age at which menarche occurred", - "data_type": "float" + "data_type": "float", + "min": 0, + "max": 89 }, { "column": "deceased_1", @@ -1125,6 +1155,8 @@ "column": "age_at_death_1", "description": "age at death of individual", "data_type": "float", + "min": 0, + "max": 89, "notes": "Set to missing (either blank or NA) if individual is not deceased" } ] @@ -1132,6 +1164,7 @@ { "table": "cancer_prostate", "url": "https://docs.google.com/spreadsheets/d/1Gfj_EoPuYWhiNk7AaR6DOPjeTC7Nn06sF8qcoMGQ9KM/edit#gid=1811888649", + "version": "1.0", "columns": [ { "column": "subject_id", diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R index ee8458e..b9b6d9a 100644 --- a/sheets_to_JSON_phenotype.R +++ b/sheets_to_JSON_phenotype.R @@ -9,11 +9,11 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1kpWz-6QfjMPVtm62fQwm4hoxzXhR0dnKxVt02fbx9ks" model_name <- "PRIMED Phenotype Data Model" model_description <- "Data model for phenotype data in the PRIMED consortium" -model_version <-"1.4" +model_version <-"1.5" # table metadata meta <- read_sheet(url, sheet="Description", skip=1, col_types="c") %>% - select(table=Table, required=Required, url=Link, version=Version) %>% + select(table=Table, required=Required, url=Link, version=`Table version`) %>% filter(!is.na(url)) # only keep tables with links #table_names <- meta$table @@ -51,30 +51,31 @@ for (i in 1:length(tables)) { mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) } - if ("Multi-value delimiter" %in% names(tmp)) { - tables[[i]] <- tmp %>% - select(column = Column, - primary_key, - required = Required, - description = Description, - data_type = `Data type`, - references = References, - enumerations = Enumerations, - multi_value_delimiter = `Multi-value delimiter`, - examples = Examples, - notes = `Notes/comments`) - } else { - tables[[i]] <- tmp %>% - select(column = Column, - primary_key, - required = Required, - description = Description, - data_type = `Data type`, - references = References, - enumerations = Enumerations, - examples = Examples, - notes = `Notes/comments`) - } + lookup <- c( + data_type = "Data type", + multi_value_delimiter = "Multi-value delimiter", + notes = "Notes/comments" + ) + tmp <- tmp %>% + rename(any_of(lookup)) %>% + rename_with(tolower) + + keep_cols <- c( + "column", + "primary_key", + "required", + "description", + "data_type", + "min", + "max", + "references", + "enumerations", + "multi_value_delimiter", + "examples", + "notes" + ) + tables[[i]] <- tmp %>% + select(any_of(keep_cols)) } From 9f4a0317f2df7775e3014edbde86e6c067763d1e Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Thu, 2 Nov 2023 15:41:55 -0700 Subject: [PATCH 7/9] add is_bucket_path to file_path columns --- PRIMED_genotype_data_model.json | 14 +++++++++----- sheets_to_JSON_genotype.R | 4 +++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/PRIMED_genotype_data_model.json b/PRIMED_genotype_data_model.json index a4eef72..4b6e04a 100644 --- a/PRIMED_genotype_data_model.json +++ b/PRIMED_genotype_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED Genotype Data Model", "description": "Data model for genotype data in the PRIMED consortium", - "version": "1.2", + "version": "1.3", "tables": [ { "table": "subject", @@ -198,7 +198,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", @@ -305,7 +306,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", @@ -441,7 +443,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", @@ -541,7 +544,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", diff --git a/sheets_to_JSON_genotype.R b/sheets_to_JSON_genotype.R index 9c75628..8ec0080 100644 --- a/sheets_to_JSON_genotype.R +++ b/sheets_to_JSON_genotype.R @@ -8,7 +8,7 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1lwVMGT-TQaWbMWvi3hdqWuEthZvaKGOImINAqXguPaM" model_name <- "PRIMED Genotype Data Model" model_description <- "Data model for genotype data in the PRIMED consortium" -model_version <- "1.2" +model_version <- "1.3" # read in the data @@ -27,6 +27,7 @@ rm(list = c("table_names", "url")) for (i in 1:length(tables)) { tables[[i]] <- tables[[i]] %>% mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>% + mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>% select(column = Column, primary_key, required = Required, @@ -34,6 +35,7 @@ for (i in 1:length(tables)) { data_type = `Data type`, references = References, enumerations = Enumerations, + is_bucket_path, examples = Examples, notes = `Notes/comments`) %>% mutate(description=gsub('"', "'", description), From feefbcc1d8c3e999fae5bec42798d9060df83d82 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Sat, 4 Nov 2023 12:49:24 -0700 Subject: [PATCH 8/9] add bucket path to gsr and phenotype models --- PRIMED_GSR_data_model.json | 5 +-- PRIMED_phenotype_data_model.json | 12 +++++--- sheets_to_JSON_gsr.R | 52 +++++++++++++++++--------------- sheets_to_JSON_phenotype.R | 2 ++ 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json index 00135a9..d54d118 100644 --- a/PRIMED_GSR_data_model.json +++ b/PRIMED_GSR_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED GSR Data Model", "description": "Data model for Genomic Summary Results in the PRIMED consortium", - "version": "1.0", + "version": "1.1", "tables": [ { "table": "analysis", @@ -423,7 +423,8 @@ "column": "file_path", "required": true, "description": "File path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index ebfbc8a..d663eca 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -129,13 +129,15 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_readme_path", "required": true, "description": "path to the README", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "n_subjects", @@ -179,13 +181,15 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_dd_path", "required": true, "description": "path to the data dictionary", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "n_subjects", diff --git a/sheets_to_JSON_gsr.R b/sheets_to_JSON_gsr.R index 7b30f2d..d79da87 100644 --- a/sheets_to_JSON_gsr.R +++ b/sheets_to_JSON_gsr.R @@ -9,7 +9,7 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I" model_name <- "PRIMED GSR Data Model" model_description <- "Data model for Genomic Summary Results in the PRIMED consortium" -model_version <- "1.0" +model_version <- "1.1" # table metadata @@ -28,35 +28,37 @@ for (i in 1:length(tables)) { tmp <- tables[[i]] %>% filter(!is.na(`Data type`)) %>% # keep only valid rows mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>% + mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>% mutate(Description=gsub('"', "'", Description), # replace double with single quote Description=gsub('\n', ' ', Description), # replace newline with space `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote `Notes/comments`=gsub('\n', ' ', `Notes/comments`), # replace newline with space References=ifelse(grepl("omop_concept", References), NA, References)) # remove external table reference - if ("Multi-value delimiter" %in% names(tmp)) { - tables[[i]] <- tmp %>% - select(column = Column, - primary_key, - required = Required, - description = Description, - data_type = `Data type`, - references = References, - enumerations = Enumerations, - multi_value_delimiter = `Multi-value delimiter`, - examples = Examples, - notes = `Notes/comments`) - } else { - tables[[i]] <- tmp %>% - select(column = Column, - primary_key, - required = Required, - description = Description, - data_type = `Data type`, - references = References, - enumerations = Enumerations, - examples = Examples, - notes = `Notes/comments`) - } + + lookup <- c( + data_type = "Data type", + multi_value_delimiter = "Multi-value delimiter", + notes = "Notes/comments" + ) + tmp <- tmp %>% + rename(any_of(lookup)) %>% + rename_with(tolower) + + keep_cols <- c( + "column", + "primary_key", + "required", + "description", + "data_type", + "references", + "enumerations", + "is_bucket_path", + "multi_value_delimiter", + "examples", + "notes" + ) + tables[[i]] <- tmp %>% + select(any_of(keep_cols)) } rm(list = c("tmp")) diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R index b9b6d9a..92dd03a 100644 --- a/sheets_to_JSON_phenotype.R +++ b/sheets_to_JSON_phenotype.R @@ -37,6 +37,7 @@ rm(list = c("table_info", "url")) for (i in 1:length(tables)) { tmp <- tables[[i]] %>% filter(!is.na(`Data type`)) %>% # keep only valid rows + mutate(is_bucket_path = ifelse(grepl("file_.*path", Column), TRUE, NA)) %>% mutate(Required=as.logical(Required), # non-T/F values will be NA Description=gsub('"', "'", Description), # replace double with single quote Description=gsub('\n', ' ', Description), # replace newline with space @@ -70,6 +71,7 @@ for (i in 1:length(tables)) { "max", "references", "enumerations", + "is_bucket_path", "multi_value_delimiter", "examples", "notes" From b0f28a1757b5cf19be307ba5a230567fa7ab09a7 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Mon, 6 Nov 2023 15:33:18 -0800 Subject: [PATCH 9/9] add data model version to harmonized table --- PRIMED_phenotype_data_model.json | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index d663eca..e53d6a5 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -150,6 +150,12 @@ "required": true, "description": "Number of rows in file (may be > n_subjects for longitudinal data)", "data_type": "integer" + }, + { + "column": "data_model_version", + "description": "data model version for this table", + "data_type": "float", + "notes": "added automatically by validation workflow" } ] }, @@ -825,6 +831,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -946,7 +954,7 @@ { "column": "year_at_diagnosis_1", "description": "year participant was diagnosed", - "data_type": "float", + "data_type": "integer", "examples": "1999" }, { @@ -1210,7 +1218,7 @@ { "column": "year_at_diagnosis", "description": "year participant was diagnosed", - "data_type": "float", + "data_type": "integer", "examples": "1999" }, {