diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json index 00135a9..d54d118 100644 --- a/PRIMED_GSR_data_model.json +++ b/PRIMED_GSR_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED GSR Data Model", "description": "Data model for Genomic Summary Results in the PRIMED consortium", - "version": "1.0", + "version": "1.1", "tables": [ { "table": "analysis", @@ -423,7 +423,8 @@ "column": "file_path", "required": true, "description": "File path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", diff --git a/PRIMED_genotype_data_model.json b/PRIMED_genotype_data_model.json index a4eef72..4b6e04a 100644 --- a/PRIMED_genotype_data_model.json +++ b/PRIMED_genotype_data_model.json @@ -1,7 +1,7 @@ { "name": "PRIMED Genotype Data Model", "description": "Data model for genotype data in the PRIMED consortium", - "version": "1.2", + "version": "1.3", "tables": [ { "table": "subject", @@ -198,7 +198,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", @@ -305,7 +306,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", @@ -441,7 +443,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", @@ -541,7 +544,8 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_type", diff --git a/PRIMED_phenotype_data_model.json b/PRIMED_phenotype_data_model.json index ebfbc8a..e53d6a5 100755 --- a/PRIMED_phenotype_data_model.json +++ b/PRIMED_phenotype_data_model.json @@ -129,13 +129,15 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_readme_path", "required": true, "description": "path to the README", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "n_subjects", @@ -148,6 +150,12 @@ "required": true, "description": "Number of rows in file (may be > n_subjects for longitudinal data)", "data_type": "integer" + }, + { + "column": "data_model_version", + "description": "data model version for this table", + "data_type": "float", + "notes": "added automatically by validation workflow" } ] }, @@ -179,13 +187,15 @@ "column": "file_path", "required": true, "description": "absolute file path in cloud storage", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "file_dd_path", "required": true, "description": "path to the data dictionary", - "data_type": "string" + "data_type": "string", + "is_bucket_path": true }, { "column": "n_subjects", @@ -821,6 +831,8 @@ "required": true, "description": "the age at which the observation or measurement for the phenotype(s) were taken", "data_type": "float", + "min": 0, + "max": 89, "examples": "56.2" }, { @@ -942,7 +954,7 @@ { "column": "year_at_diagnosis_1", "description": "year participant was diagnosed", - "data_type": "float", + "data_type": "integer", "examples": "1999" }, { @@ -1206,7 +1218,7 @@ { "column": "year_at_diagnosis", "description": "year participant was diagnosed", - "data_type": "float", + "data_type": "integer", "examples": "1999" }, { diff --git a/sheets_to_JSON_genotype.R b/sheets_to_JSON_genotype.R index 9c75628..8ec0080 100644 --- a/sheets_to_JSON_genotype.R +++ b/sheets_to_JSON_genotype.R @@ -8,7 +8,7 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1lwVMGT-TQaWbMWvi3hdqWuEthZvaKGOImINAqXguPaM" model_name <- "PRIMED Genotype Data Model" model_description <- "Data model for genotype data in the PRIMED consortium" -model_version <- "1.2" +model_version <- "1.3" # read in the data @@ -27,6 +27,7 @@ rm(list = c("table_names", "url")) for (i in 1:length(tables)) { tables[[i]] <- tables[[i]] %>% mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>% + mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>% select(column = Column, primary_key, required = Required, @@ -34,6 +35,7 @@ for (i in 1:length(tables)) { data_type = `Data type`, references = References, enumerations = Enumerations, + is_bucket_path, examples = Examples, notes = `Notes/comments`) %>% mutate(description=gsub('"', "'", description), diff --git a/sheets_to_JSON_gsr.R b/sheets_to_JSON_gsr.R index 7b30f2d..d79da87 100644 --- a/sheets_to_JSON_gsr.R +++ b/sheets_to_JSON_gsr.R @@ -9,7 +9,7 @@ library(jsonlite) url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I" model_name <- "PRIMED GSR Data Model" model_description <- "Data model for Genomic Summary Results in the PRIMED consortium" -model_version <- "1.0" +model_version <- "1.1" # table metadata @@ -28,35 +28,37 @@ for (i in 1:length(tables)) { tmp <- tables[[i]] %>% filter(!is.na(`Data type`)) %>% # keep only valid rows mutate(primary_key = ifelse(paste0(names(tables)[i], "_id") == Column, TRUE, NA)) %>% + mutate(is_bucket_path = ifelse(Column == "file_path", TRUE, NA)) %>% mutate(Description=gsub('"', "'", Description), # replace double with single quote Description=gsub('\n', ' ', Description), # replace newline with space `Notes/comments`=gsub('"', "'", `Notes/comments`), # replace double with single quote `Notes/comments`=gsub('\n', ' ', `Notes/comments`), # replace newline with space References=ifelse(grepl("omop_concept", References), NA, References)) # remove external table reference - if ("Multi-value delimiter" %in% names(tmp)) { - tables[[i]] <- tmp %>% - select(column = Column, - primary_key, - required = Required, - description = Description, - data_type = `Data type`, - references = References, - enumerations = Enumerations, - multi_value_delimiter = `Multi-value delimiter`, - examples = Examples, - notes = `Notes/comments`) - } else { - tables[[i]] <- tmp %>% - select(column = Column, - primary_key, - required = Required, - description = Description, - data_type = `Data type`, - references = References, - enumerations = Enumerations, - examples = Examples, - notes = `Notes/comments`) - } + + lookup <- c( + data_type = "Data type", + multi_value_delimiter = "Multi-value delimiter", + notes = "Notes/comments" + ) + tmp <- tmp %>% + rename(any_of(lookup)) %>% + rename_with(tolower) + + keep_cols <- c( + "column", + "primary_key", + "required", + "description", + "data_type", + "references", + "enumerations", + "is_bucket_path", + "multi_value_delimiter", + "examples", + "notes" + ) + tables[[i]] <- tmp %>% + select(any_of(keep_cols)) } rm(list = c("tmp")) diff --git a/sheets_to_JSON_phenotype.R b/sheets_to_JSON_phenotype.R index b9b6d9a..92dd03a 100644 --- a/sheets_to_JSON_phenotype.R +++ b/sheets_to_JSON_phenotype.R @@ -37,6 +37,7 @@ rm(list = c("table_info", "url")) for (i in 1:length(tables)) { tmp <- tables[[i]] %>% filter(!is.na(`Data type`)) %>% # keep only valid rows + mutate(is_bucket_path = ifelse(grepl("file_.*path", Column), TRUE, NA)) %>% mutate(Required=as.logical(Required), # non-T/F values will be NA Description=gsub('"', "'", Description), # replace double with single quote Description=gsub('\n', ' ', Description), # replace newline with space @@ -70,6 +71,7 @@ for (i in 1:length(tables)) { "max", "references", "enumerations", + "is_bucket_path", "multi_value_delimiter", "examples", "notes"