From c37d4e2027a0c2b4df361c6b7ec33e3b72c5728d Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Fri, 21 Jun 2024 00:10:21 -0700
Subject: [PATCH 1/2] allow multiple types of GSR analyses

GSR data model  now allows multiple pairs of analysis/file tables.
Naming convention is expected to be <type>_analysis and
<type>_file
---
 PRIMED_GSR_data_model.json | 184 +++++++++++++++++++++++++++++++++----
 json_to_dbml.R             |   4 +-
 sheets_to_JSON_gsr.R       |  13 +--
 3 files changed, 169 insertions(+), 32 deletions(-)
diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json
index d54d118..3600ace 100644
--- a/PRIMED_GSR_data_model.json
+++ b/PRIMED_GSR_data_model.json
@@ -1,14 +1,14 @@
 {
   "name": "PRIMED GSR Data Model",
   "description": "Data model for Genomic Summary Results in the PRIMED consortium",
-  "version": "1.1",
+  "version": "2.0",
   "tables": [
     {
-      "table": "analysis",
-      "required": true,
+      "table": "association_analysis",
+      "required": "CONDITIONAL (association_file)",
       "columns": [
         {
-          "column": "analysis_id",
+          "column": "association_analysis_id",
           "primary_key": true,
           "description": "unique identifier for a gwas in primed",
           "data_type": "string",
@@ -137,9 +137,8 @@
         },
         {
           "column": "concept_id",
-          "description": "concept_id for trait from OMOP concept table",
-          "data_type": "string",
-          "notes": "discuss in PRIMED phenotype WG"
+          "description": "OMOP concept_id",
+          "data_type": "string"
         },
         {
           "column": "mapped_trait",
@@ -389,15 +388,28 @@
           "data_type": "string",
           "multi_value_delimiter": "|",
           "examples": ["GCTA", "PLINK", "GENESIS", "SAIGE"]
+        },
+        {
+          "column": "primed_dataset_id",
+          "description": "For GWAS that used a dataset in primed individual data model indicate its dataset_id",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": "a60fb66cd539ad2c",
+          "notes": "From Genotype data model: array_dataset_id, imputation_dataset_id, or sequencing_dataset_id"
+        },
+        {
+          "column": "analysis_workspace_id",
+          "description": "workspace identifier for the GWAS that was generated in PRIMED",
+          "data_type": "string",
+          "examples": "primed-analysis/PRIMED_ANALYSIS_GERA-topmed-v3-imputation"
         }
       ]
     },
     {
-      "table": "gsr_file",
-      "required": true,
+      "table": "association_file",
       "columns": [
         {
-          "column": "gsr_file_id",
+          "column": "association_file_id",
           "primary_key": true,
           "data_type": "string",
           "references": "from: md5sum",
@@ -412,11 +424,11 @@
           "notes": "(32-digit hexadecimal number)"
         },
         {
-          "column": "analysis_id",
+          "column": "association_analysis_id",
           "required": true,
           "description": "unique identifier for a gwas in primed",
           "data_type": "string",
-          "references": "> analysis.analysis_id",
+          "references": "> association_analysis.association_analysis_id",
           "notes": "AnVIL upload workflow would generate this"
         },
         {
@@ -453,8 +465,7 @@
       ]
     },
     {
-      "table": "gsr_files_dd",
-      "required": false,
+      "table": "association_files_dd",
       "columns": [
         {
           "column": "SNPID",
@@ -468,8 +479,7 @@
           "description": "the chromosome that the variant is located on",
           "data_type": "enumeration",
           "enumerations": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"],
-          "examples": "13",
-          "notes": "confirm the enumerations with WG"
+          "examples": "13"
         },
         {
           "column": "position",
@@ -497,16 +507,15 @@
           "required": true,
           "description": "effect allele of the variant",
           "data_type": "string",
-          "examples": "A",
-          "notes": "allow for I/D as enum and QC"
+          "examples": ["A", "I", "ATTCG+"],
+          "notes": "Full strings encouraged. For long strings, they can be truncated with a '+' appended. I/D allowed for insertions/deletions."
         },
         {
           "column": "other_allele",
           "required": true,
           "description": "the other allele to the effect allele",
           "data_type": "string",
-          "examples": "G",
-          "notes": "allow for I/D as enum and QC"
+          "examples": "G"
         },
         {
           "column": "ref_allele",
@@ -665,6 +674,141 @@
           "data_type": "float"
         }
       ]
+    },
+    {
+      "table": "genetic_ancestry_analysis",
+      "required": "CONDITIONAL (genetic_ancestry_file)",
+      "columns": [
+        {
+          "column": "genetic_ancestry_analysis_id",
+          "primary_key": true,
+          "description": "unique identifier for an analysis in primed",
+          "data_type": "string",
+          "notes": "PRIMED upload workflow would generate this"
+        },
+        {
+          "column": "gsr_source",
+          "required": true,
+          "description": "Information about source of GSR data. Include additional details in README",
+          "data_type": "string",
+          "examples": ["dbGaP", "Colorado Biobank ", "PRIMED"],
+          "notes": "free text short label; useful for unreleased GSR"
+        },
+        {
+          "column": "gsr_source_url",
+          "description": "URL of source (if applicable)",
+          "data_type": "string"
+        },
+        {
+          "column": "dbgap_analysis_accession",
+          "description": "Analysis accession identifier for GSR downloaded from dbGaP",
+          "data_type": "string",
+          "examples": "pha003690.1",
+          "notes": "identifier in phaXXXXXX.v format"
+        },
+        {
+          "column": "pubmed_id",
+          "description": "Pubmed ID identifier of the publication",
+          "data_type": "string",
+          "examples": "33568819",
+          "notes": "PMID identifier"
+        },
+        {
+          "column": "first_author",
+          "description": "Last name and initials of the first author",
+          "data_type": "string"
+        },
+        {
+          "column": "publication_url",
+          "description": "External link to the publication",
+          "data_type": "string",
+          "notes": "URL of publication"
+        },
+        {
+          "column": "release_date",
+          "description": "Date on which the analysis was released publicy",
+          "data_type": "date",
+          "notes": "e.g. on dbGaP or GWAS Catalog"
+        },
+        {
+          "column": "consent_code",
+          "required": true,
+          "description": "consent abbreviation (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4721915/table/pgen.1005772.t001/?report=objectonly)",
+          "data_type": "string",
+          "notes": "NRES is the code for no restrictions on data use (i.e. open access)"
+        },
+        {
+          "column": "upload_date",
+          "required": true,
+          "description": "Date GSR was uploaded to PRIMED AnVIL workspace",
+          "data_type": "date",
+          "examples": "2023-03-21",
+          "notes": "YYYY-MM-DD format"
+        },
+        {
+          "column": "contributor_contact",
+          "required": true,
+          "description": "email of the PRIMED contributor who can be contacted for data related questions",
+          "data_type": "string",
+          "notes": "recommended by WG. helpful when GSR is not publicly released and is from a Biobank or some other source"
+        }
+      ]
+    },
+    {
+      "table": "genetic_ancestry_file",
+      "columns": [
+        {
+          "column": "genetic_ancestry_file_id",
+          "primary_key": true,
+          "data_type": "string",
+          "references": "from: md5sum",
+          "notes": "AnVIL requires entity id with naming convention <table_name>_id"
+        },
+        {
+          "column": "md5sum",
+          "required": true,
+          "description": "A valid md5 checksum",
+          "data_type": "string",
+          "examples": "49ea8cf53801c7f1e2f11336fb8a29c8",
+          "notes": "(32-digit hexadecimal number)"
+        },
+        {
+          "column": "genetic_ancestry_analysis_id",
+          "required": true,
+          "description": "unique identifier for an analysis in primed",
+          "data_type": "string",
+          "references": "> genetic_ancestry_analysis.genetic_ancestry_analysis_id",
+          "notes": "AnVIL upload workflow would generate this"
+        },
+        {
+          "column": "file_path",
+          "required": true,
+          "description": "File path in cloud storage",
+          "data_type": "string",
+          "is_bucket_path": true
+        },
+        {
+          "column": "file_type",
+          "required": true,
+          "description": "Type of the file",
+          "data_type": "string",
+          "examples": ["SNP loadings", "PCs", "admixture proportions"]
+        },
+        {
+          "column": "n_variants",
+          "description": "Count of variants in the GSR data file",
+          "data_type": "integer",
+          "examples": "15281216",
+          "notes": "If applicable (e.g. SNP loading files)"
+        },
+        {
+          "column": "n_samp",
+          "description": "Count of samples in the GSR data file",
+          "data_type": "integer",
+          "examples": "4500",
+          "notes": "If applicable (e.g. individual-level PC files)"
+        }
+      ]
     }
   ]
 }
diff --git a/json_to_dbml.R b/json_to_dbml.R
index 9f1cf9a..e67bd4c 100644
--- a/json_to_dbml.R
+++ b/json_to_dbml.R
@@ -1,8 +1,8 @@
 #remotes::install_github("UW-GAC/AnvilDataModels")
 
 #prefix <- "PRIMED_genotype_data_model"
-prefix <- "PRIMED_phenotype_data_model"
-#prefix <- "PRIMED_GSR_data_model"
+#prefix <- "PRIMED_phenotype_data_model"
+prefix <- "PRIMED_GSR_data_model"
 
 # check that data model object can be created
 AnvilDataModels::json_to_dm(paste0(prefix, ".json"))
diff --git a/sheets_to_JSON_gsr.R b/sheets_to_JSON_gsr.R
index d79da87..b942054 100644
--- a/sheets_to_JSON_gsr.R
+++ b/sheets_to_JSON_gsr.R
@@ -9,18 +9,15 @@ library(jsonlite)
 url <- "https://docs.google.com/spreadsheets/d/1xfSQqRQIq6pGkJ5jzzv2QhetmX5boaEZoNECpDwXe5I"
 model_name <- "PRIMED GSR Data Model"
 model_description <- "Data model for Genomic Summary Results in the PRIMED consortium"
-model_version <- "1.1"
+model_version <- "2.0"
 
 
 # table metadata
-meta <- tibble(
-    table=c("analysis", "gsr_file", "gsr_files_dd"),
-    required=c("TRUE", "TRUE", "FALSE")
-)
+meta <- read_sheet(url, sheet="Tables", skip=1, col_types="c") %>%
+    select(table=Table, required=Required)
 table_names <- meta$table
 tables <- lapply(table_names, function(x) read_sheet(url, sheet=x, skip=1, col_types="c"))
 names(tables) <- table_names
-rm(list = c("table_names", "url"))
 
 
 # rename and reorder columns
@@ -60,7 +57,6 @@ for (i in 1:length(tables)) {
     tables[[i]] <- tmp %>%
         select(any_of(keep_cols))
 }
-rm(list = c("tmp"))
 
 
 # call in the sheets_to_list function that accepts two arguments:
@@ -68,7 +64,6 @@ rm(list = c("tmp"))
 # 2) the list of data tables corresponding to the first argument
 source("sheets_to_list.R")
 tab_list <- sheets_to_list(apply(meta, 1, as.list), tables)
-rm(list = c("meta", "tables", "sheets_to_list"))
 
 
 # initialize leading text
@@ -81,7 +76,6 @@ master <- list(
     # Data Table Details
     tables = tab_list
 )
-rm(list = c("tab_list"))
 
 
 # compile master file in JSON format
@@ -89,7 +83,6 @@ out <- toJSON(x = master,
               pretty = TRUE,
               auto_unbox = TRUE,
               unbox = TRUE)
-rm(list = c("master"))
 
 
 # unquote the logical parameters TRUE and FALSE

From 5ef562640e8d7df06cb3758255837acde926a45e Mon Sep 17 00:00:00 2001
From: "Stephanie M. Gogarten" <sdmorris@uw.edu>
Date: Wed, 26 Jun 2024 13:59:06 -0700
Subject: [PATCH 2/2] updates to genetic ancestry analysis

---
 PRIMED_GSR_data_model.json | 185 ++++++++++++++++++++++++++++++++++---
 1 file changed, 173 insertions(+), 12 deletions(-)

diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json
index 3600ace..1d45627 100644
--- a/PRIMED_GSR_data_model.json
+++ b/PRIMED_GSR_data_model.json
@@ -156,7 +156,7 @@
         },
         {
           "column": "dbsnp_build_version",
-          "description": "dbSNP build for the rsIDS included in GSR files",
+          "description": "dbSNP build for the rsIDs included in GSR files",
           "data_type": "string"
         },
         {
@@ -395,7 +395,7 @@
           "data_type": "string",
           "multi_value_delimiter": "|",
           "examples": "a60fb66cd539ad2c",
-          "notes": "From Genotype data model: array_dataset_id, imputation_dataset_id, or sequencing_dataset_id"
+          "notes": "From PRIMED inventory workspace"
         },
         {
           "column": "analysis_workspace_id",
@@ -751,6 +751,175 @@
           "description": "email of the PRIMED contributor who can be contacted for data related questions",
           "data_type": "string",
           "notes": "recommended by WG. helpful when GSR is not publicly released and is from a Biobank or some other source"
+        },
+        {
+          "column": "reference_assembly",
+          "required": true,
+          "description": "Reference genome assembly that the submitted data is mapped to",
+          "data_type": "enumeration",
+          "enumerations": ["GRCh38", "GRCh37", "NCBI36", "NCBI35", "NCBI34"],
+          "examples": "GRCh38"
+        },
+        {
+          "column": "dbsnp_build_version",
+          "description": "dbSNP build for the rsIDs included in GSR files",
+          "data_type": "string"
+        },
+        {
+          "column": "n_variants",
+          "required": true,
+          "description": "Total number of variants in the analysis results across all chromosomes",
+          "data_type": "integer",
+          "examples": "1000000",
+          "notes": "This will be used for a QC step to check data integrity of submitted data"
+        },
+        {
+          "column": "min_MAF_filter",
+          "description": "minimum minor allele frequency filter",
+          "data_type": "float",
+          "examples": "0.01"
+        },
+        {
+          "column": "min_MAC_filter",
+          "description": "minimum minor allele count filter",
+          "data_type": "integer",
+          "examples": "20"
+        },
+        {
+          "column": "LD_max_r2_filter",
+          "description": "maximum r^2 for SNPs kept in LD pruning",
+          "data_type": "float",
+          "examples": "0.1"
+        },
+        {
+          "column": "genotyping_technology",
+          "required": true,
+          "description": "The genotyping technology used for detecting variants",
+          "data_type": "enumeration",
+          "enumerations": ["genome-wide array", "WGS", "exome array", "WES", "other array"],
+          "multi_value_delimiter": "|"
+        },
+        {
+          "column": "genotyping_platform",
+          "required": true,
+          "description": "Genotyping platform description including manufacturer, array name, sequencer name",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": ["Illumina GSA", "Illumina Omni-2.5", "Illumina HiSeq", "Affymetrix"],
+          "notes": "put 'unavailable' if unknown"
+        },
+        {
+          "column": "is_imputed",
+          "required": true,
+          "description": "Indicator of whether the analysis was performed using imputed genotypes or dosages",
+          "data_type": "boolean",
+          "enumerations": ["TRUE", "FALSE"],
+          "examples": true
+        },
+        {
+          "column": "imputation_reference_panel",
+          "required": "CONDITIONAL (is_imputed = TRUE)",
+          "description": "Reference panel use for imputation",
+          "data_type": "enumeration",
+          "enumerations": ["1000 Genomes", "HRC", "TOPMed", "Other"],
+          "examples": "TOPMed",
+          "notes": "Put 'Other' if not one of the common reference panels listed. Can include further information in imputation_reference_panel_detail or README"
+        },
+        {
+          "column": "imputation_reference_panel_detail",
+          "required": "CONDITIONAL (is_imputed = TRUE)",
+          "description": "Details of the imputation reference panel; e.g. version number or name of panel when imputation_reference_panel = 'Other'",
+          "data_type": "string",
+          "examples": "TOPMed r2",
+          "notes": "version number or name of 'other' include N/A"
+        },
+        {
+          "column": "imputation_quality_filter",
+          "required": "CONDITIONAL (is_imputed = TRUE)",
+          "description": "minimum imputation quality value (e.g. Rsq, info) for filtering imputed variants",
+          "data_type": "float",
+          "examples": "0.3",
+          "notes": "If no filter, enter value of 0"
+        },
+        {
+          "column": "n_samp",
+          "required": true,
+          "description": "Total sample size in the analysis",
+          "data_type": "integer",
+          "notes": "When different markers have different sample sizes, e.g, due to missing genotypes, use max sample size across markers"
+        },
+        {
+          "column": "cohorts",
+          "required": true,
+          "description": "A list of cohorts that collected the samples.",
+          "data_type": "string",
+          "multi_value_delimiter": "|"
+        },
+        {
+          "column": "population_descriptor",
+          "required": true,
+          "description": "the concept or classification scheme used to categorize people into populations for this analysis",
+          "data_type": "string",
+          "examples": "reported ancestry"
+        },
+        {
+          "column": "population_labels",
+          "required": true,
+          "description": "name given to a population that describes or classifies it according to the dimension along which it was identified",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": "Chinese Americans | European Americans | African Americans"
+        },
+        {
+          "column": "population_proportions",
+          "description": "proportion of participants from each population in the same order mapping to the values in the population_labels variable",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": "0.7 | 0.1 | 0.2"
+        },
+        {
+          "column": "countries_of_recruitment",
+          "required": true,
+          "description": "Reported countries of recruitment",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": "Ghana | Kenya | Nigeria"
+        },
+        {
+          "column": "countries_of_birth",
+          "description": "Reported countries of birth",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": "Ghana | Kenya | Nigeria"
+        },
+        {
+          "column": "analysis_method",
+          "required": true,
+          "description": "The name or description of the method or computational algorithm used for genetic ancestry analysis.",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": ["linear regression", "logistic regression", "LMM", "GLMM", "meta-analysis"]
+        },
+        {
+          "column": "analysis_software",
+          "description": "The name of the software used for the genetic anestry analysis",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": ["GCTA", "PLINK", "GENESIS", "SAIGE"]
+        },
+        {
+          "column": "primed_dataset_id",
+          "description": "For analyses that used a dataset in primed individual data model indicate its dataset_id",
+          "data_type": "string",
+          "multi_value_delimiter": "|",
+          "examples": "a60fb66cd539ad2c",
+          "notes": "From PRIMED inventory workspace"
+        },
+        {
+          "column": "analysis_workspace_id",
+          "description": "workspace identifier for the GWAS that was generated in PRIMED",
+          "data_type": "string",
+          "examples": "primed-analysis/PRIMED_ANALYSIS_GERA-topmed-v3-imputation"
         }
       ]
     },
@@ -792,21 +961,13 @@
           "required": true,
           "description": "Type of the file",
           "data_type": "string",
-          "examples": ["SNP loadings", "PCs", "admixture proportions"]
+          "examples": ["SNP loadings", "allele frequencies"]
         },
         {
           "column": "n_variants",
           "description": "Count of variants in the GSR data file",
           "data_type": "integer",
-          "examples": "15281216",
-          "notes": "If applicable (e.g. SNP loading files)"
-        },
-        {
-          "column": "n_samp",
-          "description": "Count of samples in the GSR data file",
-          "data_type": "integer",
-          "examples": "4500",
-          "notes": "If applicable (e.g. individual-level PC files)"
+          "examples": "15281216"
         }
       ]
     }