From 5ef562640e8d7df06cb3758255837acde926a45e Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Wed, 26 Jun 2024 13:59:06 -0700 Subject: [PATCH] updates to genetic ancestry analysis --- PRIMED_GSR_data_model.json | 185 ++++++++++++++++++++++++++++++++++--- 1 file changed, 173 insertions(+), 12 deletions(-) diff --git a/PRIMED_GSR_data_model.json b/PRIMED_GSR_data_model.json index 3600ace..1d45627 100644 --- a/PRIMED_GSR_data_model.json +++ b/PRIMED_GSR_data_model.json @@ -156,7 +156,7 @@ }, { "column": "dbsnp_build_version", - "description": "dbSNP build for the rsIDS included in GSR files", + "description": "dbSNP build for the rsIDs included in GSR files", "data_type": "string" }, { @@ -395,7 +395,7 @@ "data_type": "string", "multi_value_delimiter": "|", "examples": "a60fb66cd539ad2c", - "notes": "From Genotype data model: array_dataset_id, imputation_dataset_id, or sequencing_dataset_id" + "notes": "From PRIMED inventory workspace" }, { "column": "analysis_workspace_id", @@ -751,6 +751,175 @@ "description": "email of the PRIMED contributor who can be contacted for data related questions", "data_type": "string", "notes": "recommended by WG. helpful when GSR is not publicly released and is from a Biobank or some other source" + }, + { + "column": "reference_assembly", + "required": true, + "description": "Reference genome assembly that the submitted data is mapped to", + "data_type": "enumeration", + "enumerations": ["GRCh38", "GRCh37", "NCBI36", "NCBI35", "NCBI34"], + "examples": "GRCh38" + }, + { + "column": "dbsnp_build_version", + "description": "dbSNP build for the rsIDs included in GSR files", + "data_type": "string" + }, + { + "column": "n_variants", + "required": true, + "description": "Total number of variants in the analysis results across all chromosomes", + "data_type": "integer", + "examples": "1000000", + "notes": "This will be used for a QC step to check data integrity of submitted data" + }, + { + "column": "min_MAF_filter", + "description": "minimum minor allele frequency filter", + "data_type": "float", + "examples": "0.01" + }, + { + "column": "min_MAC_filter", + "description": "minimum minor allele count filter", + "data_type": "integer", + "examples": "20" + }, + { + "column": "LD_max_r2_filter", + "description": "maximum r^2 for SNPs kept in LD pruning", + "data_type": "float", + "examples": "0.1" + }, + { + "column": "genotyping_technology", + "required": true, + "description": "The genotyping technology used for detecting variants", + "data_type": "enumeration", + "enumerations": ["genome-wide array", "WGS", "exome array", "WES", "other array"], + "multi_value_delimiter": "|" + }, + { + "column": "genotyping_platform", + "required": true, + "description": "Genotyping platform description including manufacturer, array name, sequencer name", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": ["Illumina GSA", "Illumina Omni-2.5", "Illumina HiSeq", "Affymetrix"], + "notes": "put 'unavailable' if unknown" + }, + { + "column": "is_imputed", + "required": true, + "description": "Indicator of whether the analysis was performed using imputed genotypes or dosages", + "data_type": "boolean", + "enumerations": ["TRUE", "FALSE"], + "examples": true + }, + { + "column": "imputation_reference_panel", + "required": "CONDITIONAL (is_imputed = TRUE)", + "description": "Reference panel use for imputation", + "data_type": "enumeration", + "enumerations": ["1000 Genomes", "HRC", "TOPMed", "Other"], + "examples": "TOPMed", + "notes": "Put 'Other' if not one of the common reference panels listed. Can include further information in imputation_reference_panel_detail or README" + }, + { + "column": "imputation_reference_panel_detail", + "required": "CONDITIONAL (is_imputed = TRUE)", + "description": "Details of the imputation reference panel; e.g. version number or name of panel when imputation_reference_panel = 'Other'", + "data_type": "string", + "examples": "TOPMed r2", + "notes": "version number or name of 'other' include N/A" + }, + { + "column": "imputation_quality_filter", + "required": "CONDITIONAL (is_imputed = TRUE)", + "description": "minimum imputation quality value (e.g. Rsq, info) for filtering imputed variants", + "data_type": "float", + "examples": "0.3", + "notes": "If no filter, enter value of 0" + }, + { + "column": "n_samp", + "required": true, + "description": "Total sample size in the analysis", + "data_type": "integer", + "notes": "When different markers have different sample sizes, e.g, due to missing genotypes, use max sample size across markers" + }, + { + "column": "cohorts", + "required": true, + "description": "A list of cohorts that collected the samples.", + "data_type": "string", + "multi_value_delimiter": "|" + }, + { + "column": "population_descriptor", + "required": true, + "description": "the concept or classification scheme used to categorize people into populations for this analysis", + "data_type": "string", + "examples": "reported ancestry" + }, + { + "column": "population_labels", + "required": true, + "description": "name given to a population that describes or classifies it according to the dimension along which it was identified", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": "Chinese Americans | European Americans | African Americans" + }, + { + "column": "population_proportions", + "description": "proportion of participants from each population in the same order mapping to the values in the population_labels variable", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": "0.7 | 0.1 | 0.2" + }, + { + "column": "countries_of_recruitment", + "required": true, + "description": "Reported countries of recruitment", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": "Ghana | Kenya | Nigeria" + }, + { + "column": "countries_of_birth", + "description": "Reported countries of birth", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": "Ghana | Kenya | Nigeria" + }, + { + "column": "analysis_method", + "required": true, + "description": "The name or description of the method or computational algorithm used for genetic ancestry analysis.", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": ["linear regression", "logistic regression", "LMM", "GLMM", "meta-analysis"] + }, + { + "column": "analysis_software", + "description": "The name of the software used for the genetic anestry analysis", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": ["GCTA", "PLINK", "GENESIS", "SAIGE"] + }, + { + "column": "primed_dataset_id", + "description": "For analyses that used a dataset in primed individual data model indicate its dataset_id", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": "a60fb66cd539ad2c", + "notes": "From PRIMED inventory workspace" + }, + { + "column": "analysis_workspace_id", + "description": "workspace identifier for the GWAS that was generated in PRIMED", + "data_type": "string", + "examples": "primed-analysis/PRIMED_ANALYSIS_GERA-topmed-v3-imputation" } ] }, @@ -792,21 +961,13 @@ "required": true, "description": "Type of the file", "data_type": "string", - "examples": ["SNP loadings", "PCs", "admixture proportions"] + "examples": ["SNP loadings", "allele frequencies"] }, { "column": "n_variants", "description": "Count of variants in the GSR data file", "data_type": "integer", - "examples": "15281216", - "notes": "If applicable (e.g. SNP loading files)" - }, - { - "column": "n_samp", - "description": "Count of samples in the GSR data file", - "data_type": "integer", - "examples": "4500", - "notes": "If applicable (e.g. individual-level PC files)" + "examples": "15281216" } ] }