diff --git a/CHANGELOG.md b/CHANGELOG.md index 8631b36..69fc7c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,58 +1,90 @@ # Change log +## 1.5 + +|Table |Field |Change notes | +|:-------------------------|:------------------------|:------------------------------------------------------| +|genetic_findings |variant_type |added SV as allowable type | +|genetic_findings |sv_type |added; not required | +|genetic_findings |chrom |changed from required to conditional | +|genetic_findings |chrom_end |added; not required | +|genetic_findings |pos |changed from required to conditional | +|genetic_findings |pos_end |added; not required | +|genetic_findings |ref |changed from required to conditional | +|genetic_findings |alt |changed from required to conditional | +|genetic_findings |copy_number |added; not required | +|genetic_findings |gene_of_interest |renamed 'gene' field, made conditional, expanded notes | +|genetic_findings |hgvs |added; not required | +|genetic_findings |zygosity |added to list of allowable enumerations | +|genetic_findings |gene_disease_validity |added; not required | +|experiment_dna_short_read |sequencing_event_details |added; not required | + + +## 1.4.2 + +|Table |Field |Change notes | +|:-----------|:-------------|:----------------------------| +|participant |gregor_center |add UCI to enumerated values | + + +## 1.4.1 + +|Table |Field |Change notes | +|:-----------|:------------|:---------------------------| +|participant |solve_status |change to enumerated values | + + ## 1.4 -|Table |Field |Change notes | -|:--------------------------|:---------------------------|:--------------------------------------------------------------------------------------------------| -|experiement_rna_short_read |total_reads |change data type to float to accommodate validation for numbers of reads >2^31 | -|phenotype |syndromic |added; not required | -|aligned |aligned_file |will be populated by DCC in post-processing - no impact on data submitters. | -|aligned |aligned_file_index |will be populated by DCC in post-processing - no impact on data submitters. | -|experiment_dna_short_read |experiment_sample_id |field now required | -|experiment_dna_short_read |sequencing_event_details |added; not required | -|experiment_rna_short_read |total_reads |changed data type to float for large number support | -|experiment_nanopore |seq_library_prep_kit_method |added 'Unknown' ennumeration to allow for external data received where library prep kit is unknown | -|aligned_nanopore |quality_issues |added; not required | -|experiment_pac_bio |size_selection_method |added; not required | -|experiment_pac_bio |library_size |added; not required | -|experiment_pac_bio |smrt_cell_kit |added; not required | -|experiment_pac_bio |smrt_cell_id |added; not required | -|experiment_pac_bio |movie_name |added; not required | -|experiment_pac_bio |polymerase_kit |added; not required | -|experiment_pac_bio |sequencing_kit |added; not required | -|experiment_pac_bio |movie_length_hours |added; not required | -|experiment_pac_bio |includes_kinetics |added; not required | -|experiment_pac_bio |includes_CpG_methylation |added; not required | -|experiment_pac_bio |by_strand |added; not required | -|participant |solve_status |change to enumerated values | -|participant |gregor_center |add UCI to enumerated values | +|Table |Field |Change notes | +|:--------------------------|:------------------------------|:--------------------------------------------------------------------------------------------------| +|experiement_rna_short_read |total_reads |change data type to float to accommodate validation for numbers of reads >2^31 | +|genetic_findings |hgvsc |added; not required | +|genetic_findings |hgvsp |added; not required | +|genetic_findings |zygosity |added; not required | +|genetic_findings |known_condition_name |required if gene_known_for_phenotype = Known | +|genetic_findings |condition_id |required if gene_known_for_phenotype = Known | +|genetic_findings |condition_inheritance |required if gene_known_for_phenotype = Known | +|genetic_findings |partial_contribution_explained |required if phenotype_contribution = Partial | +|phenotype |syndromic |added; not required | +|aligned |aligned_file |will be populated by DCC in post-processing - no impact on data submitters. | +|aligned |aligned_file_index |will be populated by DCC in post-processing - no impact on data submitters. | +|experiment_dna_short_read |experiment_sample_id |field now required | +|experiment_dna_short_read |sequencing_event_details |added; not required | +|experiment_rna_short_read |total_reads |changed data type to float for large number support | +|experiment_nanopore |seq_library_prep_kit_method |added 'Unknown' ennumeration to allow for external data received where library prep kit is unknown | +|aligned_nanopore |quality_issues |added; not required | +|experiment_pac_bio |size_selection_method |added; not required | +|experiment_pac_bio |library_size |added; not required | +|experiment_pac_bio |smrt_cell_kit |added; not required | +|experiment_pac_bio |smrt_cell_id |added; not required | +|experiment_pac_bio |movie_name |added; not required | +|experiment_pac_bio |polymerase_kit |added; not required | +|experiment_pac_bio |sequencing_kit |added; not required | +|experiment_pac_bio |movie_length_hours |added; not required | +|experiment_pac_bio |includes_kinetics |added; not required | +|experiment_pac_bio |includes_CpG_methylation |added; not required | +|experiment_pac_bio |by_strand |added; not required | ## 1.3 -|Table |Field |Change notes | -|:-------------------------------|:------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------| -|genetic_findings |all |table added | -|experiment_pac_bio |all |table added | -|aligned_pac_bio |all |table added | -|aligned_pac_bio_set |all |table added | -|called_variants_pac_bio |all |table added | -|experiment_atac_short_read |all |table added | -|aligned_atac_short_read |all |table added | -|called_peaks_atac_short_read |all |table added | -|allele_specific_atac_short_read |all |table added | -|experiment |table_name |DCC added 'experiment_nanopore, experiment_pac_bio, and experiment_atac_short_read' as valid experiment types - no impact to data submitters. | -|aligned |all |table added - will be populated by DCC in post-processing - no impact on data submitters. | -|participant |solve_status |required field added | -|participant |missing_variant_case |required field added | -|participant |missing_variant_details |optional field added | -|genetic_findings |hgvsc |added; not required | -|genetic_findings |hgvsp |added; not required | -|genetic_findings |zygosity |added; not required | -|genetic_findings |known_condition_name |required if gene_known_for_phenotype = Known | -|genetic_findings |condition_id |required if gene_known_for_phenotype = Known | -|genetic_findings |condition_inheritance |required if gene_known_for_phenotype = Known | -|genetic_findings |partial_contribution_explained |required if phenotype_contribution = Partial | +|Table |Field |Change notes | +|:-------------------------------|:-----------------------|:---------------------------------------------------------------------------------------------------------------------------------------------| +|genetic_findings |all |table added | +|experiment_pac_bio |all |table added | +|aligned_pac_bio |all |table added | +|aligned_pac_bio_set |all |table added | +|called_variants_pac_bio |all |table added | +|experiment_atac_short_read |all |table added | +|aligned_atac_short_read |all |table added | +|called_peaks_atac_short_read |all |table added | +|allele_specific_atac_short_read |all |table added | +|experiment |table_name |DCC added 'experiment_nanopore, experiment_pac_bio, and experiment_atac_short_read' as valid experiment types - no impact to data submitters. | +|aligned |all |table added - will be populated by DCC in post-processing - no impact on data submitters. | +|participant |solve_status |required field added | +|participant |missing_variant_case |required field added | +|participant |missing_variant_details |optional field added | ## 1.2 diff --git a/GREGoR_data_model.json b/GREGoR_data_model.json index 48ea1f1..7ea2fef 100644 --- a/GREGoR_data_model.json +++ b/GREGoR_data_model.json @@ -1,7 +1,7 @@ { "name": "GREGoR Data Model", "description": "Data model for the GREGoR consortium", - "version": "1.4.2", + "version": "1.5", "tables": [ { "table": "participant", @@ -343,9 +343,15 @@ "column": "variant_type", "required": true, "data_type": "enumeration", - "enumerations": ["SNV/INDEL", "SV", "CNV", "RE", "MEI"], + "enumerations": ["SNV/INDEL", "RE", "SV"], "examples": "SNV/INDEL", - "notes": "Current data model only supports SNV/INDEL variant types. \n-Other variant type in process." + "notes": "SNV/INDEL: single nucelotide variants or short insertion/deletions (<50bp)\nRE: repeat elements\nSV: structural variants, including copy number variants and mobile element insertions" + }, + { + "column": "sv_type", + "data_type": "enumeration", + "enumerations": ["BND", "CNV", "CPX", "CTX", "DEL", "DUP", "INS", "INS:ME", "INS:ME:ALU", "INS:ME:LINE1", "INS:ME:SVA", "INS:UNK", "INV"], + "notes": "SV-vcf code descriptions:\nBND = 'Translocation'\nCNV = 'Copy Number Variant'\nCPX = 'Complex SV'\nCTX = 'Reciprocal chromosomal translocation'\nDEL = 'Deletion'\nDUP = 'Duplication'\nINS = 'Insertion'\nINS:ME = 'Mobile element insertion of unspecified ME class'\nINS:ME:ALU = 'Alu element insertion'\nINS:ME:LINE1 = 'LINE1 element insertion'\nINS:ME:SVA = 'SVA element insertion'\nINS:UNK = 'Sequence insertion of unspecified origin'\nINV = 'Inversion'" }, { "column": "variant_reference_assembly", @@ -358,33 +364,51 @@ }, { "column": "chrom", - "required": true, + "required": "CONDITIONAL (variant_type = SNV/INDEL, variant_type = RE)", "description": "Chromosome of the variant", "data_type": "enumeration", "enumerations": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"], "examples": "4" }, + { + "column": "chrom_end", + "description": "End position chromosome of SV", + "data_type": "enumeration", + "enumerations": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"], + "notes": "Optional field to report second chromosome in SV" + }, { "column": "pos", - "required": true, + "required": "CONDITIONAL (variant_type = SNV/INDEL, variant_type = RE)", "description": "Start position of the variant", "data_type": "integer", "examples": "25145092" }, + { + "column": "pos_end", + "description": "End position of SV", + "data_type": "integer" + }, { "column": "ref", - "required": true, + "required": "CONDITIONAL (variant_type = SNV/INDEL, variant_type = RE)", "description": "Reference allele of the variant", "data_type": "string", "examples": "C" }, { "column": "alt", - "required": true, + "required": "CONDITIONAL (variant_type = SNV/INDEL, variant_type = RE)", "description": "Alternate position of the variant", "data_type": "string", "examples": "T" }, + { + "column": "copy_number", + "description": "CNV copy number", + "data_type": "integer", + "notes": "To indicate copy number for CNVs that aren't covered by DUP or DEL, such as a triplication" + }, { "column": "ClinGen_allele_ID", "description": "ClinGen Allele ID for cross table refrence", @@ -392,12 +416,13 @@ "examples": "CA2877328" }, { - "column": "gene", - "required": true, - "description": "HGNC approved gene symbol, if known", + "column": "gene_of_interest", + "required": "CONDITIONAL (variant_type = SNV/INDEL, variant_type = RE)", + "description": "HGNC approved symbol of the known or candidate gene(s) that are relevant for the observed phenotype.", "data_type": "string", + "multi_value_delimiter": "|", "examples": "SEPSECS", - "notes": "HGNC symbol related to the identified variant in context of the proposed condition \nIf non-coding and intergenic, use 'intergenic'. \nIf non-coding and not intergenic, use 'other'. \n\n- Currently only single genes are allowed. Multi-gene and multi-locus variants are being discussed with the SV/CNV data model. \n- In situations where the variant overlaps two equally viable gene candidates on a single site, we suggest listing a separate discovery entry for each gene." + "notes": "- If the SNV/INDEL or RE is intergenic with no clear gene of interest, use 'intergenic'\n- If the SV has multiple genes of interest, use a multi-value delimiter\n- If the SV has no specific gene of interest, leave blank" }, { "column": "transcript", @@ -419,12 +444,18 @@ "examples": "p.Leu282=", "notes": "May use 'p.?' for non-coding variants" }, + { + "column": "hgvs", + "description": "genomic HGVS description of the variant", + "data_type": "string", + "notes": "Strongly encouraged for complex SVs (SV_type=CPX). \nCan be used to show uncertainty in SV breakends (g.(?_234567)_(345678_?)del)" + }, { "column": "zygosity", "required": true, "description": "Zygosity of variant", "data_type": "enumeration", - "enumerations": ["Heterozygous", "Homozygous", "Hemizygous", "Heteroplasmy", "Homoplasmy", "Mosaic"], + "enumerations": ["Heterozygous", "Homozygous", "Hemizygous", "Heteroplasmy", "Homoplasmy", "Mosaic", "Unknown"], "examples": "Heterozygous" }, { @@ -504,6 +535,13 @@ "data_type": "string", "notes": "Field to be updated in subsequent data submissions. TBD requirement timeline. Currently optional, to consider whether required if already submitted." }, + { + "column": "gene_disease_validity", + "description": "Validity assessment of the gene-disease relationship", + "data_type": "enumeration", + "enumerations": ["Definitive", "Strong", "Moderate", "Limited", "Disputed", "Animal Model Only", "Refuted"], + "notes": "Gene-disease validity as defined by ClinGen framework. \nEach RC can lookup in GenCC (search.thegencc.org; PMID 35507016) or curate themselves. \nAcceptable to use unpublished evidence/case data if an RC would like to curate a novel gene-disease association. \n\nCurrently optional, can change to required if gene_known_for_phenotype = known" + }, { "column": "public_database_other", "description": "Public databases that this variant in this participant has been submitted by the RC.", @@ -829,6 +867,7 @@ }, { "column": "sequencing_event_details", + "description": "describe if there are any sequencing-specific issues that would be important to note", "data_type": "string" } ] @@ -1987,7 +2026,8 @@ "required": true, "description": "identifier for experiment set", "data_type": "string", - "references": "> aligned_pac_bio_set.aligned_pac_bio_set_id" + "references": "> aligned_pac_bio_set.aligned_pac_bio_set_id", + "is_unique": true }, { "column": "called_variants_dna_file", @@ -2251,7 +2291,8 @@ "required": true, "description": "identifier for aligned ATAC-seq data", "data_type": "string", - "references": "> aligned_atac_short_read.aligned_atac_short_read_id" + "references": "> aligned_atac_short_read.aligned_atac_short_read_id", + "is_unique": true }, { "column": "called_peaks_file", diff --git a/change_log.R b/change_log.R index 60fa94c..49974ad 100644 --- a/change_log.R +++ b/change_log.R @@ -2,13 +2,14 @@ library(googlesheets4) library(dplyr) library(readr) -url <- "https://docs.google.com/spreadsheets/d/1NcB5pz7rWr2AJpjmDFu6v5E5pGVWmAoaS2kH-pF86H8" +url <- "https://docs.google.com/spreadsheets/d/1rC5ZgpzVMXiWDbzO8SQx_LyH566ZFoxkZk7O9xjOYr4" -log <- read_sheet(url, sheet="Change Log") %>% +log <- read_sheet(url, sheet="Change Log", col_types="cccc") %>% filter(!is.na(Version)) %>% + mutate(Version = as.character(Version)) %>% mutate(`Change notes` = gsub('"', "'", `Change notes`)) %>% filter(!(Version == "1.3" & Table == "genetic_findings")) %>% - mutate(Version = ifelse(Table == "genetic_findings", 1.3, Version)) %>% + mutate(Version = ifelse(Table == "genetic_findings" & Version == "1.2", "1.3", Version)) %>% arrange(Version) con <- file("CHANGELOG.md", "w") diff --git a/sheets_to_JSON.R b/sheets_to_JSON.R index fa22fd9..3dd28c3 100644 --- a/sheets_to_JSON.R +++ b/sheets_to_JSON.R @@ -4,10 +4,10 @@ library(tidyr) library(stringr) library(jsonlite) -url <- "https://docs.google.com/spreadsheets/d/1NcB5pz7rWr2AJpjmDFu6v5E5pGVWmAoaS2kH-pF86H8" +url <- "https://docs.google.com/spreadsheets/d/1rC5ZgpzVMXiWDbzO8SQx_LyH566ZFoxkZk7O9xjOYr4" model_name = "GREGoR Data Model" model_description = "Data model for the GREGoR consortium" -model_version = "1.4.2" +model_version = "1.5" # table metadata meta <- read_sheet(url, sheet="Table overview/status", skip=1)