From ac6ccc2fdec13e4f3bdf0f55b456f0d5a9faed9c Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Thu, 19 Oct 2023 11:02:31 -0700 Subject: [PATCH 1/4] add genetic_findings, pac_bio, and atac_short_read --- GREGoR_data_model.json | 944 ++++++++++++++++++++++++++++++++++++++++- sheets_to_JSON.R | 16 +- 2 files changed, 946 insertions(+), 14 deletions(-) diff --git a/GREGoR_data_model.json b/GREGoR_data_model.json index ee47edc..1d7064d 100644 --- a/GREGoR_data_model.json +++ b/GREGoR_data_model.json @@ -1,7 +1,7 @@ { "name": "GREGoR Data Model", "description": "Data model for the GREGoR consortium", - "version": "1.2.1", + "version": "1.3", "tables": [ { "table": "participant", @@ -166,6 +166,27 @@ "description": "age in years at which consent was originally obtained", "data_type": "float", "examples": "42" + }, + { + "column": "solve_status", + "required": true, + "description": "Indication of whether the submitting RC considers this case 'solved'", + "data_type": "enumeration", + "enumerations": ["Yes", "Likely", "No", "Partial"], + "examples": "partial" + }, + { + "column": "missing_variant_case", + "required": true, + "description": "Indication of whether this is known to be a missing variant case.", + "data_type": "enumeration", + "enumerations": ["Yes", "No", "Unknown"], + "notes": "MVP cases can include both those initially ascertained prospectively based on expectation of a missing pathogenic variant and those ascertained retrospectively after analysis identifies an expectation of a missing pathogenic variant within a narrow search space.\n\nThe case should meet one of the following criteria:\n\n- Has a LP/P variant identified in a gene underlying a recessive condition consistent with the phenotype observed in the patient, and suspected to be the correct diagnosis, for which a second explanatory LP/P variant has not been identified by prior genetic testing\n- Has a specific clinical diagnosis supported by evidence (e.g. biochemical testing) for a dominant condition known to be of single gene etiology for which an explanatory LP/P variant has not been identified by prior genetic testing(very uncommon) \n- Has no LP/P variants in a gene underlying a recessive condition consistent with the phenotype observed in the patient and the recessive condition is associated with a specific clinical diagnosis and has only been attributed to variants in a single gene (e.g. an individual clinically diagnosed with cystic fibrosis but no variants identified in CFTR)\n\nAny cases with potentially explanatory VUS identified as the 'missing variant,' are eligible and should be included (e.g. an individual with a LP/P variant and VUS in trans identified in a gene underlying a recessive condition consistent with the phenotype observed should be flagged for MVP)." + }, + { + "column": "missing_variant_details", + "description": "Text description of what’s missing , including a description of region or gene of interest when available", + "data_type": "string" } ] }, @@ -278,6 +299,251 @@ } ] }, + { + "table": "genetic_findings", + "columns": [ + { + "column": "genetic_findings_id", + "primary_key": true, + "required": true, + "description": "Unique ID of this variant in this participant (primary key)", + "data_type": "string", + "examples": "4_25145092_Broad_RGP_1432", + "notes": "RC submitted unique variant ID\n- Useful for tracking in downstream functional studies (i.e. V2F clickup database). \n- RC submitted but encourage conventional naming (genomic coordinate + participant id)" + }, + { + "column": "participant_id", + "required": true, + "description": "Subject/Participant Identifier within project", + "data_type": "string", + "references": "> participant.participant_id", + "examples": "Broad_RGP_1432_3", + "notes": "RC submitted" + }, + { + "column": "experiment_id", + "required": true, + "description": "The experiment table and experiment ID(s) in which discovery was identified:\nexperiment_table.id_in_table.\nShould correspond to an experiment_id in the DCC-generated experiment table.", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": ["experiment_dna_short_read.GSS201938-01-021-SG-1", "experiment_nanopore.BCM_00001"], + "notes": "RC submitted. \n- If experiment identifier is not available due to current data model limitations, use line 35 'original_method_of_discovery'\n- If variant is identified in multiple datasets, list the assigned primary experiment_id first and fill the following rows (e.g. allele balance) with data from this experiment\n- Additional information on orthogonal experiments can noted in line 35 or line 36" + }, + { + "column": "variant_type", + "required": true, + "data_type": "enumeration", + "enumerations": ["SNV/INDEL", "SV", "CNV", "RE", "MEI"], + "examples": "SNV/INDEL", + "notes": "Current data model only supports SNV/INDEL variant types. \n-Other variant type in process." + }, + { + "column": "variant_reference_assembly", + "required": true, + "description": "The genome build for identifying the variant position", + "data_type": "enumeration", + "enumerations": ["GRCh38", "CHM13"], + "examples": ["GRCh38", "CHM13"], + "notes": "Supports hg38 and future human references. If your pipeline uses a reference_assembly not currently listed, please contact the tiger team/DSA WG." + }, + { + "column": "chrom", + "required": true, + "description": "Chromosome of the variant", + "data_type": "enumeration", + "enumerations": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT"], + "examples": "4" + }, + { + "column": "pos", + "required": true, + "description": "Start position of the variant", + "data_type": "integer", + "examples": "25145092" + }, + { + "column": "ref", + "required": true, + "description": "Reference allele of the variant", + "data_type": "string", + "examples": "C" + }, + { + "column": "alt", + "required": true, + "description": "Alternate position of the variant", + "data_type": "string", + "examples": "T" + }, + { + "column": "ClinGen_allele_ID", + "description": "ClinGen Allele ID for cross table refrence", + "data_type": "string", + "examples": "CA2877328" + }, + { + "column": "gene", + "required": true, + "description": "HGNC approved gene symbol, if known", + "data_type": "string", + "examples": "SEPSECS", + "notes": "HGNC symbol related to the identified variant in context of the proposed condition \nIf non-coding and intergenic, use 'intergenic'. \nIf non-coding and not intergenic, use 'other'. \n\n- Currently only single genes are allowed. Multi-gene and multi-locus variants are being discussed with the SV/CNV data model. \n- In situations where the variant overlaps two equally viable gene candidates on a single site, we suggest listing a separate discovery entry for each gene." + }, + { + "column": "transcript", + "description": "Text description of transcript overlapping the variant", + "data_type": "string", + "examples": "ENST00000382103.7", + "notes": "Encourage to use MANE transcripts with version number.\n- If your variant does not have MANE transcript, add most relevant transcript." + }, + { + "column": "hgvsc", + "description": "HGVS c. description of the variant (m. for mitochondrial)", + "data_type": "string", + "examples": "c.846G>A" + }, + { + "column": "hgvsp", + "description": "HGVS p. description of the variant", + "data_type": "string", + "examples": "p.Leu282=", + "notes": "Use 'p.?' for non-coding variants" + }, + { + "column": "zygosity", + "required": true, + "description": "Zygosity of variant", + "data_type": "enumeration", + "enumerations": ["Heterozygous", "Homozygous", "Hemizygous", "Heteroplasmy", "Homoplasmy", "Mosaic"], + "examples": "Heterozygous" + }, + { + "column": "allele_balance_or_heteroplasmy_percentage", + "description": "Reported allele balance (mosaic) or heteroplasmy percentage (mitochondrial)", + "data_type": "float", + "notes": "Option reporting of allele balance for mosaic or mitochondrial variants. \n- If the variant is found mosaic in a parent sample, use the free notes line 36 to report parental allele balance." + }, + { + "column": "variant_inheritance", + "description": "Detection of variant in parents", + "data_type": "enumeration", + "enumerations": ["de novo", "maternal", "paternal", "biparental", "nonmaternal", "nonpaternal", "unknown"], + "examples": "nonmaternal", + "notes": "- For duos where the variant is only confirmed to be not maternally or paternally inherited, use nonmaternal or nonpaternal, respectively.\n- If the variant is found mosaic in a parent sample, report the patient allele balance in line 20 and use the free notes line 36 to report parental allele balance. \n- in scenario where variant is present in parent A and parent B's sequencing is not available (so cannot rule out possibility parent B also carries the variant), this should be 'unknown', since line 34 (additional_family_members_with_variant) should also contain the info that parent A has this variant" + }, + { + "column": "linked_variant", + "description": "Second variant in recessive cases", + "data_type": "string", + "examples": "4_25156922_Broad_RGP_1432", + "notes": "genetic_findings_id of a second variant in the same gene to complete a compound het." + }, + { + "column": "linked_variant_phase", + "data_type": "enumeration", + "enumerations": ["in trans", "in cis", "unknown"], + "examples": "unknown", + "notes": "optional description of phase if inheritance is unknown" + }, + { + "column": "gene_known_for_phenotype", + "required": true, + "description": "Indicate if the gene listed is a candidate or known disease gene. Known disease genes can be identified using OMIM or MONDO or MitoMap. Variant/phenotype in proband should be consistent with the described MOD/phenotype to be considered a known gene for condition.", + "data_type": "enumeration", + "enumerations": ["Known", "Candidate"], + "examples": "Known", + "notes": "- Use OMIM or MONDO or MitoMap as truth source for known disease genes \n- Variant/phenotype/inheritance in the patient should be consistent with the disease entry to be considered 'Known', otherwise use 'Candidate' (e.g. phenotype expansions or novel phenotypes are considered 'Candidate' )" + }, + { + "column": "known_condition_name", + "description": "Free text of condition name. Variant/phenotype/inheritance in proband should be consistent with the condition.", + "data_type": "string", + "examples": "Pontocerebellar hypoplasia type 2D", + "notes": "- Condition name should match the OMIM or MONDO entry listed in line 26 for known disease genes." + }, + { + "column": "condition_id", + "description": "MONDO/OMIM number for condition used for variant interpretation.", + "data_type": "string", + "examples": ["OMIM:613811", "MONDO:0013438"], + "notes": "Only list a MONDO or OMIM ID if the patient's phenotype and inheritance is consistent with the established disease mechanism and gene-phenotype association." + }, + { + "column": "condition_inheritance", + "description": "Description of the expected inheritance of condition used for variant interpretation", + "data_type": "enumeration", + "enumerations": ["Autosomal recessive", "Autosomal dominant", "X-linked", "Mitochondrial", "Y-linked", "Contiguous gene syndrome", "Somatic mosaicism", "Digenic", "Other", "Unknown"], + "multi_value_delimiter": "|", + "examples": "Autosomal recessive", + "notes": "Inheritance description should match the OMIM or MONDO entry listed in line 26. \n- If multiple inheritance types listed in OMIM/MONDO, select that which is suspected for this proband\n- In cases where multiple inheritance patterns are being considered, it is possible to select multiple here" + }, + { + "column": "GREGoR_variant_classification", + "description": "Clinical significance of variant described to condition listed as determined by the RC's variant curation.", + "data_type": "enumeration", + "enumerations": ["Benign", "Likely benign", "Uncertain significance - low", "Uncertain significance - moderate", "Uncertain significance - high", "Uncertain significance", "Likely pathogenic", "Pathogenic", "Curation in progress", "Well-established P/LP"], + "examples": "Curation in progress", + "notes": "Field to be updated in subsequent data submissions. TBD requirement timeline." + }, + { + "column": "GREGoR_ClinVar_SCV", + "description": "ClinVar accession number for the variant curation submitted by your center", + "data_type": "string", + "notes": "Field to be updated in subsequent data submissions. TBD requirement timeline." + }, + { + "column": "public_database_other", + "description": "Public databases that this variant in this participant has been submitted by the RC.", + "data_type": "string", + "multi_value_delimiter": "|" + }, + { + "column": "public_database_ID_other", + "description": "Public database variant/case ID", + "data_type": "string", + "multi_value_delimiter": "|" + }, + { + "column": "phenotype_contribution", + "description": "Contribution of variant-linked condition to participant's phenotype.", + "data_type": "enumeration", + "enumerations": ["Partial", "Full", "Uncertain"], + "examples": "Full", + "notes": "- Each discovery in a recessive pair is considered full contribution.\n- If the associated gene is not known ('candidate' on line 24), use Uncertain.\n- If the associated gene is known but a second hit is missing, use either Uncertain or Full (depending on your confidence of the gene in the diagnosis)" + }, + { + "column": "partial_contribution_explained", + "description": "List of specific phenotypes (HPO IDs) explained by the condition associated with this variant/gene in cases of partial contribution", + "data_type": "string", + "references": "> phenotype.term_id", + "multi_value_delimiter": "|", + "examples": "HP:0000365", + "notes": "Field used when a genetic finding only partially explains participant phenotype and a second genetic finding is expected. For example, a syndromic case with a well-established pathogenic non-syndromic hearing loss variant. \n- For recesssive variants with partial contribution, only one of the linked variants needs to list HPO IDs explained by the linked candidate set." + }, + { + "column": "additional_family_members_with_variant", + "description": "List of related participant IDs carrying the same variant", + "data_type": "string", + "references": "> participant.participant_id", + "multi_value_delimiter": "|" + }, + { + "column": "method_of_discovery", + "description": "The method/assay(s) used to identify the candidate", + "data_type": "enumeration", + "enumerations": ["SR-ES", "SR-GS", "LR-GS", "SNP array", "Optical mapping", "Karyotype", "SR RNA-seq", "LR RNA-seq", "SR-ES-reanalysis", "SR-GS-reanalysis", "LR-GS-reanalysis", "SNP array-reanalysis", "Optical mapping-reanalysis", "Karyotype-reanalysis", "SR RNA-seq-reanalysis", "LR RNA-seq-reanalysis"], + "multi_value_delimiter": "|", + "examples": "SR-GS", + "notes": "Field to be used to capture method of discovery if experiment not currently supported by data model. (eg. >experiment_table_name.id_in_table)" + }, + { + "column": "notes", + "description": "Free text field to explain edge cases or discovery updates or list parallel experiment IDs or list parental allele balance when mosaic... etc.", + "data_type": "string", + "notes": "Data collected in this section will be used to determine data model updates needed for future submission rounds." + } + ] + }, { "table": "analyte", "required": true, @@ -402,7 +668,7 @@ "column": "table_name", "required": true, "data_type": "enumeration", - "enumerations": ["experiment_dna_short_read", "experiment_rna_short_read", "experiment_nanopore"], + "enumerations": ["experiment_dna_short_read", "experiment_rna_short_read", "experiment_nanopore", "experiment_pac_bio", "experiment_atac_short_read"], "examples": "experiment_dna_short_read" }, { @@ -435,7 +701,7 @@ "column": "table_name", "required": true, "data_type": "enumeration", - "enumerations": ["aligned_dna_short_read", "aligned_rna_short_read", "aligned_nanopore"], + "enumerations": ["aligned_dna_short_read", "aligned_rna_short_read", "aligned_nanopore", "aligned_pac_bio ", "aligned_atac_short_read"], "examples": "aligned_dna_short_read" }, { @@ -450,6 +716,16 @@ "data_type": "string", "references": "> participant.participant_id", "enumerations": "BCM_H7YG5DSX2" + }, + { + "column": "aligned_file", + "data_type": "string", + "is_bucket_path": true + }, + { + "column": "aligned_index_file", + "data_type": "string", + "is_bucket_path": true } ] }, @@ -843,7 +1119,7 @@ { "column": "5prime3prime_bias", "data_type": "float", - "examples": "?" + "examples": "1.09" }, { "column": "percent_GC", @@ -1246,14 +1522,14 @@ "required": true, "description": "identifier for a set of experiments (primary key)", "data_type": "string", - "notes": "RCs make their own IDs (these must begin with center-specific prefix). \naligned_dna__set_id links the aligned_dna_ table to the called_variants_dna_ table. For centers that are only uploading single sample files, the aligned__set_id and aligned__id values can be identical. For centers uploading multi-sample files, they will need to come up with a value for aligned__set_id that makes sense to them for indicating the sample group for a multi-sample callset, and use that same value in called_variants_." + "notes": "RCs make their own IDs (these must begin with center-specific prefix). \naligned_dna_short_read_set_id links the aligned_dna_short_read table to the called_variants_dna_short_read table. For centers that are only uploading single sample files, the aligned_short_read_set_id and aligned_short_read_id values can be identical. For centers uploading multi-sample files, they will need to come up with a value for aligned_short_read_set_id that makes sense to them for indicating the sample group for a multi-sample callset, and use that same value in called_variants_short_read." }, { "column": "aligned_nanopore_id", "required": true, "data_type": "string", "references": "> aligned_nanopore.aligned_nanopore_id", - "notes": "the identifier for a single-sample aligned_dna_ included in the read_set (one per row)" + "notes": "the identifier for a single-sample aligned_dna_short_read included in the read_set (one per row)" } ] }, @@ -1317,6 +1593,662 @@ "data_type": "string" } ] + }, + { + "table": "experiment_pac_bio", + "required": "CONDITIONAL (aligned_pac_bio, aligned_pac_bio_set, called_variants_pac_bio)", + "columns": [ + { + "column": "experiment_pac_bio_id", + "primary_key": true, + "required": true, + "description": "identifier for experiment_short_read (primary key)", + "data_type": "string", + "examples": ["Broad_E1", "Broad_E2", "GSS201938-01-021-SG-1"], + "notes": "RCs make their own IDs, must begin with center abbreviation as defined in participant table; need to be globally unique in consortium; may be generated by prepending experiment_sample_id with center abbreviation" + }, + { + "column": "analyte_id", + "required": true, + "data_type": "string", + "references": "> analyte.analyte_id" + }, + { + "column": "experiment_sample_id", + "description": "identifier used in the data file (e.g. VCF header / LIMS ID)", + "data_type": "string", + "examples": "12339D-SA", + "notes": "may be the same as experiment_short_read_id if the file does contain sample identifiers\nshould be present if downstream file contains a sample_id (e.g. BAM, VCF)\nsome centers have one id for the sample (tube) and a diff ID for the sample as named in the VCF; experiment_sample_id = ID in the VCF file; analyte_id = ID for the tube/aliquot/whatever" + }, + { + "column": "seq_library_prep_kit_method", + "required": true, + "description": "Library prep kit used", + "data_type": "enumeration", + "enumerations": ["SMRTbell prep kit 3.0", "HiFI express template prep kit 2.0"], + "examples": "SMRTbell prep kit 3.0", + "notes": "Can be missing if RC receives external data" + }, + { + "column": "fragmentation_method", + "description": "method used for shearing/fragmentation", + "data_type": "string", + "examples": "NA, Covaris g-tube (DNA volume/concentration, RCF's, etc.)" + }, + { + "column": "experiment_type", + "required": true, + "data_type": "enumeration", + "enumerations": ["targeted", "genome", "fiberseq", "isoseq", "masseq"], + "examples": "targeted", + "notes": "While the most common use of PacBio is whole-genome sequencing, targeted sequencing is possible and may be of interest to the consortium." + }, + { + "column": "targeted_regions_method", + "description": "Capture method used.", + "data_type": "string", + "examples": "Twist Alliance Dark Genes Panel" + }, + { + "column": "targeted_region_bed_file", + "description": "name and path of bed file uploaded to workspace", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/LR_experiment.bed", + "notes": "Can be missing if RC receives external data" + }, + { + "column": "date_data_generation", + "description": "Date of data generation (First sequencing date)", + "data_type": "date", + "examples": "2022-06-29", + "notes": "Can be missing if RC receives external data; ISO 8601 date format" + }, + { + "column": "sequencing_platform", + "description": "sequencing platform used for the experiment", + "data_type": "enumeration", + "enumerations": ["PacBio Revio", "PacBio Sequel IIe", "PacBio Sequel II"], + "examples": "PacBio Revio", + "notes": "Can be missing if RC receives external data" + }, + { + "column": "was_barcoded", + "required": true, + "description": "indicates whether samples were barcoded on this flowcell", + "data_type": "boolean", + "examples": true + }, + { + "column": "barcode_kit", + "description": "Barcode kit used", + "data_type": "string", + "enumerations": ["SMRTbell barcoded adapter plate 3.0", "Barcoded M13 primer plate", "Barcoded overhang adapter kit - 8A/8B"], + "examples": "SMRTbell barcoded adapter plate 3.0" + }, + { + "column": "application_kit", + "description": "Library prep kits for special applications", + "data_type": "enumeration", + "enumerations": ["MAS-Seq for 10x Single Cell 3' kit", "Iso-Seq express oligo kit", "SMRTbell gDNA amplification kit"], + "examples": "MAS-Seq for 10x Single Cell 3' kit" + }, + { + "column": "smrtlink_server_version", + "required": true, + "description": "Version number of PacBio SMRTLink software", + "data_type": "string", + "examples": "12.0.0.174552" + }, + { + "column": "instrument_ics_version", + "required": true, + "description": "Version number of PacBio instrument control software", + "data_type": "string", + "examples": "12.0.0.173080" + } + ] + }, + { + "table": "aligned_pac_bio", + "required": "CONDITIONAL (aligned_pac_bio_set, called_variants_pac_bio)", + "columns": [ + { + "column": "aligned_pac_bio_id", + "primary_key": true, + "required": true, + "description": "identifier for aligned_short_read (primary key)", + "data_type": "string", + "examples": "BCM_H7YG5DSX2-3-IDUDI0014-1", + "notes": "experiment_short_read_id + alignment indicator" + }, + { + "column": "experiment_pac_bio_id", + "required": true, + "description": "identifier for experiment", + "data_type": "string", + "references": "> experiment_pac_bio.experiment_pac_bio_id" + }, + { + "column": "aligned_pac_bio_file", + "required": true, + "description": "name and path of file with aligned reads", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.bam" + }, + { + "column": "aligned_pac_bio_index_file", + "required": true, + "description": "name and path of index file corresponding to aligned reads file", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.bam.bai" + }, + { + "column": "md5sum", + "required": true, + "description": "md5 checksum for file", + "data_type": "string", + "examples": "129c28163df082" + }, + { + "column": "reference_assembly", + "required": true, + "data_type": "enumeration", + "enumerations": ["chm13", "GRCh38_noalt", "GRCh38", "GRCh37", "NCBI36", "NCBI35", "NCBI34"], + "examples": "GRCh38" + }, + { + "column": "alignment_software", + "required": true, + "description": "Software including version number", + "data_type": "string", + "examples": "Minimap2-2.24", + "notes": "Can be unknown if RC receives external data" + }, + { + "column": "analysis_details", + "description": "brief description of the analysis pipeline used for producing the file; perhaps a DOI or link to something like a WDL file or github repository", + "data_type": "string", + "examples": "DOI:10.5281/zenodo.4469317" + }, + { + "column": "mean_coverage", + "description": "Mean coverage of either the genome or the targeted regions", + "data_type": "float", + "examples": "100", + "notes": "Can be unknown if RC receives external data" + }, + { + "column": "genome_coverage", + "description": "e.g. ≥90% at 10x or 20x; per consortium decision", + "data_type": "integer", + "examples": "93" + }, + { + "column": "contamination", + "description": "Contamination level estimate., e.g. <1% (display raw fraction not percent)", + "data_type": "float", + "examples": "0.01" + }, + { + "column": "sex_concordance", + "description": "Comparison between reported sex vs genotype sex; Other if ploidy NOT XX or XY and Other if sex at birth is not known, thus unable to perform sex concordance", + "data_type": "boolean", + "examples": true + }, + { + "column": "num_reads", + "description": "Total reads (before/ignoring alignment)", + "data_type": "integer", + "examples": "11946552" + }, + { + "column": "num_bases", + "description": "Number of bases (before/ignoring alignment)", + "data_type": "integer", + "examples": "101126719147" + }, + { + "column": "read_length_mean", + "description": "Mean length of all reads (before/ignoring alignment)", + "data_type": "integer", + "examples": "18232" + }, + { + "column": "num_aligned_reads", + "description": "Total aligned reads", + "data_type": "integer", + "examples": "10751896" + }, + { + "column": "num_aligned_bases", + "description": "Number of bases in aligned reads", + "data_type": "integer", + "examples": "91014047232" + }, + { + "column": "aligned_read_length_mean", + "description": "Mean length of aligned reads", + "data_type": "integer", + "examples": "18104" + }, + { + "column": "read_error_rate", + "description": "Mean empirical per-base error rate of aligned reads", + "data_type": "float", + "examples": "0.01" + }, + { + "column": "mapped_reads_pct", + "description": "Number between 1 and 100, na", + "data_type": "float", + "examples": "90" + }, + { + "column": "methylation_called", + "required": true, + "description": "Indicates whether 5mC methylation has been called and annotated in the BAM file's MM and ML tags", + "data_type": "boolean", + "examples": true + } + ] + }, + { + "table": "aligned_pac_bio_set", + "required": "CONDITIONAL (called_variants_pac_bio)", + "columns": [ + { + "column": "aligned_pac_bio_set_id", + "primary_key": true, + "required": true, + "description": "identifier for a set of experiments (primary key)", + "data_type": "string", + "notes": "RCs make their own IDs (these must begin with center-specific prefix). \naligned_dna_short_read_set_id links the aligned_dna_short_read table to the called_variants_dna_short_read table. For centers that are only uploading single sample files, the aligned_short_read_set_id and aligned_short_read_id values can be identical. For centers uploading multi-sample files, they will need to come up with a value for aligned_short_read_set_id that makes sense to them for indicating the sample group for a multi-sample callset, and use that same value in called_variants_short_read." + }, + { + "column": "aligned_pac_bio_id", + "required": true, + "data_type": "string", + "references": "> aligned_pac_bio.aligned_pac_bio_id", + "notes": "the identifier for a single-sample aligned_dna_short_read included in the read_set (one per row)" + } + ] + }, + { + "table": "called_variants_pac_bio", + "columns": [ + { + "column": "called_variants_pac_bio_id", + "primary_key": true, + "description": "unique key for table (anvil requirement)", + "data_type": "string", + "references": "from:md5sum" + }, + { + "column": "aligned_pac_bio_set_id", + "required": true, + "description": "identifier for experiment set", + "data_type": "string", + "references": "> aligned_pac_bio_set.aligned_pac_bio_set_id" + }, + { + "column": "called_variants_dna_file", + "required": true, + "description": "name and path of the file with variant calls", + "data_type": "string", + "is_bucket_path": true, + "examples": ["gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SV.vcf", "gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SNV.gvcf", "gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.EH.vcf"] + }, + { + "column": "md5sum", + "required": true, + "description": "md5 checksum for file", + "data_type": "string", + "examples": "129c28163df082", + "notes": "md5sum computed prior to upload (used to verify file integrity)" + }, + { + "column": "caller_software", + "required": true, + "description": "variant calling software used including version number", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": ["pepper-margin-deepvariant-r0.8", "sniffles-v2.0.7", "pbsv-2.8.0", "pav-v2.0.1"] + }, + { + "column": "variant_types", + "required": true, + "description": "types of variants called", + "data_type": "string", + "enumerations": ["SNV", "INDEL", "SV", "CNV", "RE", "MEI"], + "multi_value_delimiter": "|", + "examples": "SNV|INDEL", + "notes": "can add more values as the need arises\nif there are two VCFs for SNV and Indels, there would be two different lines in this table; if combined in one VCF, a |-delimited entry" + }, + { + "column": "analysis_details", + "description": "brief description of the analysis pipeline used for producing the file; perhaps a link to something like a WDL file or github repository", + "data_type": "string" + } + ] + }, + { + "table": "experiment_atac_short_read", + "required": "CONDITIONAL (aligned_atac_short_read, called_peaks_atac_short_read, allele_specific_atac_short_read)", + "columns": [ + { + "column": "experiment_atac_short_read_id", + "primary_key": true, + "required": true, + "description": "identifier for experiment_atac_short_read (primary key)", + "data_type": "string", + "examples": ["Broad_E1", "Broad_E2", "GSS201938-01-021-SG-1"], + "notes": "RCs make their own IDs, must begin with center abbreviation as defined in participant table; need to be globally unique in consortium; may be generated by prepending experiment_sample_id with center abbreviation" + }, + { + "column": "analyte_id", + "required": true, + "data_type": "string", + "references": "> analyte.analyte_id" + }, + { + "column": "experiment_sample_id", + "description": "identifier used in the data file (e.g. VCF header / LIMS ID)", + "data_type": "string", + "examples": "12339D-SA", + "notes": "may be the same as experiment_short_read_id if the file does contain sample identifiers\nshould be present if downstream file contains a sample_id (e.g. BAM, VCF)\nsome centers have one id for the sample (tube) and a diff ID for the sample as named in the VCF; experiment_sample_id = ID in the VCF file; analyte_id = ID for the tube/aliquot/whatever" + }, + { + "column": "seq_library_prep_kit_method", + "description": "Library prep kit used", + "data_type": "string", + "examples": ["Kappa Hyper PCR plus ", "Kappa Hyper PCR free"], + "notes": "Can be missing if RC receives external data" + }, + { + "column": "read_length", + "description": "sequenced read length (bp); GREGoR RCs do paired end sequencing, so is the example of 100bp indicates 2x100bp.", + "data_type": "integer", + "examples": "100", + "notes": "Can be missing if RC receives external data; all RCs are doing paired-end reads." + }, + { + "column": "experiment_type", + "required": true, + "data_type": "enumeration", + "enumerations": ["targeted", "genome", "exome"], + "examples": "targeted", + "notes": "facilitates having exome and GS-SR experiments in the same experiment_details table" + }, + { + "column": "targeted_regions_method", + "description": "Which capture kit is used. Can be missing if RC receives external data", + "data_type": "string", + "examples": "NimbleGen SeqCap EZ Human Exome Library v2.0" + }, + { + "column": "targeted_region_bed_file", + "description": "name and path of bed file uploaded to workspace", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/SR_experiment.bed", + "notes": "Can be missing if RC receives external data" + }, + { + "column": "date_data_generation", + "description": "Date of data generation (First sequencing date)", + "data_type": "date", + "examples": "2022-06-29", + "notes": "Can be missing if RC receives external data; ISO 8601 date format" + }, + { + "column": "target_insert_size", + "description": "insert size the protocol targets for DNA fragments", + "data_type": "integer", + "examples": "500", + "notes": "Can be missing if RC receives external data" + }, + { + "column": "sequencing_platform", + "description": "sequencing platform used for the experiment", + "data_type": "string", + "examples": ["HiSeq2000", "HiSeq2500", "HiSeqX", "NovaSeq"], + "notes": "Can be missing if RC receives external data" + } + ] + }, + { + "table": "aligned_atac_short_read", + "required": "CONDITIONAL (called_peaks_atac_short_read, allele_specific_atac_short_read)", + "columns": [ + { + "column": "aligned_atac_short_read_id", + "primary_key": true, + "required": true, + "description": "identifier for aligned_atac_short_read (primary key)", + "data_type": "string", + "examples": "BCM_H7YG5DSX2-3-IDUDI0014-1", + "notes": "experiment_short_read_id + alignment indicator" + }, + { + "column": "experiment_atac_short_read_id", + "required": true, + "description": "identifier for experiment", + "data_type": "string", + "references": "> experiment_atac_short_read.experiment_atac_short_read_id" + }, + { + "column": "aligned_atac_short_read_file", + "required": true, + "description": "name and path of file with aligned reads", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram" + }, + { + "column": "aligned_atac_short_read_index_file", + "required": true, + "description": "name and path of index file corresponding to aligned reads file", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai" + }, + { + "column": "md5sum", + "required": true, + "description": "md5 checksum for file", + "data_type": "string", + "examples": "129c28163df082" + }, + { + "column": "reference_assembly", + "required": true, + "data_type": "enumeration", + "enumerations": ["GRCh38", "GRCh37", "NCBI36", "NCBI35", "NCBI34"], + "examples": "GRCh38" + }, + { + "column": "reference_assembly_uri", + "data_type": "string", + "examples": "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/GRCh38.primary_assembly.genome.fa.gz" + }, + { + "column": "reference_assembly_details", + "data_type": "string", + "examples": "primary, chrY-masked" + }, + { + "column": "alignment_software", + "required": true, + "description": "Software including version number", + "data_type": "string", + "examples": "bowtie2" + }, + { + "column": "gene_annotation_details", + "data_type": "enumeration", + "enumerations": "gencode_comprehensive_chr gencode_comprehensive_all gencode_comprehensive_pri gencode_basic_chr gencode_basic_all gencode_basic_pri lncRNA_annotation polyA_feature_annotation consensus_pseudogenes predicted_tRNA_genes", + "examples": ["examples corresponding to GENCODE options", "gencode_comprehensive_chr", "gencode_comprehensive_all", "gencode_comprehensive_pri"] + }, + { + "column": "alignment_log_file", + "description": "path of (log) file with all parameters for alignment software", + "data_type": "string", + "is_bucket_path": true + }, + { + "column": "alignment_postprocessing", + "description": "If any post processing was applied", + "data_type": "string", + "examples": "multimapped reads removed, reads trimmed" + }, + { + "column": "mean_coverage", + "description": "Mean coverage of either the genome or the targeted regions", + "data_type": "float", + "examples": "100", + "notes": "Can be unknown if RC receives external data" + }, + { + "column": "percent_uniquely_aligned", + "description": "how many reads aligned to just one place", + "data_type": "float", + "examples": "81" + }, + { + "column": "percent_multimapped", + "description": "how many reads aligned to multiple places", + "data_type": "float", + "examples": "10" + }, + { + "column": "percent_unaligned", + "description": "how many reads didn't align", + "data_type": "float", + "examples": "9" + } + ] + }, + { + "table": "called_peaks_atac_short_read", + "required": "CONDITIONAL (allele_specific_atac_short_read)", + "columns": [ + { + "column": "called_peaks_atac_short_read_id", + "primary_key": true, + "description": "unique key for table (anvil requirement)", + "data_type": "string", + "references": "from:peaks_md5sum" + }, + { + "column": "aligned_atac_short_read_id", + "required": true, + "description": "identifier for aligned ATAC-seq data", + "data_type": "string", + "references": "> aligned_atac_short_read.aligned_atac_short_read_id" + }, + { + "column": "called_peaks_file", + "required": true, + "description": "name and path of the bed file with open chromatin peaks after QC filtering", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.ATAC.IDR.bed" + }, + { + "column": "peaks_md5sum", + "required": true, + "description": "md5 checksum for called_peaks_file", + "data_type": "string", + "examples": "129c28163df082", + "notes": "md5sum computed prior to upload (used to verify file integrity)" + }, + { + "column": "peak_caller_software", + "required": true, + "description": "peak calling software used including version number", + "data_type": "string", + "multi_value_delimiter": "|", + "examples": ["hotspot2", "overlap_peaks"] + }, + { + "column": "peak_set_type", + "required": true, + "description": "peak set type, according to ENCODE descriptors", + "data_type": "enumeration", + "enumerations": ["narrowPeak", "gappedPeak", "IDR"], + "examples": ["narrowPeak", "gappedPeak", "IDR"] + }, + { + "column": "analysis_details", + "description": "brief description of the analysis pipeline used for producing the called_peaks_file; perhaps a link to something like a WDL file or github repository", + "data_type": "string" + } + ] + }, + { + "table": "allele_specific_atac_short_read", + "columns": [ + { + "column": "asc_atac_short_read_id", + "description": "unique key for table (anvil requirement)", + "data_type": "string", + "references": "from:asc_md5sum" + }, + { + "column": "called_peaks_atac_short_read_id", + "required": true, + "description": "identifier for called peaks", + "data_type": "string" + }, + { + "column": "asc_file", + "required": true, + "description": "name and path of the tsv file with allele-specific chromatin accessibility measures (logFC) at heterozygous sites after QC and significance testing", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SV.bed" + }, + { + "column": "asc_md5sum", + "required": true, + "description": "md5 checksum for called_peaks_file", + "data_type": "string", + "examples": "129c28163df082", + "notes": "md5sum computed prior to upload (used to verify file integrity)" + }, + { + "column": "peak_set_type", + "required": true, + "description": "peak set type, according to ENCODE descriptors", + "data_type": "enumeration", + "enumerations": ["narrowPeak", "gappedPeak", "IDR"], + "examples": ["narrowPeak", "gappedPeak", "IDR"] + }, + { + "column": "het_sites_file", + "required": true, + "description": "VCF file containing prefiltered heterozygous sites used for reference alignment bias testing and calling allele-specific chromatin accessibility events", + "data_type": "string", + "is_bucket_path": true, + "examples": "gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.het_sites.vcf" + }, + { + "column": "het_sites_md5sum", + "required": true, + "description": "md5 checksum for het_sites_file", + "data_type": "string", + "examples": "129c28163df082", + "notes": "md5sum computed prior to upload (used to verify file integrity)" + }, + { + "column": "analysis_details", + "description": "brief description of the analysis pipeline used for producing the asc_file; perhaps a link to something like a WDL file or github repository", + "data_type": "string" + } + ] } ] } diff --git a/sheets_to_JSON.R b/sheets_to_JSON.R index 6ed3d73..857fc9e 100644 --- a/sheets_to_JSON.R +++ b/sheets_to_JSON.R @@ -4,16 +4,16 @@ library(tidyr) library(stringr) library(jsonlite) -url <- "https://docs.google.com/spreadsheets/d/1-TcKKMJMSMV_zhkmJGswmFY2ECGi_S2heqAjBkat5tM" +url <- "https://docs.google.com/spreadsheets/d/1mZlJ9IauaVNiJ6f2Q14pg6rGEpWVrAjMJmAZ_FioVTY" model_name = "GREGoR Data Model" model_description = "Data model for the GREGoR consortium" -model_version = "1.2.1" +model_version = "1.3" # table metadata -meta <- read_sheet(url, sheet="Table overview/status") +meta <- read_sheet(url, sheet="Table overview/status", skip=1) # can't validate genetics_findings table -meta <- filter(meta, !(Table %in% "genetic_findings")) +#meta <- filter(meta, !(Table %in% "genetic_findings")) meta <- meta %>% mutate(required=ifelse(tolower(Required) == "yes", TRUE, Required)) %>% @@ -24,7 +24,7 @@ meta <- meta %>% table_names <- meta$table tables <- lapply(table_names, function(x) read_sheet(url, sheet=x, skip=1, col_types="c")) names(tables) <- table_names -rm(list = c("table_names", "url")) +#rm(list = c("table_names", "url")) # rename and reorder columns @@ -64,7 +64,7 @@ for (i in 1:length(tables)) { # 2) the list of data tables corresponding to the first argument source("sheets_to_list.R") tab_list <- sheets_to_list(apply(meta, 1, as.list), tables) -rm(list = c("meta", "tables", "sheets_to_list")) +#rm(list = c("meta", "tables", "sheets_to_list")) # initialize leading text @@ -77,7 +77,7 @@ master <- list( # Data Table Details tables = tab_list ) -rm(list = c("tab_list")) +#rm(list = c("tab_list")) # compile master file in JSON format @@ -85,7 +85,7 @@ out <- toJSON(x = master, pretty = TRUE, auto_unbox = TRUE, unbox = TRUE) -rm(list = c("master")) +#rm(list = c("master")) # unquote the logical parameters TRUE and FALSE From 5a9be798965410016fa054427c0b546357121856 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 20 Oct 2023 14:58:07 -0700 Subject: [PATCH 2/4] add note for column --- GREGoR_data_model.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GREGoR_data_model.json b/GREGoR_data_model.json index 1d7064d..2b9a9f7 100644 --- a/GREGoR_data_model.json +++ b/GREGoR_data_model.json @@ -2084,7 +2084,8 @@ "required": true, "description": "Software including version number", "data_type": "string", - "examples": "bowtie2" + "examples": "bowtie2", + "notes": "Can be unknown if RC receives external data" }, { "column": "gene_annotation_details", From fd5dc5e887b10521e70b632ba945b0c53531bd1a Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Fri, 20 Oct 2023 16:36:09 -0700 Subject: [PATCH 3/4] add details on missing variant columns, add reference --- GREGoR_data_model.json | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/GREGoR_data_model.json b/GREGoR_data_model.json index 2b9a9f7..cdcd7fa 100644 --- a/GREGoR_data_model.json +++ b/GREGoR_data_model.json @@ -178,15 +178,16 @@ { "column": "missing_variant_case", "required": true, - "description": "Indication of whether this is known to be a missing variant case.", + "description": "Indication of whether this is known to be a missing variant case, see notes for a description of the Missing Variant Project and inclusion criteria.", "data_type": "enumeration", "enumerations": ["Yes", "No", "Unknown"], - "notes": "MVP cases can include both those initially ascertained prospectively based on expectation of a missing pathogenic variant and those ascertained retrospectively after analysis identifies an expectation of a missing pathogenic variant within a narrow search space.\n\nThe case should meet one of the following criteria:\n\n- Has a LP/P variant identified in a gene underlying a recessive condition consistent with the phenotype observed in the patient, and suspected to be the correct diagnosis, for which a second explanatory LP/P variant has not been identified by prior genetic testing\n- Has a specific clinical diagnosis supported by evidence (e.g. biochemical testing) for a dominant condition known to be of single gene etiology for which an explanatory LP/P variant has not been identified by prior genetic testing(very uncommon) \n- Has no LP/P variants in a gene underlying a recessive condition consistent with the phenotype observed in the patient and the recessive condition is associated with a specific clinical diagnosis and has only been attributed to variants in a single gene (e.g. an individual clinically diagnosed with cystic fibrosis but no variants identified in CFTR)\n\nAny cases with potentially explanatory VUS identified as the 'missing variant,' are eligible and should be included (e.g. an individual with a LP/P variant and VUS in trans identified in a gene underlying a recessive condition consistent with the phenotype observed should be flagged for MVP)." + "notes": "The goal of the Missing Variant Project (MVP) is to identify a cohort of cases for which there is an extremely high expectation of the presence of one or two 'missing' pathogenic variants within a narrow and specific search space, i.e. a single gene/locus of interest for which the known phenotype is the suspected/likely diagnosis for the affected individual.\n\nMVP cases can include both those initially ascertained prospectively based on expectation of a missing pathogenic variant and those ascertained retrospectively after analysis identifies an expectation of a missing pathogenic variant within a narrow search space.\n\nThe case should meet one of the following criteria:\n- Has a LP/P variant identified in a gene underlying a recessive condition consistent with the phenotype observed in the patient, and suspected to be the correct diagnosis, for which a second explanatory LP/P variant has not been identified by prior genetic testing\n- Has a specific clinical diagnosis supported by evidence (e.g. biochemical testing) for a dominant condition known to be of single gene etiology for which an explanatory LP/P variant has not been identified by prior genetic testing(very uncommon) \n- Has no LP/P variants in a gene underlying a recessive condition consistent with the phenotype observed in the patient and the recessive condition is associated with a specific clinical diagnosis and has only been attributed to variants in a single gene (e.g. an individual clinically diagnosed with cystic fibrosis but no variants identified in CFTR)\n\nAny cases with potentially explanatory VUS identified as the 'missing variant,' are eligible and should be included (e.g. an individual with a LP/P variant and VUS in trans identified in a gene underlying a recessive condition consistent with the phenotype observed should be flagged for MVP)." }, { "column": "missing_variant_details", - "description": "Text description of what’s missing , including a description of region or gene of interest when available", - "data_type": "string" + "description": "For missing variant cases, indicate gene(s) or region of interest and reason for inclusion in MVP.", + "data_type": "string", + "examples": ["Werner syndrome case with single pathogenic hit in WRN ", "Clinical diagnosis of Marfan syndrome with no pathogenic or likely pathogenic variant identified in FBN1", "Clinical diagnosis of neurofibromatosis type 1 with no pathogenic or likely pathogenic variant identified in NF1 "] } ] }, @@ -2202,7 +2203,8 @@ "column": "called_peaks_atac_short_read_id", "required": true, "description": "identifier for called peaks", - "data_type": "string" + "data_type": "string", + "references": "> called_peaks_atac_short_read.called_peaks_atac_short_read_id" }, { "column": "asc_file", From 4d491bb6e5282e4c241031ebc5c9f37b0e7ae397 Mon Sep 17 00:00:00 2001 From: "Stephanie M. Gogarten" Date: Mon, 23 Oct 2023 10:12:22 -0700 Subject: [PATCH 4/4] comment out references from genetic findings table Some references are causing validation errors: 1) phenotype.term_id is not unique so cannot be a foreign key 2) if a column referencing a foreign key has all missing values, get an error of incompatible types --- GREGoR_data_model.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GREGoR_data_model.json b/GREGoR_data_model.json index cdcd7fa..24aa7cd 100644 --- a/GREGoR_data_model.json +++ b/GREGoR_data_model.json @@ -516,7 +516,7 @@ "column": "partial_contribution_explained", "description": "List of specific phenotypes (HPO IDs) explained by the condition associated with this variant/gene in cases of partial contribution", "data_type": "string", - "references": "> phenotype.term_id", + //"references": "> phenotype.term_id", "multi_value_delimiter": "|", "examples": "HP:0000365", "notes": "Field used when a genetic finding only partially explains participant phenotype and a second genetic finding is expected. For example, a syndromic case with a well-established pathogenic non-syndromic hearing loss variant. \n- For recesssive variants with partial contribution, only one of the linked variants needs to list HPO IDs explained by the linked candidate set." @@ -525,7 +525,7 @@ "column": "additional_family_members_with_variant", "description": "List of related participant IDs carrying the same variant", "data_type": "string", - "references": "> participant.participant_id", + //"references": "> participant.participant_id", "multi_value_delimiter": "|" }, {