Added more patient.yaml examples

AG-Boerries · Jun 17, 2022 · 574c93c · 574c93c
1 parent 40a742c
commit 574c93c
Show file tree

Hide file tree

Showing 3 changed files with 214 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -17,7 +17,8 @@ databases/*
 
 # assets/input
 assets/input/*
-!assets/input/example_patient.yaml
+!assets/input/example_WES_patient.yaml
+!assets/input/example_TSO500_patient.yaml
 !assets/input/.gitkeep
 
 # assets/references

diff --git a/assets/input/example_TSO500_patient.yaml b/assets/input/example_TSO500_patient.yaml
@@ -0,0 +1,212 @@
+common:
+  author: MIRACUM-Pipe
+  center: Freiburg
+  memory: 150g
+  cpucores: 12
+  files:
+    # filenames of the input files as fastq.gz
+    panel:
+        tumor: MT21-28037-DNA_S7_L00 #MT21_19902-RNA_S12_L001_R1_001.fastq.gz
+        numberOfFiles: 4
+  entity: ESCA
+  RNA:
+    # folder contatining RNA fastq files for RNA fusions
+    folder: RNA
+  protocol: panel
+  # process and annotate germline file, allowed values: yes, no
+  # set to no if the germline sample should be used only to identify somatic variants
+  germline: yes
+  ucscServer: genome-euro-mysql.soe.ucsc.edu
+
+sex: XY
+
+general:
+  # minimum base quality to call a variant
+  minBaseQual: 28
+  # minimum
+  maf_cutoff: 0.01
+  # minimum variant allele / tumor frequency to call a variant
+  minVAF: 0.05
+
+## Panel Parameters
+panel:
+  samtools:
+    mpileup2snp:
+      #--min-coverage  Minimum read depth at a position to make a call [8]
+      minCoverage: 20
+      #--min-reads2    Minimum supporting reads at a position to call variants [2]
+      minReads2: 2
+      #--min-freq-for-hom      Minimum frequency to call homozygote [0.75]
+      minFreqForHom: 0.75
+      #--p-value       Default p-value threshold for calling variants [99e-02]
+      pValue: 0.99
+      #--strand-filter Ignore variants with >90% support on one strand [1]
+      strandFilter: 1
+      #--min-var-freq  Minimum variant allele frequency threshold [0.01] ->  minVAF
+      #--min-avg-qual  Minimum base quality at a position to count a read [15] ->  minBaseQual
+
+    mpileup2indel:
+      #--min-coverage  Minimum read depth at a position to make a call [8]
+      minCoverage: 20
+      #--min-reads2    Minimum supporting reads at a position to call variants [2]
+      minReads2: 2
+      #--min-freq-for-hom      Minimum frequency to call homozygote [0.75]
+      minFreqForHom: 0.75
+      #--p-value       Default p-value threshold for calling variants [99e-02]
+      pValue: 0.99
+      #--strand-filter Ignore variants with >90% support on one strand [1]
+      strandFilter: 1
+      #--min-var-freq  Minimum variant allele frequency threshold [0.01] ->  minVAF
+      #--min-avg-qual  Minimum base quality at a position to count a read [15] ->  minBaseQual
+
+  varscan:
+    fpfilter:
+      #--min-var-count		Minimum number of variant-supporting reads [4]
+      minVarCount: 50
+      #--min-var-count-lc	Minimum number of variant-supporting reads when depth below somaticPdepth [2]
+      minVarCountLC: 2
+      #--max-somatic-p		Maximum somatic p-value [0.05]
+      maxSomaticP: 0.05
+      #--max-somatic-p-depth	Depth required to test max somatic p-value [10]
+      maxSomaticPDepth: 10
+      #--min-ref-readpos	Minimum average read position of ref-supporting reads [0.1]
+      minRefReadpos: 0.1
+      #--min-var-readpos	Minimum average read position of var-supporting reads [0.1]
+      minVarReadpos: 0.1
+      #--min-ref-dist3		Minimum average distance to effective 3' end (ref) [0.1]
+      minRefDist3: 0.1
+      #--min-var-dist3		Minimum average distance to effective 3' end (var) [0.1]
+      minVarDist3: 0.1
+      #--min-strandedness	Minimum fraction of variant reads from each strand [0.01]
+      minStrandedness: 0.01
+      #--min-strand-reads	Minimum allele depth required to perform the strand tests [5]
+      minStrandReads: 5
+      #--max-basequal-diff		Maximum average base quality diff (ref - var) [50]
+      maxBasequalDiff: 50
+      #--min-ref-avgrl		Minimum average trimmed read length for ref allele [90]
+      minRefAVGRL: 90
+      #--min-var-avgrl		Minimum average trimmed read length for var allele [90]
+      minVarAVGRL: 90
+      #--max-rl-diff		Maximum average relative read length difference (ref - var) [0.25]
+      maxRlDiff: 0.25
+      #--max-ref-mmqs		Maximum mismatch quality sum of reference-supporting reads [100]
+      maxRefMMQS: 100
+      #--max-var-mmqs		Maximum mismatch quality sum of variant-supporting reads [100]
+      maxVarMMQS: 100
+      #--min-mmqs-diff		Minimum average mismatch quality sum (var - ref) [0]
+      minMMQSDiff: 0
+      #--max-mmqs-diff		Maximum average mismatch quality sum (var - ref) [50]
+      maxMMQSDiff: 50
+      #--min-ref-mapqual	Minimum average mapping quality for ref allele [15]
+      minRefMapQual: 15
+      #--min-var-mapqual	Minimum average mapping quality for var allele [15]
+      minVarMapQual: 15
+      #--max-mapqual-diff	Maximum average mapping quality (ref - var) [50]
+      maxMapQualDiff: 50
+
+  mutect:
+    #--base-quality-score-threshold 28; Base qualities below this threshold will be reduced to the minimum (6)  Default value: 18
+    #--callable-depth 8; Minimum depth to be considered callable for Mutect stats.  Does not affect genotyping. Default value: 10
+    callableDepth: 8
+    #--intervals V6UTR.bed; One or more genomic intervals over which to operate  This argument may be specified 0 or more times. Default value: null
+    #--min-base-quality-score 28; Minimum base quality required to consider a base for calling  Default value: 10
+
+  filterMutectCalls:
+    #--contamination-estimate 0.5?; Estimate of contamination.  Default value: 0.0 (evtl. tumor purity)
+    #--intervals V6UTR.bed; One or more genomic intervals over which to operate  This argument may be specified 0 or more times. Default value: null
+    #--min-allele-fraction 0.05; Minimum allele fraction required  Default value: 0.0 (VAF?)
+    #--min-median-base-quality 28; Minimum median base quality of alt reads  Default value: 20
+
+  ControlFREEC:
+    # desired behavior in the ambiguous regions (poly-N or low mappability regions between two different copy number values)
+    # 0: the "unknown" region is attached to the "known" region on the right
+    # 1: make a separate fragment of this unknown region and then attaches it to the left or to the right region choosing the longer one
+    # 2: make a separate fragment of this unknown region and then attaches it to the left or to the right region but the ploidy copy number has a priority
+    # 3: make a separate fragment of this unknown region and then attaches it to the left or to the right region choosing the longer one but this known region should make at least half-size of the unknown region
+    # 4: make a separate fragment of this unknown region and do not assign any copy number to this region at all
+    breakPointType: 4
+    # positive value of threshold for segmentation of normalized profiles
+    breakPointThreshold: 1.2
+    # set to 1 or 2 to correct the Read Count (RC) for GC-content bias and low mappability even when you have a control sample
+    # 0: simply model "sample RC ~ Control RC"
+    # 1: normalize the sample and the control RC using GC-content and then calculate the ratio "Sample RC/contol RC"
+    # 2: model "sample RC ~ Control RC" bias, and then normalize for GC-content
+    forceGCcontentNormalization: 1
+    # intercept of polynomial
+    # 1 - with GC-content,
+    # 0 - with a control dataset
+    intercept: 1
+    # minimal number of consecutive windows to call a CNA
+    # 3 for WES and 1 for WGS
+    minCNAlength: 3
+    # maxThreads: 12 -> cpucores
+    # set TRUE for target resequencing data (e.g., exome-seq) to avoid false positive predictions due to non-uniform capture
+    noisyData: TRUE
+    # genome ploidy; In case of doubt, you can set different values and Control-FREEC will select the one that explains most observed CNAs
+    ploidy: 2
+    # set FALSE to avoid printing "-1" to the _ratio.txt files. Useful for exome-seq or targeted sequencing data
+    printNA: FALSE
+    # threshold on the minimal number of reads per window in the control sample
+    # Useful for exome-seq or targeted sequencing data
+    # recommended value >=50 for for exome data
+    readCountThreshold: 50
+    # step (used only when "window" is specified); do not use for exome sequencing (instead set "window=0")
+    step: 0
+    # explicit window size (higher priority than coefficientOfVariation);
+    # for whole genome sequencing: "window=50000"
+    # for whole exome sequencing: "window=0"
+    window: 0
+    # Use a mappability profile to correct read counts (in this case a mappability file must be provided with "gemMappabilityFile" )
+    uniqueMatch: TRUE
+    # set TRUE to correct for contamination by normal cells.
+    # If "contamination" is not provided, it will automatically evaluate the level of contamination
+    contaminationAdjustment: TRUE
+
+  fusions:
+    fusionGenes: TSO500_fusions.txt
+  amplification:
+    amplificationGenes: TSO500_amplifications.txt
+
+annovar:
+  # define annotation databases, together with their type, used. The databases have to be installed with annovar
+  protocol: 'refGene,gnomad211_genome,avsnp150,clinvar_20210501,intervar_20180118,dbnsfp42a,cosmic_coding,cosmic_noncoding'
+  argop: 'g,f,f,f,f,f,f,f'
+
+sequenza:
+  # define sequenza-utils parameter
+  # --window WINDOW; Window size used for binning the original seqz file. Default is 50.
+  window: 50
+  # if used for samples without mathcing normal as control a separate non-matching-normal file in bam format is needed;
+  # file should be stored in references/sequenza
+  nonMatchingNormal: non_matching_normal.bam
+  chromosomes: chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX
+
+reference:
+  # define reference genome (located in references/Genome incl. indices)
+  genome: genome.fa
+  # file containing the length of the chromosomes (located in references/chromosomes)
+  length: hg19.len
+  # reference wig file for HRD calculation; if not present its generated (to be put/created in databases)
+  hrdRef: hg19all.wig.gz
+  # microsatellite sites for msisnesor-pro; if not present its generated (to be put/created in databases)
+  microsatelliteSites: microsatellite_sites_hg19
+  # database of known SNPs (located in databses/dbSNP)
+  dbSNP: snp150hg19.vcf.gz
+  # Mappability file needed for calling CNVs with ControlFREEC (located in reference/mappability)
+  mappability: out100m2_hg19.gem
+  sequencing: # all located in references/sequencing
+    # target region covered by the sequencer in .bed format
+    captureRegions: TSO500.bed
+    # file containing all the covered genes as HUGO Symbols
+    captureGenes: TSO500_Targets.txt
+    # target region in Mega bases covered
+    coveredRegion: 1.94
+    # target / capture region kit name
+    captureRegionName: TSO500
+    # target capture correlation factors for mutation signature analysis
+    captureCorFactors : targetCapture_cor_factors.rda
+    # covered exons by capture kit; basically intersection of exome with capture regions
+    # e.g. bedtools intersect -a targets.bed -b bed_exons_hg19.bed > exons_Routine.bed -u
+    coveredExons: exons_TSO500.bed
+    # positive list for genes in germline to be reported (e.g. by ACMG)
+    # actionableGenes: actionable_genes.txt
diff --git a/assets/input/example_patient.yaml → assets/input/example_WES_patient.yaml b/assets/input/example_patient.yaml → assets/input/example_WES_patient.yaml