diff --git a/.gitignore b/.gitignore index 8deb9bf..cef8139 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,8 @@ databases/* # assets/input assets/input/* -!assets/input/example_patient.yaml +!assets/input/example_WES_patient.yaml +!assets/input/example_TSO500_patient.yaml !assets/input/.gitkeep # assets/references diff --git a/assets/input/example_TSO500_patient.yaml b/assets/input/example_TSO500_patient.yaml new file mode 100644 index 0000000..0737a96 --- /dev/null +++ b/assets/input/example_TSO500_patient.yaml @@ -0,0 +1,212 @@ +common: + author: MIRACUM-Pipe + center: Freiburg + memory: 150g + cpucores: 12 + files: + # filenames of the input files as fastq.gz + panel: + tumor: MT21-28037-DNA_S7_L00 #MT21_19902-RNA_S12_L001_R1_001.fastq.gz + numberOfFiles: 4 + entity: ESCA + RNA: + # folder contatining RNA fastq files for RNA fusions + folder: RNA + protocol: panel + # process and annotate germline file, allowed values: yes, no + # set to no if the germline sample should be used only to identify somatic variants + germline: yes + ucscServer: genome-euro-mysql.soe.ucsc.edu + +sex: XY + +general: + # minimum base quality to call a variant + minBaseQual: 28 + # minimum + maf_cutoff: 0.01 + # minimum variant allele / tumor frequency to call a variant + minVAF: 0.05 + +## Panel Parameters +panel: + samtools: + mpileup2snp: + #--min-coverage Minimum read depth at a position to make a call [8] + minCoverage: 20 + #--min-reads2 Minimum supporting reads at a position to call variants [2] + minReads2: 2 + #--min-freq-for-hom Minimum frequency to call homozygote [0.75] + minFreqForHom: 0.75 + #--p-value Default p-value threshold for calling variants [99e-02] + pValue: 0.99 + #--strand-filter Ignore variants with >90% support on one strand [1] + strandFilter: 1 + #--min-var-freq Minimum variant allele frequency threshold [0.01] -> minVAF + #--min-avg-qual Minimum base quality at a position to count a read [15] -> minBaseQual + + mpileup2indel: + #--min-coverage Minimum read depth at a position to make a call [8] + minCoverage: 20 + #--min-reads2 Minimum supporting reads at a position to call variants [2] + minReads2: 2 + #--min-freq-for-hom Minimum frequency to call homozygote [0.75] + minFreqForHom: 0.75 + #--p-value Default p-value threshold for calling variants [99e-02] + pValue: 0.99 + #--strand-filter Ignore variants with >90% support on one strand [1] + strandFilter: 1 + #--min-var-freq Minimum variant allele frequency threshold [0.01] -> minVAF + #--min-avg-qual Minimum base quality at a position to count a read [15] -> minBaseQual + + varscan: + fpfilter: + #--min-var-count Minimum number of variant-supporting reads [4] + minVarCount: 50 + #--min-var-count-lc Minimum number of variant-supporting reads when depth below somaticPdepth [2] + minVarCountLC: 2 + #--max-somatic-p Maximum somatic p-value [0.05] + maxSomaticP: 0.05 + #--max-somatic-p-depth Depth required to test max somatic p-value [10] + maxSomaticPDepth: 10 + #--min-ref-readpos Minimum average read position of ref-supporting reads [0.1] + minRefReadpos: 0.1 + #--min-var-readpos Minimum average read position of var-supporting reads [0.1] + minVarReadpos: 0.1 + #--min-ref-dist3 Minimum average distance to effective 3' end (ref) [0.1] + minRefDist3: 0.1 + #--min-var-dist3 Minimum average distance to effective 3' end (var) [0.1] + minVarDist3: 0.1 + #--min-strandedness Minimum fraction of variant reads from each strand [0.01] + minStrandedness: 0.01 + #--min-strand-reads Minimum allele depth required to perform the strand tests [5] + minStrandReads: 5 + #--max-basequal-diff Maximum average base quality diff (ref - var) [50] + maxBasequalDiff: 50 + #--min-ref-avgrl Minimum average trimmed read length for ref allele [90] + minRefAVGRL: 90 + #--min-var-avgrl Minimum average trimmed read length for var allele [90] + minVarAVGRL: 90 + #--max-rl-diff Maximum average relative read length difference (ref - var) [0.25] + maxRlDiff: 0.25 + #--max-ref-mmqs Maximum mismatch quality sum of reference-supporting reads [100] + maxRefMMQS: 100 + #--max-var-mmqs Maximum mismatch quality sum of variant-supporting reads [100] + maxVarMMQS: 100 + #--min-mmqs-diff Minimum average mismatch quality sum (var - ref) [0] + minMMQSDiff: 0 + #--max-mmqs-diff Maximum average mismatch quality sum (var - ref) [50] + maxMMQSDiff: 50 + #--min-ref-mapqual Minimum average mapping quality for ref allele [15] + minRefMapQual: 15 + #--min-var-mapqual Minimum average mapping quality for var allele [15] + minVarMapQual: 15 + #--max-mapqual-diff Maximum average mapping quality (ref - var) [50] + maxMapQualDiff: 50 + + mutect: + #--base-quality-score-threshold 28; Base qualities below this threshold will be reduced to the minimum (6) Default value: 18 + #--callable-depth 8; Minimum depth to be considered callable for Mutect stats. Does not affect genotyping. Default value: 10 + callableDepth: 8 + #--intervals V6UTR.bed; One or more genomic intervals over which to operate This argument may be specified 0 or more times. Default value: null + #--min-base-quality-score 28; Minimum base quality required to consider a base for calling Default value: 10 + + filterMutectCalls: + #--contamination-estimate 0.5?; Estimate of contamination. Default value: 0.0 (evtl. tumor purity) + #--intervals V6UTR.bed; One or more genomic intervals over which to operate This argument may be specified 0 or more times. Default value: null + #--min-allele-fraction 0.05; Minimum allele fraction required Default value: 0.0 (VAF?) + #--min-median-base-quality 28; Minimum median base quality of alt reads Default value: 20 + + ControlFREEC: + # desired behavior in the ambiguous regions (poly-N or low mappability regions between two different copy number values) + # 0: the "unknown" region is attached to the "known" region on the right + # 1: make a separate fragment of this unknown region and then attaches it to the left or to the right region choosing the longer one + # 2: make a separate fragment of this unknown region and then attaches it to the left or to the right region but the ploidy copy number has a priority + # 3: make a separate fragment of this unknown region and then attaches it to the left or to the right region choosing the longer one but this known region should make at least half-size of the unknown region + # 4: make a separate fragment of this unknown region and do not assign any copy number to this region at all + breakPointType: 4 + # positive value of threshold for segmentation of normalized profiles + breakPointThreshold: 1.2 + # set to 1 or 2 to correct the Read Count (RC) for GC-content bias and low mappability even when you have a control sample + # 0: simply model "sample RC ~ Control RC" + # 1: normalize the sample and the control RC using GC-content and then calculate the ratio "Sample RC/contol RC" + # 2: model "sample RC ~ Control RC" bias, and then normalize for GC-content + forceGCcontentNormalization: 1 + # intercept of polynomial + # 1 - with GC-content, + # 0 - with a control dataset + intercept: 1 + # minimal number of consecutive windows to call a CNA + # 3 for WES and 1 for WGS + minCNAlength: 3 + # maxThreads: 12 -> cpucores + # set TRUE for target resequencing data (e.g., exome-seq) to avoid false positive predictions due to non-uniform capture + noisyData: TRUE + # genome ploidy; In case of doubt, you can set different values and Control-FREEC will select the one that explains most observed CNAs + ploidy: 2 + # set FALSE to avoid printing "-1" to the _ratio.txt files. Useful for exome-seq or targeted sequencing data + printNA: FALSE + # threshold on the minimal number of reads per window in the control sample + # Useful for exome-seq or targeted sequencing data + # recommended value >=50 for for exome data + readCountThreshold: 50 + # step (used only when "window" is specified); do not use for exome sequencing (instead set "window=0") + step: 0 + # explicit window size (higher priority than coefficientOfVariation); + # for whole genome sequencing: "window=50000" + # for whole exome sequencing: "window=0" + window: 0 + # Use a mappability profile to correct read counts (in this case a mappability file must be provided with "gemMappabilityFile" ) + uniqueMatch: TRUE + # set TRUE to correct for contamination by normal cells. + # If "contamination" is not provided, it will automatically evaluate the level of contamination + contaminationAdjustment: TRUE + + fusions: + fusionGenes: TSO500_fusions.txt + amplification: + amplificationGenes: TSO500_amplifications.txt + +annovar: + # define annotation databases, together with their type, used. The databases have to be installed with annovar + protocol: 'refGene,gnomad211_genome,avsnp150,clinvar_20210501,intervar_20180118,dbnsfp42a,cosmic_coding,cosmic_noncoding' + argop: 'g,f,f,f,f,f,f,f' + +sequenza: + # define sequenza-utils parameter + # --window WINDOW; Window size used for binning the original seqz file. Default is 50. + window: 50 + # if used for samples without mathcing normal as control a separate non-matching-normal file in bam format is needed; + # file should be stored in references/sequenza + nonMatchingNormal: non_matching_normal.bam + chromosomes: chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX + +reference: + # define reference genome (located in references/Genome incl. indices) + genome: genome.fa + # file containing the length of the chromosomes (located in references/chromosomes) + length: hg19.len + # reference wig file for HRD calculation; if not present its generated (to be put/created in databases) + hrdRef: hg19all.wig.gz + # microsatellite sites for msisnesor-pro; if not present its generated (to be put/created in databases) + microsatelliteSites: microsatellite_sites_hg19 + # database of known SNPs (located in databses/dbSNP) + dbSNP: snp150hg19.vcf.gz + # Mappability file needed for calling CNVs with ControlFREEC (located in reference/mappability) + mappability: out100m2_hg19.gem + sequencing: # all located in references/sequencing + # target region covered by the sequencer in .bed format + captureRegions: TSO500.bed + # file containing all the covered genes as HUGO Symbols + captureGenes: TSO500_Targets.txt + # target region in Mega bases covered + coveredRegion: 1.94 + # target / capture region kit name + captureRegionName: TSO500 + # target capture correlation factors for mutation signature analysis + captureCorFactors : targetCapture_cor_factors.rda + # covered exons by capture kit; basically intersection of exome with capture regions + # e.g. bedtools intersect -a targets.bed -b bed_exons_hg19.bed > exons_Routine.bed -u + coveredExons: exons_TSO500.bed + # positive list for genes in germline to be reported (e.g. by ACMG) + # actionableGenes: actionable_genes.txt \ No newline at end of file diff --git a/assets/input/example_patient.yaml b/assets/input/example_WES_patient.yaml similarity index 100% rename from assets/input/example_patient.yaml rename to assets/input/example_WES_patient.yaml