diff --git a/README.md b/README.md index 68882fb..6ed64a3 100644 --- a/README.md +++ b/README.md @@ -77,11 +77,11 @@ Create temporary shell variables pointing to where we'll store VEP and its cache export VEP_PATH=~/vep export VEP_DATA=~/.vep -Download the v82 release of VEP: +Download the v83 release of VEP: mkdir $VEP_PATH $VEP_DATA; cd $VEP_PATH - curl -LO https://github.com/Ensembl/ensembl-tools/archive/release/82.tar.gz - tar -zxf 82.tar.gz --starting-file variant_effect_predictor --transform='s|.*/|./|g' + curl -LO https://github.com/Ensembl/ensembl-tools/archive/release/83.tar.gz + tar -zxf 83.tar.gz --starting-file variant_effect_predictor --transform='s|.*/|./|g' Add that path to `PERL5LIB`, and the htslib subfolder to `PATH` where `tabix` will be installed: @@ -90,18 +90,19 @@ Add that path to `PERL5LIB`, and the htslib subfolder to `PATH` where `tabix` wi Download and unpack VEP's offline cache for GRCh37, GRCh38, and GRCm38: - rsync -zvh rsync://ftp.ensembl.org/ensembl/pub/release-82/variation/VEP/homo_sapiens_vep_82_GRCh{37,38}.tar.gz $VEP_DATA - rsync -zvh rsync://ftp.ensembl.org/ensembl/pub/release-82/variation/VEP/mus_musculus_vep_82_GRCm38.tar.gz $VEP_DATA - cat $VEP_DATA/*_vep_82_GRC{h37,h38,m38}.tar.gz | tar -izxf - -C $VEP_DATA + rsync -zvh rsync://ftp.ensembl.org/ensembl/pub/release-83/variation/VEP/homo_sapiens_vep_83_GRCh{37,38}.tar.gz $VEP_DATA + rsync -zvh rsync://ftp.ensembl.org/ensembl/pub/release-83/variation/VEP/mus_musculus_vep_83_GRCm38.tar.gz $VEP_DATA + cat $VEP_DATA/*_vep_83_GRC{h37,h38,m38}.tar.gz | tar -izxf - -C $VEP_DATA Install the Ensembl API, the reference FASTAs for GRCh37/GRCh38/GRCm38, and some neat VEP plugins: - perl INSTALL.pl --AUTO afp --SPECIES homo_sapiens,mus_musculus --ASSEMBLY GRCh38,GRCm38 --PLUGINS CADD,ExAC,dbNSFP,UpDownDistance --DESTDIR $VEP_PATH --CACHEDIR $VEP_DATA perl INSTALL.pl --AUTO afp --SPECIES homo_sapiens --ASSEMBLY GRCh37 --PLUGINS CADD,ExAC,dbNSFP,UpDownDistance --DESTDIR $VEP_PATH --CACHEDIR $VEP_DATA + perl INSTALL.pl --AUTO afp --SPECIES homo_sapiens --ASSEMBLY GRCh38 --PLUGINS CADD,ExAC,dbNSFP,UpDownDistance --DESTDIR $VEP_PATH --CACHEDIR $VEP_DATA + perl INSTALL.pl --AUTO afp --SPECIES mus_musculus --ASSEMBLY GRCm38 --PLUGINS CADD,UpDownDistance --DESTDIR $VEP_PATH --CACHEDIR $VEP_DATA Convert the offline cache for use with tabix, that significantly speeds up the lookup of known variants: - perl convert_cache.pl --species homo_sapiens,mus_musculus --version 82_GRCh37,82_GRCh38,82_GRCm38 --dir $VEP_DATA + perl convert_cache.pl --species homo_sapiens,mus_musculus --version 83_GRCh37,83_GRCh38,83_GRCm38 --dir $VEP_DATA Download and index a custom ExAC r0.3 VCF, that skips variants overlapping known somatic hotspots: @@ -110,7 +111,7 @@ Download and index a custom ExAC r0.3 VCF, that skips variants overlapping known Test running VEP in offline mode with the ExAC plugin, on the provided sample GRCh37 VCF: - perl variant_effect_predictor.pl --species homo_sapiens --assembly GRCh37 --offline --no_progress --everything --shift_hgvs 1 --check_existing --check_alleles --total_length --allele_number --no_escape --xref_refseq --dir $VEP_DATA --fasta $VEP_DATA/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa --plugin ExAC,$VEP_DATA/ExAC.r0.3.sites.minus_somatic.vcf.gz --input_file example_GRCh37.vcf --output_file example_GRCh37.vep.txt + perl variant_effect_predictor.pl --species homo_sapiens --assembly GRCh37 --offline --no_progress --everything --shift_hgvs 1 --check_existing --check_alleles --total_length --allele_number --no_escape --xref_refseq --dir $VEP_DATA --fasta $VEP_DATA/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz --plugin ExAC,$VEP_DATA/ExAC.r0.3.sites.minus_somatic.vcf.gz --input_file example_GRCh37.vcf --output_file example_GRCh37.vep.txt Authors ------- diff --git a/maf2maf.pl b/maf2maf.pl index 3885f4f..95e4fd3 100644 --- a/maf2maf.pl +++ b/maf2maf.pl @@ -16,8 +16,8 @@ my ( $tum_depth_col, $tum_rad_col, $tum_vad_col ) = qw( t_depth t_ref_count t_alt_count ); my ( $nrm_depth_col, $nrm_rad_col, $nrm_vad_col ) = qw( n_depth n_ref_count n_alt_count ); my ( $vep_path, $vep_data, $vep_forks, $ref_fasta ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, - "$ENV{HOME}/.vep/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz" ); -my ( $species, $ncbi_build, $maf_center, $min_hom_vaf ) = ( "homo_sapiens", "GRCh37", ".", 0.7 ); + "$ENV{HOME}/.vep/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz" ); +my ( $species, $ncbi_build, $cache_version, $maf_center, $min_hom_vaf ) = ( "homo_sapiens", "GRCh37", "", ".", 0.7 ); my $perl_bin = $Config{perlpath}; # Columns that can be safely borrowed from the input MAF @@ -68,6 +68,7 @@ 'vep-forks=s' => \$vep_forks, 'species=s' => \$species, 'ncbi-build=s' => \$ncbi_build, + 'cache-version=s' => \$cache_version, 'ref-fasta=s' => \$ref_fasta, ) or pod2usage( -verbose => 1, -input => \*DATA, -exitval => 2 ); pod2usage( -verbose => 1, -input => \*DATA, -exitval => 0 ) if( $help ); @@ -111,10 +112,15 @@ ( -s $ref_fasta ) or die "ERROR: Reference FASTA not found: $ref_fasta\n"; # Contruct VEP command using some default options and run it - my $vep_cmd = "$perl_bin $vep_path/variant_effect_predictor.pl --species $species --assembly $ncbi_build --offline --no_progress --no_stats --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --regulatory --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --check_alleles --check_ref --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --minimal --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --input_file $vcf_file --output_file $vep_anno"; - $vep_cmd .= " --fork $vep_forks" if( $vep_forks > 1 ); # VEP barks if it's set to 1 + my $vep_cmd = "$perl_bin $vep_path/variant_effect_predictor.pl --species $species --assembly $ncbi_build --offline --no_progress --no_stats --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --check_alleles --check_ref --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --minimal --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --input_file $vcf_file --output_file $vep_anno"; + # VEP barks if --fork is set to 1. So don't use this argument unless it's >1 + $vep_cmd .= " --fork $vep_forks" if( $vep_forks > 1 ); + # Add --cache-version only if the user specifically asked for a version + $vep_cmd .= " --cache_version $cache_version" if( $cache_version ); # Add options that only work on human variants $vep_cmd .= " --polyphen b --gmaf --maf_1kg --maf_esp" if( $species eq "homo_sapiens" ); + # Add options that work for most species, except a few + $vep_cmd .= " --regulatory" unless( $species eq "canis_familiaris" ); # Add options that only work on human variants mapped to the GRCh37 reference genome $vep_cmd .= " --plugin ExAC,$vep_data/ExAC.r0.3.sites.minus_somatic.vcf.gz" if( $species eq "homo_sapiens" and $ncbi_build eq "GRCh37" ); @@ -351,7 +357,8 @@ =head1 OPTIONS --vep-forks Number of forked processes to use when running VEP [4] --species Ensembl-friendly name of species (e.g. mus_musculus for mouse) [homo_sapiens] --ncbi-build NCBI reference assembly of variants MAF (e.g. GRCm38 for mouse) [GRCh37] - --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] + --cache-version Version of offline cache to use with VEP (e.g. 75, 82, 83) [Default: Installed version] + --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] --help Print a brief help message and quit --man Print the detailed manual diff --git a/maf2vcf.pl b/maf2vcf.pl index adbe967..5eedb25 100644 --- a/maf2vcf.pl +++ b/maf2vcf.pl @@ -9,7 +9,7 @@ use Pod::Usage qw( pod2usage ); # Set any default paths and constants -my $ref_fasta = "$ENV{HOME}/.vep/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz"; +my $ref_fasta = "$ENV{HOME}/.vep/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz"; my ( $tum_depth_col, $tum_rad_col, $tum_vad_col ) = qw( t_depth t_ref_count t_alt_count ); my ( $nrm_depth_col, $nrm_rad_col, $nrm_vad_col ) = qw( n_depth n_ref_count n_alt_count ); @@ -263,7 +263,7 @@ =head1 OPTIONS --input-maf Path to input file in MAF format --output-dir Path to output directory where VCFs will be stored, one per TN-pair --output-vcf Path to output multi-sample VCF containing all TN-pairs [/.vcf] - --ref-fasta Path to reference Fasta file [~/.vep/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] + --ref-fasta Path to reference Fasta file [~/.vep/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] --per-tn-vcfs Specify this to generate VCFs per-TN pair, in addition to the multi-sample VCF --tum-depth-col Name of MAF column for read depth in tumor BAM [t_depth] --tum-rad-col Name of MAF column for reference allele depth in tumor BAM [t_ref_count] diff --git a/vcf2maf.pl b/vcf2maf.pl index c009bac..6dd3773 100644 --- a/vcf2maf.pl +++ b/vcf2maf.pl @@ -11,8 +11,8 @@ # Set any default paths and constants my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" ); -my ( $vep_path, $vep_data, $vep_forks, $ref_fasta ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, "$ENV{HOME}/.vep/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz" ); -my ( $species, $ncbi_build, $maf_center, $min_hom_vaf ) = ( "homo_sapiens", "GRCh37", ".", 0.7 ); +my ( $vep_path, $vep_data, $vep_forks, $ref_fasta ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, "$ENV{HOME}/.vep/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz" ); +my ( $species, $ncbi_build, $cache_version, $maf_center, $min_hom_vaf ) = ( "homo_sapiens", "GRCh37", "", ".", 0.7 ); my $perl_bin = $Config{perlpath}; # Hash to convert 3-letter amino-acid codes to their 1-letter codes @@ -222,10 +222,15 @@ sub GetBiotypePriority { ( -s $ref_fasta ) or die "ERROR: Reference FASTA not found: $ref_fasta\n"; # Contruct VEP command using some default options and run it - my $vep_cmd = "$perl_bin $vep_path/variant_effect_predictor.pl --species $species --assembly $ncbi_build --offline --no_progress --no_stats --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --regulatory --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --check_alleles --check_ref --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --minimal --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --input_file $input_vcf --output_file $output_vcf"; - $vep_cmd .= " --fork $vep_forks" if( $vep_forks > 1 ); # VEP barks if it's set to 1 + my $vep_cmd = "$perl_bin $vep_path/variant_effect_predictor.pl --species $species --assembly $ncbi_build --offline --no_progress --no_stats --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --check_alleles --check_ref --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --minimal --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --input_file $input_vcf --output_file $output_vcf"; + # VEP barks if --fork is set to 1. So don't use this argument unless it's >1 + $vep_cmd .= " --fork $vep_forks" if( $vep_forks > 1 ); + # Add --cache-version only if the user specifically asked for a version + $vep_cmd .= " --cache_version $cache_version" if( $cache_version ); # Add options that only work on human variants $vep_cmd .= " --polyphen b --gmaf --maf_1kg --maf_esp" if( $species eq "homo_sapiens" ); + # Add options that work for most species, except a few we know about + $vep_cmd .= " --regulatory" unless( $species eq "canis_familiaris" ); # Add options that only work on human variants mapped to the GRCh37 reference genome $vep_cmd .= " --plugin ExAC,$vep_data/ExAC.r0.3.sites.minus_somatic.vcf.gz" if( $species eq "homo_sapiens" and $ncbi_build eq "GRCh37" ); @@ -694,9 +699,10 @@ =head1 OPTIONS --vep-path Folder containing variant_effect_predictor.pl [~/vep] --vep-data VEP's base cache/plugin directory [~/.vep] --vep-forks Number of forked processes to use when running VEP [4] - --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/82_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] + --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/83_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz] --species Ensembl-friendly name of species (e.g. mus_musculus for mouse) [homo_sapiens] --ncbi-build NCBI reference assembly of variants MAF (e.g. GRCm38 for mouse) [GRCh37] + --cache-version Version of offline cache to use with VEP (e.g. 75, 82, 83) [Default: Installed version] --maf-center Variant calling center to report in MAF [.] --min-hom-vaf If GT undefined in VCF, minimum allele fraction to call a variant homozygous [0.7] --help Print a brief help message and quit