From a6076f1706faed22044d901bb15b67b170100422 Mon Sep 17 00:00:00 2001 From: yunfeiguo Date: Thu, 13 Aug 2015 23:54:57 -0700 Subject: [PATCH] fix bug in venn diagram plotting --- bin/secondary/stats | 94 +++++++++++++++------------------ doc/User Guide/Manuals/stats.md | 78 +++++++++++++++++++++++++++ lib/SeqMule/Utils.pm | 27 ++++++++++ 3 files changed, 149 insertions(+), 50 deletions(-) create mode 100644 doc/User Guide/Manuals/stats.md diff --git a/bin/secondary/stats b/bin/secondary/stats index a25fa41..c2e5a07 100755 --- a/bin/secondary/stats +++ b/bin/secondary/stats @@ -24,6 +24,12 @@ use Getopt::Long qw/GetOptions/; my $ARGCOUNT=100000; #number of regions to extract using command line, max number if restricted by kernel config my $INDEL_EXTENSION = 10; #when determining indel overlapping, interalize indels by extending indel INDEL_EXTENSION bp towards both ends my $TMPDIR = $ENV{TMPDIR} || "/tmp"; +my %LEGAL_VAR_FILETYPE = ( + 'avinput' => 1, + 'vcf' => 1, + 'consensus' => 1, + 'vcf.gz' => 1, +); ################################################## my ($prefix, $bam,$vcf, @@ -111,18 +117,22 @@ if ($union_files) &conv2anno(split /,/,$union_files) ); } -if ($consensus_files) -{ +if ($consensus_files) { &var_consensus( &conv2anno(split /,/,$consensus_files) ); } -if ($venn) -{ +if ($venn) { my @files=split /,/,$venn; - @files=&SeqMule::Utils::getNonEmptyVCF(@files); - &venn( \@files,&conv2anno(&normalize_vcf(@files)) ); + &SeqMule::Utils::checkFileType(\%LEGAL_VAR_FILETYPE,1,@files); + + if(&SeqMule::Utils::getSuffix($files[0]) =~ /^vcf$|^vcf\.gz$/) { + @files=&SeqMule::Utils::getNonEmptyVCF(@files); + &venn( \@files,&conv2anno(&normalize_vcf(@files)) ); + } else { + &venn( \@files,&conv2anno(@files) ); + } } if (defined $capture) @@ -809,8 +819,7 @@ sub coverage_stat } #generate Venn Diagram -sub venn -{ +sub venn { warn "NOTICE: Do not support more than 5 files right now\n" and return undef unless (@_<=5); warn "NOTICE: No need to plot for single file\n" and return undef if (@_==1); @@ -838,16 +847,14 @@ sub venn my $non_snp_index_pool; #remove empty file - for (@avinput_pool) - { + for (@avinput_pool) { $_=undef unless (-s $_ >0); } @avinput_pool=grep {defined $_} @avinput_pool; warn "WARNING: failed to plot Venn diagram, number of nonempty files is more than 5 or smaller than 2.\n" and return unless @avinput_pool<=5 && @avinput_pool>=2; #generate index matrix - for my $count(0..$#avinput_pool) - { + for my $count(0..$#avinput_pool) { #*_index_matrix is array of array #first layer is array of file index #2nd layer is array of file name plus variants @@ -857,18 +864,19 @@ sub venn open IN,"<",$input or die "Cannot open $input:$!\n"; $snp_index_matrix[$count]=[]; $non_snp_index_matrix[$count]=[]; - while () - { + my $illegal_input_count = 0; + while () { #chr start end ref obs ... #1 2 3 4 5 - /^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t.*$/ or next; + unless(/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t?.*$/) { + $illegal_input_count++; + next; + } - if ($2 == $3 and length($4) == 1 and length($5) == 1 ) #start == end means this is a SNP - { + if ($2 == $3 and length($4) == 1 and length($5) == 1 ) {#start == end means this is a SNP my $key=[$1,$2,$3,$4,$5]; push @{$snp_index_matrix[$count]},$key and $snp_appear{$key}=1 unless $snp_appear{$key}; - } else - { + } else { #for indels, MVNs #use intervalization of 20bp (10bp in each direction) #as long as there's an overlap of interval @@ -879,11 +887,11 @@ sub venn } } close IN; + warn "WARNING: $input has $illegal_input_count illegal lines, expect >= 5 tab delimited fields\n" if $illegal_input_count > 0; } @non_snp_index_matrix = &gen_index_for_interval(\@non_snp_index_matrix); - for my $count(0..$#avinput_name) - { + for my $count(0..$#avinput_name) { #add file name to head of array my $input = $avinput_name[$count]; my ($name_prefix) = $input=~/^(.*)(\.avinput|\.tmp|\.vcf|\.consensus)$/i; @@ -904,8 +912,7 @@ sub venn &genPlot("SNV",$snp_index_pool,scalar @avinput_pool); &genPlot("NON-SNV",$non_snp_index_pool,scalar @avinput_pool); } -sub gen_index_for_interval -{ +sub gen_index_for_interval { #be careful here #we are dealing with references #so any change to the content may have unexpected side effects @@ -992,14 +999,12 @@ sub gen_index_for_interval #and there shouldn't be duplicates return @new; } -sub write_index_pool -{ +sub write_index_pool { my $max=0; my $outfile = shift; my @index_matrix = @_; - for my $i(0..$#index_matrix) - { + for my $i(0..$#index_matrix) { $max = @{$index_matrix[$i]} if @{$index_matrix[$i]} > $max; } @@ -1009,9 +1014,8 @@ sub write_index_pool my @line; for my $j(0..$#index_matrix) { - if(defined $index_matrix[$j][$i]) - { - push @line,(join "+",@{$index_matrix[$j][$i]}); + if(defined $index_matrix[$j][$i] and @{$index_matrix[$j][$i]}>0) { + push @line,(join "+",(grep {defined} @{$index_matrix[$j][$i]})); } else { push @line,"NA"; @@ -1021,8 +1025,7 @@ sub write_index_pool } close OUT; } -sub genPlot -{ +sub genPlot { my $category = shift; #SNP or NON-SNP die "ERROR: category missing" unless defined $category; my $variant_index_pool=shift; @@ -1044,8 +1047,7 @@ sub genPlot "y=list()\n". "title='Overlapping of $category variants'\n"; #convert dataframe to list such that na=remove works - for my $no(1..$count) - { + for my $no(1..$count) { $rscript.= "y[[$no]]=x[[$no]]\nnames(y)[$no]=$no\n". "tmp=paste($no,names(x)[$no],sep=':')\n". @@ -1066,41 +1068,35 @@ sub genPlot }\n"; #high resolution tiff output - if ($count==5) - { + if ($count==5) { $rscript.=" venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=0.65,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9) "; - } elsif ($count==4) - { + } elsif ($count==4) { $rscript.=" venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=0.8,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9) "; - } else - { + } else { $rscript.=" venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=1,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=0,ext.length=0.9) "; } #jpeg output for html report - if ($count==5) - { + if ($count==5) { $rscript.=" obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=0.65,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9) jpeg('$jpg',width=5,height=5,units='in',res=500) grid.draw(obj) garbage <- dev.off() "; - } elsif ($count==4) - { + } elsif ($count==4) { $rscript.=" obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=0.8,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9) jpeg('$jpg',width=5,height=5,units='in',res=500) grid.draw(obj) garbage <- dev.off() "; - } else - { + } else { $rscript.=" obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=1,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=0,ext.length=0.9) jpeg('$jpg',width=5,height=5,units='in',res=500) @@ -1117,13 +1113,11 @@ sub genPlot } #convert files to annovar input format for easy handling -sub conv2anno -{ +sub conv2anno { my @files=@_; my @return_files; my $exe_conver=&getExe("convert2annovar.pl"); - for (@files) - { + for (@files) { if (/^(.*)\.vcf(\.gz)?$/i) { my $local_prefix=$1; diff --git a/doc/User Guide/Manuals/stats.md b/doc/User Guide/Manuals/stats.md new file mode 100644 index 0000000..9812c2f --- /dev/null +++ b/doc/User Guide/Manuals/stats.md @@ -0,0 +1,78 @@ +# NAME + +SeqMule an automatic pipeline for next-generation sequencing data analysis + +# SYNOPSIS + +seqmule stats + +For details, please use 'seqmule stats -h': + +Options: + + --prefix,-p output prefix. Mandatory for multiple input files. + --bam a sorted BAM file (used with --capture, --aln) + --capture [BED] a BED file for capture regions (or any other regions of interest). Effective for --bam and --vcf. + --vcf output variant stats for a VCF file. If a BED file is supplied, extract variants based on the BED file. + --aln output alignment stats for a BAM file + --consensus,-c comma separated list of files for extracting consensus calls. + VCF4 and SOAPsnp *.consensus format or ANNOVAR *.avinput required. + --union,-u comma separated list of files for pooling variants (same format as above). + --venn comma separated list of files for Venn diagram plotting (same format as above). + --c-vcf comma separated list of SORTED VCF files for extracting consensus calls. *.vcf or *.vcf.gz suffix required + --u-vcf comma separated list of SORTED VCF files for extracting union calls. *.vcf or *.vcf.gz suffix required + --ref reference file in FASTA format. Effective for --c-vcf and --u-vcf. + -s,--sample sample name for VCF file, used for -vcf, -u, -venn, -c options. + --plink convert VCF to PLINK format (PED,MAP). Only works with --vcf option. + --mendel-stat generate Mendelian error statistics + --paternal sample ID for paternal ID (case-sensitive). Rest are either maternal or offspring. Only one family allowed. + --maternal sample ID for maternal ID (case-sensitive). Rest are either paternal or offspring. Only one family allowed. + -N extract variants appearing in at least N input files. Currently only effective for --c-vcf option. + --jmem max java memory. Only effective for --c-vcf and --u-vcf. Default: 1750m + --jexe Java executable path. Default: java + -t number of threads. Only effective for --aln, --c-vcf and --u-vcf. Default: 1 + --tmpdir use DIR for storing large temporary files. Default: $TMPDIR(in your ENV variables) or /tmp + --nofilter If specified, consider all variants, otherwise, only unfiltered variants. + -h,--help help + --noclean do not clean temporary files + -v,--verbose verbose + + + EXAMPLE + + #draw Venn Diagram to examine overlapping between different VCF files + seqmule stats -p gatk-soap-varscan -venn gatk.vcf,soap.avinput,varscan.vcf + + #extract union of all variants, ouput in ANNOVAR format + seqmule stats -p gatk-soap-varscan -u gatk.vcf,soap.avinput,varscan.vcf + + #extract consensus of all variants, output in ANNOVAR format + seqmule stats -p gatk_soap_varscan -c gatk.vcf,soap.avinput,varscan.vcf + + #extract consensus of all variants, output in VCF format + seqmule stats -p gatk_soap_varscan -c-vcf gatk.vcf,soapsnp.vcf,varscan.vcf -ref hg19.fa + + #extract union of all variants, output in VCF format + seqmule stats -p gatk_soap_varscan -u-vcf gatk.vcf,soapsnp.vcf,varscan.vcf -ref hg19.fa + + #generate coverage statistics for specified region (region.bed) + seqmule stats -p sample -capture region.bed --bam sample.bam + + #generate alignment statistics + seqmule stats -bam sample.bam -aln + + #generate variant statistics + seqmule stats -vcf sample.vcf + + #extract variants in specified region generate variant statistics + seqmule stats -vcf sample.vcf -capture region.bed + + #generate Mendelian error statistics + #NOTE, sample.vcf contains 3 samples! + seqmule stats -vcf sample.vcf --plink --mendel-stat --paternal father --maternal mother + +# OPTIONS + +- **--capture** + + SeqMule automatizes analysis of next-generation sequencing data by simplifying program installation, downloading of various databases, generation of analysis script, and customization of your pipeline. diff --git a/lib/SeqMule/Utils.pm b/lib/SeqMule/Utils.pm index 0f691a1..e1af41b 100644 --- a/lib/SeqMule/Utils.pm +++ b/lib/SeqMule/Utils.pm @@ -13,6 +13,33 @@ no warnings 'File::Find'; use SeqMule::SeqUtils; +sub checkFileType { + #take accepted filetype (hash ref) + #and a list of files + #make sure file types are all accepted and are the same + my $ft = shift; + my $same_flag = shift; #require all files to be the same? + my @files = @_; + my $first = shift @files; + my $first_ft = &getSuffix($first); + unless ($ft->{$first_ft}) { + croak("ERROR: illegal file type ($first_ft), only accept ".(join(",",keys %$ft))."\n"); + } + if($same_flag) { + for my $i(@files) { + my $i_ft = &getSuffix($i); + croak("ERROR: suffix $i_ft is not same as suffix $first_ft\n") unless $i_ft eq $first_ft; + } + } +} +sub getSuffix { + my $file = shift; + my ($suffix) = $file=~/\.([^\.]+)$/ or croak("ERROR: did not find suffix ($file)\n"); + if($suffix eq 'gz') { + return &getSuffix($file=~/(.*?)\.$suffix$/).".$suffix"; + } + return $suffix; +} sub get_rank_by_mergingrule { #given index for sample, index for file, and merging rule