From a6076f1706faed22044d901bb15b67b170100422 Mon Sep 17 00:00:00 2001
From: yunfeiguo <yunfeigu@usc.edu>
Date: Thu, 13 Aug 2015 23:54:57 -0700
Subject: [PATCH] fix bug in venn diagram plotting

---
 bin/secondary/stats             | 94 +++++++++++++++------------------
 doc/User Guide/Manuals/stats.md | 78 +++++++++++++++++++++++++++
 lib/SeqMule/Utils.pm            | 27 ++++++++++
 3 files changed, 149 insertions(+), 50 deletions(-)
 create mode 100644 doc/User Guide/Manuals/stats.md
diff --git a/bin/secondary/stats b/bin/secondary/stats
index a25fa41..c2e5a07 100755
--- a/bin/secondary/stats
+++ b/bin/secondary/stats
@@ -24,6 +24,12 @@ use Getopt::Long qw/GetOptions/;
 my $ARGCOUNT=100000; #number of regions to extract using command line, max number if restricted by kernel config
 my $INDEL_EXTENSION = 10; #when determining indel overlapping, interalize indels by extending indel INDEL_EXTENSION bp towards both ends
 my $TMPDIR = $ENV{TMPDIR} || "/tmp";
+my %LEGAL_VAR_FILETYPE = (
+    'avinput'	=>	1,
+    'vcf'	=>	1,
+    'consensus'	=>	1,
+    'vcf.gz'	=>	1,
+);
 ##################################################
 my ($prefix,
     $bam,$vcf,
@@ -111,18 +117,22 @@ if ($union_files)
 	&conv2anno(split /,/,$union_files) );
 }
 
-if ($consensus_files)
-{
+if ($consensus_files) {
     &var_consensus(
 	&conv2anno(split /,/,$consensus_files) 
     );
 }
 
-if ($venn)
-{
+if ($venn) {
     my @files=split /,/,$venn;
-    @files=&SeqMule::Utils::getNonEmptyVCF(@files);
-    &venn( \@files,&conv2anno(&normalize_vcf(@files)) );
+    &SeqMule::Utils::checkFileType(\%LEGAL_VAR_FILETYPE,1,@files);
+
+    if(&SeqMule::Utils::getSuffix($files[0]) =~ /^vcf$|^vcf\.gz$/) {
+	@files=&SeqMule::Utils::getNonEmptyVCF(@files);
+	&venn( \@files,&conv2anno(&normalize_vcf(@files)) );
+    } else {
+	&venn( \@files,&conv2anno(@files) );
+    }
 }
 
 if (defined $capture) 
@@ -809,8 +819,7 @@ sub coverage_stat
 }
 
 #generate Venn Diagram
-sub venn 
-{
+sub venn {
     warn "NOTICE: Do not support more than 5 files right now\n" and return undef unless (@_<=5);
     warn "NOTICE: No need to plot for single file\n" and return undef if (@_==1);
 
@@ -838,16 +847,14 @@ sub venn
     my $non_snp_index_pool;
 
     #remove empty file
-    for (@avinput_pool)
-    {
+    for (@avinput_pool) {
 	$_=undef unless (-s $_ >0);
     }
     @avinput_pool=grep {defined $_} @avinput_pool;
     warn "WARNING: failed to plot Venn diagram, number of nonempty files is more than 5 or smaller than 2.\n" and return unless @avinput_pool<=5 && @avinput_pool>=2;
 
     #generate index matrix
-    for my $count(0..$#avinput_pool)
-    {
+    for my $count(0..$#avinput_pool) {
 	#*_index_matrix is array of array
 	#first layer is array of file index
 	#2nd layer is array of file name plus variants
@@ -857,18 +864,19 @@ sub venn
 	open IN,"<",$input or die "Cannot open $input:$!\n";
 	$snp_index_matrix[$count]=[];
 	$non_snp_index_matrix[$count]=[];
-	while (<IN>) 
-	{
+	my $illegal_input_count = 0;
+	while (<IN>) {
 	    #chr	start	end	ref	obs	...
 	    #1		2	3	4	5
-	    /^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t.*$/ or next;
+	    unless(/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t?.*$/) {
+		$illegal_input_count++;
+		next;
+	    }
 
-	    if ($2 == $3 and length($4) == 1 and length($5) == 1 ) #start == end means this is a SNP
-	    {
+	    if ($2 == $3 and length($4) == 1 and length($5) == 1 ) {#start == end means this is a SNP 
 		my $key=[$1,$2,$3,$4,$5];
 		push @{$snp_index_matrix[$count]},$key and $snp_appear{$key}=1 unless $snp_appear{$key};
-	    } else
-	    {
+	    } else {
 		#for indels, MVNs
 		#use intervalization of 20bp (10bp in each direction)
 		#as long as there's an overlap of interval
@@ -879,11 +887,11 @@ sub venn
 	    }
 	}
 	close IN;
+	warn "WARNING: $input has $illegal_input_count illegal lines, expect >= 5 tab delimited fields\n" if $illegal_input_count > 0;
     }
 
     @non_snp_index_matrix = &gen_index_for_interval(\@non_snp_index_matrix);
-    for my $count(0..$#avinput_name)
-    {
+    for my $count(0..$#avinput_name) {
 	#add file name to head of array
 	my $input = $avinput_name[$count];
 	my ($name_prefix) = $input=~/^(.*)(\.avinput|\.tmp|\.vcf|\.consensus)$/i;
@@ -904,8 +912,7 @@ sub venn
     &genPlot("SNV",$snp_index_pool,scalar @avinput_pool);
     &genPlot("NON-SNV",$non_snp_index_pool,scalar @avinput_pool);
 }
-sub gen_index_for_interval
-{
+sub gen_index_for_interval {
     #be careful here
     #we are dealing with references
     #so any change to the content may have unexpected side effects
@@ -992,14 +999,12 @@ sub gen_index_for_interval
     #and there shouldn't be duplicates
     return @new;
 }
-sub write_index_pool
-{
+sub write_index_pool {
     my $max=0;
     my $outfile = shift;
     my @index_matrix = @_;
 
-    for my $i(0..$#index_matrix)
-    {
+    for my $i(0..$#index_matrix) {
 	$max = @{$index_matrix[$i]} if @{$index_matrix[$i]} > $max;
     }
 
@@ -1009,9 +1014,8 @@ sub write_index_pool
 	my @line;
 	for my $j(0..$#index_matrix)
 	{
-	    if(defined $index_matrix[$j][$i])
-	    {
-		push @line,(join "+",@{$index_matrix[$j][$i]});
+	    if(defined $index_matrix[$j][$i] and @{$index_matrix[$j][$i]}>0) {
+		push @line,(join "+",(grep {defined} @{$index_matrix[$j][$i]}));
 	    } else
 	    {
 		push @line,"NA";
@@ -1021,8 +1025,7 @@ sub write_index_pool
     }
     close OUT;
 }
-sub genPlot
-{
+sub genPlot {
     my $category = shift; #SNP or NON-SNP
     die "ERROR: category missing" unless defined $category;
     my $variant_index_pool=shift;
@@ -1044,8 +1047,7 @@ sub genPlot
     "y=list()\n".
     "title='Overlapping of $category variants'\n";
     #convert dataframe to list such that na=remove works
-    for my $no(1..$count) 
-    {
+    for my $no(1..$count) {
 	$rscript.=
 	"y[[$no]]=x[[$no]]\nnames(y)[$no]=$no\n".
 	"tmp=paste($no,names(x)[$no],sep=':')\n".
@@ -1066,41 +1068,35 @@ sub genPlot
     }\n";
 
     #high resolution tiff output
-    if ($count==5)
-    {
+    if ($count==5) {
 	$rscript.="
 	venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=0.65,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
 	";
-    } elsif ($count==4) 
-    {
+    } elsif ($count==4) {
 	$rscript.="
 	venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=0.8,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
 	";
-    } else
-    {
+    } else {
 	$rscript.="
 	venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=1,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=0,ext.length=0.9)
 	";
     }
 #jpeg output for html report
-    if ($count==5)
-    {
+    if ($count==5) {
 	$rscript.="
 	obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=0.65,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
 	jpeg('$jpg',width=5,height=5,units='in',res=500)
 	grid.draw(obj)
 	garbage <- dev.off()
 	";
-    } elsif ($count==4) 
-    {
+    } elsif ($count==4) {
 	$rscript.="
 	obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=0.8,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
 	jpeg('$jpg',width=5,height=5,units='in',res=500)
 	grid.draw(obj)
 	garbage <- dev.off()
 	";
-    } else
-    {
+    } else {
 	$rscript.="
 	obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=1,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=0,ext.length=0.9)
 	jpeg('$jpg',width=5,height=5,units='in',res=500)
@@ -1117,13 +1113,11 @@ sub genPlot
 }
 
 #convert files to annovar input format for easy handling
-sub conv2anno 
-{
+sub conv2anno {
     my @files=@_;
     my @return_files;
     my $exe_conver=&getExe("convert2annovar.pl");
-    for (@files) 
-    {
+    for (@files) {
 	if (/^(.*)\.vcf(\.gz)?$/i)
 	{
 	    my $local_prefix=$1;
diff --git a/doc/User Guide/Manuals/stats.md b/doc/User Guide/Manuals/stats.md
new file mode 100644
index 0000000..9812c2f
--- /dev/null
+++ b/doc/User Guide/Manuals/stats.md	
@@ -0,0 +1,78 @@
+# NAME
+
+SeqMule an automatic pipeline for next-generation sequencing data analysis
+
+# SYNOPSIS
+
+seqmule stats <options>
+
+For details, please use 'seqmule stats -h':
+
+Options:
+
+        --prefix,-p <STRING>      output prefix. Mandatory for multiple input files.
+        --bam <BAM>               a sorted BAM file (used with --capture, --aln)
+        --capture [BED]           a BED file for capture regions (or any other regions of interest). Effective for --bam and --vcf.
+        --vcf <VCF>               output variant stats for a VCF file. If a BED file is supplied, extract variants based on the BED file.
+        --aln                     output alignment stats for a BAM file
+        --consensus,-c <LIST>     comma separated list of files for extracting consensus calls. 
+                                  VCF4 and SOAPsnp *.consensus format or ANNOVAR *.avinput required.
+        --union,-u <LIST>         comma separated list of files for pooling variants (same format as above).
+        --venn <LIST>             comma separated list of files for Venn diagram plotting (same format as above).
+        --c-vcf <LIST>            comma separated list of SORTED VCF files for extracting consensus calls. *.vcf or *.vcf.gz suffix required
+        --u-vcf <LIST>            comma separated list of SORTED VCF files for extracting union calls. *.vcf or *.vcf.gz suffix required
+        --ref <FASTA>             reference file in FASTA format. Effective for --c-vcf and --u-vcf.
+        -s,--sample <STRING>      sample name for VCF file, used for -vcf, -u, -venn, -c options.
+        --plink                   convert VCF to PLINK format (PED,MAP). Only works with --vcf option.
+        --mendel-stat             generate Mendelian error statistics
+        --paternal <STRING>       sample ID for paternal ID (case-sensitive). Rest are either maternal or offspring. Only one family allowed.
+        --maternal <STRING>       sample ID for maternal ID (case-sensitive). Rest are either paternal or offspring. Only one family allowed.
+        -N <INT>                  extract variants appearing in at least N input files. Currently only effective for --c-vcf option.
+        --jmem <STRING>           max java memory. Only effective for --c-vcf and --u-vcf. Default: 1750m
+        --jexe <STRING>           Java executable path. Default: java
+        -t <INT>                  number of threads. Only effective for --aln, --c-vcf and --u-vcf. Default: 1
+        --tmpdir <DIR>            use DIR for storing large temporary files. Default: $TMPDIR(in your ENV variables) or /tmp
+        --nofilter                If specified, consider all variants, otherwise, only unfiltered variants.
+        -h,--help                 help
+        --noclean                 do not clean temporary files
+        -v,--verbose              verbose
+
+
+        EXAMPLE 
+
+        #draw Venn Diagram to examine overlapping between different VCF files
+        seqmule stats -p gatk-soap-varscan -venn gatk.vcf,soap.avinput,varscan.vcf
+
+        #extract union of all variants, ouput in ANNOVAR format
+        seqmule stats -p gatk-soap-varscan -u gatk.vcf,soap.avinput,varscan.vcf
+
+        #extract consensus of all variants, output in ANNOVAR format
+        seqmule stats -p gatk_soap_varscan -c gatk.vcf,soap.avinput,varscan.vcf
+
+        #extract consensus of all variants, output in VCF format
+        seqmule stats -p gatk_soap_varscan -c-vcf gatk.vcf,soapsnp.vcf,varscan.vcf -ref hg19.fa
+
+        #extract union of all variants, output in VCF format
+        seqmule stats -p gatk_soap_varscan -u-vcf  gatk.vcf,soapsnp.vcf,varscan.vcf -ref hg19.fa
+
+        #generate coverage statistics for specified region (region.bed)
+        seqmule stats -p sample -capture region.bed --bam sample.bam
+
+        #generate alignment statistics
+        seqmule stats -bam sample.bam -aln
+
+        #generate variant statistics
+        seqmule stats -vcf sample.vcf
+
+        #extract variants in specified region generate variant statistics
+        seqmule stats -vcf sample.vcf -capture region.bed
+
+        #generate Mendelian error statistics
+        #NOTE, sample.vcf contains 3 samples!
+        seqmule stats -vcf sample.vcf --plink --mendel-stat --paternal father --maternal mother
+
+# OPTIONS
+
+- **--capture**
+
+    SeqMule automatizes analysis of next-generation sequencing data by simplifying program installation, downloading of various databases, generation of analysis script, and customization of your pipeline.
diff --git a/lib/SeqMule/Utils.pm b/lib/SeqMule/Utils.pm
index 0f691a1..e1af41b 100644
--- a/lib/SeqMule/Utils.pm
+++ b/lib/SeqMule/Utils.pm
@@ -13,6 +13,33 @@ no warnings 'File::Find';
 use SeqMule::SeqUtils;
 
 
+sub checkFileType {
+    #take accepted filetype (hash ref)
+    #and a list of files
+    #make sure file types are all accepted and are the same
+    my $ft = shift; 
+    my $same_flag = shift; #require all files to be the same?
+    my @files = @_;
+    my $first = shift @files;
+    my $first_ft = &getSuffix($first);
+    unless ($ft->{$first_ft}) {
+	croak("ERROR: illegal file type ($first_ft), only accept ".(join(",",keys %$ft))."\n");
+    }
+    if($same_flag) {
+	for my $i(@files) {
+	    my $i_ft = &getSuffix($i);
+	    croak("ERROR: suffix $i_ft is not same as suffix $first_ft\n") unless $i_ft eq $first_ft;
+	}
+    }
+}
+sub getSuffix {
+    my $file = shift;
+    my ($suffix) = $file=~/\.([^\.]+)$/ or croak("ERROR: did not find suffix ($file)\n");
+    if($suffix eq 'gz') {
+	return &getSuffix($file=~/(.*?)\.$suffix$/).".$suffix";
+    }
+    return $suffix;
+}
 sub get_rank_by_mergingrule
 {
     #given index for sample, index for file, and merging rule