-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanSamBWT.pl
92 lines (81 loc) · 2.58 KB
/
cleanSamBWT.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/perl
use Getopt::Long;
use Data::Dumper;
use Cwd 'abs_path';
GetOptions(\%opts, "infile|i=s", "backbone|b=s", "minOvlp=s", "keepEnds!", "help|h!");
sub usage(){
die "USAGE :: perl cleanSamBWT.pl -i inFile -b backboneFile.fasta -minOvlp percentage100Scale [-help|-h]\n\n
-infile|i\n\tSam file of reads vs the backbone [String]\n\n
-backbone|b\n\tFASTA of backbone sequence the backbone [String]\n\n
-minOvlp\n\tMinimum alignment overlap percentage to keep [Real, (0,100] ]\n\n
-keepEnds\n\tKeep alignments shorter than minAln length if they are at the ends (make sense for expanding backbones) [Boolean]\n\n
-help or -help\n\tGet help for using this script\n\n";
}
if ($opts{'help'} || !$opts{'infile'} || !$opts{'minOvlp'}){
if (!$opts{'infile'}){
print "infile missing, please check usage manual\n\n";
}
if (!$opts{'minOvlp'}){
print "No minimum alignement length specified, won't run the script cause it doesn't make sense, sorry, please check usage manual\n\n";
}
&usage;
}
# Variable definition
## Capture options
my $inFile= $opts{'infile'};
my $backboneFile= $opts{'backbone'};
my $minOverlap= $opts{'minOvlp'};
my $keepEnds= $opts{'keepEnds'};
## Other variables
my %backboneLength=();
my $line='';
my $cigarString='';
my $start=0;
my $strand=0;
my $sequence='';
my $id='';
my $skippedLeft=0;
my $skippedRight=0;
# Main program
open (BACKBONE, "$backboneFile") or die ("Unable to open file for reading: $backboneFile\n$!\n");
while ($line=<BACKBONE>){
chomp ($line);
if ($line =~ /^>(\S+)/){
$id=$1;
$backboneLength{$id}= 0;
next;
}
$backboneLength{$id}+= length($line);
}
close (BACKBONE);
open (OUTSAM , ">$inFile.clean") or die ("Unable to create file for writting: $inFile.clean\n$!\n");
open (INSAM, "$inFile") or die ("Unable to open file for reading: $inFile\n$!\n");
while ($line=<INSAM>){
if ($line =~ /^@/){
next;
}
if ($line=~ m/^\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)/){
$strand=$1;
$id=$2;
$start=$3;
$cigarString=$4;
$sequence=$5;
$alnLength=0;
while ($cigarString=~ m/(\d+)[MI]/g){
$alnLength+=$1;
}
$skippedLeft=0;
$skippedRight=0;
if ($cigarString=~ m/^(\d+)S/){
$skippedLeft=$1;
}
if ($cigarString=~ m/(\d+)S$/){
$skippedRight=$1;
}
if ($keepEnds && ($alnLength/length($sequence))*100 >= $minOverlap || (($alnLength/length($sequence))*100 < $minOverlap && ( ($start - $skippedLeft <= 0 && $skippedRight<=4) || ($start+$skippedLeft+$alnLength+$skippedRight >= $backboneLength{$id} && $skippedLeft<=4) ) )){
print OUTSAM $line;
}
}
}
close (INSAM);
close (OUTSAM);