-
Notifications
You must be signed in to change notification settings - Fork 2
/
Gene_overlap_check.pm
135 lines (84 loc) · 3.23 KB
/
Gene_overlap_check.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env perl
package Gene_overlap_check;
use strict;
use warnings;
use Carp;
my %GENE_TO_SPAN_INFO;
my $singleton_obj = undef;
sub new {
my ($packagename, $gene_span_file) = @_;
if ($singleton_obj) {
return($singleton_obj);
}
else {
$singleton_obj = {};
bless ($singleton_obj, $packagename);
&_parse_gene_span_info($gene_span_file);
return($singleton_obj);
}
}
####
sub get_gene_span_info {
my ($self, $gene_id) = @_;
my $struct = $GENE_TO_SPAN_INFO{$gene_id} or confess "Error, no gene span info stored for gene $gene_id";
return($struct);
}
####
sub are_genes_overlapping {
my ($self, $geneA, $geneB) = @_;
my $genome_span_A_href = $GENE_TO_SPAN_INFO{$geneA} or confess "Error, no gene span info found for $geneA";
my $genome_span_B_href = $GENE_TO_SPAN_INFO{$geneB} or confess "Error, no gene span info found for $geneB";
if ($genome_span_A_href->{chr} eq $genome_span_B_href->{chr}
&&
## coordinate overlap testing
$genome_span_A_href->{lend} < $genome_span_B_href->{rend}
&&
$genome_span_A_href->{rend} > $genome_span_B_href->{lend}
) {
return(1);
}
else {
return(0);
}
}
####
sub pct_overlap_shorter_length {
my ($self, $geneA, $geneB) = @_;
if (! $self->are_genes_overlapping($geneA, $geneB)) {
return(0);
}
my $genome_span_A_href = $GENE_TO_SPAN_INFO{$geneA} or confess "Error, no gene span info found for $geneA";
my $genome_span_B_href = $GENE_TO_SPAN_INFO{$geneB} or confess "Error, no gene span info found for $geneB";
my $len_gene_A = $genome_span_A_href->{rend} - $genome_span_A_href->{lend} + 1;
my $len_gene_B = $genome_span_B_href->{rend} - $genome_span_B_href->{lend} + 1;
my $shorter_len = ($len_gene_A < $len_gene_B) ? $len_gene_A : $len_gene_B;
my @coords = sort {$a<=>$b} ($genome_span_A_href->{lend}, $genome_span_A_href->{rend},
$genome_span_B_href->{lend}, $genome_span_B_href->{rend});
# overlap is based on internal sorted coords.
my $overlap_len = $coords[2] - $coords[1] + 1;
my $pct_overlap = $overlap_len / $shorter_len * 100;
return($pct_overlap);
}
############ PRIVATE
####
sub _parse_gene_span_info {
my ($gene_spans_file) = @_;
open (my $fh, $gene_spans_file) or confess "Error, cannot open file: $gene_spans_file .... be sure to have run prep_genome_lib.pl to generate it";
while (<$fh>) {
chomp;
my @x = split(/\t/);
my $gene_id = $x[0];
my $gene_symbol = $x[5];
my $chr = $x[1];
my ($lend, $rend) = sort {$a<=>$b} ($x[2], $x[3]); # should already be sorted, but just in case.
$GENE_TO_SPAN_INFO{$gene_symbol} = { chr => $chr,
lend => $lend,
rend => $rend };
$GENE_TO_SPAN_INFO{$gene_id} = { chr => $chr,
lend => $lend,
rend => $rend };
}
close $fh;
return;
}
1; #EOM