-
Notifications
You must be signed in to change notification settings - Fork 1
/
oligosParse.m
119 lines (99 loc) · 3.5 KB
/
oligosParse.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
function oligos = oligosParse(params)
% params = struct('species','Mouse','verbose',1,...
% 'number',48,'seqNum',1000,'thres',30,'querySize',30,...
% 'blastArgs','-S 2','parallel', 0,...
% 'specialTranscripts','C:\FISHerMan\Db\Mouse.STList.fas');
if params(1).verbose
disp('reading the result file from OligoArray');
end
oligos = [params(1).species '.tempoligos.txt'];
if exist(oligos, 'file')
fid = fopen(oligos,'r');
fmt = '%s %f %f %f %f %f %f %s %s %s %*[^\n]';
temp = textscan(fid,fmt,'CollectOutput',true,'delimiter','\t','TreatAsEmpty','NA');
fclose(fid);
else
warning('missing important files from OligoArray');
return;
end
%% Remove non-specific oligos
if params(1).verbose
disp('removing non-specific oligos');
end
geneNames = temp{1,1};
nonspecificHits = temp{1,3}(:,1);
specificHits = temp{1,3}(:,2);
index = [];
for n = 1:length(geneNames)
if params(1).verbose && mod(n, 1000) == 1
disp([' analyzing oligo entry no. ' num2str(n)]);
end
% all Header here are two-segment Header
pos = regexp(geneNames{n,1}, ':');
if ~isempty(pos)
geneName=geneNames{n,1}(pos(1)+1:end);
end
if length(regexp(nonspecificHits{n,1}, geneName)) < ...
length(regexp(nonspecificHits{n,1}, ':'))
index = [index;n];
end
end
index = unique(index,'stable');
geneNames(index) = [];
nonspecificHits(index) = [];
specificHits(index) = [];
%% Convert oligos into their complementary sequences
% OligoArray generates same sequences as transcriptome sequences, not
% their complementary
for n = 1:length(specificHits)
specificHits{n,1} = seqrcomplement(specificHits{n,1});
end
%% Blast oligos against abundant rna database and remove non-specific oligos
[geneNames,specificHits,nonspecificHits]...
=blastAbundantRNASimple(geneNames,specificHits,nonspecificHits,params);
%% Remove transcripts without enough oligos
if params(1).verbose
disp(['there are ' num2str(length(unique(geneNames))) ' transcripts']);
disp('removing transcripts without enough oligos');
end
pos = regexp(geneNames, ':');
trimNames = {};
for n = 1:length(geneNames)
trimNames{end+1,1} = geneNames{n,1}(1:pos{n,1}(1)-1);
end
uniqueNames = unique(trimNames, 'stable');
indexTotal = zeros(length(trimNames),1);
for n = 1:length(uniqueNames)
index = ismember(trimNames, uniqueNames{n,1});
if sum(index) < params(1).number &&...
checkSpecialTranscripts(uniqueNames{n,1},params) % check for Bin's special sequences
indexTotal = indexTotal+index;
if params(1).verbose
disp([' transcript ' uniqueNames{n,1} ...
' has less than ' num2str(params(1).number) ' probes']);
end
end
end
indexTotal = logical(indexTotal);
geneNames(indexTotal) = [];
nonspecificHits(indexTotal) = [];
specificHits(indexTotal) = [];
%% Rearrange the list of oligos by grouping same genes together
if params(1).verbose
disp('rearranging the oligo fasta file');
end
[geneNames,specificHits,nonspecificHits]...
=rearrangeOligos(geneNames,specificHits,nonspecificHits,params);
%% Save the list of oligos into files
if params(1).verbose
disp('saving the oligo fasta file');
end
% delete(oligos);
oligos = [params(1).species '.oligos.txt'];
if exist(oligos, 'file')
delete(oligos);
end
fastawrite(oligos,geneNames,specificHits);
if params(1).verbose
disp('done parsing OligoArray results');
end