-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdropseq_aln_v2.lib
226 lines (190 loc) · 5.81 KB
/
dropseq_aln_v2.lib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# dropseq_aln.lib
STD_MSG="Useful function for Drop-seq data"
fastq2sam()
{
declare local mem=$1
declare local r1=$2
declare local r2=$3
declare local sampleID=$4
declare local outDir=$5
declare local outBAM="${outDir}/${sampleID}_unl_read_pairs.bam"
ionice -c 3 java -Xmx${mem} -jar ${picard_tool} FastqToSam F1=${read1} F2=${read2} SM=${sampleID} SO=queryname O=${outBAM} QUIET=true
if [ ! -e ${outBAM} ]
then
echo "Failed fastq2sam function"
exit 1
fi
}
sam2fastq()
{
declare local mem=$1
declare local inDir=$2
declare local sampleID=$3
declare local outFASTQ="${inDir}/${sampleID}_filtered_and_trimmed_R2.fastq.gz"
ionice -c 3 java -Xmx${xmx} -jar ${picard_tool} SamToFastq \
I="${inDir}/${sampleID}_unl_read_pairs_filtered_adapter_polyA_trimmed.bam" \
F="${inDir}/${sampleID}_filtered_and_trimmed_R2.fastq.gz" \
QUIET=true
if [ ! -e ${outFASTQ} ]
then
echo "Failed sam2fastq function"
exit 1
fi
}
tagLowQualityReads()
{
declare local codetype=$1
declare local range=$2
declare local qual=$3
declare local mem=$4
declare local inDir=$5
declare local statDir=$6
declare local sampleID=$7
declare local outBAM=/dev/null
declare local statFile=/dev/null
if [ "${codetype}" = 'bc' ]
then
echo "BC mode ..."
outBAM="${inDir}/${sampleID}_unl_read_pairs_tagged_cells.bam"
statFile="${statDir}/${sampleID}_unl_read_pairs_stats_BC.txt"
discard='False'
inBAM="${inDir}/${sampleID}_unl_read_pairs.bam"
tgName='XC'
else
echo "UMI mode ..."
outBAM="${inDir}/${sampleID}_unl_read_pairs_tagged_cells_umi.bam"
statFile="${statDir}/${sampleID}_unl_read_pairs_stats_UMI.txt"
discard='True'
inBAM="${inDir}/${sampleID}_unl_read_pairs_tagged_cells.bam"
tgName='XM'
fi
ionice -c 3 java -Xmx${mem} -jar ${dropseq_tool} TagBamWithReadSequenceExtended \
I=${inBAM} \
SUMMARY=${statFile} \
BASE_RANGE=${range} \
BASE_QUALITY=${qual} \
NUM_BASES_BELOW_QUALITY=1 \
BARCODED_READ=1 \
DISCARD_READ=${discard} \
TAG_NAME=${tgName} \
O=${outBAM} \
QUIET=true
if [ ! -e ${outBAM} ]
then
echo "Failed tagLowQualityReads function"
exit 1
fi
}
filterBAM()
{
declare local mem=$1
declare local inDir=$2
declare local sampleID=$3
declare local outBAM="${inDir}/${sampleID}_unl_read_pairs_filtered.bam"
ionice -c 3 java -Xmx${mem} -jar ${dropseq_tool} FilterBam \
I="${inDir}/${sampleID}_unl_read_pairs_tagged_cells_umi.bam" \
TAG_REJECT=XQ \
O=${outBAM} \
QUIET=true
if [ ! -e ${outBAM} ]
then
echo "Failed filterBAM function"
exit 1
fi
}
trimReads()
{
declare local codetype=$1
declare local mem=$2
declare local inDir=$3
declare local statDir=$4
declare local sampleID=$5
declare local outBAM=/dev/null
if [ "${codetype}" = 'adapter' ]
then
echo "Timming 5' adapter..."
outBAM="${inDir}/${sampleID}_unl_read_pairs_filtered_adapter_trimmed.bam"
ionice -c 3 java -Xmx${mem} -jar ${dropseq_tool} TrimStartingSequence \
I="${inDir}/${sampleID}_unl_read_pairs_filtered.bam" \
OUTPUT_SUMMARY="${statDir}/${sampleID}_adapter_trimming.txt" \
SEQUENCE="AAGCAGTGGTATCAACGCAGAGTGAATGGG" \
MISMATCHES=0 \
NUM_BASES=5 \
O=${outBAM} \
QUIET=true
fi
if [ "${codetype}" = 'polyA' ]
then
echo "Timming 3' polyA..."
outBAM="${inDir}/${sampleID}_unl_read_pairs_filtered_adapter_polyA_trimmed.bam"
ionice -c 3 java -Xmx${mem} -jar ${dropseq_tool} PolyATrimmer \
I="${inDir}/${sampleID}_unl_read_pairs_filtered_adapter_trimmed.bam" \
OUTPUT_SUMMARY="${statDir}/${sampleID}_polyA_trimming.txt" \
MISMATCHES=0 \
NUM_BASES=6 \
O=${outBAM} \
USE_NEW_TRIMMER=true \
QUIET=true
fi
if [ ! -e ${outBAM} ]
then
echo "Failed trimAdapter function"
exit 1
fi
}
intersectFASTQ()
{
# to avoid memory problem is better to split reads names in smaller chunks of nlines
declare local r1=$1
declare local nlines=$2
declare local mem=$3
declare local inDir=$4
declare local sampleID=$5
declare local currentDir=$( 'pwd' )
declare local outFASTQ="${inDir}/${sampleID}_R1.fastq.gz"
declare local tmpdir="${inDir}/tmp_split_fastq"
if [ -e "${outFASTQ}" ]
then
ionice -c 3 rm ${outFASTQ}
ionice -c 3 rm -r ${tmpdir}
fi
mkdir -p ${tmpdir}
# exstract all reads names
cd ${tmpdir}
echo "Step 1: reads names exstraction ..."
ionice -c 3 zcat "${inDir}/${sampleID}_filtered_and_trimmed_R2.fastq.gz" | awk -F "\t" '{ printf("%s",$0); n++; if(n%4==0) { printf("\n");} else { printf("\t");} }' | awk -F "\t" '{print substr($1,2)}' | ionice -c 3 split -l ${nlines} -d - "reads_names_"
# get the names of splitted files
shopt -s nullglob
nm=(${tmpdir}/reads_names_*)
shopt -u nullglob
# Use BBmap tool to subset reads
echo "Step 2: Use BBmap for subsetting ..."
declare -i c=0
for i in ${nm[@]}
do
echo "Processing file ${i}"
ionice -c 3 ${bbmap}filterbyname.sh -Xmx${xmx} app=t in=${r1} out=${outFASTQ} include=t prefix=f names=${i}
c=$(expr $c+1)
done
cd ${currentDir}
if [ ! -e ${outFASTQ} ]
then
echo "Failed intersectFASTQ function"
exit 1
else
# sort by name as reads 2
echo "Step 3: Sort the generated fastq by names ..."
ionice -c 3 ${bbmap}sortbyname.sh -Xmx${xmx} in=${outFASTQ} out="${inDir}/${sampleID}_filtered_and_trimmed_R1.fastq.gz"
ionice -c 3 rm ${outFASTQ}
ionice -c 3 rm -r ${tmpdir}
fi
}
groupBC()
{
declare local r1=$1
declare local sampleID=$2
declare local outDir=$3
declare local outFile="${outDir}/${sampleID}_BC_counts.gz"
mkdir -p ${outDir}
ionice -c 3 zcat ${r1} | awk -F "\t" '{ printf("%s",$0); n++; if(n%4==0) { printf("\n");} else { printf("\t");} }' | awk -F "\t" '{print substr($2,1,12)}' | sort | uniq -c | sort -r -k 1 | gzip -c > ${outFile}
}