-
Notifications
You must be signed in to change notification settings - Fork 20
/
earlGrey
executable file
·589 lines (527 loc) · 25.2 KB
/
earlGrey
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
#!/bin/bash
usage()
{
echo " #############################
earlGrey version 5.0.0
Required Parameters:
-g == genome.fasta
-s == species name
-o == output directory
Optional Parameters:
-t == Number of Threads (DO NOT specify more than are available)
-r == RepeatMasker search term (e.g arthropoda/eukarya)
-l == Starting consensus library for an inital mask (in fasta format)
-i == Number of Iterations to BLAST, Extract, Extend (Default: 10)
-f == Number flanking basepairs to extract (Default: 1000)
-c == Cluster TE library to reduce redundancy? (yes/no, Default: no)
-m == Remove putative spurious TE annotations <100bp? (yes/no, Default: no)
-d == Create soft-masked genome at the end? (yes/no, Default: no)
-n == Max number of sequences used to generate consensus sequences (Default: 20)
-a == minimum number of sequences required to build a consensus sequence (Default: 3)
-e == Run HELIANO as an optional step to detect Helitrons (yes/no, Default: no)
-h == Show help
Example Usage:
earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16
Queries can be sent to:
tobias.baril[at]unine.ch
Please make use of the GitHub Issues and Discussion Tabs at: https://github.com/TobyBaril/EarlGrey
#############################"
}
# Subprocess Make Directories #
makeDirectory()
{
directory=$(realpath ${directory})
mkdir -p ${directory}/${species}_EarlGrey/ && OUTDIR=${directory}/${species}_EarlGrey
if [ ! -z "$RepSpec" ] || [ ! -z "$startCust" ] ; then
mkdir -p $OUTDIR/${species}_RepeatMasker/
fi
mkdir -p $OUTDIR/${species}_Database/
mkdir -p $OUTDIR/${species}_RepeatModeler/
mkdir -p $OUTDIR/${species}_strainer/
mkdir -p $OUTDIR/${species}_Curated_Library/
mkdir -p $OUTDIR/${species}_RepeatMasker_Against_Custom_Library/
mkdir -p $OUTDIR/${species}_RepeatLandscape/
mkdir -p $OUTDIR/${species}_mergedRepeats/
mkdir -p ${OUTDIR}/${species}_summaryFiles/
if [ ! -z "$heli" ]; then
mkdir -p ${OUTDIR}/${species}_heliano/
fi
}
# Subprocess PrepGenome #
prepGenome()
{
if [ ! -L ${genome} ]; then
genome=$(realpath ${genome})
fi
if [ -L $genome ]; then
genome=$(realpath -s ${genome})
fi
if [ ! -f ${genome}.prep ] || [ ! -f ${genome}.dict ]; then
cp ${genome} ${genome}.bak && gzip -f ${genome}.bak
sed '/>/ s/ .*//g; /^$/d' ${genome} > ${genome}.tmp
${SCRIPT_DIR}/headSwap.sh -i ${genome}.tmp -o ${genome}.prep && rm ${genome}.tmp
mv ${genome}.tmp.dict ${genome}.dict
sed -i '/^>/! s/[DVHBPE]/N/g' ${genome}.prep
dict=${genome}.dict
genOrig=${genome}
genome=${genome}.prep
else
dict=${genome}.dict
genOrig=${genome}
genome=${genome}.prep
fi
}
# Subprocess getRepeatMaskerFasta
# Generate a copy of the RepeatMasker library subset used for the initial TE mask in FASTA format (only used if a RepeatMasker run is specified)
getRepeatMaskerFasta()
{
if [[ $RepSpec = *" "* ]]; then
echo "ERROR: You have entered a species name that contains a space, please use the NCBI TaxID rather than name. E.G In place of \"Homo sapiens\" use \"9606\""
exit 2
fi
if [[ $(which RepeatMasker) == *"bin"* ]]; then
libpath="$(which RepeatMasker | sed 's|bin.*|share/RepeatMasker/Libraries/RepeatMaskerLib.h5|g')"
else
libpath="$(which RepeatMasker | sed 's|/[^/]*$||g')/Libraries/RepeatMaskerLib.h5"
fi
if [[ $(which RepeatMasker) == *"bin"* ]]; then
PATH=$PATH:"$(which RepeatMasker | sed 's|bin.*|share/RepeatMasker/|g')"
fi
famdb.py -i $libpath families -f fasta_name --include-class-in-name -a -d $RepSpec > ${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
RepSub=${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
}
# Subprocess firstMask
# Run the initial RepeatMasker run with the specified species subset (only used if a RepeatMasker run is specified)
firstMask()
{
cd ${OUTDIR}/${species}_RepeatMasker
rmthreads=$(expr $ProcNum / 4)
RepeatMasker -species $RepSpec -norna -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid species search term, if issue persists please use NCBI Taxids (E.G Drosophila is replaced with 7125)"; exit 2
fi
}
# Subprocess firstMaskCustomLib
# Run the initial RepeatMasker run with the specific fasta consensus library
firstMaskCustomLib()
{
cd ${OUTDIR}/${species}_RepeatMasker
rmthreads=$(expr $ProcNum / 4)
RepeatMasker -lib ${startCust} -norna -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
RepSub=${startCust}
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid custom consensus file, check the fasta file looks as expected"; exit 2
fi
}
# Subprocess buildDB
# if masked genome exists, build database from this. If not, build database from original input genome
buildDB()
{
if [ -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
cd ${OUTDIR}/${species}_Database
BuildDatabase -name ${species} -engine ncbi ${OUTDIR}/${species}_RepeatMasker/*.masked
else
cd ${OUTDIR}/${species}_Database
BuildDatabase -name ${species} -engine ncbi $genome
fi
}
# Subprocess deNovo1
# Run the initial de novo annotation run with RepeatModeler2, with error catches for weird genome sizes
deNovo1()
{
cd ${OUTDIR}/${species}_RepeatModeler
RepeatModeler -engine ncbi -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species}
if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 5"
RepeatModeler -engine ncbi -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 81000000
if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 4"
RepeatModeler -engine ncbi -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 27000000
if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
echo "ERROR: RepeatModeler Failed"
exit 2
fi
fi
fi
}
# Subprocess strainer
# contains the BLAST, Extract, Extend, Trim pipeline from James Galbraith
strainer()
{
cd ${OUTDIR}/${species}_strainer/
${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${ProcNum} -f $Flank -r $num -n $no_seq -m $min_seq
latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained"
cp $latestFile ${OUTDIR}/${species}_strainer/
}
# Subprocess clust
# optional wicker clustering of TE library consensus sequences to reduce redundancy
clust()
{
cd ${OUTDIR}/${species}_strainer/
cd-hit-est -d 0 -aS 0.8 -c 0.8 -G 0 -g 1 -b 500 -r 1 -i $latestFile -o ${latestFile}.clstrd.fa
latestFile=$latestFile.clstrd.fa
}
# Subprocess novoMask
# Run RepeatMasker with the final processed library
novoMask()
{
if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
cd ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/
rmthreads=$(expr $ProcNum / 4)
RepeatMasker -lib $latestFile -norna -lcambig -s -a -pa $rmthreads -dir ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/ $genome
else
cd ${OUTDIR}/${species}_Curated_Library/
cat $latestFile $RepSub > ${species}_combined_library.fasta
cd ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/
rmthreads=$(expr $ProcNum / 4)
RepeatMasker -lib ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -norna -lcambig -s -a -pa $rmthreads -dir ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/ $genome
fi
}
# Subprocess heliano
# Run HELIANO as an optional step, then replace overlapping repeats in the merged output with HELIANO outputs
### TODO:
heliano_optional()
{
cd ${OUTDIR}/${species}_heliano/
heliano -g $genome --nearest -dn 6000 -flank_sim 0.5 -o ${OUTDIR}/${species}_heliano/HEL_${timestamp} -w 10000 -n $ProcNum
awk '{OFS="\t"}{print $1, "HELIANO", "RC/Helitron", $2+1, $3, $5, $6, ".", "ID="$9"_"$11";shortTE=F"}' ${OUTDIR}/${species}_heliano/HEL_${timestamp}/RC.representative.bed > ${OUTDIR}/${species}_heliano/HEL_${timestamp}/RC.representative.gff
helitron_gff=${OUTDIR}/${species}_heliano/HEL_${timestamp}/RC.representative.gff
}
# Subprocess rcMergeRepeats
# Defragment repeat sequences to adjust for insertion times
mergeRep()
{
mkdir ${OUTDIR}/${species}_mergedRepeats/looseMerge
if [ -s "$helitron_gff" ]; then
echo "Running loose merge with HELIANO output"
${SCRIPT_DIR}/rcMergeRepeatsLoose -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/looseMerge -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin -e $helitron_gff
else
${SCRIPT_DIR}/rcMergeRepeatsLoose -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/looseMerge -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin
fi
if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
awk '{OFS="\t"}{print $1, $2, $3, $4, $5, $6, $7, $8, toupper($9)}' ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff > ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff.1 && mv ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff{.1,}
fi
if [ ! -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
echo "ERROR: loose merge defragmentation failed, trying strict merge..."
cd ${OUTDIR}/${species}_mergedRepeats/
if [ -s "$helitron_gff" ]; then
${SCRIPT_DIR}/rcMergeRepeats -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/ -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin -e $helitron_gff
else
${SCRIPT_DIR}/rcMergeRepeats -f $genome -s $species -d ${OUTDIR}/${species}_mergedRepeats/ -u ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out -q ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -t $ProcNum -b ${dict} -m $margin
fi
if [ ! -f "${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed" ]; then
echo "ERROR: strict merge also failed, check ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).out looks as expected"
exit 2
fi
fi
}
# Subprocess pieChart
# Generate a pie chart summary of TE content
charts()
{
cd ${OUTDIR}/${species}_summaryFiles/
if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
else
${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
fi
}
# Subprocess calcDivRL
# Calculate divergence estimates
calcDivRL()
{
cd ${OUTDIR}/${species}_RepeatLandscape
if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
cp ${OUTDIR}/${species}_RepeatLandscape/*_summary_table.tsv ${OUTDIR}/${species}_summaryFiles/${species}_divergence_summary_table.tsv
else
python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
cp ${OUTDIR}/${species}_RepeatLandscape/*_summary_table.tsv ${OUTDIR}/${species}_summaryFiles/${species}_divergence_summary_table.tsv
fi
}
# Subprocess sweepUp
# Puts required files into a summary folder
sweepUp()
{
cd ${OUTDIR}/${species}_summaryFiles/
if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ ! -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
cp ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff $latestFile ${OUTDIR}/${species}_summaryFiles/
elif [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
cp ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta $latestFile ${OUTDIR}/${species}_summaryFiles/
elif [ ! -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ ! -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
cp ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.gff $latestFile ${OUTDIR}/${species}_summaryFiles/
elif [ ! -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ] && [ -f "${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta" ]; then
cp ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.gff ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta $latestFile ${OUTDIR}/${species}_summaryFiles/
else
echo "Error: Cannot find required summary files, please check one of the following is present: ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed and/or ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed"
fi
}
# Subprocess runningTea
runningTea()
{
echo "
) (
( ) )
) ( (
_______)_
.-'---------|
( C|/\/\/\/\/|
'-./\/\/\/\/|
'_________'
'-------'
<<< $stage >>>"
}
# Subprocess GetTime
convertsecs()
{
h=$(bc <<< "${1}/3600")
m=$(bc <<< "(${1}%3600)/60")
s=$(bc <<< "${1}%60")
printf "%02d:%02d:%05.2f\n" $h $m $s
}
# Subprocess Checks
Checks()
{
if [ -z "$genome" ] || [ -z "$species" ] || [ -z "$directory" ] ; then
usage; exit 1
fi
if [ -z "$ProcNum" ] ; then
ProcNum=1; echo "$ProcNum Cores Will Be Used"
else
echo "$ProcNum Cores Will Be Used"
fi
if [ -z "$RepSpec" ] && [ -z "$startCust" ] ; then
echo "RepeatMasker species not specified, running Earl Grey without an initial mask with known repeats"
else
echo "Running Earl Grey with an intial mask with known repeats"
fi
if [ -z "$num" ] ; then
num=10; echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
else
echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
fi
if [ -z "$no_seq" ] ; then
no_seq=20; echo "$no_seq sequences will be used in BEAT consensus generation"
else
echo "$no_seq sequences will be used in BEAT consensus generation"
fi
if [ -z "$cluster" ] || [ "$cluster" == "no" ] ; then
cluster=no; echo "TE library consensus sequences will not be clustered"
elif [ "$cluster" == "yes" ]; then
cluster=yes; echo "TE library consensus sequences will be clustered, be aware of the impact this can have on subfamilies and chimeric TEs!"
else
clustering=no; echo "Clustering not specified using (yes/no). Using default parameter (no)."
fi
if [ -z "$softMask" ] || [ "$softMask" == "no" ] ; then
softMask=no; echo "Softmasked genome will not be generated"
elif [ "$softMask" == "yes" ]; then
softMask=yes; echo "Softmasked genome will be generated"
else
softMask=no; echo "Softmask not specified using (yes/no). Using default parameter (no)."
fi
if [ "$margin" == "yes" ] ; then
margin=yes; echo "Short TE sequences <100bp will be removed from annotation"
elif [ -z "$margin" ] || [ "$margin" == "no" ]; then
margin=no; echo "Short TE sequences <100bp will not be removed from annotation"
else
margin=no; echo "Margin not specified using (yes/no). Using default parameter (no)."
fi
if [ -z "$Flank" ]; then
Flank=1000; echo "Blast, Extend, Align, Trim Process Will Add 1000bp to Each End in Each Iteration"
else
echo "Blast, Extract, Extend, Trim Process Will Add $Flank to Each End in Each Iteration"
fi
if [ -z "$min_seq" ]; then
min_seq=3; echo "Blast, Extend, Align, Trim Process Will Require 3 Sequences to Generate a New Consensus Sequence"
else
echo "Blast, Extend, Align, Trim Process Will Require $min_seq Sequences to Generate a New Consensus Sequence"
fi
if [ ! -d "$SCRIPT_DIR" ]; then
echo "ERROR: Script directory variable not set, please run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
fi
if [ ! -d "$SCRIPT_DIR/TEstrainer/" ]; then
echo "ERROR: teStrainer module not found, please check all modules are present and run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
fi
if [ "$heli" == "yes" ] ; then
heli=yes; echo "HELITRON detection will be run using HELIANO"
elif [ -z "$heli" ] || [ "$heli" == "no" ]; then
heli=no; echo "HELITRON detection will not be run"
else
heli=no; echo "HELITRON detection not specified using (yes/no). Using default parameter (no)."
fi
# biocontainer checks
if [ -d "/usr/local/share/RepeatMasker/Libraries/" ]; then
if grep -q Placeholder /usr/local/share/RepeatMasker/Libraries/Dfam.h5 ; then
while true; do
read -p "Are you using the biocontainer installation? If yes, we can configure RepeatMasker to work for you:[YyNn]" yn
case $yn in
[Yy]* ) cd /usr/local/share/RepeatMasker/Libraries/
curl -O https://dfam.org/releases/Dfam_3.7/families/Dfam_curatedonly.h5.gz
gunzip Dfam_curatedonly.h5.gz && mv Dfam.h5 Dfam.h5.bak
mv Dfam_curatedonly.h5 Dfam.h5
cd /usr/local/share/RepeatMasker/
perl configure -rmblast_dir=/usr/local/bin -libdir=/usr/local/share/RepeatMasker/Libraries -trf_prgm=/usr/local/bin/trf -default_search_engine=rmblast
sed -i 's|${SCRIPT_DIR}/LTR_FINDER_parallel|perl ${SCRIPT_DIR}/LTR_FINDER_parallel|g' /usr/local/share/earlgrey*/scripts/rcMergeRepeatsLoose
sed -i 's|${SCRIPT_DIR}/LTR_FINDER_parallel|perl ${SCRIPT_DIR}/LTR_FINDER_parallel|g' /usr/local/share/earlgrey*/scripts/rcMergeRepeats
break;;
[Nn]* ) exit;;
* ) echo "Please answer yes or no.";;
esac
done
fi
fi
}
# Main #
while getopts g:s:o:t:f:i:r:c:l:m:d:n:a:e:h option
do
case "${option}"
in
g) genome=${OPTARG};;
s) species=${OPTARG};;
o) directory=${OPTARG};;
t) ProcNum=${OPTARG};;
f) Flank=${OPTARG};;
i) num=${OPTARG};;
r) RepSpec=${OPTARG};;
l) startCust=${OPTARG};;
c) cluster=${OPTARG};;
m) margin=${OPTARG};;
d) softMask=${OPTARG};;
n) no_seq=${OPTARG};;
a) min_seq=${OPTARG};;
e) heli=${OPTARG};;
h) usage; exit 0;;
esac
done
SECONDS=0
# Step 1 - set up the directories and make sure all modules are present
stage="Checking Parameters" && runningTea
SCRIPT_DIR=/data/toby/EarlGrey/scripts/
Checks
stage="Making Directories" && runningTea
makeDirectory
# Start Logs - only bother starting if everything is present and correct
exec > >(tee "${OUTDIR}/${species}EarlGrey.log") 2>&1
stage="Cleaning Genome" && runningTea
prepGenome
sleep 1
# Step 2 - initial annotation, either de novo, or if required an initial RepeatMasker followed by a de novo run
if [ -z "$RepSpec" ]; then
sleep 1
else
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
stage="Getting RepeatMasker Sequences for $RepSpec and Saving as Fasta" && runningTea
getRepeatMaskerFasta
stage="Running Initial Mask with Known Repeats" && runningTea
firstMask
else
stage="Genome has already been masked, skipping..." && runningTea
getRepeatMaskerFasta
fi
fi
if [ -z "$startCust" ]; then
sleep 1
else
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
stage="Running Initial Mask with Known Repeats in Custom Library" && runningTea
firstMaskCustomLib
else
stage="Genome has already been masked, skipping..." && runningTea
fi
fi
# Step 3 - make a database from the genome, and then run a de novo TE annotation using RepeatModeler2
if [ ! -f ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
stage="Detecting Novel Repeats" && runningTea
buildDB
deNovo1
sleep 1
else
stage="De novo repeats have already been detected, skipping..." && runningTea
sleep 1
fi
# Step 4 - run TEstrainer, which runs a BLAST, extract, extent, trim, on the the de novo repeat library to refine consensus sequences
if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
stage="Straining TEs and Refining de novo Consensus Sequences" && runningTea
if [ -f ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
rm -r ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ${OUTDIR}/${species}_strainer/TS*
fi
strainer
if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
echo "WARNING: TEstrainer failed to produce a strain file, please check the log file for more information. If you have run an intial mask with known repeats, this could be due RepeatModeler2 failing to identify any new repeats. Please check if this is expected."
fi
sleep 1
else
stage="TEs already strained, skipping..." && runningTea
latestFile=${OUTDIR}/${species}_strainer/${species}-families.fa.strained
sleep 1
fi
# Step 4.5 - cluster TE library IF REQUIRED
if [ "$cluster" == "yes" ]; then
stage="Clustering TE sequences to Wicker TE Family Level (80-80-80 rule)" && runningTea
clust
sleep 1
fi
# Stage 5 - run the final RepeatMasker annotation using the refined TE consensus library
if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
stage="Identifying Repeats Using Species-Specific Library" && runningTea
novoMask
if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
echo "ERROR: RepeatMasker failed, please check logs" && exit 2
fi
sleep 1
else
stage="Final masking already complete, skipping..." && runningTea
sleep 1
fi
# Stage 5.5 - Run HELIANO as an optional step
if [ "$heli" == "yes" ]; then
if [ ! -s ${OUTDIR}/${species}_heliano/*/RC.representative.gff ]; then
stage="Running HELIANO to Detect Helitrons" && runningTea
timestamp=$(date +"%Y%m%d_%H%M")
heliano_optional
else
stage="HELITRON detection already complete, skipping..." && runningTea
helitron_gff="$(realpath $(ls -td -- ${OUTDIR}/${species}_heliano/*/ | head -n 1))/RC.representative.gff"
echo "HELITRON GFF: $helitron_gff"
fi
sleep 1
fi
# Stage 6
if [ ! -f ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ] ; then
stage="Defragmenting Repeats" && runningTea
mergeRep
sleep 1
else
stage="Repeats already defragmented, skipping..." && runningTea
sleep 1
fi
# Stage 7
stage="Generating Summary Plots" && runningTea
charts
calcDivRL
sleep 1
# Stage 8
stage="Tidying Directories and Organising Important Files" && runningTea
sweepUp
sleep 1
# Stage 8.5 Optional Softmask
if [ "$softMask" == "yes" ]; then
stage="Generating Softmasked Genome" && runningTea
gunzip ${genome%.prep}.bak.gz && bedtools maskfasta -fi ${genome%.prep}.bak -bed ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed -fo ${OUTDIR}/${species}_summaryFiles/${species}.softmasked.fasta -soft && gzip ${genome%.prep}.bak
sleep 1
fi
# Stage 9
time=$(convertsecs $SECONDS)
stage="Done in $time" && runningTea
sleep 5
# Stage 10
stage="TE library, Summary Figures, and TE Quantifications in Standard Formats Can Be Found in ${OUTDIR}/${species}_summaryFiles/" && runningTea
sleep 5