-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenbank2refseq.sh
46 lines (39 loc) · 1.63 KB
/
genbank2refseq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
#Scripts requires the following tools:
# Pigz
# wget
# Internet! :)
echo "This tools will rename chromsomes for ARS-UI_Ramb_v2.0 (genBank > RefSeq)"
echo "The link to assembly_report.txt is hard coded. Change it manually"
echo "Input the common suffix of your BED6 input files e.g. dmr.gz followed by [ENTER]:"
read JOB
#Downloading manifest from NCBI
echo "$(date) ===== Start..."
wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/772/045/GCF_016772045.1_ARS-UI_Ramb_v2.0/GCF_016772045.1_ARS-UI_Ramb_v2.0_assembly_report.txt
echo "$(date) ===== Assembly report downloaded"
#Creating dictionary for Genbank\tRefSeq names
grep -v '#' GCF_016772045.1_ARS-UI_Ramb_v2.0_assembly_report.txt | cut -f5,7 | sed 's/na/CM028731.1/' | head -n -1 > dict
echo "$(date) ===== Processing input BED6 format file...";sleep 2
#Taking in files
files_array=(*$JOB)
for f in "${files_array[@]}";do
name=$(basename -s .dmr.gz $f)
echo "$(date) ===== $name"
#Reading every line in the dmr file as an array
zcat $f | while read -ra LINE;do
#First element of the array genbank chr
chrom=${LINE[0]}
#Finding the refseq match in the dictionary
match=$(grep "$chrom" dict | cut -f2)
#Reconstructing BED6 format for each line
printf '%s\t%s\t%s\t%s\t%s\t%s\n' "${match} ${LINE[@]:1}" >> ${name}.bed
done;
#Fix for later found issue of extra tabs isntead of '%s\t%s\n'
sed -i -e 's/ /\t/' -e 's/\t$//' ${name}.bed
echo "$(date) ===== Finished chr renaming for $name"
echo "$(date) Gz compression."
pigz ${name}.bed
done;
#GC
rm -f GCF_016772045.1_ARS-UI_Ramb_v2.0_assembly_report.txt dict
echo "$(date) ===== Finished with no errors."