-
Notifications
You must be signed in to change notification settings - Fork 1
/
protocol-paper-obtain-data.sh
62 lines (49 loc) · 3.42 KB
/
protocol-paper-obtain-data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env bash
# Download genome and annotation
mkdir ./genome
cd ./genome
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.basic.annotation.gtf.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/GRCm38.primary_assembly.genome.fa.gz
find . -type f -name '*.gz' -exec bash -c 'gzip -d {}' \;
# index the genome and annotation with STAR aligner
STAR --runMode genomeGenerate --genomeDir . --genomeFastaFiles ./GRCm38.primary_assembly.genome.fa --sjdbGTFfile ./gencode.vM23.basic.annotation.gtf --runThreadN 32
cd ..
# Obtain raw data
SRR='SRR315616 SRR315617 SRR315618 SRR315619 SRR315601 SRR315602 SRR315612 SRR315613 SRR315614 SRR315615 SRR315604 SRR315605 SRR315606 SRR315607 SRR315608 SRR315609 SRR315610 SRR315611';
prefetch $SRR;
mkdir raw;
for SRR_FILE in $SRR; do fastq-dump -O ./raw --gzip $SRR_FILE --split-files; done;
mkdir fastq;
cat ./raw/SRR315616_1.fastq.gz ./raw/SRR315617_1.fastq.gz ./raw/SRR315618_1.fastq.gz ./raw/SRR315619_1.fastq.gz > ./fastq/ES_noHR_noCH_ribo.fastq.gz;
cat ./raw/SRR315601_1.fastq.gz ./raw/SRR315602_1.fastq.gz > ./fastq/ES_noHR_60sCH_ribo.fastq.gz;
cat ./raw/SRR315612_1.fastq.gz ./raw/SRR315613_1.fastq.gz ./raw/SRR315614_1.fastq.gz ./raw/SRR315615_1.fastq.gz > ./fastq/ES_90sHR_60sCH_ribo.fastq.gz;
cat ./raw/SRR315604_1.fastq.gz ./raw/SRR315605_1.fastq.gz ./raw/SRR315606_1.fastq.gz > ./fastq/ES_120sHR_60sCH_ribo.fastq.gz;
cat ./raw/SRR315607_1.fastq.gz ./raw/SRR315608_1.fastq.gz ./raw/SRR315609_1.fastq.gz > ./fastq/ES_150sHR_60sCH_ribo.fastq.gz;
cat ./raw/SRR315610_1.fastq.gz ./raw/SRR315611_1.fastq.gz > ./fastq/ES_180sHR_60sCH_ribo.fastq.gz;
# Trimming
mkdir ./trim
find ./fastq/*.fastq.gz -maxdepth 1 -type f -exec bash -c \
'bn=$(basename {}); cutadapt -a CTGTAGGCACCATCAAT -j 32 --minimum-length 20 -q 20 --trimmed-only {} -o trim/$bn' \; \
> cutadapt.log
# Alignment
# !!!!! ATTENTION !!!!!
# Here one should make a transcriptomic alignment, NOT a genomic alignment
# !!!!! !!!!!
#
# NOTE! Genome-level multi-mapping reads are excluded at the alignment step.
# It might be problematic to correctly filter out multi-mapping reads later from the transcriptomic
# alignment as there are often many overlapping transcripts of a single gene
mkdir align
find ./fastq/*.fastq.gz -maxdepth 1 -type f -exec bash -c \
's=$(basename {});bn=${s%.fastq.gz}; mkdir align/$bn' \;
find ./trim/*.fastq.gz -maxdepth 1 -type f -exec bash -c \
's=$(basename {});bn=${s%.fastq.gz}; STAR --genomeDir ./genome/ --readFilesCommand zcat --quantMode TranscriptomeSAM --readFilesIn {} --outFileNamePrefix ./align/$bn/ --alignSJDBoverhangMin 1 --outFilterMultimapNmax 1 --runThreadN 32 --outSAMtype BAM Unsorted' \;
# Sort and index the read alignment.
# !NOTE! The earlier version of the protocol suggested filtering multi-mapping reads with samtools.
# We no longer recommend such filtering as it will exclude the reads mapped to alternative
# overlapping transcripts of a single gene and significantly reduce the effective read coverage.
# Instead, when considering only uniquely-mapped reads, we suggest to perform read mapping allowing
# only single mapping locations, see STAR parameter `--outFilterMultimapNmax 1` above.
find ./trim/*.fastq.gz -maxdepth 1 -type f -exec bash -c \
's=$(basename {});bn=${s%.fastq.gz}; samtools sort ./align/$bn/Aligned.toTranscriptome.out.bam > ./align/$bn.bam ' \;
find ./align/*.bam -maxdepth 1 -type f -exec samtools index {} \;