-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMakefile
228 lines (159 loc) · 7.61 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
######## COMMON FLAGS FOR ALL MAKE FILES ##########
# Run jobs in parallel
MAKEFLAGS = -j
# -K causes the bsub command to wait until the job is finished to return
# -o and -e redirect stdout and stderr
BSUB = bsub -K -q bartel -o /lab/bartel1_ata/koppstein/bjob_output -e /lab/bartel1_ata/koppstein/bjob_stderr
# For home vs. Tak compilation
UNAME := $(shell uname)
######## USEFUL FUNCTIONS #######################
# Link a file from $(2) to $(1) -- useful for "pulling into the directory" from
# locations on the filesystem
define link-file
$(1): $(2)
mkdir -p $$(dir $$@)
# if the file isn't already there, then make a symlink
if [ ! -L $(1) ]; then ln -s $$< $$@; fi;
endef
# Extract the given sequence from the influenza transcripts fasta file
define extract-influenza-fasta
$(shell $(EXTRACT_FASTA) -d $(1) -n $(EXTRACT_FASTA_NUCS) \
-i $(INFLUENZA_TRANSCRIPTS))
endef
# These are the files made by bowtie-build
define bowtie-target-pattern
$(foreach base,$(BOWTIE_BUILD_BASE),$(BOWTIE_DIR)/$(1)$(base))
endef
####### COMMON DIRECTORY PATHS ##########
SOLEXA_BARTEL := /lab/solexa_public/Bartel
DATA_DIR := data
INT_DIR := intermediate
ANNO_DIR := anno
GRAPH_DIR := graphs
BOWTIE_DIR := $(ANNO_DIR)/bowtie_indices
BJOB_OUTPUT = /lab/bartel1_ata/koppstein/bjob_output
BJOB_STDERR = /lab/bartel1_ata/koppstein/bjob_stderr
# Don't delete intermediate files
.SECONDARY:
# You can use these sub-commands to generate intermediates
.PHONY: get_data clean_to_collapsed clean_data collapsed clean_collapsed vfiltered clean_vfiltered trzdimmed cleantrimmed trimmed_summary qfiltered cleanqfiltered par clean_par distributions cleandistributions cage_hmm clean_to_collapsed test_command par_5R_nogs shuffle clean_shuffle rnapet_targets clean_rnapet_targets remove_empty five_prime_targets clean_five_prime_targets influenza_rnaseq clean_influenza_rnaseq clean_par_graphs bedtools_graph_targets clean_bedtools_graph_targets tables clean_tables mapped_only_tables clean_mapped_only_tables paper clean_paper
# Deletes everything in the intermediate dir that's not the collapsed fastq file
# (as this takes a long time to generate)
# also deletes broken symlinks
clean_to_collapsed:
cd $(INT_DIR) && find . -type f ! -name "*_collapsed.fastq.gz" -delete && find -L . -type l -delete && cd ..
# Remove files that are empty (usually Makefiles automatically delete these;
# unfortunately, calls to bsub complicate things, so you sometimes need to delete
# the empty filehandles by hand)
remove_empty:
for DIRECTORY in $(INT_DIR) $(GRAPH_DIR) $(ANNO_DIR); do cd $$DIRECTORY && find . -maxdepth 1 -type f -empty -exec rm {} \; && cd ..; done;
remove_almost_empty:
for DIRECTORY in $(INT_DIR) $(GRAPH_DIR) $(ANNO_DIR) $(PELCHAT_DIR); do cd $$DIRECTORY && find . -name "*.gz" -size -2 -delete && cd ..; done;
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " getdata to get data from GEO, link files on TAK, and unzip FASTQ"
@echo " cleandata to remove the data directory"
@echo " collapsed to collapse the initial reads"
@echo " clean_collapsed to get rid of the collapsed files "
@echo " vfiltered"
@echo " "
@echo " All of the intermediates can be preceded by clean_"
@echo " to remove those intermediate files specifically. "
###### SYMLINK THE SCRIPTS ########
include pipeline/scripts.mk
# has pandas 0.13.1 installed
ifeq ($(UNAME),Linux)
MY_PYTHON := /lab/bartel1_ata/koppstein/virtualenvs/my2.7/bin/python
endif
ifeq ($(UNAME),Darwin)
MY_PYTHON := python
endif
ifeq ($(UNAME),Linux)
MY_IPYTHON := /lab/bartel1_ata/koppstein/virtualenvs/my2.7/bin/ipython
endif
ifeq ($(UNAME),Darwin)
MY_IPYTHON := ipython
endif
MY_WEBLOGO := /lab/bartel1_ata/koppstein/virtualenvs/my2.7/bin/weblogo
####### CONSTANTS ########
# Base names of the bowtie-build executable output files
BOWTIE_BUILD_BASE = .1.ebwt .2.ebwt .3.ebwt .4.ebwt .rev.1.ebwt .rev.2.ebwt
####### SEQUENCING DATA BASE NAMES #######
# All second-generation time course datasets
GEN2_TIME_COURSE_BNS := NS1_30 NS1_45 NS1_60 NS1_90 NS1_120 NS1_240
# All second-generation template-switching datasets (strand-specific)
GEN2_TS_BNS := HA_TS MP_TS NA_TS NP_TS PA_TS PB1_TS
# All 5'-RACE-like datasets
GEN1_5R_BNS := PB2_5R NS1_5R
# All first-generation template-switching datasets (paired end 200x200)
GEN1_TS_BNS := PB2_TS NS1_TS
# All 5'-RACE-like datasets; will get guanosines trimmed in parallel
# for fair comparison with the template-switching datasets
GEN1_5R_NOGS_BNS := PB2_RACE_NOGS NS1_RACE_NOGS
# All template-switching datasets
TS_BNS := $(GEN2_TIME_COURSE_BNS) $(GEN2_TS_BNS) $(GEN1_TS_BNS)
INFLUENZA_GENES := HA MP NA NP NS1 PA PB1 PB2
# Datasets sorted by gene name
HA_BNS := HA_TS
MP_BNS := MP_TS
NA_BNS := NA_TS
NP_BNS := NP_TS
NS1_BNS := NS1_TS $(GEN2_TIME_COURSE_BNS) NS1_5R NS1_RACE_NOGS
PA_BNS := PA_TS
PB1_BNS := PB1_TS
PB2_BNS := PB2_TS PB2_5R PB2_RACE_NOGS
GCAAAAGCAG_BNS := $(NS1_BNS) $(HA_BNS) $(NP_BNS) $(NA_BNS) NS1_TS_5GTRIMMED HA_TS_5GTRIMMED NP_TS_5GTRIMMED NA_TS_5GTRIMMED $(foreach bn,$(GEN2_TIME_COURSE_BNS),$(bn)_5GTRIMMED) $(foreach rep,1 2,PELCHAT_HA_rep_$(rep) PELCHAT_MP_rep_$(rep) PELCHAT_NA_rep_$(rep) PELCHAT_NP_rep_$(rep) PELCHAT_NS1_rep_$(rep) PELCHAT_PA_rep_$(rep)) DUMMY_NS1
GCGAAAGCAG_BNS := $(PB2_BNS) $(PB1_BNS) $(PA_BNS) $(MP_BNS) PB2_TS_5GTRIMMED PB1_TS_5GTRIMMED PA_TS_5GTRIMMED MP_TS_5GTRIMMED $(foreach rep,1 2,PELCHAT_PB1_rep_$(rep) PELCHAT_PB2_rep_$(rep))
PELCHAT_BNS := $(foreach gene,$(INFLUENZA_GENES),\
$(foreach rep,1 2,\
PELCHAT_$(gene)_rep_$(rep)))
PELCHAT_BNS_REAL := PELCHAT_HA_rep_2 PELCHAT_MP_rep_2 PELCHAT_NA_rep_1 PELCHAT_NP_rep_1 PELCHAT_NS1_rep_2 PELCHAT_PA_rep_1 PELCHAT_PB1_rep_2 PELCHAT_PB2_rep_1
RNASEQ_BNS = RNASEQ_MOCK RNASEQ_30 RNASEQ_45 RNASEQ_60 RNASEQ_90 RNASEQ_120 RNASEQ_240 RNASEQ_ZERO
# All real datasets
TARGET_BNS = $(foreach gene,$(INFLUENZA_GENES),$($(gene)_BNS))
# Make the paper
paper: paper.docx
clean_paper:
rm -rf paper/*.aux paper/*.depytx paper/*.log paper/*.pdf paper/*.pytxcode paper/pythontex-files-paper paper/temp.tex paper.docx paper/temp_nohead.tex paper/figure/all_length_distributions.eps
paper.docx: paper/paper.tex
# $(MY_IPYTHON) nbconvert $< --to latex --SphinxTransformer.author='David Koppstein'
# cd paper && pdflatex $(notdir $<) && $(MY_PYTHON) ../$(PYTHONTEX) $(notdir $<) && pdflatex $(notdir $<) && $(MY_PYTHON) ../$(DEPYTHONTEX) $(notdir $<) -o temp.tex && tail -n +2 temp.tex > temp_nohead.tex && cd ..
#
rm -f paper/final_papers.bib
cat paper/papers.bib paper/extras.bib >> paper/final_papers.bib
cd paper && pandoc --bibliography=final_papers.bib --csl=nar_revised.csl -t docx -o paper.docx paper.tex && cd ..
mv paper/paper.docx .
cp -f paper.docx ~/science/software/dklib/influenza/compiled_paper.docx
###### GET THE DATA #########
include pipeline/get_data.mk
###### COLLAPSE FASTQ FILES ########
include pipeline/collapse_fastq.mk
###### KEEP ONLY VIRAL SEQUENCES ########
include pipeline/vfiltered.mk
###### READ QUALITY FILTERING #########
include pipeline/qfiltered.mk
###### TRIM THE VIRAL SEQUENCES #######
# for now, don't trim; we want to see
# include pipeline/trim.mk
# FURTHER COLLAPSE PRIME-AND-REALIGNED SEQUENCES
include pipeline/par.mk
# GRAPH EVERYTHING
# include pipeline/graphs.mk
# FIND ALL TRANSCRIPTION STARTS SITES
include pipeline/find_starts.mk
# DO CAGE DATA
include pipeline/cage.mk
# MAP READS TO PET DATA
include pipeline/pet.mk
# MAP READS TO TRANSCRIPTION START SITES
include pipeline/five_prime_library.mk
# MAP RNA-SEQ DATA TO THE GENOME
include pipeline/rnaseq.mk
# MAP INFLUENZA DIRECTLY TO THE GENOME
include pipeline/map.mk
# MAKE FIGURES
include pipeline/figure.mk
# GROSEQ ANALYSIS
include pipeline/groseq.mk
# PELCHAT ANALYSIS
include pipeline/pelchat.mk