forked from tanlabcode/scATAC-pro
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scATAC-pro
executable file
·325 lines (275 loc) · 17.3 KB
/
scATAC-pro
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
#!/bin/bash
## scATAC-pro
#########################
## usage ####
#########################
SOFT="scATAC-pro"
VERSION="1.5.0"
function usage {
echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"
echo -e "Use option -b|--verbose to print running message on screen"
echo -e "Use option -h|--help for more information"
}
function help {
usage;
echo
echo "$SOFT $VERSION"
echo "---------------"
echo "OPTIONS"
echo
echo " [-s|--step ANALYSIS_STEP]: run an analysis module (or an arbitrary combination of several modules) of the scATAC-pro workflow, supported modules include"
echo " demplx_fastq: perform demultiplexing
input: Either fastq files for both reads and index, separated by comma or folder for 10x
fastq files like:
PE1_fastq,PE2_fastq,index1_fastq,inde2_fastq,index3_fastq... or
folder_path4_10xfastq
output: Demultiplexed fastq1 and fastq2 files with index information embedded
in the read name as: @index3_index2_index1:original_read_name, saved in output/demplxed _fastq/ "
echo " trimming: trim read adapter
input: demultiplexed fastq1 and fastq2 files
output: trimmed demultiplexed fastq1 and fastq2 files, saved in output/trimmed_fastq/"
echo " mapping: perform reads alignment
input: fastq files, separated by comma for each paired end
output: position sorted bam file saved in output/mapping_result/, mapping qc stat and fragment.txt saved in output/summary/"
echo " call_peak: call peaks using aggregated data
input: BAM file, outputted from the mapping module
output: peaks in bed file format, saved in output/peaks/PEAK_CALLER/OUTPUT_PREFIX_features_Black list_Removed.bed"
echo " get_mtx: build raw peak-by-cell matrix
input: fragment.txt file, outputted from the mapping module, and features/peaks file path,
outputted from the call_peak module, separated by a comma
output: raw sparse peak-by-cell matrix in Matrix Market format, barcodes and feature files, saved output/raw_matrix/PEAK_CALLER/"
echo " aggr_signal: generate aggregated signal, which can be uploaded to and viewed
in genome browser
input: BAM file, outputted from the mapping module
output: Aggregated data in .bw and .bedgraph format, saved in output/signal/"
echo " qc_per_barcode: generate quality control metrics for each barcode
input: fragment.txt file, outputted from the mapping module, and feature/peak file, outputt ed from the call_peak module, separated by comma
output: qc_per_barcode.txt file, saved in output/summary/"
echo " call_cell: perform cell calling
input: raw peak-by-barcode matrix file path, outputted from the get_mtx module
output: filtered peak-by-cell matrix, saved in output/filtered_matrix/PEAK_CALLER/CELL_CALLER/"
echo " get_bam4Cells: extract bam file for cell barcodes and calculate mapping stats
input: A bam file for aggregated data outputted from the mapping module and a barcodes.txt
file outputted from module call_cell, separated by comma
output: A bam file saved in output/mapping_results and mapping stats (optional) saved
in output/summary for cell barcodes "
echo " process: processing data - including demplx_fastq, mapping, call_peak, get_mtx,
aggr_signal, qc_per_barcode, call_cell and get_bam4Cells
input: either fastq files for both reads and index, separated by comma or file folder for 10x
fastq files per sample like:
fastq1,fastq2,index_fastq1,index_fastq2, index_fastq3..., or
folder4_10x_fastq
output: peak-by-cell matrix and all intermediate results "
echo " process_no_dex: processing data without demultiplexing
input: demultiplexed fastq files for both reads, separated by a comma like:
fastq1,fastq2;
output: peak-by-cell and all intermediate results "
echo " process_from_align: processing data from the alignment step (including alignment)
input: demultiplexed and trimmed fastq files for both reads, separated by a comma like:
fastq1,fastq2;
output: peak-by-cell and all intermediate results "
echo " process_with_bam: processing from bam file
input: bam file for aggregated data, outputted from the mapping module
output: peak-by-cell matrix and all intermediate results "
echo " clustering: cell clustering
input: filtered peak-by-cell matrix file
output: seurat objects with clustering label in the metadata (.rds file) and
file of barcodes with cluster labels (cell_cluster_table.txt file), saved in output/down stream_analysis/PEAK_CALLER/CELL_CALLER/"
echo " motif_analysis: preform motif analysis
input: filtered peak-by-cell matrix file, outputted from call_cell module, or the seurat_obj.rds file outputted
from clustering module
output: TF-by-cell enrichment chromVAR object, a table and a heatmap indicating TF enrichment
for each cell clusters"
echo " runDA: perform differential accessibility analysis
input: path_to_seurat_object with two groups of clusters to compare, could be like:
seurat_obj.rds,0:1,2 (will compare cells in cluster 0 or cluster 1 with cells in cluster2
for the given seurat object) or
seurat_obj.rds,0,rest (will compare cells in cluster 0 with the rest of cells) or
seurat_obj.rds,one,rest (will compare cells in any one of the clusters with the rest of the cells)
output: differential accessibility feutures in plain text format saved in
output/dstream_analysis/PEAK_CALLER/CELL_CALLER/"
echo " runGO: perform GO term enrichment analysis
input: differential accessible features file, outputted from runDA module
output: enriched GO terms in .xlsx file saved in the same directory as the input file"
echo " runCicero: run cicero for calculating gene activity score and predicting cis chromatin interactions
input: seurat_obj.rds file, outputted from the clustering module
output: cicero gene activity in .rds format and predicted interactions in .txt format, saved in
output/dstream_analysis/PEAK_CALLER/CELL_CALLER/"
echo " split_bam: split bam file into different clusters
input: barcodes with cluster label (cell_cluster_table.txt file, outputed from clustering)
output: bam file (saved in output/downstream/CELL_CALLER/data_by_cluster), .bw, .bedgr aph (saved in output/signal/) file for each cluster"
echo " footprint: perform footprinting analysis, supports comparison between two clusters and one cluster vs
the rest of clusters (one-vs-rest)
input: 0,1 ## or '0,rest' (means cluster1 vs rest) or 'one,rest' (all one-vs-rest)
output: footprint summary statistics in table and heatmap
(saved in output/downstream/PEAK_CALLER/CELL_CALLER/)"
echo " downstream: perform all downstream analyses, including clustering, motif_analysis,
split_bam (optional) and footprinting analysis (optional)
input: filtered peak-by-cell matrix file, outputted from the call_cell module
output: all outputs from each module"
echo " report: generate summary report in html file
input: directory to outputted QC files, output/summary as default
output: summary report in html format, saved in output/summary, and .eps figures in ouput/summary/
Figures/"
echo " convert10xbam: convert bam file in 10x genomics format to bam file in scATAC-pro format
input: bam file (position sorted) in 10x genomics format
output: position sorted bam file in scATAC-pro format in output/mapping_result, mapping qc stat
and fragment.txt file in output/summary/"
echo " mergePeaks: merge peaks (called from different data sets) if the distance is
less than a given size in basepairs (200 for instance),filtering peaks by qvalue
input: peak files and a distance parameter separated by comma:
peakFile1,peakFile2,...,peakFileN,200,0.01
output: merged peaks saved in file output/peaks/merged.bed"
echo " reConstMtx: reconstruct peak-by-cell matrix given peak file, fragments.txt file, barcodes.txt and
an optional path for reconstructed matrix
input: different files separated by comma:
peakFilePath,fragmentFilePath,barcodesPath,reconstructMatrixPath
output: reconstructed peak-by-cell matrix saved in reconstructMatrixPath,
if reconstructMatrixPath not specified, a sub-folder reConstruct_matrix will be created
under the same path as the input barcodes.txt file"
echo " integrate: perform integration of two ore more data sets
input: peak/feature files and a optional distance parameter, separated by comma: peak_file1,peak_file2,...,peak_fileN,200,0.01
output: merged peaks, reconstructed matrix, integrated seurat obj and umap plot, saved in
output/integrated/"
echo " integrate_mtx: perform integration of two ore more matrices given the reconstructed peak-by-cell matrix
input: mtx1,mtx2, separated by comma like, mtx1_filepath,mtx2_filepath
output: integrated seurat obj and umap plot, saved in output/integrated"
echo " visualize: interactively visualize the data using VisCello
input: VisCello_obj directory, outputted from the clustering module
output: launch VisCello through web browser window for interactively visualization"
echo " rmDoublets: remove potential doublets
input: a peak-by-cell matrix file or a seurat object file in .rds format
output: doublets removed matrix.rds and barcodes.txt file and seurat objects w/ and w/o doublets saved in the input directory (and a umap colored by singlet/doubet)"
echo " addCB2bam: add cell barcode tag to bam file
input: a bam file generated by scATAC-pro
output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
the input bam file)"
echo " label transfer (cell annotation) from scRNA-seq data
input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format, and an optional .gtf file for gene annotation, separated by a comma.
output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
seurat object.
*Note*: the cell annotation should be given as a metadata of the seurat object of
scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction
done."
echo " reprocess_cellreanger_output: re-process cellranger results
input: cellranger_bam_file,cellranger_fragments.tsv.gz
output: all outputs of the data processing "
echo " -i|--input INPUT : input data, different types of input data are required for different analysis;"
echo " -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module"
echo " [-o|--output_dir : folder to save results, default 'output/' under current directory. sub-folder will be created automatically for each module"
echo " [-h|--help]: print help information on screen"
echo " [-v|--version]: display current version number of scATAC-pro on screen"
echo " [-b|--verbose]: print the running message on screen"
exit;
}
function version {
echo -e "$SOFT version $VERSION"
exit
}
vb=0
function verbose {
vb=1
}
function opts_error {
echo -e "Error : invalid parameters !" >&2
echo -e "Use $SOFT -h for help"
exit
}
#####################
## Set PATHS and defaults
#####################
SOFT_PATH=`dirname $0`
ABS_SOFT_PATH=`cd "$SOFT_PATH"; pwd`
SCRIPTS_PATH="$ABS_SOFT_PATH/scripts"
CUR_PATH=$PWD
CLUSTER=0
MAKE_OPTS=""
STEP=""
INPUT=""
OUTPUT_DIR="output"
CONF="configure_user.txt"
#####################
## Inputs
#####################
if [ $# -lt 1 ]
then
usage
exit
fi
# Transform long options to short ones
for arg in "$@"; do
shift
case "$arg" in
"--step") set -- "$@" "-s" ;;
"--input") set -- "$@" "-i" ;;
"--conf") set -- "$@" "-c" ;;
"--output_dir") set -- "$@" "-o" ;;
"--help") set -- "$@" "-h" ;;
"--version") set -- "$@" "-v" ;;
"--verbose") set -- "$@" "-b" ;;
*) set -- "$@" "$arg"
esac
done
while getopts ":s:i:c:o:vbh" OPT
do
case $OPT in
s) STEP=$OPTARG;;
i) INPUT=$OPTARG;;
c) CONFIG=$OPTARG;;
o) OUTPUT_DIR=$OPTARG ;;
v) version ;;
b) verbose ;;
h) help ;;
\?)
echo "Invalid option: -$OPTARG" >&2
usage
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
usage
exit 1
;;
esac
done
if [[ -z $INPUT || -z $CONFIG ]]; then
usage
exit
fi
################################ check valid STEPs #####
############################
AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets" "labelTransfer" "reprocess_cellranger_output" "process_from_align")
check_s=0
for i in ${AVAILABLE_STEP_ARRAY[@]}; do
if [[ "$i" = "$STEP" ]]; then check_s=1; fi
done
if [[ $check_s = 0 ]]; then
echo "Unknown step name '$STEP' found. Use $0 --help for usage information." >&2
exit 1
fi
############################
## make output_dir
############################
echo "Run scATAC-pro "${VERSION}
###################################################
##Run scATAC-pro
###################################################
declare -x OUTPUT_DIR
declare -x logDir
#if [ "$STEP" = "integrate" ]; then
# OUTPUT_DIR=${OUTPUT_DIR}/integrated
#fi
mkdir -p $OUTPUT_DIR
logDir=${OUTPUT_DIR}/logs
mkdir -p $logDir
###################################################
##Run scATAC-pro
###################################################
#make --file ${SCRIPTS_PATH}/Makefile INPUT_FILE=$INPUT OUTPUT_DIR=$OUTPUT_DIR $STEP 2>&1
config_sys=${ABS_SOFT_PATH}/configure_system.txt
if [ "$vb" = '0' ]; then
make --file ${SCRIPTS_PATH}/Makefile INPUT_FILE=$INPUT CONFIG_FILE=$CONFIG CONFIG_SYS=$config_sys $STEP > ${logDir}/$STEP.log 2>&1
else
make --file ${SCRIPTS_PATH}/Makefile INPUT_FILE=$INPUT CONFIG_FILE=$CONFIG CONFIG_SYS=$config_sys $STEP 2>&1 | tee ${logDir}/$STEP.log
fi