scATAC-pro

#!/bin/bash

## scATAC-pro           


#########################
## usage ####
#########################                                                                   

SOFT="scATAC-pro"
VERSION="1.5.0"

function usage {
    echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"
    echo -e "Use option -b|--verbose to print running message on screen"
    echo -e "Use option -h|--help for more information"
}

function help {
    usage;
    echo 
    echo "$SOFT $VERSION"
    echo "---------------"
    echo "OPTIONS"
    echo
    echo "   [-s|--step ANALYSIS_STEP]: run an analysis module (or an arbitrary combination of several modules) of the scATAC-pro workflow, supported modules include" 
    echo "      demplx_fastq: perform demultiplexing
                           input: Either fastq files for both reads and index, separated by comma or folder for 10x 
                                  fastq files like:
                                  PE1_fastq,PE2_fastq,index1_fastq,inde2_fastq,index3_fastq... or
                                  folder_path4_10xfastq
                           output: Demultiplexed fastq1 and fastq2 files with index information embedded 
                                   in the read name as:  @index3_index2_index1:original_read_name, saved in output/demplxed                                   _fastq/ "
    echo "      trimming: trim read adapter
                           input: demultiplexed fastq1 and fastq2 files
                           output: trimmed demultiplexed fastq1 and fastq2 files, saved in output/trimmed_fastq/"
    echo "      mapping: perform reads alignment
                         input: fastq files, separated by comma for each paired end
                         output: position sorted bam file saved in output/mapping_result/, mapping qc stat and fragment.txt                                 saved in output/summary/"
    echo "      call_peak: call peaks using aggregated data
                           input: BAM file, outputted from the mapping module
                           output: peaks in bed file format, saved in output/peaks/PEAK_CALLER/OUTPUT_PREFIX_features_Black                                   list_Removed.bed"
    echo "      get_mtx: build raw peak-by-cell matrix
                         input: fragment.txt file, outputted from the mapping module, and features/peaks file path, 
                                outputted from the call_peak module, separated by a comma
                         output: raw sparse peak-by-cell matrix in Matrix Market format, barcodes and feature files, saved                                  output/raw_matrix/PEAK_CALLER/"
    echo "      aggr_signal: generate aggregated signal, which can be uploaded to and viewed
                             in genome browser
                             input: BAM file, outputted from the mapping module
                             output: Aggregated data in .bw and .bedgraph format, saved in output/signal/"
    echo "      qc_per_barcode: generate quality control metrics for each barcode
                                input: fragment.txt file, outputted from the mapping module, and feature/peak file, outputt                                       ed from the call_peak module,  separated by comma
                                output: qc_per_barcode.txt file, saved in output/summary/"
    echo "      call_cell: perform cell calling
                           input: raw peak-by-barcode matrix file path, outputted from the get_mtx module
                           output: filtered peak-by-cell matrix, saved in output/filtered_matrix/PEAK_CALLER/CELL_CALLER/"
    echo "      get_bam4Cells: extract bam file for cell barcodes and calculate mapping stats
                               input: A bam file for aggregated data outputted from the mapping module and a barcodes.txt 
                                     file outputted from module call_cell, separated by comma
                               output: A bam file saved in output/mapping_results and mapping stats (optional) saved 
                                         in output/summary for cell barcodes "
    echo "      process: processing data - including demplx_fastq, mapping, call_peak, get_mtx,
                            aggr_signal, qc_per_barcode, call_cell and get_bam4Cells
                            input: either fastq files for both reads and index, separated by comma or file folder for 10x
                                   fastq files per sample like:
                                   fastq1,fastq2,index_fastq1,index_fastq2, index_fastq3..., or 
                                   folder4_10x_fastq
                            output: peak-by-cell matrix and all intermediate results "
    echo "      process_no_dex: processing data without demultiplexing
                            input: demultiplexed fastq files for both reads, separated by a comma like:
                                   fastq1,fastq2; 
                            output: peak-by-cell and all intermediate results "
    echo "      process_from_align: processing data from the alignment step (including alignment)
                            input: demultiplexed and trimmed fastq files for both reads, separated by a comma like:
                                   fastq1,fastq2; 
                            output: peak-by-cell and all intermediate results "
    echo "      process_with_bam: processing from bam file
                            input: bam file for aggregated data, outputted from the mapping module 
                            output: peak-by-cell matrix and all intermediate results "
    echo "      clustering: cell clustering
                           input: filtered peak-by-cell matrix file
                           output: seurat objects with clustering label in the metadata (.rds file) and 
                                   file of barcodes with cluster labels (cell_cluster_table.txt file), saved in output/down                                   stream_analysis/PEAK_CALLER/CELL_CALLER/"
    echo "      motif_analysis: preform motif analysis
                           input: filtered peak-by-cell matrix file, outputted from call_cell module, or the seurat_obj.rds file outputted
                                  from clustering module
                           output: TF-by-cell enrichment chromVAR object, a table and a heatmap indicating TF enrichment 
                                   for each cell clusters"
    echo "      runDA: perform differential accessibility analysis
                           input: path_to_seurat_object with two groups of clusters to compare, could be like:
                                  seurat_obj.rds,0:1,2 (will compare cells in cluster 0 or cluster 1 with cells in cluster2
                                  for the given seurat object) or
                                  seurat_obj.rds,0,rest (will compare cells in cluster 0 with the rest of cells) or
                                  seurat_obj.rds,one,rest (will compare cells in any one of the clusters with the rest of the cells) 
                                 
                           output: differential accessibility feutures in plain text format saved in 
                                   output/dstream_analysis/PEAK_CALLER/CELL_CALLER/"
    echo "      runGO: perform GO term enrichment analysis
                           input: differential accessible features file, outputted from runDA module
                           output: enriched GO terms in .xlsx file saved in the same directory as the input file"
    echo "      runCicero: run cicero for calculating gene activity score and predicting cis chromatin interactions
                           input: seurat_obj.rds file, outputted from the clustering module
                           output: cicero gene activity in .rds format and predicted interactions in .txt format, saved in
                                   output/dstream_analysis/PEAK_CALLER/CELL_CALLER/"
    echo "      split_bam: split bam file into different clusters
                           input: barcodes with cluster label (cell_cluster_table.txt file, outputed from clustering)
                           output: bam file (saved in output/downstream/CELL_CALLER/data_by_cluster), .bw, .bedgr                                                            aph (saved in output/signal/) file for each cluster"
    echo "      footprint: perform footprinting analysis, supports comparison between two clusters and one cluster vs 
                           the rest of clusters (one-vs-rest)
                           input: 0,1  ## or '0,rest' (means cluster1 vs rest) or 'one,rest' (all one-vs-rest)
                           output: footprint summary statistics in table and heatmap 
                                       (saved in output/downstream/PEAK_CALLER/CELL_CALLER/)"
    echo "      downstream: perform all downstream analyses, including clustering, motif_analysis, 
                            split_bam (optional) and footprinting analysis (optional)
                            input: filtered peak-by-cell matrix file, outputted from the call_cell module
                            output: all outputs from each module"
    echo "      report: generate summary report in html file
                        input: directory to outputted QC files, output/summary as default
                        output: summary report in html format, saved in output/summary, and .eps figures in ouput/summary/
                                Figures/"
    echo "      convert10xbam: convert bam file in 10x genomics format to bam file in scATAC-pro format 
                         input: bam file (position sorted) in 10x genomics format
                         output: position sorted bam file in scATAC-pro format in output/mapping_result, mapping qc stat
                                 and fragment.txt file in output/summary/"
    echo "      mergePeaks: merge peaks (called from different data sets) if the distance is
                            less than a given size in basepairs (200 for instance),filtering peaks by qvalue 
                         input: peak files and a distance parameter separated by comma:
                                peakFile1,peakFile2,...,peakFileN,200,0.01
                         output: merged peaks saved in file output/peaks/merged.bed"
    echo "      reConstMtx: reconstruct peak-by-cell matrix given peak file, fragments.txt file, barcodes.txt and
                            an optional path for reconstructed matrix
                         input: different files separated by comma:
                                peakFilePath,fragmentFilePath,barcodesPath,reconstructMatrixPath
                         output: reconstructed peak-by-cell matrix saved in reconstructMatrixPath,
                                 if reconstructMatrixPath not specified, a sub-folder reConstruct_matrix will be created
                                 under the same path as the input barcodes.txt file" 
    echo "      integrate: perform integration of two ore more data sets
                           input: peak/feature files and a optional distance parameter, separated by comma: peak_file1,peak_file2,...,peak_fileN,200,0.01
                           output: merged peaks, reconstructed matrix, integrated seurat obj and umap plot, saved in 
                                   output/integrated/"
    echo "      integrate_mtx: perform integration of two ore more matrices given the reconstructed peak-by-cell matrix
                           input: mtx1,mtx2, separated by comma like, mtx1_filepath,mtx2_filepath
                           output: integrated seurat obj and umap plot, saved in output/integrated"
    echo "      visualize: interactively visualize the data using VisCello
                         input: VisCello_obj directory, outputted from the clustering module
                         output: launch VisCello through web browser window for interactively visualization"
    echo "      rmDoublets: remove potential doublets
                         input: a peak-by-cell matrix file or a seurat object file in .rds format
                         output: doublets removed matrix.rds and barcodes.txt file and seurat objects w/ and w/o doublets saved in the input directory (and a umap colored by singlet/doubet)"

    echo "      addCB2bam: add cell barcode tag to bam file
                         input: a bam file generated by scATAC-pro
                         output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
                                 the input bam file)"

   echo "       label transfer (cell annotation) from scRNA-seq data
                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,                                   and an optional .gtf file for gene annotation, separated by a comma.
                         output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
                                 an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
                                 seurat object.
                         *Note*: the cell annotation should be given as a metadata of the seurat object of
                                 scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction
                                 done."

    echo "      reprocess_cellreanger_output: re-process cellranger results 
                            input: cellranger_bam_file,cellranger_fragments.tsv.gz
                            output: all outputs of the data processing "
    echo "   -i|--input INPUT : input data, different types of input data are required for different analysis;"
    echo "   -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module"
    echo "   [-o|--output_dir : folder to save results, default 'output/' under current directory. sub-folder will be created automatically for each module"
    echo "   [-h|--help]: print help information on screen"
    echo "   [-v|--version]: display current version number of scATAC-pro on screen"
    echo "   [-b|--verbose]: print the running message on screen"
    exit;
}

function version {
    echo -e "$SOFT version $VERSION"
    exit
}
vb=0
function verbose {
 vb=1
}


function opts_error {
    echo -e "Error : invalid parameters !" >&2
    echo -e "Use $SOFT -h for help"
    exit
}

#####################
## Set PATHS and defaults
#####################

SOFT_PATH=`dirname $0`
ABS_SOFT_PATH=`cd "$SOFT_PATH"; pwd`
SCRIPTS_PATH="$ABS_SOFT_PATH/scripts"
CUR_PATH=$PWD

CLUSTER=0
MAKE_OPTS=""
STEP=""
INPUT=""
OUTPUT_DIR="output"
CONF="configure_user.txt"


#####################
## Inputs
#####################
if [ $# -lt 1 ]
then
    usage
    exit
fi

# Transform long options to short ones
for arg in "$@"; do
  shift
  case "$arg" in
      "--step")   set -- "$@" "-s" ;;
      "--input") set -- "$@" "-i" ;;
      "--conf")   set -- "$@" "-c" ;;
      "--output_dir")   set -- "$@" "-o" ;;
      "--help")   set -- "$@" "-h" ;;
      "--version")   set -- "$@" "-v" ;;
      "--verbose")   set -- "$@" "-b" ;;
      *)        set -- "$@" "$arg"
  esac
done

while getopts ":s:i:c:o:vbh" OPT
do
    case $OPT in
	s) STEP=$OPTARG;;
	i) INPUT=$OPTARG;;
	c) CONFIG=$OPTARG;;
	o) OUTPUT_DIR=$OPTARG ;;
	v) version ;;
	b) verbose ;;
	h) help ;;
	\?)
	     echo "Invalid option: -$OPTARG" >&2
	     usage
	     exit 1
	     ;;
	 :)
	     echo "Option -$OPTARG requires an argument." >&2
	     usage
	     exit 1
	     ;;
    esac
done


if [[ -z $INPUT || -z $CONFIG ]]; then
    usage
    exit
fi


################################ check valid STEPs #####
############################
AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets" "labelTransfer" "reprocess_cellranger_output" "process_from_align")


check_s=0
for i in ${AVAILABLE_STEP_ARRAY[@]}; do
	    if [[ "$i" = "$STEP" ]]; then check_s=1; fi
done


if [[ $check_s = 0 ]]; then 
    echo "Unknown step name '$STEP' found. Use $0 --help for usage information." >&2
    exit 1
fi


############################
## make output_dir
############################


echo "Run scATAC-pro "${VERSION}


###################################################
##Run scATAC-pro
###################################################


declare -x OUTPUT_DIR
declare -x logDir

#if [ "$STEP" = "integrate" ]; then
#    OUTPUT_DIR=${OUTPUT_DIR}/integrated
#fi

mkdir -p $OUTPUT_DIR
logDir=${OUTPUT_DIR}/logs
mkdir -p $logDir

###################################################
##Run scATAC-pro
###################################################

#make --file ${SCRIPTS_PATH}/Makefile INPUT_FILE=$INPUT OUTPUT_DIR=$OUTPUT_DIR $STEP 2>&1
config_sys=${ABS_SOFT_PATH}/configure_system.txt

if [ "$vb" = '0' ]; then
    make --file ${SCRIPTS_PATH}/Makefile INPUT_FILE=$INPUT CONFIG_FILE=$CONFIG CONFIG_SYS=$config_sys $STEP > ${logDir}/$STEP.log 2>&1
else
    make --file ${SCRIPTS_PATH}/Makefile INPUT_FILE=$INPUT CONFIG_FILE=$CONFIG CONFIG_SYS=$config_sys $STEP 2>&1 | tee ${logDir}/$STEP.log
fi