upgrade to v1.2.0

wbaopaul · Oct 29, 2020 · f20a3a0 · f20a3a0
2 parents 12daf06 + 41ef496
commit f20a3a0
Show file tree

Hide file tree

Showing 12 changed files with 264 additions and 198 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ Installation
 ------------
 
 -   Note: It is not necessary to install scATAC-pro from scratch. You can use the docker or singularity version if you prefer (see [Run scATAC-pro through docker or singularity](#run-scATAC-pro-through-docker-or-singularity) )
--   Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.1.4
+-   Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.2.0
 
 <!-- -->
 
@@ -49,10 +49,13 @@ Installation
 Updates
 ------------
 - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis
-- Current version: 1.1.4
+- Current version: 1.2.0
 - Recent updates
+    * updated footprinting analysis dependent module *rgt-hint* to python3
+    * saved qc statistics in html report into tables, and added peak calling summary in the report
+    * added qc per cell to metadata of the seurat object as: total.unique.frags, frac.peak, frac.mito,
+      frac.tss, frac.promoter, and frac.enhancer
     * *demplx_fastq*: the input supports PATH to the DIRECTORY of 10x fastq files
-    * *runGO*: update background genes to be all genes associated with any peak
     * *integrate*: add VFACS (Variable Features Across ClusterS) option for the integration module,
       **which reselect variable features across cell clusters after an initial clustering, followed by 
         another round of dimension reduction and clustering**, specify *Integrate_By = VFACS* in configure file
@@ -87,7 +90,7 @@ Dependencies
 -   bedtools (&gt;=2.27.1)
 -   deepTools (&gt;=3.2.1)
 -   trim\_galore (&gt;=0.6.3), Trimmomatic (&gt;=0.6.3)
--   Regulratory Genomics Toolbox (RGT, for footprinting analysis, will ask whether you want to install it since the installation is done through conda, which takes a while and you may not want to conduct footprinting analysis)
+-   Regulratory Genomics Toolbox (RGT, for footprinting analysis)
 -   g++ compiler, bzip2, ncurses-devel
 -   R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, VisCello.atac
 
@@ -278,7 +281,7 @@ Detailed Usage
     usage : scATAC-pro -s STEP -i INPUT -c CONFIG [-o] [-h] [-v]
     Use option -h|--help for more information
 
-    scATAC-pro 1.1.4
+    scATAC-pro 1.2.0
     ---------------
     OPTIONS
 
@@ -419,12 +422,12 @@ Run scATAC-pro through docker or singularity
 ----------------------------------
 In case you have problem in installing dependencies, you can run scATAC-pro without installing dependencies in **one of** the following ways:
 
-1. Run the pre-built dockerized version, pull the docker image [here](https://hub.docker.com/r/wbaopaul/scatac-pro)
+1. Run the pre-built dockerized version, pull the docker image [here](https://hub.docker.com/r/wbaopaul/scatac-pro) 
 
 2. Run it through singularity (which is more friendly with high performance cluster or HPC, and linux server) by running the following command:
 
 ```
-$ singularity pull -F docker://wbaopaul/scatac-pro 
+$ singularity pull -F docker://wbaopaul/scatac-pro:latest 
 ## will generate scatac-pro_latest.sif in current directory
 
 $ singularity exec -H YOUR_WORK_DIR --cleanenv scatac-pro_latest.sif scATAC-pro -s XXX -i XXX -c XXX
@@ -438,11 +441,11 @@ $ singularity exec -H YOUR_WORK_DIR --cleanenv scatac-pro_latest.sif scATAC-pro
 #!/bin/bash
 module load singularity
 
-singularity pull -F docker://wbaopaul/scatac-pro  ## you just need run this line once
+singularity pull -F docker://wbaopaul/scatac-pro:latest  ## you just need run this line once
 ## will generate scatac-pro_latest.sif in the current directory
 
 singularity exec --cleanenv -H /mnt/isilon/tan_lab/yuw1/run_scATAC-pro/PBMC10k scatac-pro_latest.sif \ 
-scATAC-pro -s mapping -i fastq_file1,fastq_file2 -c configure_user.txt
+scATAC-pro -s mapping -i fastq_PE1_file,fastq_PE2_file -c configure_user.txt
 
 # and then qsub mapping.sh
 ```

diff --git a/complete_update_history.md b/complete_update_history.md
@@ -1,7 +1,12 @@
 ## Complete Update History
-- Current version: 1.1.4
+- Current version: 1.2.0
+    * update footprint dependency *rgt-hint* module to python3
+    * save qc statistics in html report into tables, and peak calling summary inf added in the report
+    * add qc per cell to seurat obj metadata as: total.unique.frags, frac.peak, frac.mito,
+      frac.tss, frac.promoter, and frac.enhancer
+- VERSION **1.1.4** released
     * *demplx_fastq*: the input supports directory path of 10x fastq files
-- Current version: 1.1.3
+- VERSION: 1.1.3 released
     * *runGO*: update background genes to be all genes associated with any peak
 - May, 2020 --VERSION **1.1.2** released
     * *integrate*: add VFACS (Variable Features Across ClusterS) option for the integration module,

diff --git a/configure_system.txt b/configure_system.txt
@@ -17,7 +17,8 @@ BOWTIE2_PATH = /home/yuw1/.local/bin/bowtie2.2.9
 BOWTIE_PATH = /home/yuw1/.local/bin/bowtie-1.2.2
 MACS2_PATH = /home/yuw1/.local/bin
 PERL_PATH = /usr/bin
+CUTADAPT_PATH = /home/yuw1/.local/bin
 TRIM_GALORE_PATH = /mnt/isilon/tan_lab/yuw1/local_tools/bin/TrimGalore-0.6.3
-HINT_PATH =  /mnt/isilon/tan_lab/yuw1/local_tools/bin/conda3/envs/py2/bin
+HINT_PATH =  /home/yuw1/.local/bin
 TRIMMOMATIC_PATH =  /mnt/isilon/tan_lab/yuw1/local_tools/bin/Trimmomatic-0.39
 GEM_PATH =  /mnt/isilon/tan_lab/yuw1/local_tools/bin/gem
diff --git a/configure_user.txt b/configure_user.txt
@@ -97,7 +97,7 @@ Top_Variable_Features = 10000 ## number/fraction of variable features used for s
 REDUCTION = pca  ## pca/lda, note that UMAP and TSNE will be automatically calculated correspondly
 nREDUCTION = 30 ## the reduced dimension
 CLUSTERING_METHOD = seurat  ## seurat/cisTopic/kmeans/LSI/SCRAT/chromVAR/scABC
-K_CLUSTERS = NULL  ## the number of clusters, will set resolution as 0.2 if it's NULL
+K_CLUSTERS = 0.2  ## the number of cluster (in integer) or the resolution parameter (in float) for louvain algorithm (implemented by seurat)
 prepCello = TRUE  ## generate object for VisCello (for visualization)
 
 

diff --git a/scATAC-pro b/scATAC-pro
@@ -9,7 +9,7 @@
 #########################                                                                   
 
 SOFT="scATAC-pro"
-VERSION="1.1.4"
+VERSION="1.2.0"
 
 function usage {
     echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"

diff --git a/scripts/addQC2seurat.sh b/scripts/addQC2seurat.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+## reconstruct peak-cell matrix given a union peak file, a fragment.txt file and
+## a barcodes.txt file, separated by comma
+## the output peak-by-cell matrix will be saved under reConstruct_Matrix/, which
+## was under the same directory as barcodes.txt file
+
+set -e
+
+inputs=$1 
+
+inputs=(${inputs//,/ })
+seurat_file=${inputs[0]}
+qc_file=${inputs[1]}
+
+# reading configure file
+curr_dir=`dirname $0`
+source ${curr_dir}/read_conf.sh
+read_conf "$2"
+read_conf "$3"
+
+
+${R_PATH}/R --vanilla --args $seurat_file $qc_file < ${curr_dir}/src/addQC2seurat.R 
+
diff --git a/scripts/clustering.sh b/scripts/clustering.sh
@@ -13,9 +13,11 @@ abs_out_dir=`cd ${OUTPUT_DIR}; pwd`
 abs_down_dir=${abs_out_dir}/downstream_analysis/${PEAK_CALLER}/${CELL_CALLER}
 mkdir -p $abs_down_dir
 
+
+bc_stat_file=${abs_out_dir}/summary/${OUTPUT_PREFIX}.${PEAK_CALLER}.qc_per_barcode.txt
 curr_dir=`dirname $0`
 
-${R_PATH}/Rscript --vanilla ${curr_dir}/src/clustering.R $mtx_file $CLUSTERING_METHOD $K_CLUSTERS $abs_down_dir $GENOME_NAME $TSS $norm_by $REDUCTION $nREDUCTION $Top_Variable_Features 
+${R_PATH}/Rscript --vanilla ${curr_dir}/src/clustering.R $mtx_file $CLUSTERING_METHOD $K_CLUSTERS $abs_down_dir $GENOME_NAME $TSS $norm_by $REDUCTION $nREDUCTION $Top_Variable_Features $bc_stat_file 
 
 
 if [ "$prepCello" = "TRUE" ]; then

diff --git a/scripts/install/install_dependencies.sh b/scripts/install/install_dependencies.sh
@@ -435,6 +435,7 @@ if [ $wasInstalled == 0 ]; then
             conda install cutadapt -y --channel bioconda
         else
             pip install --user --upgrade cutadapt
+            export PATH=~/.local/bin:$PATH
         fi
     fi
 
@@ -498,79 +499,13 @@ wasInstalled=0
 ##########################
 which rgt-hint > /dev/null 2>&1
 if [ $? != "0" ]; then
-    echo -e "$RED""rgt not installed..."
-    echo -e -n "Do you want to install RGT for footprinting analysis (it will take a while to install it) ? (y/n) [n] : " "$NORMAL"
-    read ans
-    if [ XX${ans} = XXy ]; then
-
-        unset PYTHONPATH
-        unset PYTHONHOME
-        echo -e "$RED""OK, trying to install it through conda...""$NORMAL"
-        which conda > /dev/null 2>&1
-        if [ $? != "0" ]; then
-            echo "Install miniconda3:"
-                if [ "$os" = "Darwin" ]; then
-                    $get tmp.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh 
-                else
-                    $get tmp.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
-                fi
-                bash tmp.sh -b -f -p $PREFIX_BIN/conda3
-                conda_path=$PREFIX_BIN/conda3/bin
-                export PATH=$conda_path:$PATH
-                conda init bash
-                source ~/.bashrc
-        else
-            conda_path=$(dirname `which conda`)
-            pver=`conda --version 2>&1 | cut -d" " -f2`
-            vercomp $pver "4.7.0"
-            if [[ $? == 2 ]]; then
-                echo -e "$RED""conda v4.7.0 or higher is needed [$pver detected], I will updated it now...""$NORMAL"
-
-                if [ "$os" = "Darwin" ]; then
-                    $get tmp.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh 
-                else
-                    $get tmp.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 
-                fi
-                bash tmp.sh -b -f -p $PREFIX_BIN/conda3
-                conda_path=$PREFIX_BIN/conda3/bin
-                export PATH=$conda_path:$PATH
-
-                conda init bash
-                source ~/.bashrc
-            fi
-       fi
-        unset PYTHONPATH
-        unset PYTHONHOME
-        ${conda_path}/conda --help > /dev/null 2>&1
-        if [ $? != "0" ]; then
-            "Cannot install Miniconda3, please install it manually!"
-        else
-
-            conda create -y --name py2 python=2.7
-            conda activate py2
-            pip install --upgrade pip
-            pip install pytz pyparsing subprocess32
-            pip install python-dateutil==2.5.0
-            pip install --upgrade cython scipy numpy
-            pip install --upgrade RGT 
-
-            HINT_PATH=$(dirname `which rgt-hint`)
-            if [ $? != '0' ]; then
-                echo -e  "$RED"" I cannot install RGT (for footprint analysis), please install it manually! ].""$NORMAL"
-                exit 
-            fi
-            #echo "install dependent data for rgt, this will take a while"
-            #cd ~/rgtdata
-            #python setupGenomicData.py --mm9
-            #python setupGenomicData.py --mm10
-            #python setupGenomicData.py --hg19
-            #python setupGenomicData.py --hg38
-            #conda deactivate 
-            #conda deactivate
-      fi
-    fi
+    echo -e "$RED""rgt not detected, trying install it now..."
+    pip install --user cython numpy scipy
+    pip install --user RGT
+    export PATH=~/.local/bin:$PATH
+    HINT_PATH=$(dirname `which rgt-hint`)
 else
-        HINT_PATH=$(dirname `which rgt-hint`)
+    HINT_PATH=$(dirname `which rgt-hint`)
 fi
 
 ##check whether hint is installed correctly
@@ -668,6 +603,7 @@ fi
 
 which trim_galore > /dev/null 2>&1
 if [ $? = "0" ]; then
+    echo "CUTADAPT_PATH = "`dirname $(which cutadapt)` >> configure_system.txt
     echo "TRIM_GALORE_PATH = "`dirname $(which trim_galore)` >> configure_system.txt
 fi
 

diff --git a/scripts/src/clustering.R b/scripts/src/clustering.R
@@ -19,6 +19,7 @@ norm_by = args[7]
 REDUCTION = args[8]
 nREDUCTION = as.integer(args[9])
 top_variable_features = as.numeric(args[10])
+qc_stat_file = args[11]
 
 mtx = read_mtx_scATACpro(mtx_file)
 
@@ -37,6 +38,21 @@ mtx = mtx[, cfreqs > 0]
 
 seurat.obj = runSeurat_Atac(mtx, npc = nREDUCTION, norm_by = norm_by, 
                                top_variable_features = top_variable_features, reg.var = 'nCount_ATAC')
+
+## add qc stat to each cell
+qc_singlecell = fread(qc_stat_file)
+qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)]
+qc_singlecell = data.frame(qc_singlecell)
+rownames(qc_singlecell) = qc_singlecell$bc
+qc_singlecell$bc = NULL
+names(qc_singlecell) =  c("total.unique.frags", "frac.mito",  "frac.peak",
+                         "frac.promoter", "frac.tss", "frac.enhancer")
+seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell)
+
+
+
+
+
 if(REDUCTION != 'lda'){
     seurat.obj = RunTSNE(seurat.obj, dims = 1:nREDUCTION, reduction = 'pca', check_duplicates = FALSE)
     seurat.obj = RunUMAP(seurat.obj, dims = 1:nREDUCTION, reduction = 'pca', verbose = F)