merge dev

wbaopaul · Apr 20, 2020 · 8964476 · 8964476
2 parents 950ad99 + a62836a
commit 8964476
Show file tree

Hide file tree

Showing 9 changed files with 482 additions and 51 deletions.
diff --git a/.nfs000000056817ba0600001376 b/.nfs000000056817ba0600001376
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ Updates
 ------------
 - Current version: 1.1.1
 - March, 2020
-    * *get_mtx* requires two fragments.txt and peak file, separated by comma
+    * *get_mtx* requires two input files: a fragments.txt file and a peak file, separated by comma
     * annotate peak as overlapped with a gene Tss if the corresponding distance <= 1000bp; mark peak with a gene if their distance <= 100kb 
 - Feb, 2020
     * *integrate* module enables 3 options: seurat, harmony and pool
@@ -61,7 +61,6 @@ Updates
     * added new parameters in the configuration file: Top_Variable_Features, REDUCTION, nREDUCTION
     * enabled all clustering methods mentioned in the manuscript, along with kmeans clustering on principal components
     * file path changed to like downstreame_analysis/PEAK_CALLER/CELL_CALLER/..., indicating peak caller
-    * qc_per_barcode requires two input files, separated by comma, see example and detailed usage
 - Jan, 2020 
     * added a new module *mergePeaks* to merge different peak files called from different data sets
     * added a new module *reConstMtx* to reconstruct peak-by-cell matrix given a peak file, a fragment file and a barcodes.txt file
@@ -82,7 +81,7 @@ Dependencies
 
 ### Programming language users should install
 
--   R (&gt;=3.6.0)
+-   R (&gt;=3.6.1)
 -   Python (&gt;=3.6.0)
 
 ### Software packages required
@@ -298,7 +297,7 @@ Detailed Usage
           qc_per_barcode: generate quality control metrics for each barcode
                                     input: fragment.txt file (outputted from module mapping) and peak/feature file, 
                                            (outputted from module call_peak), separated by comma
-                                     output: qc_per_barcode.summary in plain text format, saved in output/summary/
+                                     output: qc_per_barcode.txt file, saved in output/summary/
           call_cell: perform cell calling
                                input: raw peak-by-barcode matrix file, outputted from the get_mtx module
                                output: filtered peak-by-cell matrix in Market Matrix format, barcodes and features,
@@ -445,5 +444,4 @@ FAQs
 
 Citation
 --------------------------------------
-Yu W, Uzun Y, Zhu Q, Chen C, Tan K. [*scATAC-pro: a comprehensive workbench for single-cell chromatin accessibility sequencing data.*](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02008-0) Genome Biology; 2020
-
+Yu W, Uzun Y, Zhu Q, Chen C, Tan K. [*scATAC-pro: a comprehensive workbench for single-cell chromatin accessibility sequencing data.*](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02008-0) Genome Biology; 2020 
diff --git a/job_pub2 b/job_pub2
diff --git a/scATAC-pro b/scATAC-pro
@@ -49,7 +49,7 @@ function help {
                              output: Aggregated data in .bw and .bedgraph format, saved in output/signal/"
     echo "      qc_per_barcode: generate quality control metrics for each barcode
                                 input: fragment.txt file, outputted from the mapping module, and feature/peak file, outputt                                       ed from the call_peak module,  separated by comma
-                                output: qc_per_barcode.summary file of plain text format, saved in output/summary/"
+                                output: qc_per_barcode.txt file, saved in output/summary/"
     echo "      call_cell: perform cell calling
                            input: raw peak-by-barcode matrix file path, outputted from the get_mtx module
                            output: filtered peak-by-cell matrix, saved in output/filtered_matrix/PEAK_CALLER/CELL_CALLER/"

diff --git a/scripts/install/install_Rpackages.R b/scripts/install/install_Rpackages.R
@@ -5,7 +5,7 @@ if(!require(BiocManager)){
 }
 
 pks = c('devtools', 'flexdashboard', 'png', 'data.table', 'Matirx', 'Rcpp', 'ggplot2', 'flexmix',
-  'optparse', 'magrittr', 'readr', 'Seurat', 'bedr', 'gridExtra', 'ggrepel', 'kableExtra', 'viridis', 'writexl', 'xlsx')
+  'optparse', 'magrittr', 'readr', 'Seurat', 'bedr', 'gridExtra', 'ggrepel', 'kableExtra', 'viridis', 'writexl', 'xlsx', 'mefa4')
 
 for(pk in pks){
     if(!require(pk, character.only = T)) {

diff --git a/scripts/install/install_dependencies.sh b/scripts/install/install_dependencies.sh
@@ -152,9 +152,9 @@ if [ $? != "0" ]; then
     exit 1;
 else
     pver=`R --version 2>&1 | head -1 | cut -d" " -f3`
-    vercomp $pver "3.6.0"
+    vercomp $pver "3.6.1"
     if [[ $? == 2 ]]; then
-    echo -e "$RED""R v3.6.0 or higher is needed [$pver detected].""$NORMAL"
+    echo -e "$RED""R v3.6.1 or higher is needed [$pver detected].""$NORMAL"
     exit 1;
     fi
 fi

diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R
@@ -1074,11 +1074,11 @@ doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes,
   rownames(mtx) <- new.rnames
 
 
-
-  dt = reshape2::melt(as.matrix(mtx), value.name = 'count')
+  #dt = reshape2::melt(as.matrix(mtx), value.name = 'count')
+  #dt = dt[dt$count > 0, ]
+  dt = mefa4::Melt(mtx)
   rm(mtx)
-  dt = dt[dt$count > 0, ]
-  names(dt) = c('Peak', 'Cell', 'Count')
+  names(dt) = c('Peak', 'Cell', 'Count') 
   dt$Cell = as.character(dt$Cell)
   dt$Peak = as.character(dt$Peak)
 
@@ -1145,9 +1145,10 @@ doCicero_conn <- function(seurat.obj, reduction = 'tsne',
   new.rnames = sapply(new.rnames, function(x) gsub('-', '_', x))
   rownames(mtx) <- new.rnames
 
-  dt = reshape2::melt(as.matrix(mtx), value.name = 'count')
+  #dt = reshape2::melt(as.matrix(mtx), value.name = 'count')
+  #dt = dt[dt$count > 0, ]
+  dt = mefa4::Melt(mtx)
   rm(mtx)
-  dt = dt[dt$count > 0, ]
   names(dt) = c('Peak', 'Cell', 'Count')
   dt$Cell = as.character(dt$Cell)
   dt$Peak = as.character(dt$Peak)

diff --git a/scripts/src/runDA.R b/scripts/src/runDA.R
@@ -17,55 +17,37 @@ test_use = args[5]
 seurat.obj = readRDS(seuratObj_file)
 
 seurat.obj$active_clusters = as.character(seurat.obj$active_clusters)
+Idents(seurat.obj) <- seurat.obj$active_clusters
 
 group1 = tolower(group1)
 group2 = tolower(group2)
 
-confVar = 'nCount_ATAC'
-if(test_use == 'wilcox' || test_use == 'DESeq2') confVar = NULL
-
-mtx = seurat.obj@assays$ATAC@data
-
-if(test_use %in% c('DESeq2', 'negbinom')){
-  mtx = seurat.obj@assays$ATAC@counts
+confVar = NULL
+slot = 'data'
+if(test_use %in% c('DESeq2', 'LR', 'negbinom')){
+  slot = 'counts'
+  confVar = 'nCount_ATAC'
 }
 
-## features that are only expressed less than 15% in any of the cluster
-cls = unique(seurat.obj$active_clusters)
-mean_frac_cls = sapply(cls, function(x){
-    dat0 = mtx[, seurat.obj$active_clusters == x]
-    Matrix::rowMeans(dat0 > 0)
-})
-
-sele.features = rownames(mean_frac_cls)[rowSums(mean_frac_cls > 0.1) > 1]
-
-
-## select variable features
-#seurat.obj <- FindVariableFeatures(object = seurat.obj,
-#                                         selection.method = 'vst',
-#                                         nfeatures = floor(nrow(mtx) * 0.5))
-#vFeatures = VariableFeatures(seurat.obj)
-rnames = rownames(mtx)
-mtx = mtx[rnames %in% sele.features, ]
-
+cls = sort(unique(seurat.obj$active_clusters))
 group1 = unlist(strsplit(group1, ':'))
 group2 = unlist(strsplit(group2, ':'))
-
 cn = colnames(seurat.obj)
 
 if(group1[1] == 'one') {
-   cls = sort(unique(seurat.obj$active_clusters))
    markers = NULL
    for(cluster0 in cls){
         cells1 = cn[which(seurat.obj$active_clusters == cluster0)]
         if(group2[1] == 'rest') {
           cells2 = cn[which(seurat.obj$active_clusters != cluster0)]
+          id2 = NULL
         }else{
           cells2 = cn[which(seurat.obj$active_clusters %in% group2)]
+          id2 = group2
         }
         if(length(cells1) <= 10 || length(cells2) <= 10) next
-        mm = FindMarkers(mtx, 
-                         cells.1 = cells1, cells.2 = cells2,
+        mm = FindMarkers(seurat.obj, slot = slot, 
+                         ident.1 = cluster0, ident.2 = id2,
                          test.use = test_use, logfc.threshold = 0.0, 
                          max.cells.per.ident = 500, 
                          only.pos = T, latent.vars = confVar)
@@ -81,17 +63,18 @@ if(group1[1] == 'one') {
   if(length(cells1) <= 10) stop('Not enough cells in group1')
     if(group2[1] == 'rest'){
         cells2 = cn[which(!seurat.obj$active_clusters %in% group1)]
-        markers = FindMarkers(mtx, 
-                              cells.1 = cells1, cells.2 = cells2, test.use = test_use, 
+        id2 = setdiff(cls, group1)
+        markers = FindMarkers(seurat.obj, slot = slot,
+                              ident.1 = group1, ident.2 = id2, test.use = test_use, 
                               logfc.threshold = 0.0, max.cells.per.ident = 500,
                               only.pos = F, latent.vars = confVar)
         markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
         markers$fdr = p.adjust(markers$p_val, method = 'fdr')
     }else{
         cells2 = cn[which(seurat.obj$active_clusters %in% group2)]
         if(length(cells2) <= 10) stop('Not enough cells in group2')
-        markers = FindMarkers(mtx,  
-                              cells.1 = cells1, cells.2 = cells2, test.use = test_use,
+        markers = FindMarkers(seurat.obj, slot = slot, 
+                              ident.1 = group1, ident.2 = group2, test.use = test_use,
                               max.cells.per.ident = 500, logfc.threshold = 0.0, 
                               only.pos = F, latent.vars = confVar)
         markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
@@ -111,8 +94,7 @@ markers[, 'end' := unlist(strsplit(peak0, '-'))[3], by = peak0]
 setcolorder(markers, c('chr', 'start', 'end', 'p_val','avg_logFC','pct.1','pct.2', 
                        'p_val_adj', 'fdr', 'cluster', 'peak', 'peak0'))
 
-#markers = markers[abs(avg_logFC) > 0, ]
 markers = markers[fdr <= 0.05, ]
-write.table(markers, file = paste0(output_dir, '/differential_accessible_features_', group1, '_vs_', group2, '.txt'), sep = '\t',
+write.table(markers, file = paste0(output_dir, '/differential_accessible_features_', args[3], '_vs_', args[4], '.txt'), sep = '\t',
             quote = F, row.names = F)
 
diff --git a/update_note.tmp → update_note.txt b/update_note.tmp → update_note.txt
@@ -1,3 +1,4 @@
+## using mefa4::Melt instead of melt -- better for large sparse matrix
 ## Mar14, 2020
 -- add PEAK_CALLER prefix to qc_per_barcode.txt filename
 ## Mar15, 2020
@@ -8,3 +9,5 @@
 -- get_mtx inputs: fragments.txt,feature.txt
 -- annotate peaks update: a peak within tss+/- 1000bp will be annotated with geneName_tss and
    if the nearest distance to a gene is larger than 100kb, the peak will not be annotated 
+## April14, 2020
+-- update DA, fix bug of using covariate