diff --git a/README.md b/README.md index f6efe7a..a7dba89 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Installation ------------ - Note: It is not necessary to install scATAC-pro from scratch. You can use the docker or singularity version if your system support (see [Run scATAC-pro through docker or singularity](#run-scATAC-pro-through-docker-or-singularity) ) -- Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.4.3 +- Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.4.4 @@ -49,9 +49,9 @@ Installation Updates ------------ - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis -- Current version: 1.4.3 +- Current version: 1.4.4 - Highlighted updates - * **New module *reprocess_cellranger_output* added, to reprocess 10x scATAC-seq data (including atac in 10x multiome assay) originally processed by cellranger, taking cellranger processed .bam and .fragments.tsv.gz files as input (v1.4.3)** + * **New module *reprocess_cellranger_output* added, to reprocess 10x scATAC-seq data (including atac in 10x multiome assay) originally processed by cellranger, taking cellranger processed .bam and .fragments.tsv.gz files as input (v1.4.4)** * More friendly to single-end sequencing data (v1.4.2) * New module *labelTransfer* added, to do label trasfer (for cell annotation) from cell annotation of scRNA-seq data. First construct a gene by cell activity matrix, then use *FindTransferAnchors* and *TransferData* function from Seurat R package to predicted cell type annotation from the cell annotaiton in scRNA-seq data (v1.4.0) * New module *rmDoublets* added,to remove potential doublets using [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) algorithm (v1.3.1) @@ -302,7 +302,7 @@ See [here](https://scatacpro-in-r.netlify.app/note_module) or in your terminal: usage : scATAC-pro -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] Use option -h|--help for more information - scATAC-pro 1.4.3 + scATAC-pro 1.4.4 --------------- OPTIONS diff --git a/complete_update_history.md b/complete_update_history.md index 93a58ce..6b07832 100644 --- a/complete_update_history.md +++ b/complete_update_history.md @@ -1,4 +1,9 @@ ## Complete Update History +- Version 1.4.4 released + * Only consider standard chromosomes in the *qc_per_barcode* module + * Correct a minor bug in the *qc_per_barcode* module + * Add version# in the html report + * Clean and correct a minor bug in the *trimming* module - Version 1.4.3 released * add new module *reprocess_cellranger_output* to reprocess scATAC-seq data originally processed by cellranger - Version 1.4.2 released diff --git a/scATAC-pro b/scATAC-pro index 789d8a7..3141f8b 100755 --- a/scATAC-pro +++ b/scATAC-pro @@ -9,7 +9,7 @@ ######################### SOFT="scATAC-pro" -VERSION="1.4.3" +VERSION="1.4.4" function usage { echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]" diff --git a/scripts/bam2qc.sh b/scripts/bam2qc.sh index 7ce9694..7a94818 100755 --- a/scripts/bam2qc.sh +++ b/scripts/bam2qc.sh @@ -63,6 +63,7 @@ ${SAMTOOLS_PATH}/samtools index -@ $ncore ${mapRes_dir}/${OUTPUT_PREFIX}.positio if [ $MAPQ -ne 30 ]; then ${SAMTOOLS_PATH}/samtools view -f $flag0 -b -h -q $MAPQ -@ $ncore $position_sort_bam -o ${mapRes_dir}/${OUTPUT_PREFIX}.positionsort.MAPQ${MAPQ}.bam + ${SAMTOOLS_PATH}/samtools index -@ $ncore ${mapRes_dir}/${OUTPUT_PREFIX}.positionsort.MAPQ${MAPQ}.bam fi ## mapping stats diff --git a/scripts/mapping.sh b/scripts/mapping.sh index 7f94832..cd1f38c 100755 --- a/scripts/mapping.sh +++ b/scripts/mapping.sh @@ -93,7 +93,7 @@ ${R_PATH}/Rscript --vanilla ${curr_dir}/src/sort_frags.R ${qc_dir}/${OUTPUT_PREF ## index fragment file #sort -k1,1 -k2,2n -T ${mapRes_dir}/tmp/ ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv > ${qc_dir}/${OUTPUT_PREFIX}.fragments.sorted.tsv #mv ${qc_dir}/${OUTPUT_PREFIX}.fragments.sorted.tsv ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv -${TABIX_PATH}/bgzip ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv -${TABIX_PATH}/tabix -p bed ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv.gz +${TABIX_PATH}/bgzip -f ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv +${TABIX_PATH}/tabix -f -p bed ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv.gz diff --git a/scripts/process_with_bam.sh b/scripts/process_with_bam.sh index 2bbc9ab..7ae9b61 100755 --- a/scripts/process_with_bam.sh +++ b/scripts/process_with_bam.sh @@ -29,7 +29,7 @@ ${R_PATH}/Rscript --vanilla ${curr_dir}/src/sort_frags.R ${qc_dir}/${OUTPUT_PREF #sort -k1,1 -k2,2n -T ${mapRes_dir}/tmp/ ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv > ${qc_dir}/${OUTPUT_PREFIX}.fragments.sorted.tsv #mv ${qc_dir}/${OUTPUT_PREFIX}.fragments.sorted.tsv ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv ${TABIX_PATH}/bgzip -f ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv -${TABIX_PATH}/tabix -p bed ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv.gz +${TABIX_PATH}/tabix -f -p bed ${qc_dir}/${OUTPUT_PREFIX}.fragments.tsv.gz ## 2.call peak diff --git a/scripts/report.sh b/scripts/report.sh index c9e7b04..8ae14c9 100755 --- a/scripts/report.sh +++ b/scripts/report.sh @@ -23,9 +23,8 @@ configure_file=`basename ${2}` abs_configure_dir=`cd ${configure_dir}; pwd` abs_configure_file=${abs_configure_dir}/${configure_file} -#${R_PATH}/Rscript --vanilla ${curr_dir}/src/render2report.R \ -# ${abs_report_dir}/scATAC-pro_report_${OUTPUT_PREFIX}.html $abs_out_dir ${work_dir}/${2} +scatacpro_version=`scATAC-pro --version | cut -d " " -f3` ${R_PATH}/Rscript --vanilla ${curr_dir}/src/render2report.R \ - ${abs_report_dir}/scATAC-pro_report_${OUTPUT_PREFIX}.html $abs_out_dir $abs_configure_file + ${abs_report_dir}/scATAC-pro_report_${OUTPUT_PREFIX}.html $abs_out_dir $abs_configure_file $scatacpro_version $OUTPUT_PREFIX echo "Report generation Done!" diff --git a/scripts/src/get_qc_per_barcode.R b/scripts/src/get_qc_per_barcode.R index e64302e..39eb537 100644 --- a/scripts/src/get_qc_per_barcode.R +++ b/scripts/src/get_qc_per_barcode.R @@ -3,7 +3,7 @@ library(data.table) library(Rcpp) library(Matrix) - +library(GenomicRanges) #sourceCpp(paste0('getOverlaps.cpp')) @@ -122,11 +122,14 @@ frags = fread(frags.file, select=1:4, header = F) names(frags) = c('chr', 'start', 'end', 'bc') setkey(frags, chr, start) +## only keep reads in standard chrs +chrs = standardChromosomes(makeGRangesFromDataFrame(frags)) +frags = frags[chr %in% chrs] frags[, 'total_frags' := .N, by = bc] frags = frags[total_frags > 5] -frags = frags[!grepl(chr, pattern = 'random', ignore.case = T)] -frags = frags[!grepl(chr, pattern ='un', ignore.case = T)] +#frags = frags[!grepl(chr, pattern = 'random', ignore.case = T)] +#frags = frags[!grepl(chr, pattern ='un', ignore.case = T)] peaks = fread(peaks.file, select=1:3, header = F) tss = fread(tss.file, select=1:3, header = F) @@ -147,7 +150,7 @@ if(file.exists(enhs.file)) { setkey(peaks, chr, start) setkey(tss, chr, start) -chrs = unique(frags$chr) +#chrs = unique(frags$chr) ## calculate tss enrichment score if(T){ diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R index 8364b42..6e55f92 100644 --- a/scripts/src/labelTransfer.R +++ b/scripts/src/labelTransfer.R @@ -15,7 +15,7 @@ gene_gtf_file = args[4] ## if gtf file not provided, using R bioconductor packages for gene annotation if(!file.exists(gene_gtf_file)){ if(!grepl(GENOME_NAME, pattern = 'hg19|hg38|mm10|mm9', ignore.case = T)){ - stop('Genome is not belong to any of hg19,hg38,mm9 or mm10, + stop('Genome does not belong to any of hg19,hg38,mm9 or mm10, please provide .gtf file for gene annotation!') } if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { @@ -61,64 +61,10 @@ if(!file.exists(gene_gtf_file)){ gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name] names(gene_ann)[1] = 'chr' gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name)) - chrs = 1:22 - chrs = c(chrs, 'X', 'Y', 'M') + chrs = standardChromosomes(makeGRangesFromDataFrame(gene_ann)) gene_ann = gene_ann[chr %in% chrs] gene_ann = gene_ann[!duplicated(gene_name)] names(gene_ann)[2:4] = c('gene_start', 'gene_end', 'strand') - gene_ann[, 'chr' := paste0('chr', chr)] - -} - - -if(F){ - ## download gtf file if not provided - if(!file.exists(gene_gtf_file)){ - print('gene annotation gtf file not provided, I will try to download one:') - err = 0 - if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!") - return(1)}) - } - if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!") - return(1)}) - } - if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!") - return(1)}) - } - - if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!") - return(1)}) - } - - if(err == 1) stop('Download failed! Please provide a gtf file to run this module!') - } - - gene_ann = fread(gene_gtf_file, sep = '\t') - gene_ann = gene_ann[V3 == 'gene'] - gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9] - gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name] - gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name] - names(gene_ann)[1] = 'chr' - gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name)) - chrs = 1:22 - chrs = c(chrs, 'X', 'Y') - gene_ann = gene_ann[chr %in% chrs] - gene_ann = gene_ann[!duplicated(gene_name)] - names(gene_ann)[2:4] = c('gene_start', 'gene_end', 'strand') - gene_ann[, 'chr' := paste0('chr', chr)] - } seurat.rna = readRDS(inputSeurat_rna) diff --git a/scripts/src/render2report.R b/scripts/src/render2report.R index 4307bca..74f094a 100644 --- a/scripts/src/render2report.R +++ b/scripts/src/render2report.R @@ -4,10 +4,13 @@ args = commandArgs(T) output_file = args[1] result_dir = args[2] configure_file = args[3] +scatacpro_version = args[4] +sample_name = args[5] argv <- commandArgs(trailingOnly = FALSE) curr_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) out_dir = dirname(output_file) rmarkdown::render(paste0(curr_dir, "/scATAC-pro_report.Rmd"), output_file=output_file, intermediates_dir = out_dir, - params = list(output_dir = result_dir, configure_user = configure_file)) + params = list(set_title = scatacpro_version, set_sample = sample_name, + output_dir = result_dir, configure_user = configure_file)) diff --git a/scripts/src/scATAC-pro_report.Rmd b/scripts/src/scATAC-pro_report.Rmd index b8ee309..a80daeb 100644 --- a/scripts/src/scATAC-pro_report.Rmd +++ b/scripts/src/scATAC-pro_report.Rmd @@ -1,14 +1,15 @@ --- -title: scATAC-pro Report output: flexdashboard::flex_dashboard: vertical_layout: fill social: menu theme: united params: + set_title: scATAC-pro Report + set_sample: PMBC10K output_dir: /mnt/isilon/tan_lab/yuw1/run_scATAC-pro/PBMC10k/output configure_user: /mnt/isilon/tan_lab/yuw1/run_scATAC-pro/PBMC10k/configure_user.txt - +title: "scATAC-pro `r params$set_title`: `r params$set_sample`" ---