From 66e8860ea2fb99a895754c24c682560b9f58fe62 Mon Sep 17 00:00:00 2001
From: Ryan Mulqueen <mulqueenr@gmail.com>
Date: Mon, 4 Sep 2023 15:36:49 -0500
Subject: [PATCH] Update gccACT_ont.md

---
 docs/MD anderson analysis/gccACT_ont.md | 183 ++++++++----------------
 1 file changed, 63 insertions(+), 120 deletions(-)

diff --git a/docs/MD anderson analysis/gccACT_ont.md b/docs/MD anderson analysis/gccACT_ont.md
index 2315474..7381c07 100644
--- a/docs/MD anderson analysis/gccACT_ont.md	
+++ b/docs/MD anderson analysis/gccACT_ont.md	
@@ -17,9 +17,7 @@ First testing pipeline on test data
 
 ```bash
 ssh r1prpsciapp13
-wget -O demo_data.tar.gz \
-    https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz
-tar -xzvf demo_data.tar.gz
+
 ```
 
 Basecalling: https://github.com/nanoporetech/dorado
@@ -32,19 +30,29 @@ Now our data:
 ## Set up seadragon for data processing
 Download data via transfer node
 ```bash
-wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
-
 bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up
 
+
+
 #ONT data
-rsync  -LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT
+#rsync  -LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT
+#just focusing on the big pod5 files
+rsync -LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231_2/MDA_MB_231/20230802_1920_2D_PAO38925_a09c109d/pod5_pass  ~/projects/gccACT
 
 #dorado prebuilt
-rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools
+#rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools
+#or
+wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
+
+#epi2me test data
+wget -O demo_data.tar.gz \
+    https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz
+tar -xzvf demo_data.tar.gz
+
 #dorado reference genome
 #download model for base calling
-#wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz
-#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz #cpg??
+#ran on navin 10.132.80.157 cluster first and pulled to seadragon
+#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 
 #dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
 rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/dna_r10.4.1_e8.2_400bps_hac@v4.2.0 ~/
 rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 ~/
@@ -55,13 +63,27 @@ rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/wf-human-variation-master ~
 #my 10.132.80.157 references
 rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/ref ~/
 
+mkdir ~/singularity
+export NXF_SINGULARITY_CACHEDIR="~/singularity/"
+
+cd ~/singularity
+module load singularity/3.7.0
+
+#manual pull of singularity containers so i can run on gpu nodes (taking these from output log of test data ran on seadragon transfer node to see what docker containers it was pulling.)
+singularity pull docker://ontresearch/wf-human-variation-sv:shabc3ac908a14705f248cdf49f218956ec33e93ef9 
+singularity pull docker://ontresearch/wf-human-variation:sha0337567399c09ef14d1ab9cc114f77de86398e12 
+singularity pull docker://ontresearch/wf-cnv:sha428cb19e51370020ccf29ec2af4eead44c6a17c2 
+singularity pull docker://ontresearch/wf-human-variation-snp:sha0d7e7e8e8207d9d23fdf50a34ceb577da364373e 
 
 ```
 
-```bash
 ## Dorado basecalling and alignment
+Run these scripts with command:
+```bash
+bsub < 20230731_1851_3E_PAO38479_822d79b2.dorado.bsub 
+```
+or use the commented out call for an interative gpu node and run line by line.
 
-This one works! 
 20230731_1851_3E_PAO38479_822d79b2.dorado.bsub
 ```bash
 #BSUB -J 20230731_1851_3E_PAO38479_822d79b2
@@ -77,7 +99,7 @@ This one works!
 #BSUB -N
 #BSUB -u rmulqueen@mdanderson.org
 
-#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
+#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
 
 pwd
 module load nextflow/23.04.3
@@ -114,7 +136,7 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD
 #BSUB -N
 #BSUB -u rmulqueen@mdanderson.org
 
-#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
+#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
 
 pwd
 module load nextflow/23.04.3
@@ -151,7 +173,7 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD
 #BSUB -N
 #BSUB -u rmulqueen@mdanderson.org
 
-#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
+#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
 
 pwd
 module load nextflow/23.04.3
@@ -168,41 +190,56 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD
     --verbose \
     --reference ${ref} \
     --emit-sam \
+    --modified-bases-models dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 \
     dna_r10.4.1_e8.2_400bps_hac@v4.2.0 \
     ${pod5_dir}/pod5_pass/ | samtools view -b - > ${wd_out}/${output_name}.bam
 
 ```
 
+## Running ONT nextflow pipeline.
 
+Local install and run using GPUs on seadragon
 
+Note for some, the pod5 files are so big (600gb) that they need to be uploaded and cleared from seadragon since the disk quota is 1TB.
 
+Written as interactive node, but can be formatted for bsub job submisison as well. Run in a screen so you don't have to keep an active terminal through it.
 
-
-Local install
 ```bash
-#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -R rusage[mem=160] /bin/bash #get interactive gpu node
+bsub -Is -W 4:00 -q gpu -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node
 
 module load nextflow/23.04.3
 module load cuda11.5/toolkit/11.5.1
+module load singularity/3.7.0
+module load samtools/1.15 
 
-ref="/Volumes/USR2/Ryan/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa"
-wd_out="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT"
+ref="/rsrch4/home/genetics/rmulqueen/ref/genome.fa"
+wd_out="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT"
 output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell
-pod5_dir="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell
-
+pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell
 #download model for base calling
 #dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz #cpg??
-dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
+#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0
 
-#untested for this part, but use bam from wf-basecalling output as input
-nextflow run epi2me-labs/wf-human-variation \
+#these make it run locally, and use the singularity containers we pulled manually above
+export NXF_SINGULARITY_CACHEDIR="/rsrch4/home/genetics/rmulqueen/singularity/"
+export SINGULARITY_CACHEDIR="/rsrch4/home/genetics/rmulqueen/singularity/"
+#mkdir $SINGULARITY_CACHEDIR #make sure these directories are made
+#mkdir $SINGULARITY_CACHEDIR/tmp
+#mkdir $SINGULARITY_CACHEDIR/pull
+export SINGULARITY_TMPDIR=$SINGULARITY_CACHEDIR/tmp
+export SINGULARITY_PULLDIR=$SINGULARITY_CACHEDIR/pull
+export CWL_SINGULARITY_CACHE=$SINGULARITY_PULLDIR
+
+#output bam file from dorado caller has to be sorted before it can be used in the pipeline.
+samtools sort -@ 10 -T $HOME ${wd_out}/${output_name}.bam > ${wd_out}/${output_name}.sorted.bam
+
+nextflow run /home/rmulqueen/wf-human-variation-master/main.nf \
     -w ${wd_out}/${output_name}/workspace \
     -profile singularity \
     --snp --sv --cnv --methyl \
     --ref ${ref} \
-    --bam ${wd_out}/${output_name}.bam \
+    --bam ${wd_out}/${output_name}.sorted.bam \
     --dorado_ext pod5 \
-    --basecaller_basemod_threads 40 \
     --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.2.0'  \
     --remora_cfg 'dna_r10.4.1_e8.2_400bps_sup@v4.2.0_5mCG_5hmCG@v2' \
     --sample_name ${output_name} \
@@ -210,99 +247,5 @@ nextflow run epi2me-labs/wf-human-variation \
     -with-singularity \
     -without-docker
 
-```
-
-<!--
-
-#connect through Finder to seq and USR2 (for reference genome.fa)
-
-# cd ~
-# nextflow run epi2me-labs/wf-basecalling \
-#     -w ${wd_out}/${output_name}/workspace \
-#     --input $pod5_dir \
-#     --dorado_ext pod5 \
-#     --ref $ref \
-#     --out_dir ${wd_out}/${output_name} \
-#     --basecaller_cfg "dna_r10.4.1_e8.2_400bps_hac@v4.2.0" \
-#     --basecaller_basemod_threads 80 \
-#     --remora_cfg "dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2"
-
 
 ```
-
-```bash
-ssh seadragon
-sftp mulqueen@10.132.80.157
-lcd ~/projects/gccACT #seadragon directory
-get -R /volumes/seq/projects/gccACT/230808_mdamb231_ONT/ #download ONT data
-get -R ~/wf-human-variation #downloaded 
-
-#transfer
-bqueues
-bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=1:gmem=4 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node
-
-bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node
-
-
-#test demo data
-wget -O demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz #download
-tar -xzvf demo_data.tar.gz #extract
-
-
-module load singularity/3.5.2 #load singularity
-#module load cuda10.1/toolkit/10.1.243 #load cuda
-module load nextflow/22.10.6 #load nextflow
-#install nextflow instead of using module
-bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node
-cd ~/
-
-#on 10.132.80.157  server do 
-curl -s "https://get.sdkman.io" | bash
-curl -s https://get.nextflow.io | bash
-#transfer to seadragon
-
-sftp mulqueen@10.132.80.157 #transfer sdk to home directory
-get -R .sdkman
-get -R .nextflow
-get nextflow
-source "$HOME/.sdkman/bin/sdkman-init.sh"
-sdk install java 17.0.6-amzn
-#transfer nextflow to home directory
-./nextflow self-update
-mv ~/nextflow ~/tools #moving to in PATH
-
-
-OUTPUT=output
-
-
-nextflow run epi2me-labs/wf-human-variation \
-    -w ${OUTPUT}/workspace \
-    -profile standard \
-    --snp --sv \
-    --bam demo_data/demo.bam \
-    --bed demo_data/demo.bed \
-    --ref demo_data/demo.fasta \
-    --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0'  \
-    --sample_name MY_SAMPLE \
-    --out_dir ${OUTPUT} \
-    -with-singularity \
-    -without-docker
-```
-
-
-
-
-
-
-Our servers don't have GPUs so I'm just going to use my macbook
-https://labs.epi2me.io/downloads/
-and install docker desktop
-https://www.docker.com/products/docker-desktop/
-
-installing nextflow
-curl -s "https://get.sdkman.io" | bash
-source "/Users/rmulqueen/.sdkman/bin/sdkman-init.sh"
-sdk install java 17.0.6-tem
-wget -qO- https://get.nextflow.io | bash
-
--->
\ No newline at end of file