From 66e8860ea2fb99a895754c24c682560b9f58fe62 Mon Sep 17 00:00:00 2001 From: Ryan Mulqueen Date: Mon, 4 Sep 2023 15:36:49 -0500 Subject: [PATCH] Update gccACT_ont.md --- docs/MD anderson analysis/gccACT_ont.md | 183 ++++++++---------------- 1 file changed, 63 insertions(+), 120 deletions(-) diff --git a/docs/MD anderson analysis/gccACT_ont.md b/docs/MD anderson analysis/gccACT_ont.md index 2315474..7381c07 100644 --- a/docs/MD anderson analysis/gccACT_ont.md +++ b/docs/MD anderson analysis/gccACT_ont.md @@ -17,9 +17,7 @@ First testing pipeline on test data ```bash ssh r1prpsciapp13 -wget -O demo_data.tar.gz \ - https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz -tar -xzvf demo_data.tar.gz + ``` Basecalling: https://github.com/nanoporetech/dorado @@ -32,19 +30,29 @@ Now our data: ## Set up seadragon for data processing Download data via transfer node ```bash -wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz - bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up + + #ONT data -rsync -LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT +#rsync -LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT +#just focusing on the big pod5 files +rsync -LPr mulqueen@10.132.80.157:/volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231_2/MDA_MB_231/20230802_1920_2D_PAO38925_a09c109d/pod5_pass ~/projects/gccACT #dorado prebuilt -rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools +#rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools +#or +wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz + +#epi2me test data +wget -O demo_data.tar.gz \ + https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz +tar -xzvf demo_data.tar.gz + #dorado reference genome #download model for base calling -#wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz -#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz #cpg?? +#ran on navin 10.132.80.157 cluster first and pulled to seadragon +#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0 rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/dna_r10.4.1_e8.2_400bps_hac@v4.2.0 ~/ rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 ~/ @@ -55,13 +63,27 @@ rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/wf-human-variation-master ~ #my 10.132.80.157 references rsync -LPr mulqueen@10.132.80.157:/volumes/USR2/Ryan/ref ~/ +mkdir ~/singularity +export NXF_SINGULARITY_CACHEDIR="~/singularity/" + +cd ~/singularity +module load singularity/3.7.0 + +#manual pull of singularity containers so i can run on gpu nodes (taking these from output log of test data ran on seadragon transfer node to see what docker containers it was pulling.) +singularity pull docker://ontresearch/wf-human-variation-sv:shabc3ac908a14705f248cdf49f218956ec33e93ef9 +singularity pull docker://ontresearch/wf-human-variation:sha0337567399c09ef14d1ab9cc114f77de86398e12 +singularity pull docker://ontresearch/wf-cnv:sha428cb19e51370020ccf29ec2af4eead44c6a17c2 +singularity pull docker://ontresearch/wf-human-variation-snp:sha0d7e7e8e8207d9d23fdf50a34ceb577da364373e ``` -```bash ## Dorado basecalling and alignment +Run these scripts with command: +```bash +bsub < 20230731_1851_3E_PAO38479_822d79b2.dorado.bsub +``` +or use the commented out call for an interative gpu node and run line by line. -This one works! 20230731_1851_3E_PAO38479_822d79b2.dorado.bsub ```bash #BSUB -J 20230731_1851_3E_PAO38479_822d79b2 @@ -77,7 +99,7 @@ This one works! #BSUB -N #BSUB -u rmulqueen@mdanderson.org -#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node +#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node pwd module load nextflow/23.04.3 @@ -114,7 +136,7 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD #BSUB -N #BSUB -u rmulqueen@mdanderson.org -#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node +#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node pwd module load nextflow/23.04.3 @@ -151,7 +173,7 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD #BSUB -N #BSUB -u rmulqueen@mdanderson.org -#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node +#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node pwd module load nextflow/23.04.3 @@ -168,41 +190,56 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD --verbose \ --reference ${ref} \ --emit-sam \ + --modified-bases-models dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 \ dna_r10.4.1_e8.2_400bps_hac@v4.2.0 \ ${pod5_dir}/pod5_pass/ | samtools view -b - > ${wd_out}/${output_name}.bam ``` +## Running ONT nextflow pipeline. +Local install and run using GPUs on seadragon +Note for some, the pod5 files are so big (600gb) that they need to be uploaded and cleared from seadragon since the disk quota is 1TB. +Written as interactive node, but can be formatted for bsub job submisison as well. Run in a screen so you don't have to keep an active terminal through it. - -Local install ```bash -#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -R rusage[mem=160] /bin/bash #get interactive gpu node +bsub -Is -W 4:00 -q gpu -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node module load nextflow/23.04.3 module load cuda11.5/toolkit/11.5.1 +module load singularity/3.7.0 +module load samtools/1.15 -ref="/Volumes/USR2/Ryan/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa" -wd_out="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT" +ref="/rsrch4/home/genetics/rmulqueen/ref/genome.fa" +wd_out="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT" output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell -pod5_dir="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell - +pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell #download model for base calling #dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0_5mCG_5hmCG@v2 #5khz #cpg?? -dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0 +#dorado download --model dna_r10.4.1_e8.2_400bps_hac@v4.2.0 -#untested for this part, but use bam from wf-basecalling output as input -nextflow run epi2me-labs/wf-human-variation \ +#these make it run locally, and use the singularity containers we pulled manually above +export NXF_SINGULARITY_CACHEDIR="/rsrch4/home/genetics/rmulqueen/singularity/" +export SINGULARITY_CACHEDIR="/rsrch4/home/genetics/rmulqueen/singularity/" +#mkdir $SINGULARITY_CACHEDIR #make sure these directories are made +#mkdir $SINGULARITY_CACHEDIR/tmp +#mkdir $SINGULARITY_CACHEDIR/pull +export SINGULARITY_TMPDIR=$SINGULARITY_CACHEDIR/tmp +export SINGULARITY_PULLDIR=$SINGULARITY_CACHEDIR/pull +export CWL_SINGULARITY_CACHE=$SINGULARITY_PULLDIR + +#output bam file from dorado caller has to be sorted before it can be used in the pipeline. +samtools sort -@ 10 -T $HOME ${wd_out}/${output_name}.bam > ${wd_out}/${output_name}.sorted.bam + +nextflow run /home/rmulqueen/wf-human-variation-master/main.nf \ -w ${wd_out}/${output_name}/workspace \ -profile singularity \ --snp --sv --cnv --methyl \ --ref ${ref} \ - --bam ${wd_out}/${output_name}.bam \ + --bam ${wd_out}/${output_name}.sorted.bam \ --dorado_ext pod5 \ - --basecaller_basemod_threads 40 \ --basecaller_cfg 'dna_r10.4.1_e8.2_400bps_hac@v4.2.0' \ --remora_cfg 'dna_r10.4.1_e8.2_400bps_sup@v4.2.0_5mCG_5hmCG@v2' \ --sample_name ${output_name} \ @@ -210,99 +247,5 @@ nextflow run epi2me-labs/wf-human-variation \ -with-singularity \ -without-docker -``` - - \ No newline at end of file