-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
63 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,9 +17,7 @@ First testing pipeline on test data | |
|
||
```bash | ||
ssh r1prpsciapp13 | ||
wget -O demo_data.tar.gz \ | ||
https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz | ||
tar -xzvf demo_data.tar.gz | ||
|
||
``` | ||
|
||
Basecalling: https://github.com/nanoporetech/dorado | ||
|
@@ -32,19 +30,29 @@ Now our data: | |
## Set up seadragon for data processing | ||
Download data via transfer node | ||
```bash | ||
wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz | ||
|
||
bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive transfer node this has internet access for environment set up | ||
|
||
|
||
|
||
#ONT data | ||
rsync -LPr [email protected]:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT | ||
#rsync -LPr [email protected]:/volumes/seq/projects/gccACT/230808_mdamb231_ONT ~/projects/gccACT | ||
#just focusing on the big pod5 files | ||
rsync -LPr [email protected]:/volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231_2/MDA_MB_231/20230802_1920_2D_PAO38925_a09c109d/pod5_pass ~/projects/gccACT | ||
|
||
#dorado prebuilt | ||
rsync -LPr [email protected]:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools | ||
#rsync -LPr [email protected]:/volumes/USR2/Ryan/tools/dorado-0.3.4-linux-x64.tar.gz ~/tools | ||
#or | ||
wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz | ||
|
||
#epi2me test data | ||
wget -O demo_data.tar.gz \ | ||
https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz | ||
tar -xzvf demo_data.tar.gz | ||
|
||
#dorado reference genome | ||
#download model for base calling | ||
#wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.3.4-linux-x64.tar.gz | ||
#dorado download --model [email protected]_5mCG_5hmCG@v2 #5khz #cpg?? | ||
#ran on navin 10.132.80.157 cluster first and pulled to seadragon | ||
#dorado download --model [email protected]_5mCG_5hmCG@v2 | ||
#dorado download --model [email protected] | ||
rsync -LPr [email protected]:/volumes/USR2/Ryan/[email protected] ~/ | ||
rsync -LPr [email protected]:/volumes/USR2/Ryan/[email protected]_5mCG_5hmCG@v2 ~/ | ||
|
@@ -55,13 +63,27 @@ rsync -LPr [email protected]:/volumes/USR2/Ryan/wf-human-variation-master ~ | |
#my 10.132.80.157 references | ||
rsync -LPr [email protected]:/volumes/USR2/Ryan/ref ~/ | ||
|
||
mkdir ~/singularity | ||
export NXF_SINGULARITY_CACHEDIR="~/singularity/" | ||
|
||
cd ~/singularity | ||
module load singularity/3.7.0 | ||
|
||
#manual pull of singularity containers so i can run on gpu nodes (taking these from output log of test data ran on seadragon transfer node to see what docker containers it was pulling.) | ||
singularity pull docker://ontresearch/wf-human-variation-sv:shabc3ac908a14705f248cdf49f218956ec33e93ef9 | ||
singularity pull docker://ontresearch/wf-human-variation:sha0337567399c09ef14d1ab9cc114f77de86398e12 | ||
singularity pull docker://ontresearch/wf-cnv:sha428cb19e51370020ccf29ec2af4eead44c6a17c2 | ||
singularity pull docker://ontresearch/wf-human-variation-snp:sha0d7e7e8e8207d9d23fdf50a34ceb577da364373e | ||
|
||
``` | ||
|
||
```bash | ||
## Dorado basecalling and alignment | ||
Run these scripts with command: | ||
```bash | ||
bsub < 20230731_1851_3E_PAO38479_822d79b2.dorado.bsub | ||
``` | ||
or use the commented out call for an interative gpu node and run line by line. | ||
|
||
This one works! | ||
20230731_1851_3E_PAO38479_822d79b2.dorado.bsub | ||
```bash | ||
#BSUB -J 20230731_1851_3E_PAO38479_822d79b2 | ||
|
@@ -77,7 +99,7 @@ This one works! | |
#BSUB -N | ||
#BSUB -u [email protected] | ||
|
||
#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
|
||
pwd | ||
module load nextflow/23.04.3 | ||
|
@@ -114,7 +136,7 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD | |
#BSUB -N | ||
#BSUB -u [email protected] | ||
|
||
#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
|
||
pwd | ||
module load nextflow/23.04.3 | ||
|
@@ -151,7 +173,7 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD | |
#BSUB -N | ||
#BSUB -u [email protected] | ||
|
||
#bsub -Is -W 4:00 -q short -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
|
||
pwd | ||
module load nextflow/23.04.3 | ||
|
@@ -168,141 +190,62 @@ pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MD | |
--verbose \ | ||
--reference ${ref} \ | ||
--emit-sam \ | ||
--modified-bases-models [email protected]_5mCG_5hmCG@v2 \ | ||
[email protected] \ | ||
${pod5_dir}/pod5_pass/ | samtools view -b - > ${wd_out}/${output_name}.bam | ||
|
||
``` | ||
|
||
## Running ONT nextflow pipeline. | ||
|
||
Local install and run using GPUs on seadragon | ||
|
||
Note for some, the pod5 files are so big (600gb) that they need to be uploaded and cleared from seadragon since the disk quota is 1TB. | ||
|
||
Written as interactive node, but can be formatted for bsub job submisison as well. Run in a screen so you don't have to keep an active terminal through it. | ||
|
||
|
||
Local install | ||
```bash | ||
#bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=2:gmem=4 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
bsub -Is -W 4:00 -q gpu -n 1 -gpu num=2:gmem=4 -M 160 -R rusage[mem=160] /bin/bash #get interactive gpu node | ||
|
||
module load nextflow/23.04.3 | ||
module load cuda11.5/toolkit/11.5.1 | ||
module load singularity/3.7.0 | ||
module load samtools/1.15 | ||
|
||
ref="/Volumes/USR2/Ryan/ref/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa" | ||
wd_out="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT" | ||
ref="/rsrch4/home/genetics/rmulqueen/ref/genome.fa" | ||
wd_out="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT" | ||
output_name="20230726_1239_2D_PAO38369_output" #change to each flowcell | ||
pod5_dir="/Volumes/seq/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell | ||
pod5_dir="/rsrch4/home/genetics/rmulqueen/projects/gccACT/230808_mdamb231_ONT/MDA_MB_231/20230726_1239_2D_PAO38369_dde6ac95" #change to each flowcell | ||
#download model for base calling | ||
#dorado download --model [email protected]_5mCG_5hmCG@v2 #5khz #cpg?? | ||
dorado download --model [email protected] | ||
#dorado download --model [email protected] | ||
|
||
#untested for this part, but use bam from wf-basecalling output as input | ||
nextflow run epi2me-labs/wf-human-variation \ | ||
#these make it run locally, and use the singularity containers we pulled manually above | ||
export NXF_SINGULARITY_CACHEDIR="/rsrch4/home/genetics/rmulqueen/singularity/" | ||
export SINGULARITY_CACHEDIR="/rsrch4/home/genetics/rmulqueen/singularity/" | ||
#mkdir $SINGULARITY_CACHEDIR #make sure these directories are made | ||
#mkdir $SINGULARITY_CACHEDIR/tmp | ||
#mkdir $SINGULARITY_CACHEDIR/pull | ||
export SINGULARITY_TMPDIR=$SINGULARITY_CACHEDIR/tmp | ||
export SINGULARITY_PULLDIR=$SINGULARITY_CACHEDIR/pull | ||
export CWL_SINGULARITY_CACHE=$SINGULARITY_PULLDIR | ||
|
||
#output bam file from dorado caller has to be sorted before it can be used in the pipeline. | ||
samtools sort -@ 10 -T $HOME ${wd_out}/${output_name}.bam > ${wd_out}/${output_name}.sorted.bam | ||
|
||
nextflow run /home/rmulqueen/wf-human-variation-master/main.nf \ | ||
-w ${wd_out}/${output_name}/workspace \ | ||
-profile singularity \ | ||
--snp --sv --cnv --methyl \ | ||
--ref ${ref} \ | ||
--bam ${wd_out}/${output_name}.bam \ | ||
--bam ${wd_out}/${output_name}.sorted.bam \ | ||
--dorado_ext pod5 \ | ||
--basecaller_basemod_threads 40 \ | ||
--basecaller_cfg '[email protected]' \ | ||
--remora_cfg '[email protected]_5mCG_5hmCG@v2' \ | ||
--sample_name ${output_name} \ | ||
--out_dir ${wd_out}/${output_name}/ \ | ||
-with-singularity \ | ||
-without-docker | ||
|
||
``` | ||
|
||
<!-- | ||
|
||
#connect through Finder to seq and USR2 (for reference genome.fa) | ||
|
||
# cd ~ | ||
# nextflow run epi2me-labs/wf-basecalling \ | ||
# -w ${wd_out}/${output_name}/workspace \ | ||
# --input $pod5_dir \ | ||
# --dorado_ext pod5 \ | ||
# --ref $ref \ | ||
# --out_dir ${wd_out}/${output_name} \ | ||
# --basecaller_cfg "[email protected]" \ | ||
# --basecaller_basemod_threads 80 \ | ||
# --remora_cfg "[email protected]_5mCG_5hmCG@v2" | ||
|
||
|
||
``` | ||
|
||
```bash | ||
ssh seadragon | ||
sftp [email protected] | ||
lcd ~/projects/gccACT #seadragon directory | ||
get -R /volumes/seq/projects/gccACT/230808_mdamb231_ONT/ #download ONT data | ||
get -R ~/wf-human-variation #downloaded | ||
|
||
#transfer | ||
bqueues | ||
bsub -Is -W 4:00 -q gpu-medium -n 1 -gpu num=1:gmem=4 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node | ||
|
||
bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node | ||
|
||
|
||
#test demo data | ||
wget -O demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-human-variation/demo_data.tar.gz #download | ||
tar -xzvf demo_data.tar.gz #extract | ||
|
||
|
||
module load singularity/3.5.2 #load singularity | ||
#module load cuda10.1/toolkit/10.1.243 #load cuda | ||
module load nextflow/22.10.6 #load nextflow | ||
#install nextflow instead of using module | ||
bsub -Is -W 4:00 -q transfer -n 1 -M 16 -R rusage[mem=16] /bin/bash #get interactive gpu node | ||
cd ~/ | ||
|
||
#on 10.132.80.157 server do | ||
curl -s "https://get.sdkman.io" | bash | ||
curl -s https://get.nextflow.io | bash | ||
#transfer to seadragon | ||
|
||
sftp [email protected] #transfer sdk to home directory | ||
get -R .sdkman | ||
get -R .nextflow | ||
get nextflow | ||
source "$HOME/.sdkman/bin/sdkman-init.sh" | ||
sdk install java 17.0.6-amzn | ||
#transfer nextflow to home directory | ||
./nextflow self-update | ||
mv ~/nextflow ~/tools #moving to in PATH | ||
|
||
|
||
OUTPUT=output | ||
|
||
|
||
nextflow run epi2me-labs/wf-human-variation \ | ||
-w ${OUTPUT}/workspace \ | ||
-profile standard \ | ||
--snp --sv \ | ||
--bam demo_data/demo.bam \ | ||
--bed demo_data/demo.bed \ | ||
--ref demo_data/demo.fasta \ | ||
--basecaller_cfg '[email protected]' \ | ||
--sample_name MY_SAMPLE \ | ||
--out_dir ${OUTPUT} \ | ||
-with-singularity \ | ||
-without-docker | ||
``` | ||
|
||
|
||
|
||
|
||
|
||
|
||
Our servers don't have GPUs so I'm just going to use my macbook | ||
https://labs.epi2me.io/downloads/ | ||
and install docker desktop | ||
https://www.docker.com/products/docker-desktop/ | ||
|
||
installing nextflow | ||
curl -s "https://get.sdkman.io" | bash | ||
source "/Users/rmulqueen/.sdkman/bin/sdkman-init.sh" | ||
sdk install java 17.0.6-tem | ||
wget -qO- https://get.nextflow.io | bash | ||
|
||
--> |