diff --git a/.github/workflows/align.yml b/.github/workflows/align.yml index d005a95de..68b31ec71 100644 --- a/.github/workflows/align.yml +++ b/.github/workflows/align.yml @@ -23,6 +23,7 @@ on: - 'workflow/scripts/makewindows.py' - 'test/fastq/**' - '**align.yml' + - '**__main__.py' pull_request: branches: @@ -46,6 +47,7 @@ on: - 'workflow/scripts/makewindows.py' - 'test/fastq/**' - '**align.yml' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually @@ -78,7 +80,7 @@ jobs: shell: micromamba-shell {0} - name: test bwa shell: micromamba-shell {0} - run: harpy align bwa -g test/genome/genome.fasta.gz -q -s "--show-failed-logs" test/fastq + run: harpy align bwa -g test/genome/genome.fasta.gz -s "--show-failed-logs" test/fastq test_ema: name: ema runs-on: ubuntu-latest @@ -106,4 +108,4 @@ jobs: shell: micromamba-shell {0} - name: test ema shell: micromamba-shell {0} - run: harpy align ema --ema-bins 20 -g test/genome/genome.fasta.gz -q -s "--show-failed-logs" test/fastq \ No newline at end of file + run: harpy align ema --ema-bins 20 -g test/genome/genome.fasta.gz -s "--show-failed-logs" test/fastq \ No newline at end of file diff --git a/.github/workflows/demux.yml b/.github/workflows/demux.yml index ae26fa89f..c55ac7def 100644 --- a/.github/workflows/demux.yml +++ b/.github/workflows/demux.yml @@ -17,6 +17,7 @@ on: - '**demux.yml' - 'BC_files.py' - 'test/data/demux/**' + - '**__main__.py' pull_request: branches: @@ -34,7 +35,7 @@ on: - '**demux.yml' - 'BC_files.py' - 'test/data/demux/**' - + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually diff --git a/.github/workflows/impute.yml b/.github/workflows/impute.yml index 49d2db77e..fad5f8ada 100644 --- a/.github/workflows/impute.yml +++ b/.github/workflows/impute.yml @@ -19,6 +19,7 @@ on: - 'workflow/report/Impute.Rmd' - 'workflow/report/StitchCollate.Rmd' - 'workflow/scripts/stitch_impute.R' + - '**__main__.py' pull_request: branches: @@ -38,6 +39,7 @@ on: - 'workflow/report/Impute.Rmd' - 'workflow/report/StitchCollate.Rmd' - 'workflow/scripts/stitch_impute.R' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually diff --git a/.github/workflows/phase.yml b/.github/workflows/phase.yml index 51efe3429..6edcc263c 100644 --- a/.github/workflows/phase.yml +++ b/.github/workflows/phase.yml @@ -20,6 +20,7 @@ on: - 'workflow/report/HapCut2.Rmd' - 'workflow/scripts/parsePhaseBlocks.py' - 'workflow/scripts/summarizeHaplobocks.py' + - '**__main__.py' pull_request: branches: @@ -40,6 +41,7 @@ on: - 'workflow/report/HapCut2.Rmd' - 'workflow/scripts/parsePhaseBlocks.py' - 'workflow/scripts/summarizeHaplobocks.py' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually diff --git a/.github/workflows/preflight.yml b/.github/workflows/preflight.yml index 94d159562..2da384fe2 100644 --- a/.github/workflows/preflight.yml +++ b/.github/workflows/preflight.yml @@ -20,6 +20,7 @@ on: - 'workflow/report/PreflightFastq.Rmd' - 'workflow/report/PreflightBam.Rmd' - '**preflight.yml' + - '**__main__.py' pull_request: branches: @@ -40,6 +41,7 @@ on: - 'workflow/report/PreflightFastq.Rmd' - 'workflow/report/PreflightBam.Rmd' - '**preflight.yml' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually diff --git a/.github/workflows/qc.yml b/.github/workflows/qc.yml index 400b2f545..bb324397a 100644 --- a/.github/workflows/qc.yml +++ b/.github/workflows/qc.yml @@ -15,6 +15,7 @@ on: - '**validations.py' - 'test/fastq/**' - '**qc.yml' + - '**__main__.py' pull_request: branches: @@ -30,6 +31,7 @@ on: - '**validations.py' - 'test/fastq/**' - '**qc.yml' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually diff --git a/.github/workflows/variants_snp.yml b/.github/workflows/variants_snp.yml index 51c53eb0f..9e7bc6474 100644 --- a/.github/workflows/variants_snp.yml +++ b/.github/workflows/variants_snp.yml @@ -19,6 +19,7 @@ on: - 'workflow/scripts/makewindows.py' - 'test/bam/**' - '**variants_snp.yml' + - '**__main__.py' pull_request: branches: @@ -38,6 +39,7 @@ on: - 'workflow/scripts/makewindows.py' - 'test/bam/**' - '**variants_snp.yml' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually @@ -68,9 +70,12 @@ jobs: python3 -m pip install --upgrade build && python3 -m build pip install dist/*.whl resources/buildforCI.sh - - name: variants mpileup + - name: snp mpileup shell: micromamba-shell {0} run: harpy snp mpileup -w 150000 -g test/genome/genome.fasta.gz -q test/bam + - name: snp mpileup-pop + shell: micromamba-shell {0} + run: harpy snp mpileup -w 150000 -g test/genome/genome.fasta.gz -p test/samples.groups -q test/bam test_freebayes: name: freebayes runs-on: ubuntu-latest @@ -96,6 +101,9 @@ jobs: python3 -m pip install --upgrade build && python3 -m build pip install dist/*.whl resources/buildforCI.sh - - name: variants freebayes + - name: snp freebayes + shell: micromamba-shell {0} + run: harpy snp freebayes -w 150000 -g test/genome/genome.fasta.gz -q test/bam + - name: snp freebayes-pop shell: micromamba-shell {0} - run: harpy snp freebayes -w 150000 -g test/genome/genome.fasta.gz -q test/bam \ No newline at end of file + run: harpy snp freebayes -w 150000 -g test/genome/genome.fasta.gz -p test/samples.groups -q test/bam \ No newline at end of file diff --git a/.github/workflows/variants_sv.yml b/.github/workflows/variants_sv.yml index f75cf824f..ee71b357e 100644 --- a/.github/workflows/variants_sv.yml +++ b/.github/workflows/variants_sv.yml @@ -21,6 +21,7 @@ on: - 'workflow/report/Leviathan**.Rmd' - 'workflow/report/Naibr**.Rmd' - 'workflow/scripts/inferSV.py' + - '**__main__.py' pull_request: branches: @@ -42,6 +43,7 @@ on: - 'workflow/report/Leviathan**.Rmd' - 'workflow/report/Naibr**.Rmd' - 'workflow/scripts/inferSV.py' + - '**__main__.py' env: CACHE_NUMBER: 0 # increase to reset cache manually @@ -79,7 +81,7 @@ jobs: - name: leviathan-pop if: always() shell: micromamba-shell {0} - run: harpy sv leviathan -g test/genome/genome.fasta.gz -p test/samples.groups -q -s "--show-failed-logs" test/bam + run: harpy sv leviathan -g test/genome/genome.fasta.gz -o SV/leviathanpop -p test/samples.groups -q -s "--show-failed-logs" test/bam test_naibr: name: naibr runs-on: ubuntu-latest @@ -107,16 +109,16 @@ jobs: resources/buildforCI.sh - name: naibr shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -q -s "--show-failed-logs" test/bam_phased + run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/naibr -s "--show-failed-logs" test/bam_phased - name: naibr pop if: always() shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -p test/samples.groups -q -s "--show-failed-logs" test/bam_phased + run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/pop -p test/samples.groups -s "--show-failed-logs" test/bam_phased - name: naibr with phasing if: always() shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -v test/vcf/test.phased.bcf -q -s "--show-failed-logs" test/bam + run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/phase -v test/vcf/test.phased.bcf -s "--show-failed-logs" test/bam - name: naibr pop with phasing if: always() shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -v test/vcf/test.phased.bcf -p test/samples.groups -q -s "--show-failed-logs" test/bam + run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/phasepop -v test/vcf/test.phased.bcf -p test/samples.groups -s "--show-failed-logs" test/bam diff --git a/.gitignore b/.gitignore index 58986f677..0adbc7e24 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,13 @@ .snakemake/ .vscode/ -harpyenvs/ +.harpy_envs/ Seqs/ QC/ Demultiplex/ Genome/ Align/ -Variants/ +SNP/ +SV/ Impute/ Preflight/ Phase/ diff --git a/pyproject.toml b/pyproject.toml index c31289933..bc32ba5ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta" [project] name = "harpy" -version = "0.7.3" +version = "0.8.0" authors = [ {name = "Pavel Dimens"} ] description = "An automated workflow to demultiplex sequences, trim reads, map sequences, call variants, impute genotypes, and phase haplotypes of Haplotagging data. Batteries included." -requires-python = ">=3.7" +requires-python = ">=3.9" readme = {file = "README.md", content-type = "text/markdown"} license = {text = "GPL-3.0"} diff --git a/resources/createharpyenv.sh b/resources/createharpyenv.sh index b82e8dd00..3ae0eb285 100644 --- a/resources/createharpyenv.sh +++ b/resources/createharpyenv.sh @@ -4,9 +4,9 @@ mamba create -n $1 -c bioconda -c conda-forge \ bcftools=1.19 \ - pysam=0.22 \ python \ + pandas \ rich-click \ - snakemake \ + snakemake-minimal>7 \ samtools \ seqtk \ No newline at end of file diff --git a/resources/meta.yaml b/resources/meta.yaml index c5a29d899..06ad58bf6 100644 --- a/resources/meta.yaml +++ b/resources/meta.yaml @@ -20,15 +20,14 @@ build: requirements: host: - - python =3.10 + - python =3.12 - pip run: - bcftools =1.19 - pandas - - pysam =0.22 - - python =3.10 + - python >3.10 - rich-click - - snakemake-minimal + - snakemake-minimal >7 - samtools - seqtk - tabix diff --git a/src/harpy/__main__.py b/src/harpy/__main__.py index f7f81f6ec..1e2e27545 100644 --- a/src/harpy/__main__.py +++ b/src/harpy/__main__.py @@ -39,7 +39,7 @@ click.rich_click.ERRORS_EPILOGUE = "See the documentation: [link=https://pdimens.github.io/harpy/]https://pdimens.github.io/harpy/[/link]" @click.group(options_metavar='', context_settings=dict(help_option_names=["-h", "--help"])) -@click.version_option("0.7.3", prog_name="Harpy") +@click.version_option("0.8.0", prog_name="Harpy") def cli(): """ ## Harpy haplotagging pipeline @@ -164,118 +164,114 @@ def preflight(): click.rich_click.OPTION_GROUPS = { "harpy preflight bam": [ { - "name": "Configuration", - "options": ["--directory"], - }, - { - "name": "Other Options", - "options": ["--threads", "--snakemake", "--quiet", "--print-only", "--help"], + "name": "Options", + "options": ["--output-dir", "--threads", "--snakemake", "--quiet", "--help"], }, ], "harpy preflight fastq": [ { - "name": "Other Options", - "options": ["--threads", "--snakemake", "--quiet", "--print-only", "--help"], + "name": "Options", + "options": ["--output-dir", "--threads", "--snakemake", "--quiet", "--help"], }, ], "harpy demultiplex gen1": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--samplesheet"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy qc": [ { - "name": "Configuration", - "options": ["--directory", "--max-length", "--ignore-adapters", "--extra-params"], + "name": "Module Parameters", + "options": ["--max-length", "--ignore-adapters", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy align bwa": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--genome", "--quality-filter", "--molecule-distance", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy align ema": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--platform", "--whitelist", "--genome", "--quality-filter", "--ema-bins", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy snp mpileup": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--genome", "--populations", "--ploidy", "--windowsize", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy snp freebayes": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--genome", "--populations", "--ploidy", "--windowsize", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy sv leviathan": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--genome", "--populations", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy sv naibr": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--genome", "--vcf", "--molecule-distance", "--populations", "--extra-params"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy impute": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--vcf", "--parameters", "--extra-params", "--vcf-samples"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ], "harpy phase": [ { - "name": "Configuration", + "name": "Module Parameters", "options": ["--vcf", "--molecule-distance", "--genome", "--prune-threshold", "--ignore-bx", "--extra-params", "--vcf-samples"], }, { "name": "Other Options", - "options": ["--threads", "--skipreports", "--snakemake", "--quiet", "--print-only", "--help"], + "options": ["--output-dir", "--threads", "--skipreports", "--snakemake", "--quiet", "--help"], }, ] } diff --git a/src/harpy/align.py b/src/harpy/align.py index 5ad0c1904..5204ce803 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -17,9 +17,10 @@ @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "Align/bwa", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def bwa(input, genome, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, print_only): +def bwa(input, output_dir, genome, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, print_only): """ Align sequences to genome using BWA MEM @@ -30,8 +31,9 @@ def bwa(input, genome, threads, extra_params, quality_filter, molecule_distance, Instead, Harpy post-processes the alignments using the specified `--molecule-distance` to assign alignments to unique molecules. """ - workflowdir = "Align/bwa/workflow" - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/align-bwa.smk') command.append("--configfile") @@ -50,12 +52,15 @@ def bwa(input, genome, threads, extra_params, quality_filter, molecule_distance, sn = parse_fastq_inputs(input, f"{workflowdir}/input") samplenames = get_samples_from_fastq(f"{workflowdir}/input") fetch_file("align-bwa.smk", f"{workflowdir}/") + fetch_file("assignMI.py", f"{workflowdir}/scripts/") + fetch_file("bxStats.py", f"{workflowdir}/scripts/") for i in ["BxStats", "BwaGencov"]: fetch_file(f"{i}.Rmd", f"{workflowdir}/report/") with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"genomefile: {genome}\n") config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") config.write(f"quality: {quality_filter}\n") config.write(f"molecule_distance: {molecule_distance}\n") @@ -66,7 +71,7 @@ def bwa(input, genome, threads, extra_params, quality_filter, molecule_distance, generate_conda_deps() print_onstart( - f"Samples: {len(samplenames)}\nOutput Directory: Align/bwa/", + f"Samples: {len(samplenames)}\nOutput Directory: {output_dir}", "align bwa" ) _module = subprocess.run(command) @@ -85,9 +90,10 @@ def bwa(input, genome, threads, extra_params, quality_filter, molecule_distance, @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "Align/ema", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def ema(input, platform, whitelist, genome, threads, ema_bins, skipreports, extra_params, quality_filter, snakemake, quiet, print_only): +def ema(input, output_dir, platform, whitelist, genome, threads, ema_bins, skipreports, extra_params, quality_filter, snakemake, quiet, print_only): """ Align sequences to a genome using EMA @@ -98,8 +104,9 @@ def ema(input, platform, whitelist, genome, threads, ema_bins, skipreports, extr EMA may improve mapping, but it also marks split reads as secondary reads, making it less useful for variant calling with leviathan. """ - workflowdir = "Align/ema/workflow" - command = f'snakemake --rerun-incomplete --nolock --use-conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/align-ema.smk') command.append("--configfile") @@ -116,7 +123,7 @@ def ema(input, platform, whitelist, genome, threads, ema_bins, skipreports, extr exit(0) platform = platform.lower() - # the tellseq stuff isn't impremented yet, but this is a placeholder for that, wishful thinking + # the tellseq stuff isn't impremented yet, but this is a placeholder for that... wishful thinking if platform in ["tellseq", "10x"] and not whitelist: print_error(f"{platform} technology requires the use of a barcode whitelist.") if platform == "10x": @@ -132,12 +139,14 @@ def ema(input, platform, whitelist, genome, threads, ema_bins, skipreports, extr sn = parse_fastq_inputs(input, f"{workflowdir}/input") samplenames = get_samples_from_fastq(f"{workflowdir}/input") fetch_file("align-ema.smk", f"{workflowdir}/") + fetch_file("bxStats.py", f"{workflowdir}/scripts/") for i in ["EmaCount", "EmaGencov", "BxStats"]: fetch_file(f"{i}.Rmd", f"{workflowdir}/report/") with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"genomefile: {genome}\n") config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") config.write(f"quality: {quality_filter}\n") config.write(f"platform: {platform}\n") @@ -151,7 +160,7 @@ def ema(input, platform, whitelist, genome, threads, ema_bins, skipreports, extr generate_conda_deps() print_onstart( - f"Samples: {len(samplenames)}\nPlatform: {platform}\nOutput Directory: Align/ema/", + f"Samples: {len(samplenames)}\nPlatform: {platform}\nOutput Directory: {output_dir}/", "align ema" ) _module = subprocess.run(command) diff --git a/src/harpy/demultiplex.py b/src/harpy/demultiplex.py index 3585ab074..8d68e4a50 100644 --- a/src/harpy/demultiplex.py +++ b/src/harpy/demultiplex.py @@ -8,23 +8,25 @@ import re @click.command(no_args_is_help = True, epilog = "read the docs for more information: https://pdimens.github.io/harpy/modules/demultiplex/") -@click.option('-b', '--samplesheet', required = True, type=click.Path(exists=True, dir_okay=False), metavar = "File Path", help = 'Tab-delimited file of samplebarcode') +@click.option('-b', '--samplesheet', required = True, type=click.Path(exists=True, dir_okay=False), metavar = "File Path", help = 'Tab-delimited file of sample\barcode') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), metavar = "Integer", help = 'Number of threads to use') @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') -@click.argument('input', required=True, type=click.Path(exists=True, dir_okay=False)) -def gen1(input, samplesheet, threads, snakemake, skipreports, quiet, print_only): +@click.option('-o', '--output-dir', type = str, default = "Demultiplex", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.argument('FASTQ_INPUT', required=True, type=click.Path(exists=True, dir_okay=False)) +def gen1(fastq_input, output_dir, samplesheet, threads, snakemake, skipreports, quiet, print_only): """ Demultiplex Generation 1 haplotagged FASTQ files Use one of the four gzipped FASTQ files provided by the sequencer (I1, I2, R1, R2).fastq.gz for the `FASTQ_INPUT` argument, Harpy will infer the other three. Note: the `--samplesheet` must be tab (or space) delimited and have no header (i.e. no column names). """ - inprefix = re.sub(r"[\_\.][IR][12]?(?:\_00[0-9])*\.f(?:ast)?q(?:\.gz)?$", "", os.path.basename(input)) - workflowdir = "{workflowdir}" - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + inprefix = re.sub(r"[\_\.][IR][12]?(?:\_00[0-9])*\.f(?:ast)?q(?:\.gz)?$", "", os.path.basename(fastq_input)) + output_dir = output_dir.rstrip("/") + f"/{inprefix}" + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/demultiplex.gen1.smk') command.append("--configfile") @@ -41,12 +43,13 @@ def gen1(input, samplesheet, threads, snakemake, skipreports, quiet, print_only) click.echo(call_SM) exit() - check_demux_fastq(input) + check_demux_fastq(fastq_input) validate_demuxschema(samplesheet) fetch_file("demultiplex.gen1.smk", f"{workflowdir}/") with open(f"{workflowdir}/config.yml", "w") as config: - config.write(f"infile: {input}\n") + config.write(f"infile: {fastq_input}\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"infile_prefix: {inprefix}\n") config.write(f"samplefile: {samplesheet}\n") config.write(f"skipreports: {skipreports}\n") @@ -54,7 +57,7 @@ def gen1(input, samplesheet, threads, snakemake, skipreports, quiet, print_only) generate_conda_deps() print_onstart( - f"Input Prefix: {inprefix}\nDemultiplex Schema: {samplesheet}\nOutput Directory: Demultiplex/{inprefix}", + f"Input Prefix: {inprefix}\nDemultiplex Schema: {samplesheet}\nOutput Directory: {output_dir}", "demultiplex gen1" ) _module = subprocess.run(command) diff --git a/src/harpy/helperfunctions.py b/src/harpy/helperfunctions.py index 0b3327ce2..63a98b1ba 100644 --- a/src/harpy/helperfunctions.py +++ b/src/harpy/helperfunctions.py @@ -73,7 +73,7 @@ def generate_conda_deps(): """Create the YAML files of the workflow conda dependencies""" condachannels = ["conda-forge", "bioconda", "defaults"] environ = { - "qc" : ["falco", "fastp", "multiqc"], + "qc" : ["falco", "fastp", "multiqc", "pysam=0.22"], "align": ["bwa", "ema","icu","libzlib", "samtools=1.19", "seqtk", "xz"], "variants.snp": ["bcftools=1.19", "freebayes=1.3.6"], "variants.sv": ["leviathan", "naibr-plus"], @@ -81,12 +81,12 @@ def generate_conda_deps(): "r-env" : ["bioconductor-complexheatmap", "r-circlize", "r-dt", "r-flexdashboard", "r-ggplot2", "r-ggridges", "r-plotly", "r-tidyr", "r-stitch"] } - os.makedirs("harpyenvs", exist_ok = True) + os.makedirs(".harpy_envs", exist_ok = True) for i in environ: # don't overwrite existing - if not os.path.isfile(f"harpyenvs/{i}.yaml"): - with open(f"harpyenvs/{i}.yaml", "w") as yml: + if not os.path.isfile(f".harpy_envs/{i}.yaml"): + with open(f".harpy_envs/{i}.yaml", "w") as yml: yml.write(f"name: {i}\n") yml.write("channels:\n - ") yml.write("\n - ".join(condachannels)) diff --git a/src/harpy/impute.py b/src/harpy/impute.py index 301aa02d1..212fb3f22 100644 --- a/src/harpy/impute.py +++ b/src/harpy/impute.py @@ -16,9 +16,10 @@ @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "Impute", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def impute(input, parameters, threads, vcf, vcf_samples, extra_params, snakemake, skipreports, quiet, print_only): +def impute(input, output_dir, parameters, threads, vcf, vcf_samples, extra_params, snakemake, skipreports, quiet, print_only): """ Impute genotypes using variants and sequences @@ -35,8 +36,9 @@ def impute(input, parameters, threads, vcf, vcf_samples, extra_params, snakemake harpy ... -x 'switchModelIteration = 39, splitReadIterations = NA, reference_populations = c("CEU","GBR")'... ``` """ - workflowdir = "Impute/workflow" - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/impute.smk') command.append("--configfile") @@ -68,6 +70,7 @@ def impute(input, parameters, threads, vcf, vcf_samples, extra_params, snakemake contigs = biallelic_contigs(vcf) with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") config.write(f"variantfile: {vcf}\n") config.write(f"paramfile: {parameters}\n") @@ -79,7 +82,7 @@ def impute(input, parameters, threads, vcf, vcf_samples, extra_params, snakemake generate_conda_deps() print_onstart( - f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {len(sn)}\nContigs Considered: {len(contigs)}\nOutput Directory: Impute/", + f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {len(sn)}\nContigs Considered: {len(contigs)}\nOutput Directory: {output_dir}/", "impute" ) _module = subprocess.run(command) diff --git a/src/harpy/phase.py b/src/harpy/phase.py index 94b6ce682..909bfaa5e 100644 --- a/src/harpy/phase.py +++ b/src/harpy/phase.py @@ -19,9 +19,10 @@ @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "Phase", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def phase(input, vcf, threads, molecule_distance, prune_threshold, vcf_samples, genome, snakemake, extra_params, ignore_bx, skipreports, quiet, print_only): +def phase(input, output_dir, vcf, threads, molecule_distance, prune_threshold, vcf_samples, genome, snakemake, extra_params, ignore_bx, skipreports, quiet, print_only): """ Phase SNPs into haplotypes @@ -33,8 +34,9 @@ def phase(input, vcf, threads, molecule_distance, prune_threshold, vcf_samples, the samples present in your input `--vcf` file rather than all the samples present in the input alignments. """ - workflowdir = "Phase/workflow" - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f"{workflowdir}/phase-pop.smk") command.append("--configfile") @@ -60,6 +62,7 @@ def phase(input, vcf, threads, molecule_distance, prune_threshold, vcf_samples, with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") config.write(f"variantfile: {vcf}\n") config.write(f"noBX: {ignore_bx}\n") @@ -76,7 +79,7 @@ def phase(input, vcf, threads, molecule_distance, prune_threshold, vcf_samples, generate_conda_deps() print_onstart( - f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {len(sn)}\nOutput Directory: Phase/", + f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {len(sn)}\nOutput Directory: {output_dir}/", "phase" ) _module = subprocess.run(command) diff --git a/src/harpy/preflight.py b/src/harpy/preflight.py index db6463416..c34314705 100755 --- a/src/harpy/preflight.py +++ b/src/harpy/preflight.py @@ -9,13 +9,13 @@ import glob @click.command(no_args_is_help = True, epilog = "read the docs for more information: https://pdimens.github.io/harpy/modules/preflight/") -#@click.option('-o', '--output', default = "current directory", show_default=True, type=click.Path(exists=False, writable=True, file_okay=False), metavar = "Folder Path", help = 'Output directory') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), metavar = "Integer", help = 'Number of threads to use') @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "Preflight/fastq", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def fastq(input, threads, snakemake, quiet, print_only): +def fastq(input, output_dir, threads, snakemake, quiet, print_only): """ Run validity checks on haplotagged FASTQ files. @@ -28,11 +28,13 @@ def fastq(input, threads, snakemake, quiet, print_only): fix your data, but it will report the number of reads that feature errors to help you diagnose if file formatting will cause downstream issues. """ - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') - command.append(f'Preflight/fastq/workflow/preflight-fastq.smk') + command.append(f'{workflowdir}/preflight-fastq.smk') command.append('--configfile') - command.append(f"Preflight/fastq/workflow/config.yml") + command.append(f"{workflowdir}/config.yml") if quiet: command.append("--quiet") command.append("all") @@ -43,18 +45,20 @@ def fastq(input, threads, snakemake, quiet, print_only): click.echo(call_SM) exit() - os.makedirs("{workflowdir}/", exist_ok= True) - sn = parse_fastq_inputs(input, "Preflight/fastq/workflow/input") - fetch_file("preflight-fastq.smk", f"Preflight/fastq/workflow/") - fetch_file("PreflightFastq.Rmd", f"Preflight/fastq/workflow/report/") + os.makedirs(f"{workflowdir}/", exist_ok= True) + sn = parse_fastq_inputs(input, f"{workflowdir}/input") + fetch_file("preflight-fastq.smk", f"{workflowdir}") + fetch_file("PreflightFastq.Rmd", f"{workflowdir}/report/") + fetch_file("checkFASTQ.py", f"{workflowdir}/scripts/") - with open(f"Preflight/fastq/workflow/config.yml", "w") as config: - config.write(f"seq_directory: Preflight/fastq/workflow/input\n") + with open(f"{workflowdir}/config.yml", "w") as config: + config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"workflow_call: {call_SM}\n") generate_conda_deps() print_onstart( - f"Files: {len(sn)}\nOutput Directory: Preflight/fastq", + f"Files: {len(sn)}\nOutput Directory: {output_dir}/", "preflight fastq" ) _module = subprocess.run(command) @@ -64,9 +68,10 @@ def fastq(input, threads, snakemake, quiet, print_only): @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), metavar = "Integer", help = 'Number of threads to use') @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "Preflight/bam", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def bam(input, threads, snakemake, quiet, print_only): +def bam(input, output_dir, threads, snakemake, quiet, print_only): """ Run validity checks on haplotagged BAM files @@ -78,8 +83,9 @@ def bam(input, threads, snakemake, quiet, print_only): This **will not** fix your data, but it will report the number of records that feature errors to help you diagnose if file formatting will cause downstream issues. """ - workflowdir = "Preflight/bam/workflow" - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/preflight-bam.smk') command.append('--configfile') @@ -98,14 +104,16 @@ def bam(input, threads, snakemake, quiet, print_only): sn = parse_alignment_inputs(input, f"{workflowdir}/input") fetch_file("preflight-bam.smk", f"{workflowdir}/") fetch_file("PreflightBam.Rmd", f"{workflowdir}/report/") + fetch_file("checkBAM.py", f"{workflowdir}/scripts/") with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"workflow_call: {call_SM}\n") generate_conda_deps() print_onstart( - f"Samples: {len(sn)}\nOutput Directory: Preflight/bam", + f"Samples: {len(sn)}\nOutput Directory: {output_dir}/", "preflight bam" ) _module = subprocess.run(command) diff --git a/src/harpy/printfunctions.py b/src/harpy/printfunctions.py index 154a65add..ec69d01f3 100644 --- a/src/harpy/printfunctions.py +++ b/src/harpy/printfunctions.py @@ -6,20 +6,20 @@ def print_onstart(text, title): """Print a panel of info on workflow run""" print("") - print(Panel(text, title = f"[bold]Harpy {title}", title_align = "left", border_style = "light_steel_blue", subtitle= "Running Workflow"), file = sys.stderr) + print(Panel(text, title = f"[bold]Harpy {title}", title_align = "left", border_style = "light_steel_blue", subtitle = "Running Workflow", width = 75), file = sys.stderr) def print_error(errortext): """Print a yellow panel with error text""" - print(Panel(errortext, title = "[bold]Error", title_align = "left", border_style = "yellow"), file = sys.stderr) + print(Panel(errortext, title = "[bold]Error", title_align = "left", border_style = "yellow", width = 75), file = sys.stderr) def print_solution(solutiontext): """Print a blue panel with solution text""" - print(Panel(solutiontext, title = "[bold]Solution", title_align = "left", border_style = "blue"), file = sys.stderr) + print(Panel(solutiontext, title = "[bold]Solution", title_align = "left", border_style = "blue", width = 75), file = sys.stderr) def print_solution_with_culprits(solutiontext, culprittext): """Print a blue panel with solution text and the list of offenders below it""" - print(Panel(solutiontext, title = "[bold]Solution", title_align = "left", subtitle = culprittext, border_style = "blue"), file = sys.stderr) + print(Panel(solutiontext, title = "[bold]Solution", title_align = "left", subtitle = culprittext, border_style = "blue", width = 75), file = sys.stderr) def print_notice(noticetext): """Print a white panel with information text text""" - print(Panel(noticetext, title = "Notice", title_align = "left", border_style = "dim"), file = sys.stderr) \ No newline at end of file + print(Panel(noticetext, title = "Notice", title_align = "left", border_style = "dim", width = 75), file = sys.stderr) \ No newline at end of file diff --git a/src/harpy/qc.py b/src/harpy/qc.py index e2fba707e..1b8ff3978 100644 --- a/src/harpy/qc.py +++ b/src/harpy/qc.py @@ -16,9 +16,10 @@ @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('--print-only', is_flag = True, hidden = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "QC", show_default=True, metavar = "String", help = 'Name of output directory') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def qc(input, max_length, ignore_adapters, extra_params, threads, snakemake, skipreports, quiet, print_only): +def qc(input, output_dir, max_length, ignore_adapters, extra_params, threads, snakemake, skipreports, quiet, print_only): """ Remove adapters and quality trim sequences @@ -31,8 +32,9 @@ def qc(input, max_length, ignore_adapters, extra_params, threads, snakemake, ski - minimum 15bp length filter - poly-G tail removal """ - workflowdir = "QC/workflow" - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/qc.smk') command.append('--configfile') @@ -52,8 +54,10 @@ def qc(input, max_length, ignore_adapters, extra_params, threads, snakemake, ski sn = get_samples_from_fastq(f"{workflowdir}/input") fetch_file("qc.smk", f"{workflowdir}/") fetch_file("BxCount.Rmd", f"{workflowdir}/report/") + fetch_file("countBX.py", f"{workflowdir}/scripts/") with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"adapters: {ignore_adapters}\n") config.write(f"maxlen: {max_length}\n") if extra_params is not None: @@ -63,7 +67,7 @@ def qc(input, max_length, ignore_adapters, extra_params, threads, snakemake, ski generate_conda_deps() print_onstart( - f"Samples: {len(sn)}\nOutput Directory: QC/", + f"Samples: {len(sn)}\nOutput Directory: {output_dir}/", "qc" ) _module = subprocess.run(command) diff --git a/src/harpy/snp.py b/src/harpy/snp.py index 0f1a2d4e0..a67859c43 100644 --- a/src/harpy/snp.py +++ b/src/harpy/snp.py @@ -9,7 +9,7 @@ @click.command(no_args_is_help = True, epilog = "read the docs for more information: https://pdimens.github.io/harpy/modules/snp") @click.option('-g', '--genome', type=click.Path(exists=True), required = True, metavar = "File Path", help = 'Genome assembly for variant calling') -@click.option('-p', '--populations', type=click.Path(exists = True), metavar = "File Path", help = 'Tab-delimited file of samplepopulation (optional)') +@click.option('-p', '--populations', type=click.Path(exists = True), metavar = "File Path", help = "Tab-delimited file of sample\population") @click.option('-x', '--ploidy', default = 2, show_default = True, type=int, metavar = "Integer", help = 'Ploidy of samples') @click.option('-w', '--windowsize', default = 50000, show_default = True, type = int, metavar = "Integer", help = "Interval size for parallel variant calling") @click.option('-x', '--extra-params', type = str, metavar = "String", help = 'Additional variant caller parameters, in quotes') @@ -17,9 +17,10 @@ @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "SNP/mpileup", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def mpileup(input, genome, threads, populations, ploidy, windowsize, extra_params, snakemake, skipreports, quiet, print_only): +def mpileup(input, output_dir, genome, threads, populations, ploidy, windowsize, extra_params, snakemake, skipreports, quiet, print_only): """ Call variants from using bcftools mpileup @@ -31,8 +32,9 @@ def mpileup(input, genome, threads, populations, ploidy, windowsize, extra_param Use **harpy popgroup** to create a sample grouping file to use as input for `--populations`. """ - workflowdir = "Variants/mpileup/workflow" - command = (f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .').split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = (f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .').split() command.append('--snakefile') command.append(f'{workflowdir}/snp-mpileup.smk') command.append('--configfile') @@ -57,6 +59,7 @@ def mpileup(input, genome, threads, populations, ploidy, windowsize, extra_param with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") popgroupings = "" if populations is not None: @@ -76,7 +79,7 @@ def mpileup(input, genome, threads, populations, ploidy, windowsize, extra_param generate_conda_deps() print_onstart( - f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: Variants/mpileup/", + f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: {output_dir}/", "snp mpileup" ) _module = subprocess.run(command) @@ -84,7 +87,7 @@ def mpileup(input, genome, threads, populations, ploidy, windowsize, extra_param @click.command(no_args_is_help = True, epilog = "read the docs for more information: https://pdimens.github.io/harpy/modules/snp") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, metavar = "File Path", help = 'Genome assembly for variant calling') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False), metavar = "File Path", help = 'Tab-delimited file of samplepopulation (optional)') +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False), metavar = "File Path", help = "Tab-delimited file of sample\population") @click.option('-x', '--ploidy', default = 2, show_default = True, type=int, metavar = "Integer", help = 'Ploidy of samples') @click.option('-w', '--windowsize', default = 50000, show_default = True, type = int, metavar = "Integer", help = "Interval size for parallel variant calling") @click.option('-x', '--extra-params', type = str, metavar = "String", help = 'Additional variant caller parameters, in quotes') @@ -92,9 +95,10 @@ def mpileup(input, genome, threads, populations, ploidy, windowsize, extra_param @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "SNP/freebayes", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def freebayes(input, genome, threads, populations, ploidy, windowsize, extra_params, snakemake, skipreports, quiet, print_only): +def freebayes(input, output_dir, genome, threads, populations, ploidy, windowsize, extra_params, snakemake, skipreports, quiet, print_only): """ Call variants using freebayes @@ -106,8 +110,9 @@ def freebayes(input, genome, threads, populations, ploidy, windowsize, extra_par Use **harpy popgroup** to create a sample grouping file to use as input for `--populations`. """ - workflowdir = "Variants/freebayes/workflow" - command = (f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .').split() + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" + command = (f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .').split() command.append('--snakefile') command.append(f'{workflowdir}/snp-freebayes.smk') command.append('--configfile') @@ -132,6 +137,7 @@ def freebayes(input, genome, threads, populations, ploidy, windowsize, extra_par with open(f"{workflowdir}/config.yml", "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") popgroupings = "" if populations is not None: @@ -151,7 +157,7 @@ def freebayes(input, genome, threads, populations, ploidy, windowsize, extra_par generate_conda_deps() print_onstart( - f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: Variants/freebayes", + f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: {output_dir}/", "snp freebayes" ) _module = subprocess.run(command) diff --git a/src/harpy/sv.py b/src/harpy/sv.py index 807c8f453..487de8e33 100644 --- a/src/harpy/sv.py +++ b/src/harpy/sv.py @@ -9,15 +9,16 @@ @click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/sv/leviathan/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, metavar = "File Path", help = 'Genome assembly for variant calling') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False), metavar = "File Path", help = 'Tab-delimited file of samplepopulation (optional)') +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False), metavar = "File Path", help = "Tab-delimited file of sample\population") @click.option('-x', '--extra-params', type = str, metavar = "String", help = 'Additional variant caller parameters, in quotes') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), metavar = "Integer", help = 'Number of threads to use') @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "SV/leviathan", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def leviathan(input, genome, threads, populations, extra_params, snakemake, skipreports, quiet, print_only): +def leviathan(input, output_dir, genome, threads, populations, extra_params, snakemake, skipreports, quiet, print_only): """ Call structural variants using LEVIATHAN @@ -28,10 +29,10 @@ def leviathan(input, genome, threads, populations, extra_params, snakemake, skip Use **harpy popgroup** to create a sample grouping file to use as input for `--populations`. """ + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" vcaller = "leviathan" if populations is None else "leviathan-pop" - workflowdir = f"Variants/{vcaller}/workflow" - - command = f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() + command = f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .'.split() command.append('--snakefile') command.append(f'{workflowdir}/sv-{vcaller}.smk') command.append('--configfile') @@ -56,6 +57,7 @@ def leviathan(input, genome, threads, populations, extra_params, snakemake, skip with open(f'{workflowdir}/config.yml', "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") popgroupings = "" if populations is not None: @@ -73,7 +75,7 @@ def leviathan(input, genome, threads, populations, extra_params, snakemake, skip generate_conda_deps() print_onstart( - f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: Variants/{vcaller}/", + f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: {output_dir}/", "sv leviathan" ) _module = subprocess.run(command) @@ -82,16 +84,17 @@ def leviathan(input, genome, threads, populations, extra_params, snakemake, skip @click.command(no_args_is_help = True, epilog = "read the docs for more information: https://pdimens.github.io/harpy/modules/sv/naibr/") @click.option('-g', '--genome', required = True, type=click.Path(exists=True, dir_okay=False), metavar = "File Path", help = 'Genome assembly') @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False), metavar = "File Path", help = 'Path to phased bcf/vcf file') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False), metavar = "File Path", help = 'Tab-delimited file of samplepopulation (optional)') +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False), metavar = "File Path", help = "Tab-delimited file of sample\population") @click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, metavar = "Integer", help = 'Base-pair distance delineating separate molecules') @click.option('-x', '--extra-params', type = str, metavar = "String", help = 'Additional variant caller parameters, in quotes') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), metavar = "Integer", help = 'Number of threads to use') @click.option('-s', '--snakemake', type = str, metavar = "String", help = 'Additional Snakemake parameters, in quotes') @click.option('-r', '--skipreports', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t generate any HTML reports') @click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Don\'t show output text while running') -@click.option('--print-only', is_flag = True, show_default = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') +@click.option('-o', '--output-dir', type = str, default = "SV/naibr", show_default=True, metavar = "String", help = 'Name of output directory') +@click.option('--print-only', is_flag = True, hidden = True, default = False, metavar = "Toggle", help = 'Print the generated snakemake command and exit') @click.argument('input', required=True, type=click.Path(exists=True), nargs=-1) -def naibr(input, genome, vcf, threads, populations, molecule_distance, extra_params, snakemake, skipreports, quiet, print_only): +def naibr(input, output_dir, genome, vcf, threads, populations, molecule_distance, extra_params, snakemake, skipreports, quiet, print_only): """ Call structural variants using NAIBR @@ -108,11 +111,13 @@ def naibr(input, genome, vcf, threads, populations, molecule_distance, extra_par Use **harpy popgroup** to create a sample grouping file to use as input for `--populations`. """ + output_dir = output_dir.rstrip("/") + workflowdir = f"{output_dir}/workflow" vcaller = "naibr" if populations is None else "naibr-pop" - workflowdir = f"Variants/{vcaller}/workflow" + workflowdir = f"{output_dir}/workflow" vcaller += "-phase" if vcf is not None else "" - command = (f'snakemake --rerun-incomplete --nolock --use-conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .').split() + command = (f'snakemake --rerun-incomplete --nolock --software-deployment-method conda --conda-prefix ./.snakemake/conda --cores {threads} --directory .').split() command.append('--snakefile') command.append(f"{workflowdir}/sv-{vcaller}.smk") command.append("--configfile") @@ -140,6 +145,7 @@ def naibr(input, genome, vcf, threads, populations, molecule_distance, extra_par with open(f'{workflowdir}/config.yml', "w") as config: config.write(f"seq_directory: {workflowdir}/input\n") + config.write(f"output_directory: {output_dir}\n") config.write(f"samplenames: {samplenames}\n") popgroupings = "" if populations is not None: @@ -161,7 +167,7 @@ def naibr(input, genome, vcf, threads, populations, molecule_distance, extra_par generate_conda_deps() print_onstart( - f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: Variants/{vcaller}/", + f"Samples: {len(samplenames)}{popgroupings}\nOutput Directory: {output_dir}/", "sv naibr" ) _module = subprocess.run(command) diff --git a/src/harpy/validations.py b/src/harpy/validations.py index be486bd1f..b77041d72 100644 --- a/src/harpy/validations.py +++ b/src/harpy/validations.py @@ -4,9 +4,9 @@ import subprocess from .fileparsers import getnames from .printfunctions import print_error, print_notice, print_solution, print_solution_with_culprits -from urllib.request import urlretrieve from pathlib import Path from rich.table import Table +from rich import box, print import rich_click as click def vcfcheck(vcf): @@ -72,9 +72,9 @@ def check_impute_params(parameters): # Validate each column culprits = dict() colerr = 0 - errtable = Table(title="Formatting Errors") + errtable = Table(show_footer=True, box=box.SIMPLE) errtable.add_column("Column", justify="right", style="white", no_wrap=True) - errtable.add_column("Expected Values", style="green") + errtable.add_column("Expected Values", style="blue") errtable.add_column("Rows with Issues", style = "white") culprits["model"] = [] @@ -91,7 +91,7 @@ def check_impute_params(parameters): culprits["usebx"].append(str(i + 1)) colerr += 1 if culprits["usebx"]: - errtable.add_row("usebx", "True, False", ", ".join(culprits["usebx"])) + errtable.add_row("usebx", "True, False", ",".join(culprits["usebx"])) for param in ["bxlimit","k","s","ngen"]: culprits[param] = [] @@ -100,11 +100,14 @@ def check_impute_params(parameters): culprits[param].append(str(i + 1)) colerr += 1 if culprits[param]: - errtable.add_row(param, "Integers", ", ".join(culprits[param])) + errtable.add_row(param, "Integers", ",".join(culprits[param])) if colerr > 0: print_error(f"Parameter file [bold]{parameters}[/bold] is formatted incorrectly. Not all columns have valid values.") - print_solution("Review the table below of what values are expected for each column and which rows are causing issues.") + print_solution_with_culprits( + "Review the table below of what values are expected for each column and which rows are causing issues.", + "Formatting Errors:" + ) print(errtable, file = sys.stderr) exit(1) diff --git a/workflow/envs/harpy.yaml b/workflow/envs/harpy.yaml index d8e332707..605d87dfd 100644 --- a/workflow/envs/harpy.yaml +++ b/workflow/envs/harpy.yaml @@ -6,10 +6,9 @@ dependencies: - bcftools =1.19 - mamba - pandas - - pysam =0.22 - python - rich-click - - snakemake-minimal + - snakemake-minimal >7 - samtools - seqtk - tabix \ No newline at end of file diff --git a/workflow/report/HapCut2.Rmd b/workflow/report/HapCut2.Rmd index 4d92a8608..36728614b 100644 --- a/workflow/report/HapCut2.Rmd +++ b/workflow/report/HapCut2.Rmd @@ -44,7 +44,11 @@ file <- snakemake@input[[1]] df <- read.table( file, header = T, colClasses = c("factor","factor","integer","integer", "integer") - ) +) +if(nrow(df) < 1){ + cat(paste("No phase blocks were observed in the input file", file, "\n")) + knitr::knit_exit() +} df$pos_end <- df$pos_start + df$block_length df <- df[, c(1,2,3,4,6,5)] ``` diff --git a/workflow/rules/align-bwa.smk b/workflow/rules/align-bwa.smk index 0ffd47000..857b9dd33 100644 --- a/workflow/rules/align-bwa.smk +++ b/workflow/rules/align-bwa.smk @@ -5,7 +5,7 @@ import sys from rich.panel import Panel from rich import print as rprint -outdir = "Align/bwa" +outdir = config["output_directory"] seq_dir = config["seq_directory"] genomefile = config["genomefile"] samplenames = config["samplenames"] @@ -112,7 +112,7 @@ rule genome_bwa_index: log: f"Genome/{bn}.idx.log" conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Indexing {input}" shell: @@ -147,7 +147,7 @@ rule align: threads: min(10, workflow.cores) - 2 conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Aligning sequences: {wildcards.sample}" shell: @@ -241,7 +241,7 @@ rule bxstats_report: params: molecule_distance conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Generating summary of barcode alignment: {wildcards.sample}" script: @@ -256,10 +256,12 @@ rule assign_molecules: bai = outdir + "/{sample}.bam.bai" params: molecule_distance + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Assigning barcodes to molecules: {wildcards.sample}" - shell: - "assignMI.py -c {params} -i {input.bam} -o {output.bam}" + script: + "scripts/assignMI.py" rule alignment_bxstats: input: @@ -269,10 +271,12 @@ rule alignment_bxstats: outdir + "/reports/BXstats/data/{sample}.bxstats.gz" params: sample = lambda wc: d[wc.sample] + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Calculating barcode alignment statistics: {wildcards.sample}" - shell: - "bxStats.py {input.bam} | gzip > {output}" + script: + "scripts/bxStats.py" rule alignment_coverage: input: @@ -293,7 +297,7 @@ rule coverage_report: output: outdir + "/reports/coverage/{sample}.cov.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Summarizing alignment coverage: {wildcards.sample}" script: @@ -319,13 +323,15 @@ rule samtools_reports: expand(outdir + "/reports/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: outdir + "/reports/bwa.stats.html" + params: + outdir conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: """ - multiqc Align/bwa/reports/samtools_stats Align/bwa/reports/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments." --no-data-dir --filename {output} 2> /dev/null + multiqc {params}/reports/samtools_stats {params}/reports/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments." --no-data-dir --filename {output} 2> /dev/null """ rule log_runtime: diff --git a/workflow/rules/align-ema.smk b/workflow/rules/align-ema.smk index a1ca12959..d049dcc2d 100644 --- a/workflow/rules/align-ema.smk +++ b/workflow/rules/align-ema.smk @@ -5,7 +5,7 @@ from pathlib import Path from rich.panel import Panel from rich import print as rprint -outdir = "Align/ema" +outdir = config["output_directory"] seq_dir = config["seq_directory"] nbins = config["EMA_bins"] binrange = ["%03d" % i for i in range(nbins)] @@ -113,7 +113,7 @@ rule genome_bwa_index: log: f"Genome/{bn}.idx.log" conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Indexing {input}" shell: @@ -157,7 +157,7 @@ rule beadtag_count: message: "Counting barcode frequency: {wildcards.sample}" conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" shell: """ mkdir -p {params.prefix} {params.logdir} @@ -170,7 +170,7 @@ rule beadtag_summary: output: outdir + "/reports/reads.bxcounts.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Creating sample barcode validation report" script: @@ -192,7 +192,7 @@ rule preprocess: threads: 2 conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Preprocessing for EMA mapping: {wildcards.sample}" shell: @@ -217,7 +217,7 @@ rule align: threads: min(10, workflow.cores) - 2 conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Aligning barcoded sequences: {wildcards.sample}" shell: @@ -266,7 +266,7 @@ rule align_nobarcode: threads: min(10, workflow.cores) - 2 conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Aligning unbarcoded sequences: {wildcards.sample}" shell: @@ -375,7 +375,7 @@ rule coverage_report: output: outdir + "/reports/coverage/{sample}.cov.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Creating report of alignment coverage: {wildcards.sample}" script: @@ -416,10 +416,12 @@ rule bx_stats_alignments: bai = outdir + "/{sample}.bam.bai" output: outdir + "/reports/BXstats/data/{sample}.bxstats.gz" + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Calculating barcode alignment statistics: {wildcards.sample}" - shell: - "bxStats.py {input.bam} | gzip > {output}" + script: + "scripts/bxStats.py" rule bx_stats_report: input: @@ -429,7 +431,7 @@ rule bx_stats_report: params: "none" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Generating summary of barcode alignment: {wildcards.sample}" script: @@ -455,13 +457,15 @@ rule collate_samtools_stats: expand(outdir + "/reports/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]), output: outdir + "/reports/ema.stats.html" + params: + outdir conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: """ - multiqc Align/ema/reports/samtools_stats Align/ema/reports/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments." --no-data-dir --filename {output} 2> /dev/null + multiqc {outdir}/reports/samtools_stats {outdir}/reports/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments." --no-data-dir --filename {output} 2> /dev/null """ rule log_runtime: diff --git a/workflow/rules/demultiplex.gen1.smk b/workflow/rules/demultiplex.gen1.smk index b2da525eb..81526cedb 100644 --- a/workflow/rules/demultiplex.gen1.smk +++ b/workflow/rules/demultiplex.gen1.smk @@ -13,7 +13,7 @@ inprefix = config["infile_prefix"] inprefixfull = re.sub(r"[\_\.][IR][12]?(?:\_00[0-9])*\.f(?:ast)?q(?:\.gz)?$", "", infile) infiles = [f"{inprefixfull}_{i}{fq_extension}" for i in ["I1", "I2","R1","R2"]] indir = os.path.dirname(infile) -outdir = f"Demultiplex/{inprefix}/" +outdir = config["output_directory"] + f"/{inprefix}/" def barcodedict(smpl): d = dict() @@ -135,7 +135,7 @@ rule fastqc_F: threads: 1 conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Performing quality assessment: {wildcards.sample}.F.fq.gz" shell: @@ -168,7 +168,7 @@ rule fastqc_R: threads: 1 conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Performing quality assessment: {wildcards.sample}.R.fq.gz" shell: @@ -199,7 +199,7 @@ rule qc_report: params: outdir + "logs/.QC" conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Creating final demultiplexing QC report" shell: diff --git a/workflow/rules/impute.smk b/workflow/rules/impute.smk index b5690a158..9ea67362b 100644 --- a/workflow/rules/impute.smk +++ b/workflow/rules/impute.smk @@ -12,6 +12,7 @@ variantfile = config["variantfile"] paramfile = config["paramfile"] contigs = config["contigs"] skipreports = config["skipreports"] +outdir = config["output_directory"] # declare a dataframe to be the paramspace paramspace = Paramspace(pd.read_csv(paramfile, sep='\s+').rename(columns=str.lower), param_sep = "", filename_params="*") @@ -34,7 +35,7 @@ onsuccess: print("") rprint( Panel( - "The workflow has finished successfully! Find the results in [bold]Impute/[/bold]", + f"The workflow has finished successfully! Find the results in [bold]{outdir}/[/bold]", title = "[bold]harpy impute", title_align = "left", border_style = "green" @@ -46,10 +47,10 @@ rule sort_bcf: input: variantfile output: - bcf = temp("Impute/stitch_input/input.sorted.bcf"), - idx = temp("Impute/stitch_input/input.sorted.bcf.csi") + bcf = temp(outdir + "/stitch_input/input.sorted.bcf"), + idx = temp(outdir + "/stitch_input/input.sorted.bcf.csi") log: - "Impute/stitch_input/input.sorted.log" + outdir + "/stitch_input/input.sorted.log" message: "Sorting input variant call file" shell: @@ -70,7 +71,7 @@ rule bam_list: bam = expand(bam_dir + "/{sample}.bam", sample = samplenames), bai = expand(bam_dir + "/{sample}.bam.bai", sample = samplenames) output: - "Impute/stitch_input/samples.list" + outdir + "/stitch_input/samples.list" message: "Creating list of alignment files" run: @@ -79,7 +80,7 @@ rule bam_list: rule samples_file: output: - "Impute/stitch_input/samples.names" + outdir + "/stitch_input/samples.names" message: "Creating file of sample names" run: @@ -88,13 +89,11 @@ rule samples_file: rule convert2stitch: input: - "Impute/stitch_input/input.sorted.bcf" + outdir + "/stitch_input/input.sorted.bcf" output: - "Impute/stitch_input/{part}.stitch" + outdir + "/stitch_input/{part}.stitch" threads: 3 - benchmark: - ".Benchmark/Impute/fileprep.{part}.txt" message: "Converting data to biallelic STITCH format: {wildcards.part}" shell: @@ -105,23 +104,23 @@ rule convert2stitch: rule impute: input: - bamlist = "Impute/stitch_input/samples.list", - infile = "Impute/stitch_input/{part}.stitch" + bamlist = outdir + "/stitch_input/samples.list", + infile = outdir + "/stitch_input/{part}.stitch" output: # format a wildcard pattern like "k{k}/s{s}/ngen{ngen}" # into a file path, with k, s, ngen being the columns of the data frame - f"Impute/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz" + f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz" log: - f"Impute/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.log" + f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.log" params: # automatically translate the wildcard values into an instance of the param space # in the form of a dict (here: {"k": ..., "s": ..., "ngen": ...}) parameters = paramspace.instance, extra = config.get("extra", "") conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" benchmark: - f".Benchmark/Impute/stitch.{paramspace.wildcard_pattern}" + ".{part}.txt" + f".Benchmark/{outdir}/stitch.{paramspace.wildcard_pattern}" + ".{part}.txt" threads: 50 message: @@ -131,10 +130,10 @@ rule impute: rule index_vcf: input: - vcf = "Impute/{stitchparams}/contigs/{part}/{part}.vcf.gz" + vcf = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz" output: - idx = "Impute/{stitchparams}/contigs/{part}/{part}.vcf.gz.tbi", - stats = "Impute/{stitchparams}/contigs/{part}/{part}.stats" + idx = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz.tbi", + stats = outdir + "/{stitchparams}/contigs/{part}/{part}.stats" message: "Indexing: {wildcards.stitchparams}/{wildcards.part}" shell: @@ -145,40 +144,41 @@ rule index_vcf: rule collate_stitch_reports: input: - "Impute/{stitchparams}/contigs/{part}/{part}.stats" + outdir + "/{stitchparams}/contigs/{part}/{part}.stats" output: - "Impute/{stitchparams}/contigs/{part}/{part}.STITCH.html" + outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html" message: "Generating STITCH report: {wildcards.part}" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" script: "report/StitchCollate.Rmd" rule clean_stitch: input: - "Impute/{stitchparams}/contigs/{part}/{part}.STITCH.html" + outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html" output: - temp("Impute/{stitchparams}/contigs/{part}/.cleaned") + temp(outdir + "/{stitchparams}/contigs/{part}/.cleaned") params: - lambda wc: wc.get("stitchparams") + stitch = lambda wc: wc.get("stitchparams"), + outdir = outdir priority: 2 message: "Cleaning up {wildcards.stitchparams}: {wildcards.part}" shell: """ - rm -rf Impute/{params}/contigs/{wildcards.part}/input - rm -rf Impute/{params}/contigs/{wildcards.part}/RData - rm -rf Impute/{params}/contigs/{wildcards.part}/plots + rm -rf {params.outdir}/{params.stitch}/contigs/{wildcards.part}/input + rm -rf {params.outdir}/{params.stitch}/contigs/{wildcards.part}/RData + rm -rf {params.outdir}/{params.stitch}/contigs/{wildcards.part}/plots touch {output} """ rule concat_list: input: - bcf = expand("Impute/{{stitchparams}}/contigs/{part}/{part}.vcf.gz", part = contigs) + bcf = expand(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz", part = contigs) output: - temp("Impute/{stitchparams}/bcf.files") + temp(outdir + "/{stitchparams}/bcf.files") message: "Creating list vcf files for concatenation" run: @@ -187,11 +187,11 @@ rule concat_list: rule merge_vcfs: input: - files = "Impute/{stitchparams}/bcf.files", - idx = expand("Impute/{{stitchparams}}/contigs/{part}/{part}.vcf.gz.tbi", part = contigs), - clean = expand("Impute/{{stitchparams}}/contigs/{part}/.cleaned", part = contigs) + files = outdir + "/{stitchparams}/bcf.files", + idx = expand(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz.tbi", part = contigs), + clean = expand(outdir + "/{{stitchparams}}/contigs/{part}/.cleaned", part = contigs) output: - "Impute/{stitchparams}/variants.imputed.bcf" + outdir + "/{stitchparams}/variants.imputed.bcf" threads: workflow.cores message: @@ -201,9 +201,9 @@ rule merge_vcfs: rule index_merged: input: - "Impute/{stitchparams}/variants.imputed.bcf" + outdir + "/{stitchparams}/variants.imputed.bcf" output: - "Impute/{stitchparams}/variants.imputed.bcf.csi" + outdir + "/{stitchparams}/variants.imputed.bcf.csi" message: "Indexing resulting file: {output}" shell: @@ -211,10 +211,10 @@ rule index_merged: rule stats: input: - bcf = "Impute/{stitchparams}/variants.imputed.bcf", - idx = "Impute/{stitchparams}/variants.imputed.bcf.csi" + bcf = outdir + "/{stitchparams}/variants.imputed.bcf", + idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi" output: - "Impute/{stitchparams}/reports/variants.imputed.stats" + outdir + "/{stitchparams}/reports/variants.imputed.stats" message: "Calculating stats: {wildcards.stitchparams}/variants.imputed.bcf" shell: @@ -224,13 +224,13 @@ rule stats: rule comparestats: input: - orig = "Impute/stitch_input/input.sorted.bcf", - origidx = "Impute/stitch_input/input.sorted.bcf.csi", - impute = "Impute/{stitchparams}/variants.imputed.bcf", - idx = "Impute/{stitchparams}/variants.imputed.bcf.csi" + orig = outdir + "/stitch_input/input.sorted.bcf", + origidx = outdir + "/stitch_input/input.sorted.bcf.csi", + impute = outdir + "/{stitchparams}/variants.imputed.bcf", + idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi" output: - compare = "Impute/{stitchparams}/reports/impute.compare.stats", - info_sc = temp("Impute/{stitchparams}/reports/impute.infoscore") + compare = outdir + "/{stitchparams}/reports/impute.compare.stats", + info_sc = temp(outdir + "/{stitchparams}/reports/impute.infoscore") message: "Computing post-imputation stats: {wildcards.stitchparams}" shell: @@ -241,14 +241,14 @@ rule comparestats: rule imputation_results_reports: input: - "Impute/{stitchparams}/reports/impute.compare.stats", - "Impute/{stitchparams}/reports/impute.infoscore" + outdir + "/{stitchparams}/reports/impute.compare.stats", + outdir + "/{stitchparams}/reports/impute.infoscore" output: - "Impute/{stitchparams}/variants.imputed.html" + outdir + "/{stitchparams}/variants.imputed.html" params: lambda wc: wc.get("stitchparams") conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Generating imputation success report: {output}" script: @@ -256,7 +256,7 @@ rule imputation_results_reports: rule log_runtime: output: - "Impute/workflow/impute.workflow.summary" + outdir + "/workflow/impute.workflow.summary" message: "Creating record of relevant runtime parameters: {output}" run: @@ -296,12 +296,12 @@ rule log_runtime: _ = f.write(" " + str(config["workflow_call"]) + "\n") results = list() -results.append(expand("Impute/{stitchparams}/variants.imputed.bcf", stitchparams=paramspace.instance_patterns)) -results.append(expand("Impute/{stitchparams}/contigs/{part}/{part}.STITCH.html", stitchparams=paramspace.instance_patterns, part = contigs)) -results.append("Impute/workflow/impute.workflow.summary") +results.append(expand(outdir + "/{stitchparams}/variants.imputed.bcf", stitchparams=paramspace.instance_patterns)) +results.append(expand(outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html", stitchparams=paramspace.instance_patterns, part = contigs)) +results.append(outdir + "/workflow/impute.workflow.summary") if not skipreports: - results.append(expand("Impute/{stitchparams}/variants.imputed.html", stitchparams=paramspace.instance_patterns)) + results.append(expand(outdir + "/{stitchparams}/variants.imputed.html", stitchparams=paramspace.instance_patterns)) rule all: default_target: True diff --git a/workflow/rules/phase-pop.smk b/workflow/rules/phase-pop.smk index 78ec99ca3..da0daa411 100644 --- a/workflow/rules/phase-pop.smk +++ b/workflow/rules/phase-pop.smk @@ -11,7 +11,7 @@ variantfile = config["variantfile"] pruning = config["prune"] molecule_distance = config["molecule_distance"] extra = config.get("extra", "") -outdir = "Phase.noBX"if config["noBX"] else "Phase" +outdir = config["output_directory"] fragfile = "Phase.noBX/extractHairs/{sample}.unlinked.frags" if config["noBX"] else "Phase/linkFragments/{sample}.linked.frags" linkarg = "--10x 0" if config["noBX"] else "--10x 1" skipreports = config["skipreports"] @@ -80,10 +80,21 @@ rule splitbysample: awk '/^#/;/CHROM/ {{OFS="\\t"}}; !/^#/ && $10~/^0\\/0/ {{$10="0|0:"substr($10,5);print $0}}; !/^#/ && $10~/^0\\/1/; !/^#/ && $10~/^1\\/1/ {{$10="1|1:"substr($10,5);print $0}}; !/^#/ {{print $0}}' > {output} """ +rule index_alignment: + input: + bam_dir + "/{sample}.bam" + output: + bam_dir + "/{sample}.bam.bai" + message: + "Indexing alignment: {wildcards.sample}" + shell: + "samtools index {input} {output} 2> /dev/null" + rule extractHairs: input: vcf = outdir + "/input/{sample}.het.bcf", - bam = bam_dir + "/{sample}.bam" + bam = bam_dir + "/{sample}.bam", + bai = bam_dir + "/{sample}.bam.bai" output: outdir + "/extractHairs/{sample}.unlinked.frags" log: @@ -92,7 +103,7 @@ rule extractHairs: indels = indelarg, bx = linkarg conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" benchmark: ".Benchmark/Phase/extracthairs.{sample}.txt" message: @@ -112,7 +123,7 @@ rule linkFragments: params: d = molecule_distance conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" benchmark: ".Benchmark/Phase/linkfrag.{sample}.txt" message: @@ -133,7 +144,7 @@ rule phaseBlocks: prune = f"--threshold {pruning}" if pruning > 0 else "--no_prune 1", extra = extra conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" benchmark: ".Benchmark/Phase/phase.{sample}.txt" message: @@ -204,14 +215,21 @@ rule mergeSamples: output: bcf = outdir + "/variants.phased.bcf", idx = outdir + "/variants.phased.bcf.csi" - benchmark: - ".Benchmark/Phase/mergesamples.txt" + params: + "true" if len(samplenames) > 1 else "false" threads: 30 message: - "Combining samples into a single BCF file" + "Combining results into {output.bcf}" if len(samplenames) > 1 else "Copying results to {output.bcf}" shell: - "bcftools merge --threads {threads} -Ob -o {output.bcf} --write-index {input.bcf}" + """ + if [ "{params}" = true ]; then + bcftools merge --threads {threads} -Ob -o {output.bcf} --write-index {input.bcf} + else + cp {input.bcf} {output.bcf} + cp {input.idx} {output.idx} + fi + """ rule summarize_blocks: input: @@ -237,7 +255,7 @@ rule phase_report: output: outdir + "/reports/phase.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Summarizing phasing results" script: diff --git a/workflow/rules/phase-sample.smk b/workflow/rules/phase-sample.smk index 43d896ba9..73597ee87 100644 --- a/workflow/rules/phase-sample.smk +++ b/workflow/rules/phase-sample.smk @@ -79,7 +79,7 @@ rule extractHairs: benchmark: ".Benchmark/Phase/extracthairs.{sample}.txt" conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" message: "Converting to compact fragment format: {wildcards.sample}" shell: @@ -99,7 +99,7 @@ rule linkFragments: params: d = molecule_distance conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" message: "Linking fragments: {wildcards.sample}" shell: @@ -121,7 +121,7 @@ rule phaseBlocks: prune = f"--threshold {pruning}" if pruning > 0 else "--no_prune 1", extra = extra conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" message: "Creating phased haplotype blocks: {wildcards.sample}" shell: @@ -180,7 +180,7 @@ rule phase_report: output: outdir + "/reports/phase.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Summarizing phasing results" script: diff --git a/workflow/rules/preflight-bam.smk b/workflow/rules/preflight-bam.smk index 0a5b111f5..d9ad11d77 100644 --- a/workflow/rules/preflight-bam.smk +++ b/workflow/rules/preflight-bam.smk @@ -6,7 +6,7 @@ import sys import glob seq_dir = config["seq_directory"] -out_dir = f"Preflight/bam/" +out_dir = config["output_directory"] bamlist = [os.path.basename(i) for i in glob.iglob(f"{seq_dir}/*") if not os.path.isdir(i) and i.lower().endswith(".bam")] samplenames = set([os.path.splitext(i)[0] for i in bamlist]) @@ -54,10 +54,12 @@ rule checkBam: bai = seq_dir + "/{sample}.bam.bai" output: temp(out_dir + "{sample}.log") + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Processing: {wildcards.sample}" - shell: - "checkBAM.py {input.bam} > {output}" + script: + "scripts/checkBAM.py" rule mergeChecks: input: @@ -95,7 +97,7 @@ rule createReport: params: seq_dir conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Producing report" script: diff --git a/workflow/rules/preflight-fastq.smk b/workflow/rules/preflight-fastq.smk index f6825c360..bc4e9eb3d 100644 --- a/workflow/rules/preflight-fastq.smk +++ b/workflow/rules/preflight-fastq.smk @@ -6,7 +6,7 @@ import sys import glob seq_dir = config["seq_directory"] -out_dir = f"Preflight/fastq/" +out_dir = config["output_directory"] wildcard_constraints: sample = "[a-zA-Z0-9._-]+" @@ -60,10 +60,12 @@ rule checkForward: get_fq1 output: temp(out_dir + "{sample}.F.log") + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Processing forward reads: {wildcards.sample}" - shell: - "checkFASTQ.py {input} > {output}" + script: + "scripts/checkFASTQ.py" rule checkReverse: input: @@ -72,8 +74,10 @@ rule checkReverse: temp(out_dir + "{sample}.R.log") message: "Processing reverse reads: {wildcards.sample}" - shell: - "checkFASTQ.py {input} > {output}" + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" + script: + "scripts/checkFASTQ.py" rule mergeChecks: input: @@ -111,7 +115,7 @@ rule createReport: params: seq_dir conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Producing report" script: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 8f630c472..c7690e1da 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -10,7 +10,7 @@ extra = config.get("extra", "") seq_dir = config["seq_directory"] adapters = config["adapters"] skipreports = config["skipreports"] - +outdir = config["output_directory"] flist = [os.path.basename(i) for i in glob.iglob(f"{seq_dir}/*") if not os.path.isdir(i)] r = re.compile(r".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE) fqlist = list(filter(r.match, flist)) @@ -24,7 +24,7 @@ onsuccess: print("") rprint( Panel( - "The workflow has finished successfully! Find the results in [bold]QC/[/bold]", + f"The workflow has finished successfully! Find the results in [bold]{outdir}/[/bold]", title = "[bold]harpy qc", title_align = "left", border_style = "green" @@ -44,8 +44,6 @@ onerror: file = sys.stderr ) - - def get_fq1(wildcards): # code that returns a list of fastq files for read 1 based on *wildcards.sample* e.g. lst = sorted(glob.glob(seq_dir + "/" + wildcards.sample + "*")) @@ -65,14 +63,12 @@ rule trimFastp: fw = get_fq1, rv = get_fq2 output: - fw = "QC/{sample}.R1.fq.gz", - rv = "QC/{sample}.R2.fq.gz", - json = "QC/logs/json/{sample}.fastp.json" + fw = outdir + "/{sample}.R1.fq.gz", + rv = outdir + "/{sample}.R2.fq.gz", + json = outdir + "/logs/json/{sample}.fastp.json" log: - html = "QC/reports/fastp_reports/{sample}.html", - serr = "QC/logs/fastp_logs/{sample}.log" - benchmark: - ".Benchmark/QC/{sample}.txt" + html = outdir + "/reports/fastp_reports/{sample}.html", + serr = outdir + "/logs/fastp_logs/{sample}.log" params: maxlen = f"--max_len1 {maxlen}", tim_adapters = "--detect_adapter_for_pe" if adapters else "--disable_adapter_trimming", @@ -80,7 +76,7 @@ rule trimFastp: threads: 2 conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Removing adapters + quality trimming: {wildcards.sample}" shell: @@ -90,21 +86,23 @@ rule trimFastp: rule count_beadtags: input: - "QC/{sample}.R1.fq.gz" + outdir + "/{sample}.R1.fq.gz" output: - temp("QC/logs/bxcount/{sample}.count.log") + temp(outdir + "/logs/bxcount/{sample}.count.log") message: "Counting barcode frequency: {wildcards.sample}" - shell: - "countBX.py {input} > {output}" + conda: + os.getcwd() + "/.harpy_envs/qc.yaml" + script: + "scripts/countBX.py" rule beadtag_counts_summary: input: - countlog = expand("QC/logs/bxcount/{sample}.count.log", sample = samplenames) + countlog = expand(outdir + "/logs/bxcount/{sample}.count.log", sample = samplenames) output: - "QC/reports/barcode.summary.html" + outdir + "/reports/barcode.summary.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Summarizing sample barcode validation" script: @@ -112,7 +110,7 @@ rule beadtag_counts_summary: rule log_runtime: output: - "QC/workflow/qc.workflow.summary" + outdir + "/workflow/qc.workflow.summary" params: maxlen = f"--max_len1 {maxlen}", tim_adapters = "--detect_adapter_for_pe" if adapters else "--disable_adapter_trimming", @@ -129,23 +127,25 @@ rule log_runtime: _ = f.write(" " + str(config["workflow_call"]) + "\n") results = list() -results.append(expand("QC/logs/json/{sample}.fastp.json", sample = samplenames)) -results.append(expand("QC/{sample}.{FR}.fq.gz", FR = ["R1", "R2"], sample = samplenames)) -results.append("QC/workflow/qc.workflow.summary") +results.append(expand(outdir + "/logs/json/{sample}.fastp.json", sample = samplenames)) +results.append(expand(outdir + "/{sample}.{FR}.fq.gz", FR = ["R1", "R2"], sample = samplenames)) +results.append(outdir + "/workflow/qc.workflow.summary") if not skipreports: - results.append("QC/reports/barcode.summary.html") + results.append(outdir + "/reports/barcode.summary.html") rule createReport: default_target: True input: results output: - "QC/reports/qc.report.html" + outdir + "/reports/qc.report.html" + params: + outdir conda: - os.getcwd() + "/harpyenvs/qc.yaml" + os.getcwd() + "/.harpy_envs/qc.yaml" message: "Sequencing quality filtering and trimming is complete!" shell: """ - multiqc QC/logs/json -m fastp --force --filename {output} --quiet --title "QC Summary" --comment "This report aggregates trimming and quality control metrics reported by fastp" --no-data-dir 2>/dev/null + multiqc {params}/logs/json -m fastp --force --filename {output} --quiet --title "QC Summary" --comment "This report aggregates trimming and quality control metrics reported by fastp" --no-data-dir 2>/dev/null """ \ No newline at end of file diff --git a/workflow/rules/snp-freebayes.smk b/workflow/rules/snp-freebayes.smk index bd9636e1d..1b1bf78bf 100644 --- a/workflow/rules/snp-freebayes.smk +++ b/workflow/rules/snp-freebayes.smk @@ -6,14 +6,14 @@ import gzip bam_dir = config["seq_directory"] genomefile = config["genomefile"] -groupings = config.get("groupings", None) +groupings = config.get("groupings", []) bn = os.path.basename(genomefile) ploidy = config["ploidy"] samplenames = config["samplenames"] extra = config.get("extra", "") chunksize = config["windowsize"] intervals = config["intervals"] -outdir = "Variants/freebayes" +outdir = config["output_directory"] regions = dict(zip(intervals, intervals)) skipreports = config["skipreports"] @@ -44,17 +44,16 @@ onsuccess: file = sys.stderr ) -if groupings: - rule copy_groupings: - input: - groupings - output: - outdir + "/logs/sample.groups" - message: - "Logging {input}" - run: - with open(input[0], "r") as infile, open(output[0], "w") as outfile: - _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] +rule copy_groupings: + input: + groupings + output: + outdir + "/logs/sample.groups" + message: + "Logging {input}" + run: + with open(input[0], "r") as infile, open(output[0], "w") as outfile: + _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] rule index_alignments: input: @@ -89,47 +88,27 @@ rule bam_list: for bamfile in input.bam: _ = fout.write(bamfile + "\n") -if groupings: - rule call_variants_pop: - input: - bam = expand(bam_dir + "/{sample}.bam", sample = samplenames), - bai = expand(bam_dir + "/{sample}.bam.bai", sample = samplenames), - groupings = outdir + "/logs/sample.groups", - ref = f"Genome/{bn}", - ref_idx = f"Genome/{bn}.fai", - samples = outdir + "/logs/samples.files" - output: - pipe(outdir + "/regions/{part}.vcf") - params: - region = lambda wc: "-r " + regions[wc.part], - ploidy = f"-p {ploidy}", - extra = extra - conda: - os.getcwd() + "/harpyenvs/variants.snp.yaml" - message: - "Calling variants: {wildcards.part}" - shell: - "freebayes -f {input.ref} -L {input.samples} --populations {input.groupings} {params}" -else: - rule call_variants: - input: - bam = expand(bam_dir + "/{sample}.bam", sample = samplenames), - bai = expand(bam_dir + "/{sample}.bam.bai", sample = samplenames), - ref = f"Genome/{bn}", - ref_idx = f"Genome/{bn}.fai", - samples = outdir + "/logs/samples.files" - output: - pipe(outdir + "/regions/{part}.vcf") - params: - region = lambda wc: "-r " + regions[wc.part], - ploidy = f"-p {ploidy}", - extra = extra - conda: - os.getcwd() + "/harpyenvs/variants.snp.yaml" - message: - "Calling variants: {wildcards.part}" - shell: - "freebayes -f {input.ref} -L {input.samples} {params} > {output}" +rule call_variants: + input: + bam = expand(bam_dir + "/{sample}.bam", sample = samplenames), + bai = expand(bam_dir + "/{sample}.bam.bai", sample = samplenames), + groupfile = outdir + "/logs/sample.groups" if groupings else [], + ref = f"Genome/{bn}", + ref_idx = f"Genome/{bn}.fai", + samples = outdir + "/logs/samples.files" + output: + pipe(outdir + "/regions/{part}.vcf") + params: + region = lambda wc: "-r " + regions[wc.part], + ploidy = f"-p {ploidy}", + populations = "--populations " + rules.copy_groupings.output[0] if groupings else "", + extra = extra + conda: + os.getcwd() + "/.harpy_envs/variants.snp.yaml" + message: + "Calling variants: {wildcards.part}" + shell: + "freebayes -f {input.ref} -L {input.samples} {params} > {output}" rule sort_variants: input: @@ -221,7 +200,7 @@ rule bcfreport: output: outdir + "/reports/variants.{type}.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Generating bcftools report: variants.{wildcards.type}.bcf" script: @@ -234,7 +213,7 @@ rule log_runtime: "Creating record of relevant runtime parameters: {output}" params: ploidy = f"-p {ploidy}", - populations = '' if groupings is None else f"--populations {groupings}", + populations = f"--populations {groupings}" if groupings else '', extra = extra run: with open(output[0], "w") as f: diff --git a/workflow/rules/snp-mpileup.smk b/workflow/rules/snp-mpileup.smk index 572b7ad8d..9213bb1d4 100644 --- a/workflow/rules/snp-mpileup.smk +++ b/workflow/rules/snp-mpileup.smk @@ -6,13 +6,13 @@ import os bam_dir = config["seq_directory"] genomefile = config["genomefile"] bn = os.path.basename(genomefile) -groupings = config.get("groupings", None) +groupings = config.get("groupings", []) ploidy = config["ploidy"] samplenames = config["samplenames"] mp_extra = config.get("extra", "") chunksize = config["windowsize"] intervals = config["intervals"] -outdir = "Variants/mpileup" +outdir = config["output_directory"] regions = dict(zip(intervals, intervals)) skipreports = config["skipreports"] @@ -43,17 +43,16 @@ onsuccess: file = sys.stderr ) -if groupings: - rule copy_groupings: - input: - groupings - output: - outdir + "/logs/sample.groups" - message: - "Logging {input}" - run: - with open(input[0], "r") as infile, open(output[0], "w") as outfile: - _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] +rule copy_groupings: + input: + groupings + output: + outdir + "/logs/sample.groups" + message: + "Logging {input}" + run: + with open(input[0], "r") as infile, open(output[0], "w") as outfile: + _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] rule index_alignments: input: @@ -103,43 +102,25 @@ rule mpileup: shell: "bcftools mpileup --fasta-ref {input.genome} --bam-list {input.bamlist} --annotate AD --output-type b {params} > {output.bcf} 2> {output.logfile}" -if groupings: - rule call_genotypes_pop: - input: - bcf = outdir + "/{part}.mp.bcf", - groupings = outdir + "/logs/sample.groups" - output: - bcf = temp(outdir + "/call/{part}.bcf"), - idx = temp(outdir + "/call/{part}.bcf.csi") - params: - f"--ploidy {ploidy}" - threads: - 2 - message: - "Calling genotypes: {wildcards.part}" - shell: - """ - bcftools call --multiallelic-caller {params} --variants-only --output-type b {input} | - bcftools sort - --output {output.bcf} --write-index 2> /dev/null - """ -else: - rule call_genotypes: - input: - bcf = outdir + "/{part}.mp.bcf" - output: - bcf = temp(outdir + "/call/{part}.bcf"), - idx = temp(outdir + "/call/{part}.bcf.csi") - params: - f"--ploidy {ploidy}" - threads: - 2 - message: - "Calling genotypes: {wildcards.part}" - shell: - """ - bcftools call --multiallelic-caller {params} --variants-only --output-type b {input} | - bcftools sort - --output {output.bcf} --write-index 2> /dev/null - """ +rule call_genotypes_pop: + input: + groupfile = outdir + "/logs/sample.groups" if groupings else [], + bcf = outdir + "/{part}.mp.bcf" + output: + bcf = temp(outdir + "/call/{part}.bcf"), + idx = temp(outdir + "/call/{part}.bcf.csi") + params: + f"--ploidy {ploidy}", + "--group-samples" if groupings else "--group-samples -" + threads: + 2 + message: + "Calling genotypes: {wildcards.part}" + shell: + """ + bcftools call --multiallelic-caller --variants-only --output-type b {params} {input} | + bcftools sort - --output {output.bcf} --write-index 2> /dev/null + """ rule concat_list: input: @@ -237,7 +218,7 @@ rule bcfreport: output: outdir + "/reports/variants.{type}.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Generating bcftools report: variants.{wildcards.type}.bcf" script: @@ -248,7 +229,7 @@ rule log_runtime: outdir + "/workflow/snp.mpileup.workflow.summary" params: ploidy = f"--ploidy {ploidy}", - populations = '' if groupings is None else f"--populations {groupings}" + populations = f"--populations {groupings}" if groupings else "--populations -" message: "Creating record of relevant runtime parameters: {output}" run: diff --git a/workflow/rules/sv-leviathan-pop.smk b/workflow/rules/sv-leviathan-pop.smk index a19dcfd11..1dbf7a06e 100644 --- a/workflow/rules/sv-leviathan-pop.smk +++ b/workflow/rules/sv-leviathan-pop.smk @@ -8,7 +8,7 @@ genomefile = config["genomefile"] samplenames = config["samplenames"] extra = config.get("extra", "") groupfile = config["groupings"] -outdir = "Variants/leviathan-pop" +outdir = config["output_directory"] skipreports = config["skipreports"] bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False @@ -114,7 +114,7 @@ rule index_barcode: threads: 4 conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" shell: "LRez index bam -p -b {input.bam} -o {output} --threads {threads}" @@ -161,7 +161,7 @@ rule index_bwa_genome: message: "Indexing {input}" conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" shell: "bwa index {input} 2> {log}" @@ -182,7 +182,7 @@ rule leviathan_variantcall: threads: 3 conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" message: "Calling variants: Population {wildcards.population}" benchmark: @@ -229,7 +229,7 @@ rule sv_report_bypop: message: "Generating SV report: population {wildcards.population}" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" script: "report/Leviathan.Rmd" @@ -243,7 +243,7 @@ rule sv_report: message: "Generating SV report for all populations" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" script: "report/LeviathanPop.Rmd" diff --git a/workflow/rules/sv-leviathan.smk b/workflow/rules/sv-leviathan.smk index 1923232a8..182189d52 100644 --- a/workflow/rules/sv-leviathan.smk +++ b/workflow/rules/sv-leviathan.smk @@ -7,7 +7,7 @@ bam_dir = config["seq_directory"] genomefile = config["genomefile"] samplenames = config["samplenames"] extra = config.get("extra", "") -outdir = "Variants/leviathan" +outdir = config["output_directory"] skipreports = config["skipreports"] bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False @@ -64,7 +64,7 @@ rule index_barcode: benchmark: ".Benchmark/Variants/leviathan/indexbc.{sample}.txt" conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" shell: "LRez index bam --threads {threads} -p -b {input.bam} -o {output}" @@ -109,7 +109,7 @@ rule index_bwa_genome: log: f"Genome/{bn}.idx.log" conda: - os.getcwd() + "/harpyenvs/align.yaml" + os.getcwd() + "/.harpy_envs/align.yaml" message: "Indexing {input}" shell: @@ -132,7 +132,7 @@ rule leviathan_variantcall: threads: 3 conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" message: "Calling variants: {wildcards.sample}" benchmark: @@ -177,7 +177,7 @@ rule sv_report: output: outdir + "/reports/{sample}.SV.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Generating SV report: {wildcards.sample}" script: diff --git a/workflow/rules/sv-naibr-phase.smk b/workflow/rules/sv-naibr-phase.smk index 55065a309..9d424e96e 100644 --- a/workflow/rules/sv-naibr-phase.smk +++ b/workflow/rules/sv-naibr-phase.smk @@ -9,7 +9,7 @@ samplenames = config["samplenames"] extra = config.get("extra", "") genomefile = config["genomefile"] molecule_distance = config["molecule_distance"] -outdir = "Variants/naibr" +outdir = config["output_directory"] skipreports = config["skipreports"] wildcard_constraints: @@ -35,7 +35,7 @@ def process_args(args): "k" : 3 } if args: - words = [i for i in re.split("\s|=", args) if len(i) > 0] + words = [i for i in re.split(r"\s|=", args) if len(i) > 0] for i in zip(words[::2], words[1::2]): argsDict[i[0]] = i[1] return argsDict @@ -142,7 +142,7 @@ rule phase_alignments: params: extra = lambda wc: "--ignore-read-groups --sample " + wc.get("sample") +" --tag-supplementary" conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" threads: 4 message: @@ -164,7 +164,7 @@ rule log_phasing: for i in {input}; do SAMP=$(basename $i .phaselog) echo -e "${{SAMP}}\\t$(grep "Total alignments" $i)\\t$(grep "could be tagged" $i)" | - sed 's/ \+ /\\t/g' | cut -f1,3,5 >> {output} + sed 's/ \\+ /\\t/g' | cut -f1,3,5 >> {output} done """ @@ -183,7 +183,7 @@ rule create_config: with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") _ = conf.write(f"prefix={params[0]}\n") - _ = conf.write(f"outdir=Variants/naibr/{params[0]}\n") + _ = conf.write(f"outdir={outdir}/{params[0]}\n") _ = conf.write(f"threads={params[1]}\n") for i in argdict: _ = conf.write(f"{i}={argdict[i]}\n") @@ -212,7 +212,7 @@ rule call_sv: threads: min(10, workflow.cores) conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" message: "Calling variants: {wildcards.sample}" shell: @@ -247,7 +247,7 @@ rule report: output: outdir + "/reports/{sample}.naibr.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Creating report: {wildcards.sample}" script: diff --git a/workflow/rules/sv-naibr-pop-phase.smk b/workflow/rules/sv-naibr-pop-phase.smk index 616cc94af..8e0d99320 100644 --- a/workflow/rules/sv-naibr-pop-phase.smk +++ b/workflow/rules/sv-naibr-pop-phase.smk @@ -10,7 +10,7 @@ extra = config.get("extra", "") groupfile = config["groupings"] genomefile = config["genomefile"] molecule_distance = config["molecule_distance"] -outdir = "Variants/naibr-pop" +outdir = config["output_directory"] skipreports = config["skipreports"] wildcard_constraints: @@ -36,7 +36,7 @@ def process_args(args): "k" : 3, } if args: - words = [i for i in re.split("\s|=", args) if len(i) > 0] + words = [i for i in re.split(r"\s|=", args) if len(i) > 0] for i in zip(words[::2], words[1::2]): argsDict[i[0]] = i[1] return argsDict @@ -154,7 +154,7 @@ rule phase_alignments: params: extra = lambda wc: "--ignore-read-groups --sample " + wc.get("sample") + " --tag-supplementary" conda: - os.getcwd() + "/harpyenvs/phase.yaml" + os.getcwd() + "/.harpy_envs/phase.yaml" message: "Phasing: {input.aln}" shell: @@ -173,7 +173,7 @@ rule log_phasing: for i in {input}; do SAMP=$(basename $i .phaselog) echo -e "${{SAMP}}\\t$(grep "Total alignments" $i)\\t$(grep "could be tagged" $i)" | - sed 's/ \+ /\\t/g' | cut -f1,3,5 >> {output} + sed 's/ \\+ /\\t/g' | cut -f1,3,5 >> {output} done """ @@ -230,7 +230,7 @@ rule create_config: argdict = process_args(extra) with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") - _ = conf.write(f"outdir=Variants/naibr-pop/{params[0]}\n") + _ = conf.write(f"outdir={outdir}/{params[0]}\n") _ = conf.write(f"prefix={params[0]}\n") _ = conf.write(f"threads={params[1]}\n") for i in argdict: @@ -250,7 +250,7 @@ rule call_sv: threads: min(10, workflow.cores) conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" message: "Calling variants: {wildcards.population}" shell: @@ -287,7 +287,7 @@ rule report: message: "Creating report: {wildcards.population}" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" script: "report/Naibr.Rmd" @@ -300,7 +300,7 @@ rule report_pop: message: "Creating summary report" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" script: "report/NaibrPop.Rmd" diff --git a/workflow/rules/sv-naibr-pop.smk b/workflow/rules/sv-naibr-pop.smk index b0ec099aa..aa2cc9a2d 100644 --- a/workflow/rules/sv-naibr-pop.smk +++ b/workflow/rules/sv-naibr-pop.smk @@ -12,7 +12,7 @@ genomefile = config["genomefile"] molecule_distance = config["molecule_distance"] skipreports = config["skipreports"] bn = os.path.basename(genomefile) -outdir = "Variants/naibr-pop" +outdir = config["output_directory"] genome_zip = True if bn.lower().endswith(".gz") else False if genome_zip: bn = bn[:-3] @@ -28,7 +28,7 @@ def process_args(args): "k" : 3, } if args != "": - words = [i for i in re.split("\s|=", args) if len(i) > 0] + words = [i for i in re.split(r"\s|=", args) if len(i) > 0] for i in zip(words[::2], words[1::2]): argsDict[i[0]] = i[1] return argsDict @@ -129,7 +129,7 @@ rule create_config: argdict = process_args(extra) with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") - _ = conf.write(f"outdir=Variants/naibr-pop/{params[0]}\n") + _ = conf.write(f"outdir={outdir}/{params[0]}\n") _ = conf.write(f"prefix={params[0]}\n") _ = conf.write(f"threads={params[1]}\n") for i in argdict: @@ -149,7 +149,7 @@ rule call_sv: threads: min(10, workflow.cores) conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" message: "Calling variants: {wildcards.population}" shell: @@ -198,31 +198,26 @@ rule genome_link: fi """ -if genome_zip: - rule genome_compressed_faidx: - input: - f"Genome/{bn}" - output: - gzi = f"Genome/{bn}.gzi", - fai = f"Genome/{bn}.fai" - log: - f"Genome/{bn}.faidx.gzi.log" - message: - "Indexing {input}" - shell: - "samtools faidx --gzi-idx {output.gzi} --fai-idx {output.fai} {input} 2> {log}" -else: - rule genome_faidx: - input: - f"Genome/{bn}" - output: - f"Genome/{bn}.fai" - log: - f"Genome/{bn}.faidx.log" - message: - "Indexing {input}" - shell: - "samtools faidx --fai-idx {output} {input} 2> {log}" +rule genome_faidx: + input: + f"Genome/{bn}" + output: + fai = f"Genome/{bn}.fai", + gzi = f"Genome/{bn}.gzi" if genome_zip else [] + log: + f"Genome/{bn}.faidx.gzi.log" + params: + genome_zip + message: + "Indexing {input}" + shell: + """ + if [ "{params}" = "True" ]; then + samtools faidx --gzi-idx {output.gzi} --fai-idx {output.fai} {input} 2> {log} + else + samtools faidx --fai-idx {output.fai} {input} 2> {log} + fi + """ rule report: input: @@ -231,7 +226,7 @@ rule report: output: outdir + "/reports/{population}.naibr.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Creating report: {wildcards.population}" script: @@ -244,7 +239,7 @@ rule report_pop: output: outdir + "/reports/naibr.pop.summary.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Creating summary report" script: diff --git a/workflow/rules/sv-naibr.smk b/workflow/rules/sv-naibr.smk index f55c19818..61285caab 100644 --- a/workflow/rules/sv-naibr.smk +++ b/workflow/rules/sv-naibr.smk @@ -9,7 +9,7 @@ samplenames = config["samplenames"] extra = config.get("extra", "") genomefile = config["genomefile"] molecule_distance = config["molecule_distance"] -outdir = "Variants/naibr" +outdir = config["output_directory"] skipreports = config["skipreports"] bn = os.path.basename(genomefile) genome_zip = True if (bn.endswith(".gz") or bn.endswith(".GZ")) else False @@ -26,7 +26,7 @@ def process_args(args): "k" : 3 } if args != "": - words = [i for i in re.split("\s|=", args) if len(i) > 0] + words = [i for i in re.split(r"\s|=", args) if len(i) > 0] for i in zip(words[::2], words[1::2]): argsDict[i[0]] = i[1] return argsDict @@ -80,7 +80,7 @@ rule create_config: with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") _ = conf.write(f"prefix={params[0]}\n") - _ = conf.write(f"outdir=Variants/naibr/{params[0]}\n") + _ = conf.write(f"outdir={outdir}/{params[0]}\n") _ = conf.write(f"threads={params[1]}\n") for i in argdict: _ = conf.write(f"{i}={argdict[i]}\n") @@ -99,7 +99,7 @@ rule call_sv: threads: min(10, workflow.cores) conda: - os.getcwd() + "/harpyenvs/variants.sv.yaml" + os.getcwd() + "/.harpy_envs/variants.sv.yaml" message: "Calling variants: {wildcards.sample}" shell: @@ -139,40 +139,32 @@ rule genome_link: if (file {input} | grep -q compressed ) ;then # is regular gzipped, needs to be BGzipped zcat {input} | bgzip -c > {output} - elif (file {input} | grep -q BGZF ); then - # is bgzipped, just linked - ln -sr {input} {output} else - # isn't compressed, just linked + # if BZgipped or isn't compressed, just linked ln -sr {input} {output} fi """ -if genome_zip: - rule genome_compressed_faidx: - input: - f"Genome/{bn}" - output: - gzi = f"Genome/{bn}.gzi", - fai = f"Genome/{bn}.fai" - log: - f"Genome/{bn}.faidx.gzi.log" - message: - "Indexing {input}" - shell: - "samtools faidx --gzi-idx {output.gzi} --fai-idx {output.fai} {input} 2> {log}" -else: - rule genome_faidx: - input: - f"Genome/{bn}" - output: - f"Genome/{bn}.fai" - log: - f"Genome/{bn}.faidx.log" - message: - "Indexing {input}" - shell: - "samtools faidx --fai-idx {output} {input} 2> {log}" +rule genome_faidx: + input: + f"Genome/{bn}" + output: + fai = f"Genome/{bn}.fai", + gzi = f"Genome/{bn}.gzi" if genome_zip else [] + log: + f"Genome/{bn}.faidx.gzi.log" + params: + genome_zip + message: + "Indexing {input}" + shell: + """ + if [ "{params}" = "True" ]; then + samtools faidx --gzi-idx {output.gzi} --fai-idx {output.fai} {input} 2> {log} + else + samtools faidx --fai-idx {output.fai} {input} 2> {log} + fi + """ rule report: input: @@ -181,7 +173,7 @@ rule report: output: outdir + "/reports/{sample}.naibr.html" conda: - os.getcwd() + "/harpyenvs/r-env.yaml" + os.getcwd() + "/.harpy_envs/r-env.yaml" message: "Creating report: {wildcards.sample}" script: @@ -215,7 +207,8 @@ if not skipreports: rule all: default_target: True - input: + input: + results message: "Checking for expected workflow output" shell: diff --git a/workflow/scripts/assignMI.py b/workflow/scripts/assignMI.py index 4fb3b9589..b9af6130e 100755 --- a/workflow/scripts/assignMI.py +++ b/workflow/scripts/assignMI.py @@ -4,28 +4,28 @@ import os import sys import pysam -import argparse - -parser = argparse.ArgumentParser( - prog = 'assignMI.py', - description = - """ - Assign an MI:i: (Molecular Identifier) tag to each barcoded - record based on a molecular distance cutoff. Unmapped records - are discarded in the output. Records without a BX:Z: tag or - with an invalid barcode (00 as one of its segments) are presevered - but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED. - """, - usage = "assignMI.py -c cutoff -i input.bam -o output.bam", - exit_on_error = False - ) -parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules. (default: 100000)") -parser.add_argument('-i', '--input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.") -parser.add_argument('-o', '--output', help = "Output bam file. Will also create an index file.") - -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) +#import argparse + +#parser = argparse.ArgumentParser( +# prog = 'assignMI.py', +# description = +# """ +# Assign an MI:i: (Molecular Identifier) tag to each barcoded +# record based on a molecular distance cutoff. Unmapped records +# are discarded in the output. Records without a BX:Z: tag or +# with an invalid barcode (00 as one of its segments) are presevered +# but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED. +# """, +# usage = "assignMI.py -c cutoff -i input.bam -o output.bam", +# exit_on_error = False +# ) +#parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules. (default: 100000)") +#parser.add_argument('-i', '--input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.") +#parser.add_argument('-o', '--output', help = "Output bam file. Will also create an index file.") +# +#if len(sys.argv) == 1: +# parser.print_help(sys.stderr) +# sys.exit(1) def write_validbx(bam, alnrecord, molID): ''' @@ -98,8 +98,8 @@ def write_missingbx(bam, alnrecord): # write record to output file bam.write(alnrecord) -args = parser.parse_args() - +#args = parser.parse_args() +bam_input = snakemake.input[0] # initialize the dict d = dict() # chromlast keeps track of the last chromosome so we can @@ -108,21 +108,21 @@ def write_missingbx(bam, alnrecord): # MI is the name of the current molecule, starting a 1 (0+1) MI = 0 -if os.path.exists(args.input) and args.input.lower().endswith(".sam"): - alnfile = pysam.AlignmentFile(args.input) -elif os.path.exists(args.input) and args.input.lower().endswith(".bam"): - if os.path.exists(args.input + ".bai"): - alnfile = pysam.AlignmentFile(args.input) +if os.path.exists(bam_input) and bam_input.lower().endswith(".sam"): + alnfile = pysam.AlignmentFile(bam_input) +elif os.path.exists(bam_input) and bam_input.lower().endswith(".bam"): + if os.path.exists(bam_input + ".bai"): + alnfile = pysam.AlignmentFile(bam_input) else: - print(f"Error: {args.input} requires a matching {args.input}.bai index file, but one wasn\'t found.", file = sys.stderr) + print(f"Error: {bam_input} requires a matching {bam_input}.bai index file, but one wasn\'t found.", file = sys.stderr) exit(1) else: - print(f"Error: {args.input} not found", file = sys.stderr) + print(f"Error: {bam_input} not found", file = sys.stderr) exit(1) # iniitalize output file #alnfile = pysam.AlignmentFile("/home/pdimens/Documents/harpy/test/bam/sample1.bam") -outfile = pysam.AlignmentFile(args.output, "wb", template = alnfile) +outfile = pysam.AlignmentFile(snakemake.output[0], "wb", template = alnfile) #outfile = pysam.AlignmentFile("/home/pdimens/Documents/harpy/test/bam/test.bam", "w", template = alnfile) for record in alnfile.fetch(): @@ -188,7 +188,7 @@ def write_missingbx(bam, alnrecord): # if the distance between alignments is > cutoff, it's a different molecule # so we'll +1 the suffix of the original barcode and relabel this one as # BX + suffix. Since it's a new entry, we initialize it and move on - if dist > args.cutoff: + if dist > snakemake.params[0]: # increment MI b/c it's a new molecule MI += 1 # increment original barcode's suffix @@ -220,7 +220,7 @@ def write_missingbx(bam, alnrecord): outfile.close() # index the output file -pysam.index(args.output) +pysam.index(snakemake.output[0]) # exit gracefully exit(0) \ No newline at end of file diff --git a/workflow/scripts/bxStats.old.py b/workflow/scripts/bxStats.old.py deleted file mode 100755 index bd2001ad3..000000000 --- a/workflow/scripts/bxStats.old.py +++ /dev/null @@ -1,143 +0,0 @@ -#! /usr/bin/env python3 - -import re -import sys -import pysam -import argparse - -parser = argparse.ArgumentParser( - prog = 'bxStats.py', - description = 'Calculate BX molecule length and reads per molecule from BAM file.', - usage = "bxStats.py input.bam -c cutoff > output.bxstats", - exit_on_error = False - ) -parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.") -parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules.") - -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - -args = parser.parse_args() - -d = dict() -chromlast = False -alnfile = pysam.AlignmentFile(args.input) - -# define write function -# it will only be called when the current alignment's chromosome doesn't -# match the chromosome from the previous alignment -def writestats(x,chr): - for bx in x: - x[bx]["inferred"] = x[bx]["end"] - x[bx]["start"] - if x[bx]["mindist"] < 0: - x[bx]["mindist"] = 0 - outtext = f"{chr}\t{bx}\t" + "\t".join([str(x[bx][i]) for i in ["n", "start","end", "inferred", "bp", "mindist"]]) - print(outtext, file = sys.stdout) - -print("contig\tbx\treads\tstart\tend\tlength_inferred\taligned_bp\tmindist", file = sys.stdout) - -for read in alnfile.fetch(): - chrm = read.reference_name - bp = read.query_alignment_length - # check if the current chromosome is different from the previous one - # if so, print the dict to file and empty it (a consideration for RAM usage) - if chromlast != False and chrm != chromlast: - writestats(d, chromlast) - d = dict() - if read.is_duplicate or read.is_unmapped: - chromlast = chrm - continue - try: - bx = read.get_tag("BX") - validBX = True - # do a regex search to find X00 pattern in the BX - if re.search("[ABCD]0{2,4}", bx): - # if found, invalid - bx = "invalidBX" - validBX = False - except: - # There is no bx tag - bx = "noBX" - validBX = False - - aln = read.get_blocks() - if not aln: - # unaligned, skip - chromlast = chrm - continue - - if validBX: - # logic to accommodate split reads - # start position of first alignment - pos_start = aln[0][0] - # end position of last alignment - pos_end = aln[-1][1] - else: - pos_start = 0 - pos_end = 0 - - # create bx entry if it's not present - if bx not in d.keys(): - d[bx] = { - "start": pos_start, - "end": pos_end, - "bp": bp, - "n": 1, - "lastpos" : pos_end, - "mindist" : -1, - "current_suffix": 0 - } - chromlast = chrm - continue - - # if invalid/absent BX, skip the distance stuff - if bx in ["noBX", "invalidBX"]: - chromlast = chrm - continue - - # store the original barcode as `orig` b/c we might need to suffix it - orig = bx - # if there is a suffix, append it to the barcode name - if d[orig]["current_suffix"] > 0: - bx = orig + "." + str(d[orig]["current_suffix"]) - - # distance from last alignment = current aln start - previous aln end - dist = pos_start - d[bx]["lastpos"] - # if the distance between alignments is > cutoff, it's a different molecule - # so we'll +1 the suffix of the original barcode and relabel this one as - # BX + suffix. Since it's a new entry, we initialize it and move on - if dist > args.cutoff: - d[orig]["current_suffix"] += 1 - bx = orig + "." + str(d[orig]["current_suffix"]) - d[bx] = { - "start": pos_start, - "end": pos_end, - "bp": bp, - "n": 1, - "lastpos" : pos_end, - "mindist" : -1, - "current_suffix": 0 - } - chromlast = chrm - continue - # only calculate the minimum distance between alignments - # if it's a forward read or an unpaired reverse read - if read.is_forward or (read.is_reverse and not read.is_paired): - if dist < d[bx]["mindist"] or d[bx]["mindist"] < 0: - d[bx]["mindist"] = dist - - # update the basic alignment info of the barcode - d[bx]["bp"] += bp - d[bx]["n"] += 1 - chromlast = chrm - - # only if low < currentlow or high > currenthigh - if pos_start < d[bx]["start"]: - d[bx]["start"] = pos_start - if pos_end > d[bx]["end"]: - d[bx]["end"] = pos_end - - if read.is_reverse or (read.is_forward and not read.is_paired): - # set the last position to be the end of current alignment - d[bx]["lastpos"] = pos_end \ No newline at end of file diff --git a/workflow/scripts/bxStats.py b/workflow/scripts/bxStats.py index 6834d5912..9a2aec3d0 100755 --- a/workflow/scripts/bxStats.py +++ b/workflow/scripts/bxStats.py @@ -2,26 +2,28 @@ import re import sys +import gzip import pysam -import argparse - -parser = argparse.ArgumentParser( - prog = 'bxStats.py', - description = 'Calculate BX molecule length and reads per molecule from BAM file.', - usage = "bxStats.py input.bam > output.bxstats", - exit_on_error = False - ) -parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.") - -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - -args = parser.parse_args() +#import argparse + +#parser = argparse.ArgumentParser( +# prog = 'bxStats.py', +# description = 'Calculate BX molecule length and reads per molecule from BAM file.', +# usage = "bxStats.py input.bam > output.bxstats", +# exit_on_error = False +# ) +#parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.") +# +#if len(sys.argv) == 1: +# parser.print_help(sys.stderr) +# sys.exit(1) +# +#args = parser.parse_args() d = dict() chromlast = False -alnfile = pysam.AlignmentFile(args.input) +alnfile = pysam.AlignmentFile(snakemake.input[0]) +outfile = gzip.open(snakemake.output[0], "wb") # define write function # it will only be called when the current alignment's chromosome doesn't @@ -32,9 +34,9 @@ def writestats(x,chr): if x[mi]["mindist"] < 0: x[mi]["mindist"] = 0 outtext = f"{chr}\t{mi}\t" + "\t".join([str(x[mi][i]) for i in ["n", "start","end", "inferred", "bp", "mindist"]]) - print(outtext, file = sys.stdout) + outfile.write(outtext.encode() + b"\n") -print("contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tmindist", file = sys.stdout) +outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tmindist\n") for read in alnfile.fetch(): chrm = read.reference_name @@ -127,4 +129,6 @@ def writestats(x,chr): if read.is_reverse or (read.is_forward and not read.is_paired): # set the last position to be the end of current alignment - d[mi]["lastpos"] = pos_end \ No newline at end of file + d[mi]["lastpos"] = pos_end + +outfile.close() \ No newline at end of file diff --git a/workflow/scripts/checkBAM.py b/workflow/scripts/checkBAM.py index 559af82b8..967e07fb4 100755 --- a/workflow/scripts/checkBAM.py +++ b/workflow/scripts/checkBAM.py @@ -4,24 +4,24 @@ import sys import re import os.path -import argparse +#import argparse -parser = argparse.ArgumentParser( - prog = 'checkBAM.py', - description = 'Do a haplotag format validity check on a BAM file.', - usage = "checkBAM.py bamfile", - exit_on_error = False - ) +#parser = argparse.ArgumentParser( +# prog = 'checkBAM.py', +# description = 'Do a haplotag format validity check on a BAM file.', +# usage = "checkBAM.py bamfile", +# exit_on_error = False +# ) -parser.add_argument("bamfile", help = "Input BAM file. Must have corresponding index (.bai) file present.") +#parser.add_argument("bamfile", help = "Input BAM file. Must have corresponding index (.bai) file present.") +# +#if len(sys.argv) == 1: +# parser.print_help(sys.stderr) +# sys.exit(1) +# +#args = parser.parse_args() -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - -args = parser.parse_args() - -bam_in = args.bamfile +bam_in = snakemake.input[0] # regex for EXACTLY AXXCXXBXXDXX haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]$') @@ -64,4 +64,5 @@ values = [str(i) for i in [os.path.basename(bam_in), nameMismatch, n_reads, noMI, noBX, badBX, bxNotLast]] -print("\t".join(values), file = sys.stdout) \ No newline at end of file +with open(snakemake.output[0], "w") as fout: + print("\t".join(values), file = fout) \ No newline at end of file diff --git a/workflow/scripts/checkFASTQ.py b/workflow/scripts/checkFASTQ.py index 61a36a359..bb02d6ca8 100755 --- a/workflow/scripts/checkFASTQ.py +++ b/workflow/scripts/checkFASTQ.py @@ -4,24 +4,25 @@ import sys import re import os.path -import argparse - -parser = argparse.ArgumentParser( - prog = 'checkFASTQ.py', - description = 'Do a haplotag format validity check on a FASTQ file.', - usage = "checkFASTQ.py fastqfile", - exit_on_error = False - ) - -parser.add_argument("fastqfile", help = "Input FASTQ file.") - -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - -args = parser.parse_args() - -fq_in = args.fastqfile +#import argparse + +#parser = argparse.ArgumentParser( +# prog = 'checkFASTQ.py', +# description = 'Do a haplotag format validity check on a FASTQ file.', +# usage = "checkFASTQ.py fastqfile", +# exit_on_error = False +# ) +# +#parser.add_argument("fastqfile", help = "Input FASTQ file.") + +#if len(sys.argv) == 1: +# parser.print_help(sys.stderr) +# sys.exit(1) +# +#args = parser.parse_args() + +#fq_in = args.fastqfile +fq_in = snakemake.input[0] #bxz = re.compile('BX:Z:') samspec = re.compile('[A-Z][A-Z]:[AifZHB]:') @@ -56,4 +57,5 @@ noBX += 1 values = [str(i) for i in [os.path.basename(fq_in), n_reads, noBX, badBX, badSamSpec, bxNotLast]] -print("\t".join(values), file = sys.stdout) \ No newline at end of file +with open(snakemake.output[0], "w") as fout: + print("\t".join(values), file = fout) \ No newline at end of file diff --git a/workflow/scripts/countBX.py b/workflow/scripts/countBX.py index 9daa101ba..ee3487584 100755 --- a/workflow/scripts/countBX.py +++ b/workflow/scripts/countBX.py @@ -2,25 +2,25 @@ import pysam import re -import argparse +#import argparse import sys -parser = argparse.ArgumentParser( - prog = 'countBX.py', - description = 'Count the number of valid Haplotag BX tags in a FASTQ file.', - usage = "countBX.py fastqfile", - exit_on_error = False - ) - -parser.add_argument("fastqfile", help = "Input FASTQ file.") - -# parser.add_argument("outfile", help = "Output bam file. A corresponding index file will be created for it.") - -if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - -args = parser.parse_args() +#parser = argparse.ArgumentParser( +# prog = 'countBX.py', +# description = 'Count the number of valid Haplotag BX tags in a FASTQ file.', +# usage = "countBX.py fastqfile", +# exit_on_error = False +# ) +# +#parser.add_argument("fastqfile", help = "Input FASTQ file.") +# +## parser.add_argument("outfile", help = "Output bam file. A corresponding index file will be created for it.") +# +#if len(sys.argv) == 1: +# parser.print_help(sys.stderr) +# sys.exit(1) +# +#args = parser.parse_args() n_reads = 0 n_bx = 0 @@ -36,7 +36,7 @@ "C" : 0, "D" : 0 } -with pysam.FastxFile(args.fastqfile) as fh: +with pysam.FastxFile(snakemake.input[0]) as fh: for entry in fh: n_reads += 1 comments = entry.comment.split() @@ -58,11 +58,12 @@ continue n_valid += 1 -print(f"totalReads\t{n_reads}") -print(f"bxTagCount\t{n_bx}") -print(f"bxValid\t{n_valid}") -print(f"bxInvalid\t{n_bx - n_valid}") -print("A00\t",str(inv_dict["A"])) -print("C00\t",str(inv_dict["C"])) -print("B00\t",str(inv_dict["B"])) -print("D00\t",str(inv_dict["D"])) \ No newline at end of file +with open(snakemake.output[0], "w") as fout: + print(f"totalReads\t{n_reads}", file = fout) + print(f"bxTagCount\t{n_bx}", file = fout) + print(f"bxValid\t{n_valid}", file = fout) + print(f"bxInvalid\t{n_bx - n_valid}", file = fout) + print("A00\t",str(inv_dict["A"]), file = fout) + print("C00\t",str(inv_dict["C"]), file = fout) + print("B00\t",str(inv_dict["B"]), file = fout) + print("D00\t",str(inv_dict["D"]), file = fout) \ No newline at end of file