diff --git a/.deprecated/align-minimap.smk b/.deprecated/align-minimap.smk index 478b851b2..3b18512a4 100644 --- a/.deprecated/align-minimap.smk +++ b/.deprecated/align-minimap.smk @@ -54,7 +54,7 @@ def get_fq(wildcards): r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) return sorted(list(filter(r.match, fqlist))[:2]) -rule genome_setup: +rule setup_genome: input: genomefile output: @@ -73,7 +73,7 @@ rule genome_setup: fi """ -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 01b0a7499..3fd74bab4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -106,7 +106,7 @@ jobs: id: singularity shell: micromamba-shell {0} if: ${{ needs.changes.outputs.modules == 'true' }} - run: harpy qc --skipreports -a test/fastq/sample1.*.fq.gz + run: harpy qc --skipreports --quiet test/fastq/sample1.*.fq.gz - name: Create Singularity Artifact if: ${{ steps.singularity.outcome == 'success' }} uses: actions/upload-artifact@v4 @@ -148,7 +148,7 @@ jobs: path: .snakemake/singularity - name: harpy demultiplex shell: micromamba-shell {0} - run: harpy demultiplex gen1 --schema test/demux/samples.schema test/demux/Undetermined_S0_L004_R* test/demux/Undetermined_S0_L004_I* + run: harpy demultiplex gen1 --quiet --schema test/demux/samples.schema test/demux/Undetermined_S0_L004_R* test/demux/Undetermined_S0_L004_I* preflight: needs: [changes, pkgbuild] @@ -190,7 +190,7 @@ jobs: - name: test preflight bam if: always() shell: micromamba-shell {0} - run: harpy preflight bam test/bam + run: harpy preflight bam --quiet test/bam qc: needs: [changes, pkgbuild] @@ -228,10 +228,10 @@ jobs: path: .snakemake/singularity - name: harpy qc shell: micromamba-shell {0} - run: harpy qc -x "--trim_poly_g" test/fastq + run: harpy qc -x "--trim_poly_g" --quiet test/fastq - name: harpy qc all options shell: micromamba-shell {0} - run: harpy qc -a -d -c test/fastq + run: harpy qc -a -d -c --quiet test/fastq deconvolve: needs: [changes, pkgbuild] if: ${{ needs.changes.outputs.deconvolve == 'true' && needs.pkgbuild.result == 'success' }} @@ -268,7 +268,7 @@ jobs: path: .snakemake/singularity - name: harpy deconvolve shell: micromamba-shell {0} - run: harpy deconvolve test/fastq + run: harpy deconvolve --quiet test/fastq bwa: needs: [changes, pkgbuild] if: ${{ needs.changes.outputs.bwa == 'true' && needs.pkgbuild.result == 'success' }} @@ -305,7 +305,7 @@ jobs: path: .snakemake/singularity - name: test bwa shell: micromamba-shell {0} - run: harpy align bwa -g test/genome/genome.fasta.gz -x "-A 2" test/fastq + run: harpy align bwa --quiet -g test/genome/genome.fasta.gz -x "-A 2" test/fastq ema: needs: [changes, pkgbuild] @@ -343,7 +343,7 @@ jobs: path: .snakemake/singularity - name: test ema shell: micromamba-shell {0} - run: harpy align ema --ema-bins 150 -g test/genome/genome.fasta.gz -x "-d" test/fastq + run: harpy align ema --quiet --ema-bins 150 -g test/genome/genome.fasta.gz -x "-d" test/fastq strobe: needs: [changes, pkgbuild] @@ -381,7 +381,7 @@ jobs: path: .snakemake/singularity - name: test strobealign shell: micromamba-shell {0} - run: harpy align strobe -l 125 -g test/genome/genome.fasta.gz test/fastq + run: harpy align strobe --quiet -l 125 -g test/genome/genome.fasta.gz test/fastq mpileup: needs: [changes, pkgbuild] @@ -419,10 +419,10 @@ jobs: path: .snakemake/singularity - name: snp mpileup shell: micromamba-shell {0} - run: harpy snp mpileup -r test/positions.bed -g test/genome/genome.fasta.gz -x "--ignore-RG" test/bam + run: harpy snp mpileup --quiet -r test/positions.bed -g test/genome/genome.fasta.gz -x "--ignore-RG" test/bam - name: snp mpileup-pop shell: micromamba-shell {0} - run: harpy snp mpileup -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam + run: harpy snp mpileup --quiet -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam freebayes: needs: [changes, pkgbuild] @@ -460,10 +460,10 @@ jobs: path: .snakemake/singularity - name: snp freebayes shell: micromamba-shell {0} - run: harpy snp freebayes -r test/positions.bed -g test/genome/genome.fasta.gz -x "-g 200" test/bam + run: harpy snp freebayes --quiet -r test/positions.bed -g test/genome/genome.fasta.gz -x "-g 200" test/bam - name: snp freebayes-pop shell: micromamba-shell {0} - run: harpy snp freebayes -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam + run: harpy snp freebayes --quiet -r test/positions.bed -o SNP/poptest -g test/genome/genome.fasta.gz -p test/samples.groups test/bam impute: needs: [changes, pkgbuild] @@ -501,11 +501,11 @@ jobs: path: .snakemake/singularity - name: impute shell: micromamba-shell {0} - run: harpy impute --vcf test/vcf/test.bcf -p test/stitch.params test/bam + run: harpy impute --quiet --vcf test/vcf/test.bcf -p test/stitch.params test/bam - name: impute from vcf shell: micromamba-shell {0} if: always() - run: harpy impute --vcf-samples -o vcfImpute --vcf test/vcf/test.bcf -p test/stitch.params test/bam + run: harpy impute --quiet --vcf-samples -o vcfImpute --vcf test/vcf/test.bcf -p test/stitch.params test/bam phase: needs: [changes, pkgbuild] @@ -543,17 +543,17 @@ jobs: path: .snakemake/singularity - name: phase shell: micromamba-shell {0} - run: harpy phase --vcf test/vcf/test.bcf -x "--max_iter 10001" test/bam + run: harpy phase --quiet --vcf test/vcf/test.bcf -x "--max_iter 10001" test/bam - name: phase with indels shell: micromamba-shell {0} if: always() - run: harpy phase --vcf test/vcf/test.bcf -o phaseindel -g test/genome/genome.fasta.gz test/bam + run: harpy phase --quiet --vcf test/vcf/test.bcf -o phaseindel -g test/genome/genome.fasta.gz test/bam - name: phase from vcf shell: micromamba-shell {0} if: always() run: | cp test/bam/sample1.bam test/bam/pineapple.bam && rename_bam -d test/bam/pineapple.bam pineapple1 - harpy phase --vcf-samples -o phasevcf --vcf test/vcf/test.bcf test/bam + harpy phase --quiet --vcf-samples -o phasevcf --vcf test/vcf/test.bcf test/bam leviathan: needs: [changes, pkgbuild] @@ -591,12 +591,12 @@ jobs: path: .snakemake/singularity - name: leviathan shell: micromamba-shell {0} - run: harpy sv leviathan -s 100 -b 1 -g test/genome/genome.fasta.gz -x "-M 2002" test/bam + run: harpy sv leviathan --quiet -s 100 -b 1 -g test/genome/genome.fasta.gz -x "-M 2002" test/bam continue-on-error: true - name: leviathan-pop if: always() shell: micromamba-shell {0} - run: harpy sv leviathan -s 100 -b 1 -g test/genome/genome.fasta.gz -o SV/leviathanpop -p test/samples.groups test/bam + run: harpy sv leviathan --quiet -s 100 -b 1 -g test/genome/genome.fasta.gz -o SV/leviathanpop -p test/samples.groups test/bam naibr: needs: [changes, pkgbuild] @@ -634,20 +634,20 @@ jobs: path: .snakemake/singularity - name: naibr shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/naibr -x "-min_sv 5000" test/bam_phased && rm -r Genome + run: harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/naibr -x "-min_sv 5000" test/bam_phased && rm -r Genome - name: naibr pop if: always() shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/pop -p test/samples.groups test/bam_phased && rm -r Genome + run: harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/pop -p test/samples.groups test/bam_phased && rm -r Genome - name: naibr with phasing if: always() shell: micromamba-shell {0} run: | - harpy sv naibr -g test/genome/genome.fasta.gz -o SV/phase -v test/vcf/test.phased.bcf test/bam && rm -r Genome + harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/phase -v test/vcf/test.phased.bcf test/bam && rm -r Genome - name: naibr pop with phasing if: always() shell: micromamba-shell {0} - run: harpy sv naibr -g test/genome/genome.fasta.gz -o SV/phasepop -v test/vcf/test.phased.bcf -p test/samples.groups test/bam && rm -r Genome + run: harpy sv naibr --quiet -g test/genome/genome.fasta.gz -o SV/phasepop -v test/vcf/test.phased.bcf -p test/samples.groups test/bam && rm -r Genome simulate_variants: @@ -687,26 +687,26 @@ jobs: - name: simulate random snps/indels shell: micromamba-shell {0} run: | - harpy simulate snpindel --snp-count 10 --indel-count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate snpindel --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/sim.snpindel.snp.hap1.vcf --indel-vcf Simulate/snpindel/sim.snpindel.indel.hap1.vcf test/genome/genome.fasta.gz + harpy simulate snpindel --quiet --snp-count 10 --indel-count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate snpindel --quiet --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/sim.snpindel.snp.hap1.vcf --indel-vcf Simulate/snpindel/sim.snpindel.indel.hap1.vcf test/genome/genome.fasta.gz - name: simulate inversions shell: micromamba-shell {0} if: always() run: | - harpy simulate inversion --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate inversion --prefix Simulate/invvcf --vcf Simulate/inversion/sim.inversion.hap1.vcf test/genome/genome.fasta.gz + harpy simulate inversion --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate inversion --quiet --prefix Simulate/invvcf --vcf Simulate/inversion/sim.inversion.hap1.vcf test/genome/genome.fasta.gz - name: simulate cnv shell: micromamba-shell {0} if: always() run: | - harpy simulate cnv --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate cnv --prefix Simulate/cnvvcf --vcf Simulate/cnv/sim.cnv.hap1.vcf test/genome/genome.fasta.gz + harpy simulate cnv --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate cnv --quiet --prefix Simulate/cnvvcf --vcf Simulate/cnv/sim.cnv.hap1.vcf test/genome/genome.fasta.gz - name: simulate translocations shell: micromamba-shell {0} if: always() run: | - harpy simulate translocation --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate translocation --prefix Simulate/transvcf --vcf Simulate/translocation/sim.translocation.hap1.vcf test/genome/genome.fasta.gz + harpy simulate translocation --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz + harpy simulate translocation --quiet --prefix Simulate/transvcf --vcf Simulate/translocation/sim.translocation.hap1.vcf test/genome/genome.fasta.gz simulate_linkedreads: needs: [changes, pkgbuild] @@ -744,7 +744,7 @@ jobs: path: .snakemake/singularity - name: simulate linked reads shell: micromamba-shell {0} - run: harpy simulate linkedreads -t 4 -n 2 -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz + run: harpy simulate linkedreads --quiet -t 4 -n 2 -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz extras: needs: [changes, pkgbuild] diff --git a/harpy/align.py b/harpy/align.py index 363fdc51f..a63e071bd 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -94,8 +94,6 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -131,7 +129,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ generate_conda_deps() start_text = f"Samples: {sample_count}\nOutput Directory: {output_dir}\nLog: {sm_log}" - launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log) + launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/align/ema") @click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') @@ -172,8 +170,6 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, keep_unma command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -225,7 +221,7 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, keep_unma generate_conda_deps() start_text = f"Samples: {sample_count}\nPlatform: {platform}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "align_ema", start_text, output_dir, sm_log) + launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/modules/align/minimap/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') @@ -266,10 +262,8 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: - command += snakemake + command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) @@ -305,7 +299,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, generate_conda_deps() start_text = f"Samples: {sample_count}\nOutput Directory: {output_dir}\nLog: {sm_log}" - launch_snakemake(command, "align_strobe", start_text, output_dir, sm_log) + launch_snakemake(command, "align_strobe", start_text, output_dir, sm_log, quiet) align.add_command(bwa) align.add_command(ema) diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index 0d04bd3a5..61062a1d8 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -51,8 +51,6 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -79,4 +77,4 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t generate_conda_deps() start_text = f"Samples: {sample_count}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "deconvolve", start_text, output_dir, sm_log) + launch_snakemake(command, "deconvolve", start_text, output_dir, sm_log, quiet) diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 8e4397aef..1d67df2c4 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -64,8 +64,6 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -92,6 +90,6 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski generate_conda_deps() start_text = f"Haplotag Type: Generation I\nDemultiplex Schema: {schema}\nOutput Directory: {output_dir}\nLog: {sm_log}" - launch_snakemake(command, "demultiplex_gen1", start_text, output_dir, sm_log) + launch_snakemake(command, "demultiplex_gen1", start_text, output_dir, sm_log, quiet) demultiplex.add_command(gen1) diff --git a/harpy/helperfunctions.py b/harpy/helperfunctions.py index f3fb40b3b..55d08b6fd 100644 --- a/harpy/helperfunctions.py +++ b/harpy/helperfunctions.py @@ -5,11 +5,13 @@ import sys import glob import subprocess +from time import sleep from datetime import datetime from pathlib import Path from collections import Counter from rich import print as rprint from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn +from rich.console import Console from importlib_resources import files import harpy.scripts import harpy.reports @@ -74,47 +76,149 @@ def biallelic_contigs(vcf, workdir): else: with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: contigs = [line.rstrip() for line in f] - if len(contigs) == 0: print_error("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.") sys.exit(1) else: return contigs -def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile): +def snakemake_log(outdir, workflow): + """Return a snakemake logfile name. Iterates logfile run number if one exists.""" + attempts = glob.glob(f"{outdir}/logs/snakemake/*.log") + if not attempts: + return f"{outdir}/logs/snakemake/{workflow}.run1." + datetime.now().strftime("%d_%m_%Y") + ".log" + increment = sorted([int(i.split(".")[1].replace("run","")) for i in attempts])[-1] + 1 + return f"{outdir}/logs/snakemake/{workflow}.run{increment}." + datetime.now().strftime("%d_%m_%Y") + ".log" + +def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): """launch snakemake with the given commands""" - print_onstart(starttext, workflow) + print_onstart(starttext, workflow.replace("_", " ")) try: with Progress( TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width=50), + BarColumn(), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), TimeElapsedColumn(), - transient=True + transient=True, + disable=quiet ) as progress: # Add a task with a total value of 100 (representing 100%) - task = progress.add_task("Processing...", total=100) # Start a subprocess process = subprocess.Popen(sm_args.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text = True) - err = 0 + err = False + deps = False + # read up to the job summary, but break early if dependency text appears + while True: + output = process.stderr.readline() + return_code = process.poll() + if return_code == 1: + print_error("There is an error in the Snakefile. Try running the Snakefile independently to diagnose it. If you did not edit the Snakefile manually, it's probably a bug! Please submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues") + sys.exit(1) + if output == '' and return_code is not None: + break + # print dependency text only once + if "Downloading and installing remote packages" in output: + deps = True + deploy_text = "[magenta]Downloading and installing workflow dependencies" + break + if "Pulling singularity image" in output: + deps = True + deploy_text = "[magenta]Downloading software container" + break + if output.startswith("Job stats:"): + # read and ignore the next two lines + process.stderr.readline() + process.stderr.readline() + break + # if dependency text present, print console log with spinner and read up to the job stats + if deps: + if not quiet: + console = Console() + with console.status(deploy_text, spinner = "point") as status: + while True: + output = process.stderr.readline() + if output == '' and process.poll() is not None: + break + sleep(2) + if output.startswith("Job stats:"): + # read and ignore the next two lines + process.stderr.readline() + process.stderr.readline() + break + else: + while True: + output = process.stderr.readline() + if output == '' and process.poll() is not None: + break + if output.startswith("Job stats:"): + # read and ignore the next two lines + process.stderr.readline() + process.stderr.readline() + break + job_inventory = {} + task_ids = {"total_progress" : progress.add_task("[bold blue]Total", total=100)} + # process the job summary + while True: + output = process.stderr.readline() + # stop parsing on "total" since it's always the last entry + if output.startswith("total"): + break + if output.startswith("workflow_summary"): + continue + try: + rule,count = output.split() + rule_desc = rule.replace("_", " ") + # rule : display_name, count, set of job_id's + job_inventory[rule] = [rule_desc, int(count), set()] + except ValueError: + pass + while True: output = process.stderr.readline() if output == '' and process.poll() is not None: break if output: - if err > 0: + # prioritizing printing the error and quitting + if err: if "Shutting down, this" in output: sys.exit(1) - rprint(f"[red]{output.strip()}") + rprint(f"[red]{output.strip()}", file = sys.stderr) if "Error in rule" in output: progress.stop() print_onerror(sm_logfile) - rprint(f"[yellow bold]{output.strip()}") - err += 1 - match = re.search(r"\(\d+%\)", output) + rprint(f"[yellow bold]{output.strip()}", file = sys.stderr) + err = True + # find the % progress text to update Total progress bar + match = re.search(r"\(\d+%\) done", output) if match: percent = int(re.sub(r'\D', '', match.group())) - progress.update(task, completed=percent) + progress.update(task_ids["total_progress"], completed=percent) + continue + # add new progress bar track if the rule doesn't have one yet + rulematch = re.search(r"rule\s\w+:", output) + if rulematch: + rule = rulematch.group().replace(":","").split()[-1] + if rule not in task_ids: + task_ids[rule] = progress.add_task(job_inventory[rule][0], total=job_inventory[rule][1]) + continue + # store the job id in the inventory so we can later look up which rule it's associated with + jobidmatch = re.search(r"jobid:\s\d+", string = output) + if jobidmatch: + job_id = int(re.search(r"\d+",output).group()) + # rule should be the most previous rule recorded + job_inventory[rule][2].add(job_id) + continue + # check which rule the job is associated with and update the corresponding progress bar + finishmatch = re.search(r"Finished\sjob\s\d+", output) + if finishmatch: + completed = int(re.search(r"\d+", output).group()) + for job,details in job_inventory.items(): + if completed in details[2]: + progress.update(task_ids[job], advance = 1) + # remove the job to save memory. wont be seen again + details[2].discard(completed) + break + if process.returncode < 1: print_onsuccess(outdir) except KeyboardInterrupt: @@ -123,11 +227,3 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile): process.terminate() process.wait() sys.exit(1) - -def snakemake_log(outdir, workflow): - """Return a snakemake logfile name. Iterates logfile run number if one exists.""" - attempts = glob.glob(f"{outdir}/logs/snakemake/*.snakelog") - if not attempts: - return f"{outdir}/logs/snakemake/{workflow}.run1." + datetime.now().strftime("%d_%m_%Y") + ".snakelog" - increment = sorted([int(i.split(".")[1].replace("run","")) for i in attempts])[-1] + 1 - return f"{outdir}/logs/snakemake/{workflow}.run{increment}." + datetime.now().strftime("%d_%m_%Y") + ".snakelog" diff --git a/harpy/impute.py b/harpy/impute.py index 87e2fa042..a2400319e 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -60,8 +60,6 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -101,4 +99,4 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para generate_conda_deps() start_text = f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nContigs with ≥2 Biallelic SNPs: {n_biallelic}\nOutput Directory: {output_dir}/\Snakemake Log: {sm_log}" - launch_snakemake(command, "impute", start_text, output_dir, sm_log) + launch_snakemake(command, "impute", start_text, output_dir, sm_log, quiet) diff --git a/harpy/phase.py b/harpy/phase.py index b8b74836f..1f0d076fa 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -58,8 +58,6 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -99,4 +97,4 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, generate_conda_deps() start_text = f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "phase", start_text, output_dir, sm_log) + launch_snakemake(command, "phase", start_text, output_dir, sm_log, quiet) diff --git a/harpy/preflight.py b/harpy/preflight.py index 5d08d043a..83a9590cb 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -62,8 +62,6 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, config_only command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -87,7 +85,7 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, config_only generate_conda_deps() start_text = f"Files: {n}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "preflight_fastq", start_text, output_dir, sm_log) + launch_snakemake(command, "preflight_fastq", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/preflight/") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @@ -118,8 +116,6 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, conda, config_only): command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -143,7 +139,7 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, conda, config_only): generate_conda_deps() start_text = f"Samples: {n}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "preflight_bam", start_text, output_dir, sm_log) + launch_snakemake(command, "preflight_bam", start_text, output_dir, sm_log, quiet) preflight.add_command(fastq) preflight.add_command(bam) diff --git a/harpy/printfunctions.py b/harpy/printfunctions.py index 630cd83fc..66cbd4863 100644 --- a/harpy/printfunctions.py +++ b/harpy/printfunctions.py @@ -43,7 +43,7 @@ def print_onerror(logfile): """Print a red panel with error text. To be used in place of onsuccess: inside a snakefile. Expects the erroring rule printed after it.""" rprint( Panel( - f"The workflow has terminated due to an error. See the log for more info:\n[bold]{logfile}[/bold]", + f"The workflow terminated from an error. See the full log for more info:\n[bold]{logfile}[/bold]", title = "[bold]workflow error", title_align = "left", border_style = "red", diff --git a/harpy/qc.py b/harpy/qc.py index 302f1487a..cc42b3420 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -61,8 +61,6 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -113,4 +111,4 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d generate_conda_deps() start_text = f"Samples: {sample_count}\nOutput Directory: {output_dir}/\n{tasks}\nLog: {sm_log}" - launch_snakemake(command, "qc", start_text, output_dir, sm_log) + launch_snakemake(command, "qc", start_text, output_dir, sm_log, quiet) diff --git a/harpy/reports/align_bxstats.Rmd b/harpy/reports/align_bxstats.Rmd index b946de02c..6302b1844 100644 --- a/harpy/reports/align_bxstats.Rmd +++ b/harpy/reports/align_bxstats.Rmd @@ -211,7 +211,7 @@ hchart(aggregate_df, "xrange", animation = F, groupPadding = 0.0001, hcaes(x = 0, x2 = totalreads, y = rev(0:(n_samples-1)), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> hc_xAxis(title = list(text = "Total Alignments")) |> hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> - hc_caption(text = "Percentage represents the percent of alignments with a valid BX barcode") |> + hc_caption(text = "Percentage represents the percent of alignments with a valid BX barcode, shown in dark green. Light green represents invalid BX barcodes.") |> hc_tooltip(enabled = FALSE) |> hc_colors(color = "#95d8a7") |> hc_title(text = "Percent Alignments with Valid BX Barcode") |> @@ -226,7 +226,7 @@ hchart(aggregate_df, "xrange", animation = F, groupPadding = 0.0001, hcaes(x = 0, x2 = multiplex + singletons, y = rev(0:(n_samples-1)), partialFill = round(multiplex / (multiplex + singletons), 4)), dataLabels = list(enabled = TRUE)) |> hc_xAxis(title = list(text = "Total Alignments with Valid Barcodes")) |> hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> - hc_caption(text = "Percentage represents the percent of molecules composed of more than 2 single-end or paired-end sequences") |> + hc_caption(text = "Percentage represents the percent of molecules composed of more than 2 single-end or paired-end sequences, shown in dark blue. Light blue represents the singletons.") |> hc_tooltip(enabled = FALSE) |> hc_colors(color = "#8dc6f5") |> hc_title(text = "Percent Non-Singleton Molecules") |> diff --git a/harpy/resume.py b/harpy/resume.py index a06e69d9d..8f715ec26 100644 --- a/harpy/resume.py +++ b/harpy/resume.py @@ -11,8 +11,9 @@ @click.command(no_args_is_help = True, epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/other") @click.option('-c', '--conda', is_flag = True, default = False, help = 'Recreate the conda environments into .harpy_envs/') +@click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') @click.argument('directory', required=True, type=click.Path(exists=True, file_okay=False, readable=True), nargs=1) -def resume(directory, conda): +def resume(directory, conda, quiet): """ Resume a workflow from an existing Harpy directory @@ -40,7 +41,7 @@ def resume(directory, conda): sm_log = snakemake_log(directory, workflow) command = harpy_config["workflow_call"] + f" --config snakemake_log={sm_log}" start_text = f"Output Directory: {directory}\nLog: {sm_log}" - launch_snakemake(command, workflow, start_text, directory, sm_log) + launch_snakemake(command, workflow, start_text, directory, sm_log, quiet) diff --git a/harpy/simulate.py b/harpy/simulate.py index f89eefbfa..821e62a3b 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -140,8 +140,6 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -178,8 +176,9 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r start_text = f"Genome Haplotype 1: {os.path.basename(genome_hap1)}\n" start_text += f"Genome Haplotype 2: {os.path.basename(genome_hap2)}\n" start_text += f"Barcodes: {os.path.basename(barcodes)}\n" if barcodes else "Barcodes: 10X Default\n" - start_text += f"Output Directory: {output_dir}/" - launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log) + start_text += f"Output Directory: {output_dir}/\n" + start_text += f"Log: {sm_log}" + launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") @click.option('-s', '--snp-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known snps to simulate') @@ -232,8 +231,6 @@ def snpindel(genome, snp_vcf, indel_vcf, output_dir, prefix, snp_count, indel_co command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -296,7 +293,8 @@ def snpindel(genome, snp_vcf, indel_vcf, output_dir, prefix, snp_count, indel_co sys.exit(0) generate_conda_deps() - launch_snakemake(command, "simulate_snpindel", start_text.rstrip("\n"), output_dir, sm_log) + start_text += f"Log: {sm_log}" + launch_snakemake(command, "simulate_snpindel", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") @@ -337,8 +335,6 @@ def inversion(genome, vcf, prefix, output_dir, count, min_size, max_size, centro command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -392,7 +388,8 @@ def inversion(genome, vcf, prefix, output_dir, count, min_size, max_size, centro sys.exit(0) generate_conda_deps() - launch_snakemake(command, "simulate_inversion", start_text.rstrip("\n"), output_dir, sm_log) + start_text += f"Log: {sm_log}" + launch_snakemake(command, "simulate_inversion", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") @@ -442,8 +439,6 @@ def cnv(genome, output_dir, vcf, prefix, count, min_size, max_size, dup_ratio, m command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -500,7 +495,8 @@ def cnv(genome, output_dir, vcf, prefix, count, min_size, max_size, dup_ratio, m sys.exit(0) generate_conda_deps() - launch_snakemake(command, "simulate_cnv", start_text.rstrip("\n"), output_dir, sm_log) + start_text += f"Log: {sm_log}" + launch_snakemake(command, "simulate_cnv", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known translocations to simulate') @@ -538,8 +534,6 @@ def translocation(genome, output_dir, prefix, vcf, count, centromeres, genes, he command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -592,7 +586,8 @@ def translocation(genome, output_dir, prefix, vcf, count, centromeres, genes, he sys.exit(0) generate_conda_deps() - launch_snakemake(command, "simulate_translocation", start_text.rstrip("\n"), output_dir, sm_log) + start_text += f"Log: {sm_log}" + launch_snakemake(command, "simulate_translocation", start_text, output_dir, sm_log, quiet) simulate.add_command(linkedreads) simulate.add_command(snpindel) diff --git a/harpy/snakefiles/align_bwa.smk b/harpy/snakefiles/align_bwa.smk index 368ae4580..9e568cbcb 100644 --- a/harpy/snakefiles/align_bwa.smk +++ b/harpy/snakefiles/align_bwa.smk @@ -35,15 +35,13 @@ def get_fq(wildcards): r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) return sorted(list(filter(r.match, fqlist))[:2]) -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Copying {input} to Genome/" shell: """ if (file {input} | grep -q compressed ) ;then @@ -54,7 +52,7 @@ rule genome_setup: fi """ -rule genome_faidx: +rule samtools_faidx: input: f"Genome/{bn}" output: @@ -66,8 +64,6 @@ rule genome_faidx: genome_zip container: None - message: - "Indexing {input}" shell: """ if [ "{params}" = "True" ]; then @@ -77,7 +73,7 @@ rule genome_faidx: fi """ -rule genome_bwa_index: +rule bwa_index: input: f"Genome/{bn}" output: @@ -86,8 +82,6 @@ rule genome_bwa_index: f"Genome/{bn}.idx.log" conda: f"{envdir}/align.yaml" - message: - "Indexing {input}" shell: "bwa index {input} 2> {log}" @@ -111,8 +105,6 @@ rule align: 10 conda: f"{envdir}/align.yaml" - message: - "Aligning sequences: {wildcards.sample}" shell: """ bwa mem -C -v 2 -t {threads} {params.extra} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} 2> {log} | @@ -136,8 +128,6 @@ rule mark_duplicates: None threads: 4 - message: - "Marking duplicates: {wildcards.sample}" shell: """ samtools collate -O -u {input.sam} | @@ -147,15 +137,13 @@ rule mark_duplicates: rm -rf {params.tmpdir} """ -rule markdups_index: +rule index_duplicates: input: outdir + "/samples/{sample}/{sample}.markdup.bam" output: temp(outdir + "/samples/{sample}/{sample}.markdup.bam.bai") container: None - message: - "Indexing duplicate-marked alignments: {wildcards.sample}" shell: "samtools index {input}" @@ -170,12 +158,10 @@ rule assign_molecules: molecule_distance container: None - message: - "Assigning barcodes to molecules: {wildcards.sample}" shell: "assign_mi.py -o {output.bam} -c {params} {input.bam}" -rule bxstats: +rule barcode_stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -185,12 +171,10 @@ rule bxstats: sample = lambda wc: d[wc.sample] container: None - message: - "Calculating barcoded alignment statistics: {wildcards.sample}" shell: "bx_stats.py -o {output} {input.bam}" -rule coverage: +rule calculate_depth: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -200,12 +184,10 @@ rule coverage: windowsize container: None - message: - "Calculating genomic coverage: {wildcards.sample}" shell: "samtools depth -a {input.bam} | depth_windows.py {params} | gzip > {output}" -rule report_persample: +rule sample_reports: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -215,12 +197,10 @@ rule report_persample: molecule_distance conda: f"{envdir}/r.yaml" - message: - "Creating alignment report: {wildcards.sample}" script: "report/align_stats.Rmd" -rule stats: +rule general_stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -229,15 +209,13 @@ rule stats: flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") container: None - message: - "Calculating alignment stats: {wildcards.sample}" shell: """ samtools stats -d {input.bam} > {output.stats} samtools flagstat {input.bam} > {output.flagstat} """ -rule report_samtools: +rule samtools_report: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: @@ -249,20 +227,16 @@ rule report_samtools: comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" - message: - "Summarizing samtools stats and flagstat" shell: "multiqc {params} --filename {output} 2> /dev/null" -rule report_bx: +rule barcode_report: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: outdir + "/reports/barcodes.summary.html" conda: f"{envdir}/r.yaml" - message: - "Summarizing all barcode information from alignments" script: "report/align_bxstats.Rmd" diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index 53b6d3994..990768eb6 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -39,15 +39,13 @@ def get_fq(wildcards): r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) return sorted(list(filter(r.match, fqlist))[:2]) -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Copying {input} to Genome/" shell: """ if (file {input} | grep -q compressed ) ;then @@ -58,7 +56,7 @@ rule genome_setup: fi """ -rule genome_faidx: +rule samtools_faidx: input: f"Genome/{bn}" output: @@ -70,8 +68,6 @@ rule genome_faidx: genome_zip container: None - message: - "Indexing {input}" shell: """ if [ "{params}" = "True" ]; then @@ -81,7 +77,7 @@ rule genome_faidx: fi """ -rule genome_bwa_index: +rule bwa_index: input: f"Genome/{bn}" output: @@ -90,8 +86,6 @@ rule genome_bwa_index: f"Genome/{bn}.idx.log" conda: f"{envdir}/align.yaml" - message: - "Indexing {input}" shell: "bwa index {input} 2> {log}" @@ -105,8 +99,6 @@ rule ema_count: prefix = lambda wc: outdir + "/ema_count/" + wc.get("sample"), beadtech = "-p" if platform == "haplotag" else f"-w {whitelist}", logdir = f"{outdir}/logs/ema_count/" - message: - "Counting barcode frequency: {wildcards.sample}" conda: f"{envdir}/align.yaml" shell: @@ -133,8 +125,6 @@ rule ema_preprocess: 2 conda: f"{envdir}/align.yaml" - message: - "Preprocessing for EMA mapping: {wildcards.sample}" shell: """ seqtk mergepe {input.reads} | @@ -142,7 +132,7 @@ rule ema_preprocess: cat - > {log} """ -rule ema_align: +rule align_ema: input: readbin = collect(outdir + "/ema_preproc/{{sample}}/ema-bin-{bin}", bin = binrange), genome = f"Genome/{bn}", @@ -166,8 +156,6 @@ rule ema_align: 10 conda: f"{envdir}/align.yaml" - message: - "Aligning barcoded sequences: {wildcards.sample}" shell: """ ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" -x {input.readbin} 2> {log.ema} | @@ -176,7 +164,7 @@ rule ema_align: rm -rf {params.tmpdir} """ -rule bwa_align: +rule align_bwa: input: reads = outdir + "/ema_preproc/{sample}/ema-nobc", genome = f"Genome/{bn}", @@ -195,15 +183,13 @@ rule bwa_align: 10 conda: f"{envdir}/align.yaml" - message: - "Aligning unbarcoded sequences: {wildcards.sample}" shell: """ bwa mem -t {threads} -v2 -C -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.reads} 2> {log} | samtools view -h {params.unmapped} -q {params.quality} > {output} """ -rule bwa_markdups: +rule mark_duplicates: input: sam = outdir + "/bwa_align/{sample}.bwa.nobc.sam", genome = f"Genome/{bn}", @@ -220,8 +206,6 @@ rule bwa_markdups: None threads: 2 - message: - "Marking duplicates: {wildcards.sample}" shell: """ samtools collate -O -u {input.sam} | @@ -231,19 +215,17 @@ rule bwa_markdups: rm -rf {params.tmpdir} """ -rule bwa_markdups_index: +rule index_duplicates: input: outdir + "/bwa_align/{sample}.markdup.nobc.bam" output: temp(outdir + "/bwa_align/{sample}.markdup.nobc.bam.bai") container: None - message: - "Indexing duplicate-marked alignments: {wildcards.sample}" shell: "samtools index {input}" -rule concatenate_alignments: +rule concat_alignments: input: aln_bc = outdir + "/ema_align/{sample}.bc.bam", idx_bc = outdir + "/ema_align/{sample}.bc.bam.bai", @@ -259,15 +241,13 @@ rule concatenate_alignments: mem_mb = 500 container: None - message: - "Concatenating barcoded and unbarcoded alignments: {wildcards.sample}" shell: """ samtools cat -@ 1 {input.aln_bc} {input.aln_nobc} | samtools sort -@ 1 -O bam --reference {input.genome} -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} - """ -rule coverage: +rule calculate_depth: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -277,12 +257,10 @@ rule coverage: windowsize container: None - message: - "Calculating genomic coverage: {wildcards.sample}" shell: "samtools depth -a {input.bam} | depth_windows.py {params} | gzip > {output}" -rule bx_stats: +rule barcode_stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -290,12 +268,10 @@ rule bx_stats: outdir + "/reports/data/bxstats/{sample}.bxstats.gz" container: None - message: - "Calculating barcode alignment statistics: {wildcards.sample}" shell: "bx_stats.py -o {output} {input.bam}" -rule report_persample: +rule sample_reports: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -303,12 +279,10 @@ rule report_persample: outdir + "/reports/{sample}.html" conda: f"{envdir}/r.yaml" - message: - "Generating summary of barcode alignment: {wildcards.sample}" script: "report/align_stats.Rmd" -rule stats: +rule general_stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -317,15 +291,13 @@ rule stats: flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") container: None - message: - "Calculating alignment stats: {wildcards.sample}" shell: """ samtools stats -d {input.bam} > {output.stats} samtools flagstat {input.bam} > {output.flagstat} """ -rule report_samtools: +rule samtools_report: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]), output: @@ -337,20 +309,16 @@ rule report_samtools: comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" - message: - "Summarizing samtools stats and flagstat" shell: "multiqc {params} --filename {output} 2> /dev/null" -rule report_bx: +rule barcode_report: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: outdir + "/reports/barcodes.summary.html" conda: f"{envdir}/r.yaml" - message: - "Summarizing all barcode information from alignments" script: "report/align_bxstats.Rmd" diff --git a/harpy/snakefiles/align_strobealign.smk b/harpy/snakefiles/align_strobealign.smk index 21525da55..19fc7ae98 100644 --- a/harpy/snakefiles/align_strobealign.smk +++ b/harpy/snakefiles/align_strobealign.smk @@ -37,19 +37,17 @@ def get_fq(wildcards): r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) return sorted(list(filter(r.match, fqlist))[:2]) -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Copying {input} to Genome/" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -58,12 +56,10 @@ rule genome_faidx: f"Genome/{bn}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" -rule strobeindex: +rule strobe_index: input: f"Genome/{bn}" output: @@ -76,8 +72,6 @@ rule strobeindex: f"{envdir}/align.yaml" threads: 2 - message: - "Indexing {input}" shell: "strobealign --create-index -t {threads} -r {params} {input} 2> {log}" @@ -103,15 +97,13 @@ rule align: 10 conda: f"{envdir}/align.yaml" - message: - "Aligning sequences: {wildcards.sample}" shell: """ strobealign {params.readlen} -t {threads} {params.unmapped_strobe} -C --rg-id={wildcards.sample} --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} 2> {log} | samtools view -h {params.unmapped} -q {params.quality} > {output} """ -rule markduplicates: +rule mark_duplicates: input: sam = outdir + "/samples/{sample}/{sample}.strobe.sam", genome = f"Genome/{bn}", @@ -128,8 +120,6 @@ rule markduplicates: 2 container: None - message: - "Marking duplicates: {wildcards.sample}" shell: """ samtools collate -O -u {input.sam} | @@ -139,15 +129,13 @@ rule markduplicates: rm -rf {params.tmpdir} """ -rule markdups_index: +rule index_duplicates: input: outdir + "/samples/{sample}/{sample}.markdup.bam" output: temp(outdir + "/samples/{sample}/{sample}.markdup.bam.bai") container: None - message: - "Indexing duplicate-marked alignments: {wildcards.sample}" shell: "samtools index {input}" @@ -162,12 +150,10 @@ rule assign_molecules: molecule_distance container: None - message: - "Assigning barcodes to molecules: {wildcards.sample}" shell: "assign_mi.py -o {output.bam} -c {params} {input.bam}" -rule bxstats: +rule barcode_stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -177,12 +163,10 @@ rule bxstats: sample = lambda wc: d[wc.sample] container: None - message: - "Calculating barcode alignment statistics: {wildcards.sample}" shell: "bx_stats.py -o {output} {input.bam}" -rule coverage: +rule calculate_depth: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -192,12 +176,10 @@ rule coverage: windowsize container: None - message: - "Calculating genomic coverage: {wildcards.sample}" shell: "samtools depth -a {input.bam} | depth_windows.py {params} | gzip > {output}" -rule report_persample: +rule sample_reports: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -207,12 +189,10 @@ rule report_persample: molecule_distance conda: f"{envdir}/r.yaml" - message: - "Summarizing barcoded alignments: {wildcards.sample}" script: "report/align_stats.Rmd" -rule stats: +rule general_stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -221,15 +201,13 @@ rule stats: flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") container: None - message: - "Calculating alignment stats: {wildcards.sample}" shell: """ samtools stats -d {input.bam} > {output.stats} samtools flagstat {input.bam} > {output.flagstat} """ -rule report_samtools: +rule samtools_report: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: @@ -241,20 +219,16 @@ rule report_samtools: comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" - message: - "Summarizing samtools stats and flagstat" shell: "multiqc {params} --filename {output} 2> /dev/null" -rule report_bx: +rule barcode_report: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: outdir + "/reports/barcodes.summary.html" conda: f"{envdir}/r.yaml" - message: - "Summarizing all barcode information from alignments" script: "report/align_bxstats.Rmd" diff --git a/harpy/snakefiles/deconvolve.smk b/harpy/snakefiles/deconvolve.smk index c87967c5c..0220fbc10 100644 --- a/harpy/snakefiles/deconvolve.smk +++ b/harpy/snakefiles/deconvolve.smk @@ -40,8 +40,6 @@ rule interleave: rv = get_fq2 output: temp(outdir + "/workflow/input/{sample}.fastq"), - message: - "Interleaving: {wildcards.sample}" container: None shell: @@ -63,12 +61,10 @@ rule deconvolve: 2 conda: f"{envdir}/qc.yaml" - message: - "Performing deconvolution: {wildcards.sample}" shell: "QuickDeconvolution -t {threads} -i {input} -o {output} {params} > {log} 2>&1" -rule recover_forward: +rule extract_forward: input: outdir + "/{sample}.fastq" output: @@ -77,12 +73,10 @@ rule recover_forward: "-1" container: None - message: - "Extracting deconvolved forward reads: {wildcards.sample}" shell: "seqtk seq {params} {input} | gzip > {output}" -use rule recover_forward as recover_reverse with: +use rule extract_forward as extract_reverse with: output: outdir + "/{sample}.R2.fq.gz" params: diff --git a/harpy/snakefiles/demultiplex_gen1.smk b/harpy/snakefiles/demultiplex_gen1.smk index d1fb1b44e..f397fe560 100644 --- a/harpy/snakefiles/demultiplex_gen1.smk +++ b/harpy/snakefiles/demultiplex_gen1.smk @@ -42,8 +42,6 @@ rule link_R1: temp(outdir + "/DATA_R1_001.fastq.gz") container: None - message: - "Linking {input} to output directory" shell: "ln -sr {input} {output}" @@ -59,19 +57,17 @@ use rule link_R1 as link_I2 with: input: I2 output: temp(outdir + "/DATA_I2_001.fastq.gz") -rule bx_files: +rule barcode_segments: output: temp(collect(outdir + "/BC_{letter}.txt", letter = ["A","C","B","D"])) params: outdir container: None - message: - "Creating the Gen I barcode files for barcode demultiplexing" shell: "haplotag_acbd.py {params}" -rule demux_bx: +rule demux_barcodes: input: collect(outdir + "/DATA_{IR}{ext}_001.fastq.gz", IR = ["R","I"], ext = [1,2]), collect(outdir + "/BC_{letter}.txt", letter = ["A","C","B","D"]) @@ -82,8 +78,6 @@ rule demux_bx: logdir = outdir +"/logs/demux" container: None - message: - "Moving barcodes into read headers" shell: """ mkdir -p {params.logdir} @@ -92,7 +86,7 @@ rule demux_bx: mv demux*BC.log logs """ -rule split_samples_fw: +rule demux_samples_F: input: f"{outdir}/demux_R1_001.fastq.gz" output: @@ -101,20 +95,16 @@ rule split_samples_fw: c_barcode = lambda wc: samples[wc.get("sample")] container: None - message: - "Extracting forward reads:\n sample: {wildcards.sample}\n barcode: {params}" shell: """ ( zgrep -A3 "A..{params}B..D" {input} | grep -v "^--$" | gzip -q > {output} ) || touch {output} """ -use rule split_samples_fw as split_samples_rv with: +use rule demux_samples_F as demux_samples_R with: input: f"{outdir}/demux_R2_001.fastq.gz" output: outdir + "/{sample}.R.fq.gz" - message: - "Extracting reverse reads:\n sample: {wildcards.sample}\n barcode: {params}" rule fastqc_F: input: @@ -127,8 +117,6 @@ rule fastqc_F: 1 conda: f"{envdir}/qc.yaml" - message: - "Performing quality assessment: {wildcards.sample}.F.fq.gz" shell: """ mkdir -p {params} @@ -156,8 +144,6 @@ use rule fastqc_F as fastqc_R with: temp(outdir + "/logs/{sample}_R/fastqc_data.txt") params: lambda wc: f"{outdir}/logs/" + wc.get("sample") + "_R" - message: - "Performing quality assessment: {wildcards.sample}.R.fq.gz" rule qc_report: input: @@ -171,8 +157,6 @@ rule qc_report: comment = "--comment \"This report aggregates the QC results created by falco.\"" conda: f"{envdir}/qc.yaml" - message: - "Creating final demultiplexing QC report" shell: "multiqc {params} --filename {output} 2> /dev/null" diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk index c62fa2db6..05111d99a 100644 --- a/harpy/snakefiles/impute.smk +++ b/harpy/snakefiles/impute.smk @@ -43,8 +43,6 @@ rule sort_bcf: f"{outdir}/workflow/input/vcf/input.sorted.log" container: None - message: - "Sorting input variant call file" shell: "bcftools sort -Ob --write-index -o {output.bcf} {input} 2> {log}" @@ -56,8 +54,6 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) @@ -68,13 +64,11 @@ rule alignment_list: bailist = [f"{i}.bai" for i in bamlist] output: outdir + "/workflow/input/samples.list" - message: - "Creating list of alignment files" run: with open(output[0], "w") as fout: _ = [fout.write(f"{bamfile}\n") for bamfile in input["bam"]] -rule convert_stitch: +rule stitch_conversion: input: bcf = f"{outdir}/workflow/input/vcf/input.sorted.bcf", idx = f"{outdir}/workflow/input/vcf/input.sorted.bcf.csi" @@ -84,8 +78,6 @@ rule convert_stitch: 3 container: None - message: - "Converting data to biallelic STITCH format: {wildcards.part}" shell: """ bcftools view --types snps -M2 --regions {wildcards.part} {input.bcf} | @@ -113,8 +105,6 @@ rule impute: f".Benchmark/{outdir}/stitch.{paramspace.wildcard_pattern}" + ".{part}.txt" threads: workflow.cores - message: - "Imputing {wildcards.part}:\nmodel: {wildcards.model}, useBX: {wildcards.usebx}, k: {wildcards.k}, bxLimit: {wildcards.bxlimit}, s: {wildcards.s}, nGen: {wildcards.ngen}" script: "scripts/stitch_impute.R" @@ -126,27 +116,23 @@ rule index_vcf: stats = outdir + "/{stitchparams}/contigs/{part}/{part}.stats" container: None - message: - "Indexing: {wildcards.stitchparams}/{wildcards.part}" shell: """ tabix {input.vcf} bcftools stats -s "-" {input.vcf} > {output.stats} """ -rule collate_stitch_reports: +rule stitch_reports: input: outdir + "/{stitchparams}/contigs/{part}/{part}.stats" output: outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html" - message: - "Generating STITCH report: {wildcards.part}" conda: f"{envdir}/r.yaml" script: "report/stitch_collate.Rmd" -rule clean_stitch_plots: +rule clean_plots: input: outdir + "/{stitchparams}/contigs/{part}/{part}.STITCH.html" output: @@ -158,8 +144,6 @@ rule clean_stitch_plots: None priority: 2 - message: - "Cleaning up {wildcards.stitchparams}: {wildcards.part}" shell: "rm -rf {params.outdir}/{params.stitch}/contigs/{wildcards.part}/plots" @@ -168,13 +152,11 @@ rule concat_list: bcf = collect(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz", part = contigs) output: temp(outdir + "/{stitchparams}/bcf.files") - message: - "Creating list vcf files for concatenation" run: with open(output[0], "w") as fout: _ = fout.write("\n".join(input.bcf)) -rule merge_vcfs: +rule merge_vcf: input: files = outdir + "/{stitchparams}/bcf.files", idx = collect(outdir + "/{{stitchparams}}/contigs/{part}/{part}.vcf.gz.tbi", part = contigs) @@ -184,8 +166,6 @@ rule merge_vcfs: workflow.cores container: None - message: - "Merging VCFs: {wildcards.stitchparams}" shell: "bcftools concat --threads {threads} -O b -o {output} -f {input.files} 2> /dev/null" @@ -196,12 +176,10 @@ rule index_merged: outdir + "/{stitchparams}/variants.imputed.bcf.csi" container: None - message: - "Indexing resulting file: {output}" shell: "bcftools index {input}" -rule stats: +rule general_stats: input: bcf = outdir + "/{stitchparams}/variants.imputed.bcf", idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi" @@ -209,8 +187,6 @@ rule stats: outdir + "/{stitchparams}/reports/variants.imputed.stats" container: None - message: - "Calculating stats: {wildcards.stitchparams}/variants.imputed.bcf" shell: """ bcftools stats -s "-" {input.bcf} > {output} @@ -227,15 +203,13 @@ rule compare_stats: info_sc = temp(outdir + "/{stitchparams}/reports/data/impute.infoscore") container: None - message: - "Computing post-imputation stats: {wildcards.stitchparams}" shell: """ bcftools stats -s "-" {input.orig} {input.impute} | grep \"GCTs\" > {output.compare} bcftools query -f '%CHROM\\t%POS\\t%INFO/INFO_SCORE\\n' {input.impute} > {output.info_sc} """ -rule imputation_results_reports: +rule impute_reports: input: outdir + "/{stitchparams}/reports/data/impute.compare.stats", outdir + "/{stitchparams}/reports/data/impute.infoscore" @@ -245,8 +219,6 @@ rule imputation_results_reports: lambda wc: wc.get("stitchparams") conda: f"{envdir}/r.yaml" - message: - "Assessing imputation results: {output}" script: "report/impute.Rmd" diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index 6e1823d34..8252990fe 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -77,29 +77,25 @@ def get_align_index(wildcards): aln = list(filter(r.match, bamlist)) return aln[0] + ".bai" -rule extract_heterozygous: +rule extract_het: input: vcf = variantfile output: outdir + "/workflow/input/vcf/{sample}.het.vcf" container: None - message: - "Extracting heterozygous variants: {wildcards.sample}" shell: """ bcftools view -s {wildcards.sample} -Ou -m 2 -M 2 {input.vcf} | bcftools view -i 'GT="het"' > {output} """ -rule extract_homozygous: +rule extract_hom: input: vcf = variantfile output: outdir + "/workflow/input/vcf/{sample}.hom.vcf" container: None - message: - "Extracting variants: {wildcards.sample}" shell: """ bcftools view -s {wildcards.sample} -Ou {input.vcf} | bcftools view -i 'GT="hom"' > {output} @@ -113,26 +109,22 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) if indels: - rule genome_setup: + rule setup_genome: input: genomefile output: geno container: None - message: - "Copying {input} to Genome/" shell: "seqtk seq {input} > {output}" - rule genome_faidx: + rule faidx_genome: input: geno output: @@ -141,8 +133,6 @@ if indels: f"Genome/{bn}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" @@ -164,8 +154,6 @@ rule extract_hairs: f"{envdir}/phase.yaml" benchmark: ".Benchmark/Phase/extracthairs.{sample}.txt" - message: - "Converting to compact fragment format: {wildcards.sample}" shell: "extractHAIRS {params} --nf 1 --bam {input.bam} --VCF {input.vcf} --out {output} > {log} 2>&1" @@ -184,12 +172,10 @@ rule link_fragments: f"{envdir}/phase.yaml" benchmark: ".Benchmark/Phase/linkfrag.{sample}.txt" - message: - "Linking fragments: {wildcards.sample}" shell: "LinkFragments.py --bam {input.bam} --VCF {input.vcf} --fragments {input.fragments} --out {output} -d {params} > {log} 2>&1" -rule phase_blocks: +rule phase: input: vcf = outdir + "/workflow/input/vcf/{sample}.het.vcf", fragments = fragfile @@ -205,8 +191,6 @@ rule phase_blocks: f"{envdir}/phase.yaml" benchmark: ".Benchmark/Phase/phase.{sample}.txt" - message: - "Creating phased haplotype blocks: {wildcards.sample}" shell: "HAPCUT2 --fragments {input.fragments} --vcf {input.vcf} {params} --out {output.blocks} --nf 1 --error_analysis_mode 1 --call_homozygous 1 --outvcf 1 > {log} 2>&1" @@ -217,18 +201,16 @@ rule compress_phaseblock: outdir + "/phase_blocks/{sample}.phased.vcf.gz" container: None - message: - "Compressing vcf: {wildcards.sample}" shell: "bcftools view -Oz6 -o {output} --write-index {input}" -use rule compress_phaseblock as compress vcf with: +use rule compress_phaseblock as compress_vcf with: input: outdir + "/workflow/input/vcf/{sample}.hom.vcf" output: outdir + "/workflow/input/gzvcf/{sample}.hom.vcf.gz" -rule merge_annotations: +rule merge_het_hom: input: phase = outdir + "/phase_blocks/{sample}.phased.vcf.gz", orig = outdir + "/workflow/input/gzvcf/{sample}.hom.vcf.gz" @@ -241,8 +223,6 @@ rule merge_annotations: ".Benchmark/Phase/mergeAnno.{sample}.txt" container: None - message: - "Merging annotations: {wildcards.sample}" shell: "bcftools annotate -Ob -o {output.bcf} --write-index -a {input.phase} -c CHROM,POS,FMT/GT,FMT/PS,FMT/PQ,FMT/PD -m +HAPCUT {input.orig}" @@ -259,8 +239,6 @@ rule merge_samples: workflow.cores container: None - message: - "Combining results into {output.bcf}" if len(samplenames) > 1 else "Copying results to {output.bcf}" shell: """ if [ "{params}" = true ]; then @@ -280,8 +258,6 @@ rule summarize_blocks: outdir + "/reports/blocks.summary" container: None - message: - "Summarizing phasing results" shell: """ echo -e "sample\\tcontig\\tn_snp\\tpos_start\\tblock_length" > {params} @@ -298,8 +274,6 @@ rule phase_report: outdir + "/reports/phase.html" conda: f"{envdir}/r.yaml" - message: - "Summarizing phasing results" script: "report/hapcut.Rmd" diff --git a/harpy/snakefiles/preflight_bam.smk b/harpy/snakefiles/preflight_bam.smk index f1912c42c..3fc6f4de1 100644 --- a/harpy/snakefiles/preflight_bam.smk +++ b/harpy/snakefiles/preflight_bam.smk @@ -45,8 +45,6 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) @@ -59,12 +57,10 @@ rule check_bam: temp(outdir + "/{sample}.log") container: None - message: - "Processing: {wildcards.sample}" shell: "check_bam.py {input.bam} > {output}" -rule merge_checks: +rule concat_results: input: collect(outdir + "/{sample}.log", sample = samplenames) output: @@ -72,8 +68,6 @@ rule merge_checks: final = outdir + "/filecheck.bam.tsv" container: None - message: - "Concatenating results" shell: """ cat {input} | sort -k1 > {output.tmp} @@ -87,8 +81,6 @@ rule create_report: outdir + "/filecheck.bam.html" conda: f"{envdir}/r.yaml" - message: - "Producing report" script: "report/preflight_bam.Rmd" diff --git a/harpy/snakefiles/preflight_fastq.smk b/harpy/snakefiles/preflight_fastq.smk index 6d5d6ceba..d28bc836c 100644 --- a/harpy/snakefiles/preflight_fastq.smk +++ b/harpy/snakefiles/preflight_fastq.smk @@ -38,8 +38,6 @@ rule check_forward: temp(outdir + "/{sample}.F.log") container: None - message: - "Processing forward reads: {wildcards.sample}" shell: "check_fastq.py {input} > {output}" @@ -48,14 +46,12 @@ rule check_reverse: get_fq2 output: temp(outdir + "/{sample}.R.log") - message: - "Processing reverse reads: {wildcards.sample}" container: None shell: "check_fastq.py {input} > {output}" -rule merge_checks: +rule concat_results: input: collect(outdir + "/{sample}.{FR}.log", sample = samplenames, FR = ["F","R"]) output: @@ -63,8 +59,6 @@ rule merge_checks: final = outdir + "/filecheck.fastq.tsv" container: None - message: - "Concatenating results" shell: """ cat {input} | sort -k1 > {output.tmp} @@ -78,8 +72,6 @@ rule create_report: outdir + "/filecheck.fastq.html" conda: f"{envdir}/r.yaml" - message: - "Producing report" script: "report/preflight_fastq.Rmd" diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index dfe4a2ece..ad74d344b 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -43,16 +43,16 @@ def get_fq2(wildcards): return list(filter(r.match, fqlist)) if not deconvolve: - rule qc_fastp: + rule fastp: input: fw = get_fq1, rv = get_fq2 output: fw = outdir + "/{sample}.R1.fq.gz", rv = outdir + "/{sample}.R2.fq.gz", + html = outdir + "/reports/{sample}.html", json = outdir + "/reports/data/fastp/{sample}.fastp.json" log: - html = outdir + "/reports/{sample}.html", serr = outdir + "/logs/fastp/{sample}.log" params: minlen = f"--length_required {min_len}", @@ -64,22 +64,20 @@ if not deconvolve: workflow.cores conda: f"{envdir}/qc.yaml" - message: - "Quality trimming" + (", removing adapters" if not trimadapters else "") + (", removing PCR duplicates" if dedup else "") + ": {wildcards.sample}" shell: """ - fastp --trim_poly_g --cut_right {params} --thread {threads} -i {input.fw} -I {input.rv} -o {output.fw} -O {output.rv} -h {log.html} -j {output.json} -R "{wildcards.sample} QC Report" 2> {log.serr} + fastp --trim_poly_g --cut_right {params} --thread {threads} -i {input.fw} -I {input.rv} -o {output.fw} -O {output.rv} -h {output.html} -j {output.json} -R "{wildcards.sample} QC Report" 2> {log.serr} """ else: - rule qc_fastp: + rule fastp: input: fw = get_fq1, rv = get_fq2 output: fq = temp(outdir + "/fastp/{sample}.fastq"), + html = outdir + "/reports/{sample}.html", json = outdir + "/reports/data/fastp/{sample}.fastp.json" log: - html = outdir + "/reports/{sample}.html", serr = outdir + "/logs/fastp/{sample}.log" params: minlen = f"--length_required {min_len}", @@ -91,11 +89,9 @@ else: workflow.cores conda: f"{envdir}/qc.yaml" - message: - "Quality trimming" + (", removing adapters" if not trimadapters else "") + (", removing PCR duplicates" if dedup else "") + ": {wildcards.sample}" shell: """ - fastp --trim_poly_g --cut_right {params} --thread {threads} -i {input.fw} -I {input.rv} --stdout -h {log.html} -j {output.json} -R "{wildcards.sample} QC Report" 2> {log.serr} > {output.fq} + fastp --trim_poly_g --cut_right {params} --thread {threads} -i {input.fw} -I {input.rv} --stdout -h {output.html} -j {output.json} -R "{wildcards.sample} QC Report" 2> {log.serr} > {output.fq} """ rule deconvolve: @@ -114,12 +110,10 @@ else: workflow.cores conda: f"{envdir}/qc.yaml" - message: - "Performing deconvolution: {wildcards.sample}" shell: "QuickDeconvolution -t {threads} -i {input} -o {output} {params} > {log} 2>&1" - rule recover_forward: + rule extract_forward: input: outdir + "/{sample}.fastq" output: @@ -128,42 +122,36 @@ else: "-1" container: None - message: - "Extracting deconvolved forward reads: {wildcards.sample}" shell: "seqtk seq {params} {input} | gzip > {output}" - use rule recover_forward as recover_reverse with: + use rule extract_forward as extract_reverse with: output: outdir + "/{sample}.R2.fq.gz" params: "-2" -rule count_beadtags: +rule check_barcodes: input: outdir + "/{sample}.R1.fq.gz" output: temp(outdir + "/logs/bxcount/{sample}.count.log") - message: - "Counting barcode frequency: {wildcards.sample}" container: None shell: "count_bx.py {input} > {output}" -rule beadtag_counts_summary: +rule barcode_report: input: countlog = collect(outdir + "/logs/bxcount/{sample}.count.log", sample = samplenames) output: outdir + "/reports/barcode.summary.html" conda: f"{envdir}/r.yaml" - message: - "Summarizing sample barcode validation" script: "report/bx_count.Rmd" -rule create_report: +rule qc_report: input: collect(outdir + "/reports/data/fastp/{sample}.fastp.json", sample = samplenames) output: @@ -176,8 +164,6 @@ rule create_report: comment = "--comment \"This report aggregates trimming and quality control metrics reported by fastp.\"" conda: f"{envdir}/qc.yaml" - message: - "Aggregating fastp reports" shell: "multiqc {params} --filename {output}" diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk index 7e7b7cfcf..3a13fcd2b 100644 --- a/harpy/snakefiles/simulate_linkedreads.smk +++ b/harpy/snakefiles/simulate_linkedreads.smk @@ -13,22 +13,17 @@ gen_hap2 = config["inputs"]["genome_hap2"] envdir = os.getcwd() + "/.harpy_envs" snakemake_log = config["snakemake_log"] barcodes = config.get("barcodes", None) -if barcodes: - barcodefile = barcodes -else: - barcodefile = f"{outdir}/workflow/input/4M-with-alts-february-2016.txt" +barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/4M-with-alts-february-2016.txt" onstart: extra_logfile_handler = pylogging.FileHandler(snakemake_log) logger.logger.addHandler(extra_logfile_handler) -rule link_haplotype: +rule link_1st_geno: input: gen_hap1 output: f"{outdir}/workflow/input/hap.0.fasta" - message: - "Decompressing {input}" if gen_hap1.lower().endswith("gz") else "Symlinking {input}" run: if input[0].lower().endswith("gz"): with open(input[0], 'rb') as inf, open(output[0], 'w', encoding='utf8') as outf: @@ -38,21 +33,19 @@ rule link_haplotype: if not (Path(output[0]).is_symlink() or Path(output[0]).exists()): Path(output[0]).symlink_to(Path(input[0]).resolve()) -use rule link_haplotype as link_second_haplotype with: +use rule link_1st_geno as link_2nd_geno with: input: gen_hap2 output: f"{outdir}/workflow/input/hap.1.fasta" -rule genome_faidx: +rule faidx_genome: input: outdir + "/workflow/input/hap.{hap}.fasta" output: outdir + "/workflow/input/hap.{hap}.fasta.fai" container: None - message: - "Indexing haplotype {wildcards.hap}" shell: "samtools faidx --fai-idx {output} {input}" @@ -60,13 +53,11 @@ if not barcodes: rule download_barcodes: output: barcodefile - message: - "Downloading list of standard 10X barcodes" run: from urllib.request import urlretrieve _ = urlretrieve("https://raw.githubusercontent.com/aquaskyline/LRSIM/master/4M-with-alts-february-2016.txt", output[0]) -rule create_molecules_hap: +rule create_reads: input: outdir + "/workflow/input/hap.{hap}.fasta" output: @@ -81,22 +72,18 @@ rule create_molecules_hap: prefix = lambda wc: outdir + "/dwgsim_simulated/dwgsim." + wc.get("hap") + ".12" conda: f"{envdir}/simulations.yaml" - message: - "Simulating reads reads from {input}" shell: """ dwgsim -N {params.readpairs} -e 0.0001,0.0016 -E 0.0001,0.0016 -d {params.outerdist} -s {params.distsd} -1 135 -2 151 -H -y 0 -S 0 -c 0 -R 0 -r {params.mutationrate} -F 0 -o 1 -m /dev/null {input} {params.prefix} 2> {log} """ -rule interleave_dwgsim_output: +rule interleave_dwgsim: input: collect(outdir + "/dwgsim_simulated/dwgsim.{{hap}}.12.bwa.read{rd}.fastq.gz", rd = [1,2]) output: outdir + "/dwgsim_simulated/dwgsim.{hap}.12.fastq" container: None - message: - "Interleaving dwgsim output: {wildcards.hap}" shell: "seqtk mergepe {input} > {output}" @@ -126,8 +113,6 @@ rule lrsim: workflow.cores conda: f"{envdir}/simulations.yaml" - message: - "Running LRSIM to generate linked reads from\n haplotype 1: {input.hap1}\n haplotype 2: {input.hap2}" shell: """ perl {params.lrsim} -g {input.hap1},{input.hap2} -p {params.proj_dir}/lrsim/sim \\ @@ -143,8 +128,6 @@ rule sort_manifest: outdir + "/lrsim/sim.{hap}.sort.manifest" conda: f"{envdir}/simulations.yaml" - message: - "Sorting read manifest: haplotype {wildcards.hap}" shell: "msort -kn1 {input} > {output}" @@ -161,12 +144,10 @@ rule extract_reads: lambda wc: f"""{outdir}/10X/sim_hap{wc.get("hap")}_10x""" container: None - message: - "Extracting linked reads for haplotype {wildcards.hap}" shell: "extractReads {input} {params} 2> {log}" -rule convert_haplotag: +rule convert_to_haplotag: input: fw = outdir + "/10X/sim_hap{hap}_10x_R1_001.fastq.gz", rv = outdir + "/10X/sim_hap{hap}_10x_R2_001.fastq.gz", @@ -180,8 +161,6 @@ rule convert_haplotag: lambda wc: f"""{outdir}/sim_hap{wc.get("hap")}_haplotag""" container: None - message: - "Converting 10X barcodes to haplotag format: haplotype {wildcards.hap}" shell: "10xtoHaplotag.py -f {input.fw} -r {input.rv} -b {input.barcodes} -p {params} > {log}" diff --git a/harpy/snakefiles/simulate_snpindel.smk b/harpy/snakefiles/simulate_snpindel.smk index 987b23b9f..9ac477b87 100644 --- a/harpy/snakefiles/simulate_snpindel.smk +++ b/harpy/snakefiles/simulate_snpindel.smk @@ -64,28 +64,24 @@ onstart: logger.logger.addHandler(extra_logfile_handler) if snp_vcf: - rule convert_snpvcf: + rule convert_snp_vcf: input: snp_vcf output: snp_vcf_correct container: None - message: - "Converting {input} to compressed VCF format" shell: "bcftools view -Oz {input} > {output}" if indel_vcf: - rule convert_indelvcf: + rule convert_indel_vcf: input: indel_vcf output: indel_vcf_correct container: None - message: - "Converting {input} to compressed VCF format" shell: "bcftools view -Oz {input} > {output}" @@ -104,12 +100,10 @@ rule simulate_variants: parameters = variant_params conda: f"{envdir}/simulations.yaml" - message: - "Simulating snps and/or indels" shell: "perl {params.simuG} -refseq {input.geno} -prefix {params.prefix} {params.parameters} > {log}" -rule create_heterozygote_snp_vcf: +rule snp_vcf_het: input: f"{outdir}/{outprefix}.snp.vcf" output: @@ -117,8 +111,6 @@ rule create_heterozygote_snp_vcf: f"{outdir}/{outprefix}.snp.hap2.vcf" params: heterozygosity - message: - "Creating snp diploid variant files for heterozygosity = {params}" run: random.seed(6969) hap1_vcf, hap2_vcf = open(output[0], "w"), open(output[1], "w") @@ -142,16 +134,14 @@ rule create_heterozygote_snp_vcf: else: hap2_vcf.write(line) -use rule create_heterozygote_snp_vcf as create_heterozygote_indel_vcf with: +use rule snp_vcf_het as indel_vcf_het with: input: f"{outdir}/{outprefix}.indel.vcf" output: f"{outdir}/{outprefix}.indel.hap1.vcf", f"{outdir}/{outprefix}.indel.hap2.vcf" - message: - "Creating indel diploid variant files for heterozygosity = {params}" -rule all: +rule workflow_summary: default_target: True input: multiext(f"{outdir}/{outprefix}", ".bed", ".fasta"), diff --git a/harpy/snakefiles/simulate_variants.smk b/harpy/snakefiles/simulate_variants.smk index dd13c110c..1169eb699 100644 --- a/harpy/snakefiles/simulate_variants.smk +++ b/harpy/snakefiles/simulate_variants.smk @@ -57,8 +57,6 @@ if vcf: vcf_correct container: None - message: - "Converting {input} to compressed VCF format" shell: "bcftools view -Oz {input} > {output}" @@ -76,12 +74,10 @@ rule simulate_variants: parameters = variant_params conda: f"{envdir}/simulations.yaml" - message: - f"Simulating {variant}s onto genome" shell: "perl {params.simuG} -refseq {input.geno} -prefix {params.prefix} {params.parameters} > {log}" -rule create_heterozygote_vcf: +rule create_het_vcf: input: f"{outdir}/{outprefix}.vcf" output: @@ -89,8 +85,6 @@ rule create_heterozygote_vcf: f"{outdir}/{outprefix}.hap2.vcf" params: heterozygosity - message: - "Creating diploid variant files for heterozygosity = {params}" run: random.seed(6969) hap1_vcf, hap2_vcf = open(output[0], "w"), open(output[1], "w") @@ -114,7 +108,7 @@ rule create_heterozygote_vcf: else: hap2_vcf.write(line) -rule all: +rule workflow_summary: default_target: True input: multiext(f"{outdir}/{outprefix}", ".vcf", ".bed", ".fasta"), diff --git a/harpy/snakefiles/snp_freebayes.smk b/harpy/snakefiles/snp_freebayes.smk index 7bd0e3fb2..88e791b08 100644 --- a/harpy/snakefiles/snp_freebayes.smk +++ b/harpy/snakefiles/snp_freebayes.smk @@ -53,30 +53,26 @@ def sam_index(infile): if not os.path.exists(f"{infile}.bai"): subprocess.run(f"samtools index {infile} {infile}.bai".split()) -rule copy_groupings: +rule preproc_groups: input: groupings output: outdir + "/logs/sample.groups" - message: - "Logging {input}" run: with open(input[0], "r") as infile, open(output[0], "w") as outfile: _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Copying {input} to Genome/" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -85,8 +81,6 @@ rule genome_faidx: f"Genome/{bn}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" @@ -97,30 +91,16 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) -rule samplenames: - output: - outdir + "/logs/samples.names" - message: - "Creating list of sample names" - run: - with open(output[0], "w") as fout: - for samplename in samplenames: - _ = fout.write(samplename + "\n") - rule bam_list: input: bam = bamlist, bai = [f"{i}.bai" for i in bamlist] output: outdir + "/logs/samples.files" - message: - "Creating list of alignment files" run: with open(output[0], "w") as fout: for bamfile in input.bam: @@ -139,16 +119,14 @@ rule call_variants: params: region = lambda wc: "-r " + regions[wc.part], ploidy = f"-p {ploidy}", - populations = "--populations " + rules.copy_groupings.output[0] if groupings else "", + populations = f"--populations {outdir}/logs/sample.groups" if groupings else "", extra = extra conda: f"{envdir}/snp.yaml" - message: - "Calling variants: {wildcards.part}" shell: "freebayes -f {input.ref} -L {input.samples} {params} > {output}" -rule sort_variants: +rule sort_sample_variants: input: outdir + "/regions/{part}.vcf" output: @@ -156,8 +134,6 @@ rule sort_variants: idx = temp(outdir + "/regions/{part}.bcf.csi") container: None - message: - "Sorting: {wildcards.part}" shell: "bcftools sort -Ob --write-index --output {output.bcf} {input} 2> /dev/null" @@ -166,14 +142,12 @@ rule concat_list: bcfs = collect(outdir + "/regions/{part}.bcf", part = intervals), output: outdir + "/logs/bcf.files" - message: - "Creating list of region-specific vcf files" run: with open(output[0], "w") as fout: for bcf in input.bcfs: _ = fout.write(f"{bcf}\n") -rule merge_vcfs: +rule concat_variants: input: bcfs = collect(outdir + "/regions/{part}.{ext}", part = intervals, ext = ["bcf", "bcf.csi"]), filelist = outdir + "/logs/bcf.files" @@ -185,12 +159,10 @@ rule merge_vcfs: workflow.cores container: None - message: - "Combining vcfs into a single file" shell: "bcftools concat -f {input.filelist} --threads {threads} --naive -Ob -o {output} 2> {log}" -rule sort_vcf: +rule sort_all_variants: input: outdir + "/variants.raw.unsort.bcf" output: @@ -198,34 +170,10 @@ rule sort_vcf: csi = outdir + "/variants.raw.bcf.csi" container: None - message: - "Sorting and indexing final variants" shell: "bcftools sort --write-index -Ob -o {output.bcf} {input} 2> /dev/null" - -#rule normalize_bcf: -# input: -# genome = f"Genome/{bn}", -# ref_idx = f"Genome/{bn}.fai", -# bcf = outdir + "/variants.raw.bcf" -# output: -# bcf = outdir + "/variants.normalized.bcf", -# idx = outdir + "/variants.normalized.bcf.csi" -# log: -# outdir + "/logs/normalize.log" -# threads: -# 2 -# message: -# "Normalizing the called variants" -# shell: -# """ -# bcftools norm -d exact -f {input.genome} {input.bcf} 2> {log}.tmp1 | -# bcftools norm -m -any -N -Ob --write-index -o {output.bcf} 2> {log}.tmp2 -# cat {log}.tmp1 {log}.tmp2 > {log} && rm {log}.tmp1 {log}.tmp2 -# """ - -rule variants_stats: +rule general_stats: input: genome = f"Genome/{bn}", ref_idx = f"Genome/{bn}.fai", @@ -235,22 +183,18 @@ rule variants_stats: outdir + "/reports/variants.{type}.stats", container: None - message: - "Calculating variant stats: variants.{wildcards.type}.bcf" shell: """ bcftools stats -s "-" --fasta-ref {input.genome} {input.bcf} > {output} 2> /dev/null """ -rule bcf_report: +rule variant_report: input: outdir + "/reports/variants.{type}.stats" output: outdir + "/reports/variants.{type}.html" conda: f"{envdir}/r.yaml" - message: - "Generating bcftools report: variants.{wildcards.type}.bcf" script: "report/bcftools_stats.Rmd" diff --git a/harpy/snakefiles/snp_mpileup.smk b/harpy/snakefiles/snp_mpileup.smk index 6b588db12..5a0595f24 100644 --- a/harpy/snakefiles/snp_mpileup.smk +++ b/harpy/snakefiles/snp_mpileup.smk @@ -13,6 +13,7 @@ regiontype = config["regiontype"] windowsize = config.get("windowsize", None) outdir = config["output_directory"] skipreports = config["skip_reports"] +snakemake_log = config["snakemake_log"] bamlist = config["inputs"]["alignments"] genomefile = config["inputs"]["genome"] bn = os.path.basename(genomefile) @@ -47,15 +48,13 @@ def sam_index(infile): if not os.path.exists(f"{infile}.bai"): subprocess.run(f"samtools index {infile} {infile}.bai".split()) -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Symlinking {input}" shell: """ if (file {input} | grep -q compressed ) ;then @@ -66,7 +65,7 @@ rule genome_setup: fi """ -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -78,8 +77,6 @@ rule genome_faidx: genome_zip container: None - message: - "Indexing {input}" shell: """ if [ "{params}" = "True" ]; then @@ -89,13 +86,11 @@ rule genome_faidx: fi """ -rule copy_groupings: +rule preproc_groups: input: groupings output: outdir + "/logs/sample.groups" - message: - "Logging {input}" run: with open(input[0], "r") as infile, open(output[0], "w") as outfile: _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] @@ -107,8 +102,6 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) @@ -119,23 +112,11 @@ rule bam_list: bai = [f"{i}.bai" for i in bamlist] output: outdir + "/logs/samples.files" - message: - "Creating list of alignment files" run: with open(output[0], "w") as fout: for bamfile in input.bam: _ = fout.write(bamfile + "\n") -rule samplenames: - output: - outdir + "/logs/samples.names" - message: - "Creating list of sample names" - run: - with open(output[0], "w") as fout: - for samplename in samplenames: - _ = fout.write(samplename + "\n") - rule mpileup: input: bamlist = outdir + "/logs/samples.files", @@ -149,8 +130,6 @@ rule mpileup: extra = mp_extra container: None - message: - "Finding variants: {wildcards.part}" shell: "bcftools mpileup --fasta-ref {input.genome} --bam-list {input.bamlist} --annotate AD --output-type b {params} > {output.bcf} 2> {output.logfile}" @@ -168,8 +147,6 @@ rule call_genotypes: 2 container: None - message: - "Calling genotypes: {wildcards.part}" shell: """ bcftools call --multiallelic-caller --variants-only --output-type b {params} {input} | @@ -181,8 +158,6 @@ rule concat_list: bcfs = collect(outdir + "/call/{part}.bcf", part = intervals), output: outdir + "/logs/bcf.files" - message: - "Creating list of region-specific vcf files" run: with open(output[0], "w") as fout: for bcf in input.bcfs: @@ -193,8 +168,6 @@ rule concat_logs: collect(outdir + "/logs/{part}.mpileup.log", part = intervals) output: outdir + "/logs/mpileup.log" - message: - "Combining mpileup logs" run: with open(output[0], "w") as fout: for file in input: @@ -204,7 +177,7 @@ rule concat_logs: fout.write(f"{interval}\t{line}") fin.close() -rule merge_vcfs: +rule concat_variants: input: vcfs = collect(outdir + "/call/{part}.{ext}", part = intervals, ext = ["bcf", "bcf.csi"]), filelist = outdir + "/logs/bcf.files" @@ -216,12 +189,10 @@ rule merge_vcfs: workflow.cores container: None - message: - "Combining vcfs into a single file" shell: "bcftools concat -f {input.filelist} --threads {threads} --naive -Ob -o {output} 2> {log}" -rule sort_vcf: +rule sort_variants: input: outdir + "/variants.raw.unsort.bcf" output: @@ -229,33 +200,10 @@ rule sort_vcf: csi = outdir + "/variants.raw.bcf.csi" container: None - message: - "Sorting and indexing final variants" shell: "bcftools sort --write-index -Ob -o {output.bcf} {input} 2> /dev/null" -#rule normalize_bcf: -# input: -# genome = f"Genome/{bn}", -# bcf = outdir + "/variants.raw.bcf", -# samples = outdir + "/logs/samples.names" -# output: -# bcf = outdir + "/variants.normalized.bcf", -# idx = outdir + "/variants.normalized.bcf.csi" -# log: -# outdir + "/logs/normalize.log" -# threads: -# 2 -# message: -# "Normalizing the called variants" -# shell: -# """ -# bcftools norm -d exact -f {input.genome} {input.bcf} 2> {log}.tmp1 | -# bcftools norm -m -any -N -Ob --write-index -o {output.bcf} 2> {log}.tmp2 -# cat {log}.tmp1 {log}.tmp2 > {log} && rm {log}.tmp1 {log}.tmp2 -# """ - -rule variants_stats: +rule general_stats: input: genome = f"Genome/{bn}", bcf = outdir + "/variants.{type}.bcf", @@ -264,22 +212,18 @@ rule variants_stats: outdir + "/reports/data/variants.{type}.stats" container: None - message: - "Calculating variant stats: variants.{wildcards.type}.bcf" shell: """ bcftools stats -s "-" --fasta-ref {input.genome} {input.bcf} > {output} 2> /dev/null """ -rule bcf_report: +rule variant_report: input: outdir + "/reports/data/variants.{type}.stats" output: outdir + "/reports/variants.{type}.html" conda: f"{envdir}/r.yaml" - message: - "Generating bcftools report: variants.{wildcards.type}.bcf" script: "report/bcftools_stats.Rmd" diff --git a/harpy/snakefiles/sv_leviathan.smk b/harpy/snakefiles/sv_leviathan.smk index e02e469ce..e606576b6 100644 --- a/harpy/snakefiles/sv_leviathan.smk +++ b/harpy/snakefiles/sv_leviathan.smk @@ -54,8 +54,6 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) @@ -72,24 +70,20 @@ rule index_barcode: max(10, workflow.cores) conda: f"{envdir}/sv.yaml" - message: - "Indexing barcodes: {wildcards.sample}" shell: "LRez index bam --threads {threads} -p -b {input.bam} -o {output}" -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Creating {output}" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -98,12 +92,10 @@ rule genome_faidx: f"Genome/{bn}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" -rule index_bwa_genome: +rule bwa_index_genome: input: f"Genome/{bn}" output: @@ -112,12 +104,10 @@ rule index_bwa_genome: f"Genome/{bn}.idx.log" conda: f"{envdir}/align.yaml" - message: - "Indexing {input}" shell: "bwa index {input} 2> {log}" -rule call_sv: +rule call_variants: input: bam = get_alignments, bai = get_align_index, @@ -140,12 +130,10 @@ rule call_sv: f"{envdir}/sv.yaml" benchmark: ".Benchmark/leviathan/{sample}.variantcall" - message: - "Calling variants: {wildcards.sample}" shell: "LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output} -t {threads} --candidates {log.candidates} 2> {log.runlog}" -rule sort_bcf: +rule sort_variants: input: outdir + "/vcf/{sample}.vcf" output: @@ -154,27 +142,23 @@ rule sort_bcf: "{wildcards.sample}" container: None - message: - "Sorting and converting to BCF: {wildcards.sample}" shell: "bcftools sort -Ob --output {output} {input} 2> /dev/null" -rule sv_stats: +rule variant_stats: input: outdir + "/vcf/{sample}.bcf" output: temp(outdir + "/reports/data/{sample}.sv.stats") container: None - message: - "Getting SV stats for {wildcards.sample}" shell: """ echo -e "sample\\tcontig\\tposition_start\\tposition_end\\tlength\\ttype\\tn_barcodes\\tn_pairs" > {output} bcftools query -f '{wildcards.sample}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output} """ -rule merge_variants: +rule aggregate_variants: input: collect(outdir + "/reports/data/{sample}.sv.stats", sample = samplenames) output: @@ -182,8 +166,6 @@ rule merge_variants: outdir + "/deletions.bedpe", outdir + "/duplications.bedpe", outdir + "/breakends.bedpe" - message: - "Aggregating the detected variants" run: with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends: header = ["sample","contig","position_start","position_end","length","type","n_barcodes","n_pairs"] @@ -209,7 +191,7 @@ rule merge_variants: elif record[5] == "BND": _ = breakends.write(line) -rule sv_report: +rule sample_reports: input: faidx = f"Genome/{bn}.fai", statsfile = outdir + "/reports/data/{sample}.sv.stats" @@ -217,12 +199,9 @@ rule sv_report: outdir + "/reports/{sample}.SV.html" conda: f"{envdir}/r.yaml" - message: - "Generating SV report: {wildcards.sample}" script: "report/leviathan.Rmd" - rule workflow_summary: default_target: True input: diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index b6dd333f7..53275273c 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -48,30 +48,26 @@ def pop_manifest(groupingfile, filelist): popdict = pop_manifest(groupfile, bamlist) populations = popdict.keys() -rule copy_groupings: +rule preproc_groups: input: groupfile output: outdir + "/workflow/sample.groups" - message: - "Logging {input}" run: with open(input[0], "r") as infile, open(output[0], "w") as outfile: _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] -rule merge_list: +rule concat_list: input: outdir + "/workflow/sample.groups" output: outdir + "/workflow/merge_samples/{population}.list" - message: - "Creating population file list: {wildcards.population}" run: with open(output[0], "w") as fout: for bamfile in popdict[wildcards.population]: _ = fout.write(bamfile + "\n") -rule merge_populations: +rule concat_groups: input: bamlist = outdir + "/workflow/merge_samples/{population}.list", bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) @@ -83,12 +79,10 @@ rule merge_populations: 1 container: None - message: - "Merging alignments: {wildcards.population}" shell: "concatenate_bam.py -o {output} -b {input.bamlist} 2> {log}" -rule sort_merged: +rule sort_groups: input: outdir + "/workflow/input/{population}.unsort.bam" output: @@ -102,8 +96,6 @@ rule sort_merged: 10 container: None - message: - "Sorting alignments: {wildcards.population}" shell: "samtools sort -@ {threads} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input} 2> {log}" @@ -119,24 +111,20 @@ rule index_barcode: max(10, workflow.cores) conda: f"{envdir}/sv.yaml" - message: - "Indexing barcodes: Population {wildcards.population}" shell: "LRez index bam -p -b {input.bam} -o {output} --threads {threads}" -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Creating {output}" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -145,26 +133,22 @@ rule genome_faidx: f"Genome/{bn}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" -rule index_bwa_genome: +rule bwa_index_genome: input: f"Genome/{bn}" output: multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") log: f"Genome/{bn}.idx.log" - message: - "Indexing {input}" conda: f"{envdir}/align.yaml" shell: "bwa index {input} 2> {log}" -rule call_sv: +rule call_variants: input: bam = outdir + "/workflow/input/{population}.bam", bai = outdir + "/workflow/input/{population}.bam.bai", @@ -185,14 +169,12 @@ rule call_sv: workflow.cores conda: f"{envdir}/sv.yaml" - message: - "Calling variants: {wildcards.population}" benchmark: ".Benchmark/leviathan-pop/{population}.variantcall" shell: "LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output} -t {threads} --candidates {log.candidates} 2> {log.runlog}" -rule sort_bcf: +rule sort_variants: input: outdir + "/vcf/{population}.vcf" output: @@ -201,27 +183,23 @@ rule sort_bcf: "{wildcards.population}" container: None - message: - "Sorting and converting to BCF: Population {wildcards.population}" shell: "bcftools sort -Ob --output {output} {input} 2> /dev/null" -rule sv_stats: +rule variant_stats: input: outdir + "/vcf/{population}.bcf" output: temp(outdir + "/reports/data/{population}.sv.stats") container: None - message: - "Getting stats: Population {input}" shell: """ echo -e "population\\tcontig\\tposition_start\\tposition_end\\tlength\\ttype\\tn_barcodes\\tn_pairs" > {output} bcftools query -f '{wildcards.population}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output} """ -rule merge_variants: +rule aggregate_variants: input: collect(outdir + "/reports/data/{population}.sv.stats", population = populations) output: @@ -229,8 +207,6 @@ rule merge_variants: outdir + "/deletions.bedpe", outdir + "/duplications.bedpe", outdir + "/breakends.bedpe" - message: - "Aggregating the detected variants" run: with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends: header = ["population","contig","position_start","position_end","length","type","n_barcodes","n_pairs"] @@ -256,28 +232,24 @@ rule merge_variants: elif record[5] == "BND": _ = breakends.write(line) -rule sv_report: +rule group_reports: input: statsfile = outdir + "/reports/data/{population}.sv.stats", bcf = outdir + "/vcf/{population}.bcf", faidx = f"Genome/{bn}.fai" output: outdir + "/reports/{population}.sv.html" - message: - "Generating SV report: population {wildcards.population}" conda: f"{envdir}/r.yaml" script: "report/leviathan.Rmd" -rule sv_report_aggregate: +rule aggregate_report: input: faidx = f"Genome/{bn}.fai", statsfiles = collect(outdir + "/reports/data/{pop}.sv.stats", pop = populations) output: outdir + "/reports/leviathan.summary.html" - message: - "Generating SV report for all populations" conda: f"{envdir}/r.yaml" script: diff --git a/harpy/snakefiles/sv_naibr.smk b/harpy/snakefiles/sv_naibr.smk index 98485b0a2..5f0049f86 100644 --- a/harpy/snakefiles/sv_naibr.smk +++ b/harpy/snakefiles/sv_naibr.smk @@ -68,13 +68,11 @@ rule index_alignments: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) -rule create_config: +rule naibr_config: input: get_alignments output: @@ -82,8 +80,6 @@ rule create_config: params: lambda wc: wc.get("sample"), min(10, workflow.cores) - message: - "Creating naibr config file: {wildcards.sample}" run: argdict = process_args(extra) with open(output[0], "w") as conf: @@ -94,7 +90,7 @@ rule create_config: for i in argdict: _ = conf.write(f"{i}={argdict[i]}\n") -rule call_sv: +rule call_variants: input: bam = get_alignments, bai = get_align_index, @@ -109,12 +105,10 @@ rule call_sv: 10 conda: f"{envdir}/sv.yaml" - message: - "Calling variants: {wildcards.sample}" shell: "naibr {input.conf} > {log} 2>&1" -rule infer_sv: +rule infer_variants: input: bedpe = outdir + "/{sample}/{sample}.bedpe", refmt = outdir + "/{sample}/{sample}.reformat.bedpe", @@ -128,8 +122,6 @@ rule infer_sv: outdir = lambda wc: outdir + "/" + wc.get("sample") container: None - message: - "Inferring variants from naibr output: {wildcards.sample}" shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} @@ -138,15 +130,13 @@ rule infer_sv: rm -rf {params.outdir} """ -rule merge_variants: +rule aggregate_variants: input: collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames) output: outdir + "/inversions.bedpe", outdir + "/deletions.bedpe", outdir + "/duplications.bedpe" - message: - "Aggregating the detected variants" run: from pathlib import Path with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: @@ -172,15 +162,13 @@ rule merge_variants: elif record[-1] == "duplication": _ = duplications.write(f"{samplename}\t{line}") -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Symlinking {input}" shell: """ if (file {input} | grep -q compressed ) ;then @@ -192,7 +180,7 @@ rule genome_setup: fi """ -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -204,8 +192,6 @@ rule genome_faidx: genome_zip container: None - message: - "Indexing {input}" shell: """ if [ "{params}" = "True" ]; then @@ -215,7 +201,7 @@ rule genome_faidx: fi """ -rule create_report: +rule sample_reports: input: bedpe = outdir + "/bedpe/{sample}.bedpe", fai = f"Genome/{bn}.fai" @@ -223,8 +209,6 @@ rule create_report: outdir + "/reports/{sample}.naibr.html" conda: f"{envdir}/r.yaml" - message: - "Creating report: {wildcards.sample}" script: "report/naibr.Rmd" diff --git a/harpy/snakefiles/sv_naibr_phase.smk b/harpy/snakefiles/sv_naibr_phase.smk index 9e2c3bbe5..4e45c2a01 100644 --- a/harpy/snakefiles/sv_naibr_phase.smk +++ b/harpy/snakefiles/sv_naibr_phase.smk @@ -69,19 +69,17 @@ def get_align_index(wildcards): aln = list(filter(r.match, bamlist)) return aln[0] + ".bai" -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{validgenome}" container: None - message: - "Preprocessing {input}" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{validgenome}" output: @@ -90,45 +88,37 @@ rule genome_faidx: f"Genome/{validgenome}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" -rule index_original_alignments: +rule index_alignments: input: bamlist output: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) -rule index_bcf: +rule index_snps: input: vcffile output: vcffile + ".csi" container: None - message: - "Indexing {input}" shell: "bcftools index {input}" -rule index_vcfgz: +rule index_snps_gz: input: vcffile output: vcffile + ".tbi" container: None - message: - "Indexing {input}" shell: "tabix {input}" @@ -149,8 +139,6 @@ rule phase_alignments: f"{envdir}/phase.yaml" threads: 4 - message: - "Phasing alignments: {wildcards.sample}" shell: "whatshap haplotag --sample {wildcards.sample} --linked-read-distance-cutoff {params} --ignore-read-groups --tag-supplementary --output-threads={threads} -o {output.bam} --reference {input.ref} {input.vcf} {input.aln} 2> {output.log}" @@ -161,8 +149,6 @@ rule log_phasing: outdir + "/logs/whatshap-haplotag.log" container: None - message: - "Creating log of alignment phasing" shell: """ echo -e "sample\\ttotal_alignments\\tphased_alignments" > {output} @@ -173,7 +159,7 @@ rule log_phasing: done """ -rule create_config: +rule naibr_config: input: outdir + "/phasedbam/{sample}.bam" output: @@ -181,8 +167,6 @@ rule create_config: params: lambda wc: wc.get("sample"), min(10, workflow.cores) - message: - "Creating naibr config file: {wildcards.sample}" run: argdict = process_args(extra) with open(output[0], "w") as conf: @@ -193,19 +177,17 @@ rule create_config: for i in argdict: _ = conf.write(f"{i}={argdict[i]}\n") -rule index_phased_alignment: +rule index_phased: input: outdir + "/phasedbam/{sample}.bam" output: outdir + "/phasedbam/{sample}.bam.bai" container: None - message: - "Indexing alignment: {wildcards.sample}" shell: "samtools index {input} {output} 2> /dev/null" -rule call_sv: +rule call_variants: input: bam = outdir + "/phasedbam/{sample}.bam", bai = outdir + "/phasedbam/{sample}.bam.bai", @@ -220,12 +202,10 @@ rule call_sv: 10 conda: f"{envdir}/sv.yaml" - message: - "Calling variants: {wildcards.sample}" shell: "naibr {input.conf} > {log} 2>&1" -rule infer_sv: +rule infer_variants: input: bedpe = outdir + "/{sample}/{sample}.bedpe", refmt = outdir + "/{sample}/{sample}.reformat.bedpe", @@ -239,8 +219,6 @@ rule infer_sv: outdir = lambda wc: outdir + "/" + wc.get("sample") container: None - message: - "Inferring variants from naibr output: {wildcards.sample}" shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} @@ -249,15 +227,13 @@ rule infer_sv: rm -rf {params.outdir} """ -rule merge_variants: +rule aggregate_variants: input: collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames) output: outdir + "/inversions.bedpe", outdir + "/deletions.bedpe", outdir + "/duplications.bedpe" - message: - "Aggregating the detected variants" run: from pathlib import Path with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: @@ -283,7 +259,7 @@ rule merge_variants: elif record[-1] == "duplication": _ = duplications.write(f"{samplename}\t{line}") -rule create_report: +rule variant_report: input: bedpe = outdir + "/bedpe/{sample}.bedpe", fai = f"Genome/{validgenome}.fai" @@ -291,8 +267,6 @@ rule create_report: outdir + "/reports/{sample}.naibr.html" conda: f"{envdir}/r.yaml" - message: - "Creating report: {wildcards.sample}" script: "report/naibr.Rmd" diff --git a/harpy/snakefiles/sv_naibr_pop.smk b/harpy/snakefiles/sv_naibr_pop.smk index eef77feee..58e76a81c 100644 --- a/harpy/snakefiles/sv_naibr_pop.smk +++ b/harpy/snakefiles/sv_naibr_pop.smk @@ -65,30 +65,26 @@ def pop_manifest(groupingfile, filelist): popdict = pop_manifest(groupfile, bamlist) populations = popdict.keys() -rule copy_groupings: +rule preproc_groups: input: groupfile output: outdir + "/workflow/sample.groups" - message: - "Logging {input}" run: with open(input[0], "r") as infile, open(output[0], "w") as outfile: _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] -rule merge_list: +rule concat_list: input: outdir + "/workflow/sample.groups" output: outdir + "/workflow/merge_samples/{population}.list" - message: - "Creating population file list: {wildcards.population}" run: with open(output[0], "w") as fout: for bamfile in popdict[wildcards.population]: _ = fout.write(bamfile + "\n") -rule merge_populations: +rule concat_groups: input: bamlist = outdir + "/workflow/merge_samples/{population}.list", bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) @@ -100,12 +96,10 @@ rule merge_populations: 1 container: None - message: - "Merging alignments: {wildcards.population}" shell: "concatenate_bam.py -o {output} -b {input.bamlist} 2> {log}" -rule sort_merged: +rule sort_groups: input: outdir + "/workflow/input/{population}.unsort.bam" output: @@ -119,12 +113,10 @@ rule sort_merged: 10 container: None - message: - "Sorting alignments: {wildcards.population}" shell: "samtools sort -@ {threads} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input} 2> {log}" -rule create_config: +rule naibr_config: input: outdir + "/workflow/input/{population}.bam" output: @@ -132,8 +124,6 @@ rule create_config: params: lambda wc: wc.get("population"), min(10, workflow.cores) - message: - "Creating naibr config file: {wildcards.population}" run: argdict = process_args(extra) with open(output[0], "w") as conf: @@ -144,7 +134,7 @@ rule create_config: for i in argdict: _ = conf.write(f"{i}={argdict[i]}\n") -rule call_sv: +rule call_variants: input: bam = outdir + "/workflow/input/{population}.bam", bai = outdir + "/workflow/input/{population}.bam.bai", @@ -159,12 +149,10 @@ rule call_sv: 10 conda: f"{envdir}/sv.yaml" - message: - "Calling variants: {wildcards.population}" shell: "naibr {input.conf} > {log} 2>&1" -rule infer_sv: +rule infer_variants: input: bedpe = outdir + "/{population}/{population}.bedpe", refmt = outdir + "/{population}/{population}.reformat.bedpe", @@ -178,8 +166,6 @@ rule infer_sv: outdir = lambda wc: outdir + "/" + wc.get("population") container: None - message: - "Inferring variants from naibr output: {wildcards.population}" shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} @@ -188,15 +174,13 @@ rule infer_sv: rm -rf {params.outdir} """ -rule merge_variants: +rule aggregate_variants_variants: input: collect(outdir + "/bedpe/{population}.bedpe", population = populations) output: outdir + "/inversions.bedpe", outdir + "/deletions.bedpe", outdir + "/duplications.bedpe" - message: - "Aggregating the detected variants" run: from pathlib import Path with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: @@ -222,19 +206,17 @@ rule merge_variants: elif record[-1] == "duplication": _ = duplications.write(f"{samplename}\t{line}") -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Preprocessing {input}" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: @@ -243,12 +225,10 @@ rule genome_faidx: f"Genome/{bn}.faidx.log" container: None - message: - "Indexing {input}" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" -rule sv_report: +rule group_reports: input: fai = f"Genome/{bn}.fai", bedpe = outdir + "/bedpe/{population}.bedpe" @@ -256,12 +236,10 @@ rule sv_report: outdir + "/reports/{population}.naibr.html" conda: f"{envdir}/r.yaml" - message: - "Creating report: {wildcards.population}" script: "report/naibr.Rmd" -rule sv_report_aggregate: +rule aggregate_report: input: fai = f"Genome/{bn}.fai", bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations) @@ -269,8 +247,6 @@ rule sv_report_aggregate: outdir + "/reports/naibr.pop.summary.html" conda: f"{envdir}/r.yaml" - message: - "Creating summary report" script: "report/naibr_pop.Rmd" diff --git a/harpy/snakefiles/sv_naibr_pop_phase.smk b/harpy/snakefiles/sv_naibr_pop_phase.smk index 489e7935f..56dc067c0 100644 --- a/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -88,65 +88,55 @@ def get_align_index(wildcards): aln = list(filter(r.match, bamlist)) return aln[0] + ".bai" -rule genome_setup: +rule setup_genome: input: genomefile output: f"Genome/{bn}" container: None - message: - "Preprocessing {input}" shell: "seqtk seq {input} > {output}" -rule genome_faidx: +rule faidx_genome: input: f"Genome/{bn}" output: f"Genome/{bn}.fai" container: None - message: - "Indexing {input}" log: f"Genome/{bn}.faidx.log" shell: "samtools faidx --fai-idx {output} {input} 2> {log}" -rule index_bcf: +rule index_snps: input: vcffile output: vcffile + ".csi" container: None - message: - "Indexing {input}" shell: "bcftools index {input}" -rule index_vcfgz: +rule index_snps_gz: input: vcffile output: vcffile + ".tbi" container: None - message: - "Indexing {input}" shell: "tabix {input}" -rule index_original_alignments: +rule index_alignments: input: bamlist output: [f"{i}.bai" for i in bamlist] threads: workflow.cores - message: - "Indexing alignment files" run: with multiprocessing.Pool(processes=threads) as pool: pool.map(sam_index, input) @@ -168,8 +158,6 @@ rule phase_alignments: 4 conda: f"{envdir}/phase.yaml" - message: - "Phasing alignments: {wildcards.sample}" shell: "whatshap haplotag --sample {wildcards.sample} --linked-read-distance-cutoff {params} --ignore-read-groups --tag-supplementary --output-threads={threads} -o {output.bam} --reference {input.ref} {input.vcf} {input.aln} 2> {output.log}" @@ -180,8 +168,6 @@ rule log_phasing: outdir + "/logs/whatshap-haplotag.log" container: None - message: - "Creating log of alignment phasing" shell: """ echo -e "sample\\ttotal_alignments\\tphased_alignments" > {output} @@ -192,30 +178,26 @@ rule log_phasing: done """ -rule copy_groupings: +rule preproc_groups: input: groupfile output: outdir + "/workflow/sample.groups" - message: - "Logging {input}" run: with open(input[0], "r") as infile, open(output[0], "w") as outfile: _ = [outfile.write(i) for i in infile.readlines() if not i.lstrip().startswith("#")] -rule merge_list: +rule concat_list: input: outdir + "/workflow/sample.groups" output: outdir + "/workflow/pool_samples/{population}.list" - message: - "Creating pooling file list: {wildcards.population}" run: with open(output[0], "w") as fout: for bamfile in popdict[wildcards.population]: _ = fout.write(f"{outdir}/phasedbam/{Path(bamfile).stem}.bam\n") -rule merge_populations: +rule concat_groups: input: bamlist = outdir + "/workflow/pool_samples/{population}.list", bamfiles = lambda wc: collect(outdir + "/phasedbam/{sample}", sample = popdict[wc.population]) @@ -227,12 +209,10 @@ rule merge_populations: 1 container: None - message: - "Merging alignments: {wildcards.population}" shell: "concatenate_bam.py -o {output} -b {input.bamlist} 2> {log}" -rule sort_merged: +rule sort_groups: input: outdir + "/workflow/input/concat/{population}.unsort.bam" output: @@ -246,12 +226,10 @@ rule sort_merged: 10 container: None - message: - "Sorting alignments: {wildcards.population}" shell: "samtools sort -@ {threads} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input} 2> {log}" -rule create_config: +rule naibr_config: input: outdir + "/workflow/input/{population}.bam" output: @@ -259,8 +237,6 @@ rule create_config: params: lambda wc: wc.get("population"), min(10, workflow.cores) - message: - "Creating naibr config file: {wildcards.population}" run: argdict = process_args(extra) with open(output[0], "w") as conf: @@ -271,7 +247,7 @@ rule create_config: for i in argdict: _ = conf.write(f"{i}={argdict[i]}\n") -rule call_sv: +rule call_variants: input: bam = outdir + "/workflow/input/{population}.bam", bai = outdir + "/workflow/input/{population}.bam.bai", @@ -286,12 +262,10 @@ rule call_sv: 10 conda: f"{envdir}/sv.yaml" - message: - "Calling variants: {wildcards.population}" shell: "naibr {input.conf} > {log} 2>&1" -rule infer_sv: +rule infer_variants: input: bedpe = outdir + "/{population}/{population}.bedpe", refmt = outdir + "/{population}/{population}.reformat.bedpe", @@ -305,8 +279,6 @@ rule infer_sv: outdir = lambda wc: outdir + "/" + wc.get("population") container: None - message: - "Inferring variants from naibr output: {wildcards.population}" shell: """ infer_sv.py {input.bedpe} -f {output.fail} > {output.bedpe} @@ -315,15 +287,13 @@ rule infer_sv: rm -rf {params.outdir} """ -rule merge_variants: +rule aggregate_variants: input: collect(outdir + "/bedpe/{population}.bedpe", population = populations) output: outdir + "/inversions.bedpe", outdir + "/deletions.bedpe", outdir + "/duplications.bedpe" - message: - "Aggregating the detected variants" run: from pathlib import Path with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: @@ -349,27 +319,23 @@ rule merge_variants: elif record[-1] == "duplication": _ = duplications.write(f"{samplename}\t{line}") -rule infer_sv_report: +rule group_reports: input: fai = f"Genome/{bn}.fai", bedpe = outdir + "/bedpe/{population}.bedpe" output: outdir + "/reports/{population}.naibr.html" - message: - "Creating report: {wildcards.population}" conda: f"{envdir}/r.yaml" script: "report/naibr.Rmd" -rule sv_report_aggregate: +rule aggregate_report: input: fai = f"Genome/{bn}.fai", bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations) output: outdir + "/reports/naibr.pop.summary.html" - message: - "Creating summary report" conda: f"{envdir}/r.yaml" script: diff --git a/harpy/snp.py b/harpy/snp.py index e1580af24..9bbf05a3e 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -87,8 +87,6 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -143,7 +141,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e generate_conda_deps() start_text = f"Samples: {n}{popgroupings}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log) + launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @@ -186,8 +184,6 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -242,7 +238,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, generate_conda_deps() start_text = f"Samples: {n}{popgroupings}\nOutput Directory: {output_dir}/\nLog: {sm_log}" - launch_snakemake(command, "snp_freebayes", start_text, output_dir, sm_log) + launch_snakemake(command, "snp_freebayes", start_text, output_dir, sm_log, quiet) snp.add_command(mpileup) snp.add_command(freebayes) diff --git a/harpy/sv.py b/harpy/sv.py index ed8b1d82e..283a7095c 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -82,8 +82,6 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -99,7 +97,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre with open(f'{workflowdir}/config.yaml', "w", encoding="utf-8") as config: config.write("workflow: sv leviathan\n") - config.write(f"snakemake_log: {sm_log}") + config.write(f"snakemake_log: {sm_log}\n") config.write(f"output_directory: {output_dir}\n") config.write(f"min_barcodes: {min_barcodes}\n") config.write(f"min_sv: {min_sv}\n") @@ -127,7 +125,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre generate_conda_deps() modetext = "pool-by-group" if populations else "sample-by-sample" start_text = f"Samples: {n}{popgroupings}\nOutput Directory: {output_dir}/\nMode: {modetext}\nLog: {sm_log}" - launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log) + launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log, quiet) @click.command(no_args_is_help = True, epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/sv/naibr/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @@ -174,8 +172,6 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if quiet: - command += "--quiet all " if snakemake is not None: command += snakemake @@ -233,7 +229,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th generate_conda_deps() start_text = f"Samples: {n}{popgroupings}\nOutput Directory: {output_dir}/\nMode: {modetext}\nLog: {sm_log}" - launch_snakemake(command, "sv_naibr", start_text, output_dir, sm_log) + launch_snakemake(command, "sv_naibr", start_text, output_dir, sm_log, quiet) sv.add_command(leviathan) sv.add_command(naibr) diff --git a/resources/meta.yaml b/resources/meta.yaml index 98f0e964f..bf223605b 100644 --- a/resources/meta.yaml +++ b/resources/meta.yaml @@ -10,32 +10,34 @@ source: sha256: '{{ sha256 }}' build: - skip: True # [osx] number: 0 - missing_dso_whitelist: - - /lib64/libstdc++.so.6 - - /lib64/libc.so.6 + skip: True # [osx or py < 39] + script_env: + - SETUPTOOLS_SCM_PRETEND_VERSION={{ version }} run_exports: - - {{ pin_subpackage('harpy', max_pin="x.x") }} + - {{ pin_subpackage('harpy', max_pin="x") }} script: | mkdir -p ${PREFIX}/bin "${CXX}" harpy/bin/extractReads.cpp -O3 -o ${PREFIX}/bin/extractReads - ${PYTHON} -m pip install . --no-deps -vvv - chmod +x harpy/bin/* - cp harpy/bin/* ${PREFIX}/bin/ + ${PYTHON} -m pip install . --no-deps --no-build-isolation --no-cache-dir -vvv + chmod 0755 harpy/bin/* + cp -rf harpy/bin/* ${PREFIX}/bin/ + entry_points: + - harpy = harpy.__main__:cli requirements: build: - {{ compiler('cxx') }} host: - - python =3.12 + - python - pip + - setuptools-scm >=3.4 run: - apptainer - bcftools =1.20 - pandas - pysam - - python >3.10 + - python - rich-click - snakemake-minimal >7 - samtools =1.20 @@ -48,7 +50,7 @@ test: - "harpy --version" about: - home: "https://github.com/pdimens/harpy/" + home: "https://github.com/pdimens/harpy" license: GPL-3.0-or-later license_family: GPL3 license_file: LICENSE @@ -60,11 +62,11 @@ about: to Snakemake directly. With an emphasis on user-friendliness, parallelization, transparency, and reproducibility, Harpy aims to quickly handle data processing so that you can focus more on analyzing your data. - doc_url: https://pdimens.github.io/harpy/ - dev_url: https://github.com/pdimens/harpy + doc_url: "https://pdimens.github.io/harpy" + dev_url: "https://github.com/pdimens/harpy" extra: + additional-platforms: + - linux-aarch64 recipe-maintainers: - pdimens - skip-lints: - - should_be_noarch_generic \ No newline at end of file diff --git a/test/fakebam/sample1.F.fq.gz b/test/fakebam/sample1.F.fq.gz deleted file mode 100644 index 68f77d794..000000000 Binary files a/test/fakebam/sample1.F.fq.gz and /dev/null differ diff --git a/test/fakebam/sample1.R.fq.gz b/test/fakebam/sample1.R.fq.gz deleted file mode 100644 index 5c2b430b1..000000000 Binary files a/test/fakebam/sample1.R.fq.gz and /dev/null differ