diff --git a/.deprecated/align-minimap.smk b/.deprecated/align-minimap.smk index 07fcfc459..f4adac488 100644 --- a/.deprecated/align-minimap.smk +++ b/.deprecated/align-minimap.smk @@ -14,7 +14,7 @@ extra = config.get("extra", "") bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] windowsize = config["depth_windowsize"] molecule_distance = config["molecule_distance"] @@ -323,8 +323,8 @@ rule workflow_summary: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), - samtools = outdir + "/reports/minimap.stats.html" if not skipreports else [] , - bx_reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [] + samtools = outdir + "/reports/minimap.stats.html" if not skip_reports else [] , + bx_reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skip_reports else [] params: quality = config["quality"], extra = extra diff --git a/.deprecated/phase.bak.smk b/.deprecated/phase.bak.smk index be147cf09..4afd76175 100644 --- a/.deprecated/phase.bak.smk +++ b/.deprecated/phase.bak.smk @@ -22,7 +22,7 @@ else: fragfile = outdir + "/linkFragments/{sample}.linked.frags" linkarg = "--10x 0" if config["noBX"] else "--10x 1" -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] try: indelarg = "--indels 1 --ref " + config["indels"] @@ -289,7 +289,7 @@ rule workflow_summary: default_target: True input: vcf = outdir + "/variants.phased.bcf", - reports = outdir + "/reports/phase.html" if not skipreports else [] + reports = outdir + "/reports/phase.html" if not skip_reports else [] params: prune = f"--threshold {pruning}" if pruning > 0 else "--no_prune 1", extra = extra diff --git a/.github/filters.yml b/.github/filters.yml index 32f10ce09..0b1f12112 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -149,6 +149,11 @@ simreads: &simreads - 'extractReads.cpp' - 'harpy/bin/10xtoHaplotag.py' - 'harpy/scripts/LRSIM_harpy.pl' +metassembly: &metassembly + - *common + - *container + - 'harpy/metassembly.py' + - 'harpy/snakefiles/metassembly.smk' other: &other - 'harpy/stitchparams.py' - 'harpy/popgroup.py' @@ -170,4 +175,5 @@ modules: - *naibr - *simvars - *simreads + - *metassembly - *other diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9dc0b2c22..61b52867d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -41,6 +41,7 @@ jobs: phase: ${{ steps.filter.outputs.phase }} simvars: ${{ steps.filter.outputs.simvars }} simreads: ${{ steps.filter.outputs.simreads }} + metassembly: ${{ steps.filter.outputs.metassembly }} other: ${{ steps.filter.outputs.other }} container: ${{ steps.filter.outputs.container }} modules: ${{ steps.filter.outputs.modules }} @@ -106,7 +107,7 @@ jobs: id: singularity shell: micromamba-shell {0} if: ${{ needs.changes.outputs.modules == 'true' }} - run: harpy qc --skipreports --quiet test/fastq/sample1.*.fq.gz + run: harpy qc --skip-reports --quiet test/fastq/sample1.*.fq.gz - name: Create Singularity Artifact if: ${{ steps.singularity.outcome == 'success' }} uses: actions/upload-artifact@v4 @@ -231,7 +232,7 @@ jobs: run: harpy qc -x "--trim_poly_g" --quiet test/fastq - name: harpy qc all options shell: micromamba-shell {0} - run: harpy qc -a -d -c --quiet test/fastq + run: harpy qc -a -d -c 21,40,3,0 --quiet test/fastq deconvolve: needs: [changes, pkgbuild] if: ${{ needs.changes.outputs.deconvolve == 'true' && needs.pkgbuild.result == 'success' }} @@ -787,4 +788,43 @@ jobs: harpy hpc slurm harpy hpc googlebatch harpy hpc lsf - harpy hpc htcondor \ No newline at end of file + harpy hpc htcondor + metassembly: + needs: [changes, pkgbuild] + if: ${{ needs.changes.outputs.metassembly == 'true' && needs.pkgbuild.result == 'success' }} + name: metassembly + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + - name: Setup Mamba + uses: mamba-org/setup-micromamba@v1 + env: + ACTIONS_STEP_DEBUG: true + with: + init-shell: bash + generate-run-shell: true + environment-file: resources/harpy.yaml + cache-environment: true + post-cleanup: 'all' + log-level: error + - name: Install Harpy + shell: micromamba-shell {0} + run: | + python3 -m pip install --upgrade build && python3 -m build + pip install dist/*.whl + resources/buildforCI.sh + - name: Clear Space + run: rm -rf /opt/hostedtoolcache && mkdir -p .snakemake/singularity + - name: Download Singularity Artifact + uses: actions/download-artifact@v4 + with: + name: deps-image + path: .snakemake/singularity + - name: create a metassembly + shell: micromamba-shell {0} + run: harpy metassembly --quiet -m 6000 test/fastq/sample1.* + + diff --git a/Assembly/workflow/assembly.smk b/Assembly/workflow/assembly.smk new file mode 100644 index 000000000..c8010fe68 --- /dev/null +++ b/Assembly/workflow/assembly.smk @@ -0,0 +1,61 @@ +import os +import logging + +onstart: + logger.logger.addHandler(logging.FileHandler(config["snakemake_log"])) +onsuccess: + os.remove(logger.logfile) +onerror: + os.remove(logger.logfile) + +FQ1 = config["inputs"]["fastq_r1"] +FQ2 = config["inputs"]["fastq_r2"] +outdir = config["output_directory"] +envdir = os.getcwd() + "/.harpy_envs" +max_mem = config["spades"]["max_memory"] +k_param = config["spades"]["k"] +extra = config["spades"].get("extra", "") + +rule cloudspades: + input: + FQ_R1 = FQ1, + FQ_R2 = FQ2 + output: + f"{outdir}/contigs.fasta", + f"{outdir}/scaffolds.fasta" + params: + outdir = outdir, + k = k_param, + mem = max_mem // 1000, + extra = extra + log: + outdir + "/logs/assembly.log" + conda: + f"{envdir}/assembly.yaml" + threads: + workflow.cores + resources: + mem_mb=max_mem + shell: + "spades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} --gemcode1-1 {input.FQ_R1} --gemcode1-2 {input.FQ_R2} -o {params.outdir} --isolate > {log}" + +rule workflow_summary: + default_target: True + input: + f"{outdir}/athena/athena.asm.fa" + params: + k_param = k_param, + max_mem = max_mem // 1000, + extra = extra + run: + summary_template = f""" +The harpy assemble workflow ran using these parameters: + +Reads were assembled using cloudspades: + spades.py -t THREADS -m {params.max_mem} --gemcode1-1 FQ1 --gemcode1-2 FQ2 --isolate -k {params.k_param} {params.extra} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" + with open(outdir + "/workflow/metassembly.summary", "w") as f: + f.write(summary_template) \ No newline at end of file diff --git a/Assembly/workflow/config.yaml b/Assembly/workflow/config.yaml new file mode 100644 index 000000000..4035a8418 --- /dev/null +++ b/Assembly/workflow/config.yaml @@ -0,0 +1,13 @@ +workflow: assembly +snakemake_log: Assembly/logs/snakemake/assembly.1.15_10_2024.log +output_directory: Assembly +barcode_tag: BX +spades: + assembler: None + max_memory: 250000 + k: auto +skip_reports: False +workflow_call: snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method conda --conda-prefix .snakemake/conda --cores 4 --directory . --snakefile Assembly/workflow/assembly.smk --configfile Assembly/workflow/config.yaml +inputs: + fastq_r1: test/fastq/sample1.F.fq.gz + fastq_r2: test/fastq/sample1.R.fq.gz diff --git a/README.md b/README.md index 6409890ed..b35c786cf 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,6 @@ harpy ``` ## 🌈 Get Started -No data? No problem! Harpy lets you [simulate genomic variants](https://pdimens.github.io/harpy/modules/simulate/simulate-variants/) -from an existing genome and can also [create haplotag data](https://pdimens.github.io/harpy/modules/simulate/simulate-linkedreads/) +No data? No problem! Harpy lets you [simulate genomic variants](https://pdimens.github.io/harpy/workflows/simulate/simulate-variants/) +from an existing genome and can also [create haplotag data](https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads/) from an existing genome! You can see what haplotag data (and Harpy) are like without paying a single cent! \ No newline at end of file diff --git a/harpy/__main__.py b/harpy/__main__.py index 0289567b8..f1bb369dc 100644 --- a/harpy/__main__.py +++ b/harpy/__main__.py @@ -58,13 +58,13 @@ def cli(): cli.add_command(deconvolve.deconvolve) cli.add_command(metassembly.metassembly) -## the modules ## +## the workflows ## click.rich_click.COMMAND_GROUPS = { "harpy": [ { - "name": "Modules", - "commands": ["demultiplex", "metassembly","qc", "align","snp","sv","impute","phase", "simulate"], + "name": "workflows", + "commands": ["demultiplex","qc", "align","snp","sv","impute","phase", "simulate", "metassembly"], }, { "name": "Other Commands", diff --git a/harpy/_conda.py b/harpy/_conda.py index d7231afa2..efe8ba934 100644 --- a/harpy/_conda.py +++ b/harpy/_conda.py @@ -17,11 +17,11 @@ def create_conda_recipes(): "conda-forge::libzlib", "conda-forge::xz" ], - "metassembly" : [ - "bioconda::spades=4.0" + "assembly": [ + "conda-forge::python=3" ], - "athena": [ - "bioconda::athena_meta" + "metassembly": [ + "bioconda::athena_meta=1.2" ], "phase" : [ "bioconda::hapcut2", @@ -79,3 +79,10 @@ def create_conda_recipes(): yml.write("\n - ".join(condachannels)) yml.write("\ndependencies:\n - ") yml.write("\n - ".join(deps) + "\n") + + # post-deployment scripts + with open(".harpy_envs/assembly.post-deploy.sh", "w", encoding="utf-8") as shellscript: + shellscript.write("wget -O .spades.tar.gz https://github.com/ablab/spades/releases/download/v4.0.0/SPAdes-4.0.0-Linux.tar.gz\n") + shellscript.write("tar -xvzf .spades.tar.gz && rm .spades.tar.gz\n") + shellscript.write("mv SPAdes-4.0.0-Linux/bin/* ${CONDA_PREFIX}/bin && mv SPAdes-4.0.0-Linux/share/* ${CONDA_PREFIX}/share\n") + shellscript.write("rm -r SPAdes-4.0.0-Linux\n") \ No newline at end of file diff --git a/harpy/_launch.py b/harpy/_launch.py index 6a05b3c15..6a9855780 100644 --- a/harpy/_launch.py +++ b/harpy/_launch.py @@ -6,11 +6,15 @@ import glob import subprocess from rich import print as rprint -from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn from rich.console import Console -from ._misc import gzip_file +from ._misc import gzip_file, harpy_progressbar, harpy_pulsebar from ._printing import print_onsuccess, print_onstart, print_onerror, print_setup_error +EXIT_CODE_SUCCESS = 0 +EXIT_CODE_GENERIC_ERROR = 1 +EXIT_CODE_CONDA_ERROR = 2 +EXIT_CODE_RUNTIME_ERROR = 3 + def iserror(text): """logical check for erroring trigger words in snakemake output""" return "Exception" in text or "Error" in text or "MissingOutputException" in text @@ -38,8 +42,8 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): output = process.stderr.readline() # check for syntax errors at the very beginning if process.poll() or iserror(output): - exitcode = 0 if process.poll() == 0 else 1 - exitcode = 2 if iserror(output) else exitcode + exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else EXIT_CODE_GENERIC_ERROR + exitcode = EXIT_CODE_CONDA_ERROR if "Conda" in output else exitcode break if not quiet: console = Console() @@ -50,12 +54,11 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): while output.startswith("Building DAG of jobs...") or output.startswith("Assuming"): output = process.stderr.readline() if process.poll() or iserror(output): - exitcode = 0 if process.poll() == 0 else 1 + exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else EXIT_CODE_GENERIC_ERROR break - while not output.startswith("Job stats:"): # print dependency text only once - if "Downloading and installing remote packages" in output: + if "Downloading and installing remote packages" in output or "Running post-deploy" in output: deps = True deploy_text = "[dim]Installing software dependencies" break @@ -64,40 +67,29 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): deploy_text = "[dim]Downloading software container" break if "Nothing to be" in output: - exitcode = 0 + exitcode = EXIT_CODE_SUCCESS + break + if "MissingInput" in output: + exitcode = EXIT_CODE_GENERIC_ERROR break output = process.stderr.readline() # if dependency text present, print pulsing progress bar if deps: - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(bar_width= 70 - len(deploy_text), pulse_style = "grey46"), - TimeElapsedColumn(), - transient=True, - disable=quiet - ) as progress: + with harpy_pulsebar(quiet, deploy_text) as progress: progress.add_task("[dim]" + deploy_text, total = None) while not output.startswith("Job stats:"): output = process.stderr.readline() if process.poll() or iserror(output): - exitcode = 0 if process.poll() == 0 else 2 + exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else 2 break progress.stop() if process.poll() or exitcode: break if "Nothing to be" in output: - exitcode = 0 + exitcode = EXIT_CODE_SUCCESS break - with Progress( - SpinnerColumn(spinner_name = "arc", style = "dim"), - TextColumn("[progress.description]{task.description}"), - BarColumn(complete_style="yellow", finished_style="blue"), - TextColumn("[progress.remaining]{task.completed}/{task.total}", style = "magenta"), - TimeElapsedColumn(), - transient=True, - disable=quiet - ) as progress: + with harpy_progressbar(quiet) as progress: # process the job summary job_inventory = {} while True: @@ -114,7 +106,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): pass # checkpoint if process.poll() or iserror(output): - exitcode = 0 if process.poll() == 0 else 1 + exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else EXIT_CODE_GENERIC_ERROR break if process.poll() or exitcode: break @@ -124,11 +116,11 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): output = process.stderr.readline() if iserror(output) or process.poll() == 1: progress.stop() - exitcode = 3 + exitcode = EXIT_CODE_RUNTIME_ERROR break if process.poll() == 0 or output.startswith("Complete log:") or output.startswith("Nothing to be"): progress.stop() - exitcode = 0 if process.poll() == 0 else 3 + exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else EXIT_CODE_RUNTIME_ERROR break # add new progress bar track if the rule doesn't have one yet rulematch = re.search(r"rule\s\w+:", output) diff --git a/harpy/_misc.py b/harpy/_misc.py index 618edc883..64583aaec 100644 --- a/harpy/_misc.py +++ b/harpy/_misc.py @@ -9,11 +9,39 @@ from pathlib import Path from importlib_resources import files import click +from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn import harpy.scripts import harpy.reports import harpy.snakefiles from ._printing import print_error, print_solution +def harpy_progressbar(quiet): + """ + The pre-configured transient progress bar that workflows and validations use + """ + return Progress( + SpinnerColumn(spinner_name = "arc", style = "dim"), + TextColumn("[progress.description]{task.description}"), + BarColumn(complete_style="yellow", finished_style="blue"), + TextColumn("[progress.remaining]{task.completed}/{task.total}", style = "magenta"), + TimeElapsedColumn(), + transient=True, + disable=quiet + ) + +def harpy_pulsebar(quiet, desc_text): + """ + The pre-configured transient pulsing progress bar that workflows use, typically for + installing the software dependencies/container + """ + return Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(bar_width= 70 - len(desc_text), pulse_style = "grey46"), + TimeElapsedColumn(), + transient=True, + disable=quiet + ) + def symlink(original, destination): """Create a symbolic link from original -> destination if the destination doesn't already exist.""" if not (Path(destination).is_symlink() or Path(destination).exists()): @@ -73,26 +101,38 @@ def gzip_file(infile): shutil.copyfileobj(f_in, f_out) os.remove(infile) -class IntPair(click.ParamType): - """A class for a click type which accepts 2 integers, separated by a comma.""" - name = "int_pair" +class IntList(click.ParamType): + """A class for a click type which accepts an arbitrary number of integers, separated by a comma.""" + name = "int_list" + def __init__(self, max_entries): + super().__init__() + self.max_entries = max_entries + def convert(self, value, param, ctx): try: - parts = value.split(',') - if len(parts) != 2: - self.fail(f"{value} is not a valid int pair. The value should be two integers separated by a comma.", param, ctx) + parts = [i.strip() for i in value.split(',')] + if len(parts) != self.max_entries: + raise ValueError + for i in parts: + try: + int(i) + except: + raise ValueError return [int(i) for i in parts] except ValueError: - self.fail(f"{value} is not a valid int pair. The value should be two integers separated by a comma.", param, ctx) + self.fail(f"{value} is not a valid list of integers. The value should be {self.max_entries} integers separated by a comma.", param, ctx) -class IntQuartet(click.ParamType): - """A class for a click type which accepts 4 integers, separated by a comma.""" - name = "int_pair" +class KParam(click.ParamType): + """A class for a click type which accepts any number of odd integers separated by a comma, or the word auto.""" + name = "k_param" def convert(self, value, param, ctx): try: - parts = value.split(',') - if len(parts) != 4: - self.fail(f"{value} is not a valid set of 4 integers separated by a comma.", param, ctx) + if value == "auto": + return value + parts = [i.strip() for i in value.split(',')] + for i in parts: + if int(i) % 2 == 0 or int(i) > 128: + raise ValueError return [int(i) for i in parts] except ValueError: - self.fail(f"{value} is not a valid set of 4 integers separated by a comma.", param, ctx) + self.fail(f"{value} is not 'auto' or odd integers <128 separated by a comma.", param, ctx) \ No newline at end of file diff --git a/harpy/_validations.py b/harpy/_validations.py index 5e0f46bf9..479d26641 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -11,6 +11,26 @@ from rich.table import Table import rich_click as click from ._printing import print_error, print_notice, print_solution, print_solution_with_culprits +from ._misc import harpy_progressbar +from concurrent.futures import ThreadPoolExecutor, as_completed + +def is_gzip(file_path): + """helper function to determine if a file is gzipped""" + try: + with gzip.open(file_path, 'rt') as f: + f.read(10) + return True + except gzip.BadGzipFile: + return False + +def is_plaintext(file_path): + """helper function to determine if a file is plaintext""" + try: + with open(file_path, 'r') as f: + f.read(10) + return True + except UnicodeDecodeError: + return False def check_envdir(dirpath): """Check that the provided dir exists and contains the necessary environment definitions""" @@ -18,7 +38,7 @@ def check_envdir(dirpath): print_error("missing conda files", "This working directory does not contain the expected directory of conda environment definitions ([blue bold].harpy_envs/[/blue bold])\n - use [green bold]--conda[/green bold] to recreate it") sys.exit(1) envlist = os.listdir(dirpath) - envs = ["align", "phase", "qc", "r", "simulations", "stitch", "variants"] + envs = ["align", "metassembly", "phase", "qc", "r", "simulations", "stitch", "variants"] errcount = 0 errtable = Table(show_footer=True, box=box.SIMPLE) errtable.add_column("File", justify="left", style="blue", no_wrap=True) @@ -146,16 +166,26 @@ def check_impute_params(parameters): rprint(errtable, file = sys.stderr) sys.exit(1) -def validate_bam_RG(bamlist): +def validate_bam_RG(bamlist, threads, quiet): """Validate BAM files bamlist to make sure the sample name inferred from the file matches the @RG tag within the file""" culpritfiles = [] culpritIDs = [] - for i in bamlist: - samplename = Path(i).stem - samview = subprocess.run(f"samtools samples {i}".split(), stdout = subprocess.PIPE).stdout.decode('utf-8').split() + def check_RG(bamfile): + samplename = Path(bamfile).stem + samview = subprocess.run(f"samtools samples {bamfile}".split(), stdout = subprocess.PIPE).stdout.decode('utf-8').split() if samplename != samview[0]: - culpritfiles.append(os.path.basename(i)) - culpritIDs.append(samview[0]) + return os.path.basename(i), samview[0] + + with harpy_progressbar(quiet) as progress: + task_progress = progress.add_task("[green]Checking RG tags...", total=len(bamlist)) + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = [executor.submit(check_RG, i) for i in bamlist] + for future in as_completed(futures): + result = future.result() + progress.update(task_progress, advance=1) + if result: + culpritfiles.append(result[0]) + culpritIDs.append(result[1]) if len(culpritfiles) > 0: print_error("sample ID mismatch",f"There are [bold]{len(culpritfiles)}[/bold] alignment files whose RG tags do not match their filenames.") @@ -309,10 +339,9 @@ def validate_regions(regioninput, genome): # check if the region is in the genome contigs = {} - if genome.lower().endswith("gz"): - with gzip.open(genome, "r") as fopen: + if is_gzip(genome): + with gzip.open(genome, "rt") as fopen: for line in fopen: - line = line.decode() if line.startswith(">"): cn = line.rstrip("\n").lstrip(">").split()[0] contigs[cn] = 0 @@ -372,33 +401,14 @@ def validate_regions(regioninput, genome): sys.exit(1) return "file" - -def is_gzip(file_path): - """helper function to determine if a file is gzipped""" - try: - with gzip.open(file_path, 'rt') as f: - f.read(10) - return True - except: - return False - -def is_plaintext(file_path): - """helper function to determine if a file is plaintext""" - try: - with open(file_path, 'r') as f: - f.read(10) - return True - except: - return False - -def check_fasta(genofile): +def check_fasta(genofile, quiet): """perform validations on fasta file for extensions and file contents""" ext_options = [".fasta", ".fas", ".fa", ".fna", ".ffn", ".faa", ".mpfa", ".frn"] ext_correct = 0 for i in ext_options: if genofile.lower().endswith(i) or genofile.lower().endswith(i + ".gz"): ext_correct += 1 - if ext_correct == 0: + if ext_correct == 0 and not quiet: print_notice(f"[blue]{genofile}[/blue] has an unfamiliar FASTA file extension. Common FASTA file extensions are: [green]" + ", ".join(ext_options) + "[/green] and may also be gzipped.") # validate fasta file contents @@ -450,3 +460,39 @@ def check_fasta(genofile): print_error("sequences absent", f"No sequences detected in [blue]{genofile}[/blue].") print_solution(solutiontext + "\nSee the FASTA file spec and try again after making the appropriate changes: https://www.ncbi.nlm.nih.gov/genbank/fastaformat/") sys.exit(1) + + +def validate_fastq_bx(fastq_list, threads, quiet): + def validate(fastq): + BX = False + BC = False + if is_gzip(fastq): + fq = gzip.open(fastq, "rt") + else: + fq = open(fastq, "r") + with fq: + while True: + line = fq.readline() + if not line: + break + if not line.startswith("@"): + continue + BX = True if "BX:Z" in line else BX + BC = True if "BC:Z" in line else BC + if BX and BC: + print_error("clashing barcode tags", f"Both [green bold]BC:Z[/green bold] and [green bold]BX:Z[/green bold] tags were detected in the read headers for [blue]{os.path.basename(fastq)}[/blue]. Athena accepts [bold]only[/bold] one of [green bold]BC:Z[/green bold] or [green bold]BX:Z[/green bold].") + print_solution("Check why your data has both tags in use and remove/rename one of the tags.") + sys.exit(1) + # check for one or the other after parsing is done + if not BX and not BC: + print_error("no barcodes found",f"No [green bold]BC:Z[/green bold] or [green bold]BX:Z[/green bold] tags were detected in read headers for [blue]{os.path.basename(fastq)}[/blue]. Athena requires the linked-read barcode to be present as either [green bold]BC:Z[/green bold] or [/green bold]BX:Z[/green bold] tags.") + print_solution("Check that this is linked-read data and that the barcode is demultiplexed from the sequence line into the read header as either a `BX:Z` or `BC:Z` tag.") + sys.exit(1) + + # parellelize over the fastq list + with harpy_progressbar(quiet) as progress: + task_progress = progress.add_task("[green]Validating FASTQ inputs...", total=len(fastq_list)) + with ThreadPoolExecutor(max_workers=threads) as executor: + futures = [executor.submit(validate, i) for i in fastq_list] + for future in as_completed(futures): + progress.update(task_progress, advance=1) \ No newline at end of file diff --git a/harpy/align.py b/harpy/align.py index 4b4393613..e2cae6bd7 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -37,7 +37,7 @@ def align(): }, { "name": "Workflow Controls", - "options": ["--conda", "--depth-window", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--depth-window", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ], "harpy align ema": [ @@ -47,7 +47,7 @@ def align(): }, { "name": "Workflow Controls", - "options": ["--conda", "--depth-window", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--depth-window", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ], "harpy align strobe": [ @@ -57,12 +57,12 @@ def align(): }, { "name": "Workflow Controls", - "options": ["--conda", "--depth-window", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--depth-window", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/modules/align/bwa/") +@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/workflows/align/bwa/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional bwa mem parameters, in quotes') @@ -75,10 +75,10 @@ def align(): @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_params, min_quality, molecule_distance, snakemake, skipreports, quiet, hpc, conda, setup_only): +def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_params, min_quality, molecule_distance, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Align sequences to genome using `BWA MEM` @@ -102,7 +102,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome) + check_fasta(genome, quiet) fetch_rule(workflowdir, "align_bwa.smk") fetch_report(workflowdir, "align_stats.Rmd") fetch_report(workflowdir, "align_bxstats.Rmd") @@ -117,7 +117,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ config.write(f"keep_unmapped: {keep_unmapped}\n") config.write(f"molecule_distance: {molecule_distance}\n") config.write(f"depth_windowsize: {depth_window}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") config.write(f"workflow_call: {command}\n") @@ -140,7 +140,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/align/ema") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/align/ema") @click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") @@ -155,10 +155,10 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_unmapped, threads, ema_bins, skipreports, extra_params, min_quality, snakemake, quiet, hpc, conda, setup_only): +def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_unmapped, threads, ema_bins, skip_reports, extra_params, min_quality, snakemake, quiet, hpc, conda, setup_only): """ Align sequences to genome using `EMA` @@ -197,7 +197,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome) + check_fasta(genome, quiet) fetch_rule(workflowdir, "align_ema.smk") fetch_report(workflowdir, "align_stats.Rmd") fetch_report(workflowdir, "align_bxstats.Rmd") @@ -213,7 +213,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u config.write(f"platform: {platform}\n") config.write(f"EMA_bins: {ema_bins}\n") config.write(f"depth_windowsize: {depth_window}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") config.write(f"workflow_call: {command}\n") @@ -239,7 +239,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/modules/align/minimap/") +@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/workflows/align/minimap/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') @@ -253,10 +253,10 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, threads, extra_params, min_quality, molecule_distance, snakemake, skipreports, quiet, hpc, conda, setup_only): +def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, threads, extra_params, min_quality, molecule_distance, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Align sequences to genome using `strobealign` @@ -283,7 +283,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, os.makedirs(f"{workflowdir}/", exist_ok= True) fqlist, sample_count = parse_fastq_inputs(inputs) - check_fasta(genome) + check_fasta(genome, quiet) fetch_rule(workflowdir, "align_strobealign.smk") fetch_report(workflowdir, "align_stats.Rmd") fetch_report(workflowdir, "align_bxstats.Rmd") @@ -300,7 +300,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, config.write(f"molecule_distance: {molecule_distance}\n") config.write(f"average_read_length: {read_length}\n") config.write(f"depth_windowsize: {depth_window}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") config.write(f"workflow_call: {command}\n") diff --git a/harpy/assembly.py b/harpy/assembly.py new file mode 100644 index 000000000..813417866 --- /dev/null +++ b/harpy/assembly.py @@ -0,0 +1,100 @@ +"""Perform a linked-read aware metassembly""" + +import os +import sys +from rich import box +from rich.table import Table +import rich_click as click +from ._conda import create_conda_recipes +from ._launch import launch_snakemake +from ._misc import fetch_rule, snakemake_log, KParam +from ._validations import validate_fastq_bx + +docstring = { + "harpy assemble": [ + { + "name": "Parameters", + "options": ["--bx-tag", "--extra-params", "--kmer-length", "--max-memory", "--metassembly"], + }, + { + "name": "Workflow Controls", + "options": ["--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], + }, + ] +} + +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") +@click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode (`BX` or `BC`)") +@click.option('-x', '--extra-params', type = str, help = 'Additional spades parameters, in quotes') +@click.option('-m', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 250000, help = 'Maximum memory for metaSPADES to use, in megabytes') +@click.option('-a', '--metassembly', is_flag = True, show_default = True, default = False, help = 'Perform a metagenomic assembly') +@click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)') +@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') +@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') +@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') +@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') +@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') +@click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) +@click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) +def assemble(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, metassembly, output_dir, extra_params, threads, snakemake, skip_reports, quiet, hpc, setup_only): + """ + Perform an assembly from linked-read sequences + + Use the `--metassembly` flag to perform a metagenome assembly (defaults to an assembly of a single sample). + The linked-read barcodes must be in either a `BX:Z` or `BC:Z` FASTQ header tag, specified with `--bx-tag`. + If specifying `K` values, they must be separated by commas and without spaces (e.g. `-k 15,23,51`). + """ + output_dir = output_dir.rstrip("/") + asm = "metassembly" if metassembly else "assembly" + workflowdir = f"{output_dir}/workflow" + sdm = "conda" #if conda else "conda apptainer" + command = f'snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix .snakemake/conda --cores {threads} --directory . ' + command += f"--snakefile {workflowdir}/{asm}.smk " + command += f"--configfile {workflowdir}/config.yaml " + if hpc: + command += f"--workflow-profile {hpc} " + if snakemake is not None: + command += snakemake + + validate_fastq_bx([fastq_r1, fastq_r2], threads, quiet) + os.makedirs(workflowdir, exist_ok=True) + fetch_rule(workflowdir, f"{asm}.smk") + os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) + sm_log = snakemake_log(output_dir, asm) + + with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: + config.write(f"workflow: {asm}\n") + config.write(f"snakemake_log: {sm_log}\n") + config.write(f"output_directory: {output_dir}\n") + config.write(f"barcode_tag: {bx_tag.upper()}\n") + config.write("spades:\n") + config.write(f" max_memory: {max_memory}\n") + if kmer_length == "auto": + config.write(f" k: auto\n") + else: + config.write(f" k: " + ",".join(map(str,kmer_length)) + "\n") + if extra_params: + config.write(f" extra: {extra_params}\n") + config.write(f"skip_reports: {skip_reports}\n") + config.write(f"workflow_call: {command}\n") + config.write("inputs:\n") + config.write(f" fastq_r1: {fastq_r1}\n") + config.write(f" fastq_r2: {fastq_r2}\n") + + create_conda_recipes() + if setup_only: + sys.exit(0) + + start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) + start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) + start_text.add_column("value", justify="left") + start_text.add_row("Barcode Tag: ", bx_tag.upper()) + if kmer_length == "auto": + start_text.add_row(f"Kmer Length: ", "auto") + else: + start_text.add_row(f"Kmer Length: ", ",".join(map(str,kmer_length))) + start_text.add_row("Output Folder:", f"{output_dir}/") + start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") + launch_snakemake(command, asm, start_text, output_dir, sm_log, quiet) diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index 14f793911..3c8bc1261 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -23,7 +23,7 @@ ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/qc") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") @click.option('-k', '--kmer-length', default = 21, show_default = True, type=int, help = 'Size of kmers') @click.option('-w', '--window-size', default = 40, show_default = True, type=int, help = 'Size of window guaranteed to contain at least one kmer') @click.option('-d', '--density', default = 3, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'On average, 1/2^d kmers are indexed') diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 8a2ed4d37..1f08b3ddd 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -31,13 +31,13 @@ def demultiplex(): }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/demultiplex/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/demultiplex/") @click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Tab-delimited file of sample\barcode') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name') @@ -45,13 +45,13 @@ def demultiplex(): @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('R1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('R2_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('I1_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) @click.argument('I2_FQ', required=True, type=click.Path(exists=True, dir_okay=False, readable=True)) -def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, skipreports, quiet, hpc, conda, setup_only): +def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Demultiplex Generation I haplotagged FASTQ files @@ -80,7 +80,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski config.write("workflow: demultiplex gen1\n") config.write(f"snakemake_log: {sm_log}\n") config.write(f"output_directory: {output_dir}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") config.write(f" demultiplex_schema: {Path(schema).resolve()}\n") diff --git a/harpy/impute.py b/harpy/impute.py index cfb4f63f0..11bc5c511 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -20,12 +20,12 @@ }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/impute/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/impute/") @click.option('-x', '--extra-params', type = str, help = 'Additional STITCH parameters, in quotes') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Impute", show_default=True, help = 'Output directory name') @click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False), help = 'STITCH parameter file (tab-delimited)') @@ -35,11 +35,11 @@ @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for imputation rather than those found the inputs') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_params, snakemake, skipreports, quiet, hpc, conda, setup_only): +def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Impute genotypes using variants and alignments @@ -70,7 +70,7 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) check_impute_params(parameters) bamlist, n = parse_alignment_inputs(inputs) - validate_bam_RG(bamlist) + validate_bam_RG(bamlist, threads, quiet) samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) biallelic, n_biallelic = biallelic_contigs(vcf, f"{workflowdir}") @@ -88,7 +88,7 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para config.write(f"samples_from_vcf: {vcf_samples}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") config.write(f" paramfile: {Path(parameters).resolve()}\n") diff --git a/harpy/metassembly.py b/harpy/metassembly.py index f6e5ddc70..f63a12120 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -7,49 +7,46 @@ import rich_click as click from ._conda import create_conda_recipes from ._launch import launch_snakemake -from ._misc import fetch_rule, snakemake_log, IntPair +from ._misc import fetch_rule, snakemake_log, KParam +from ._validations import validate_fastq_bx docstring = { "harpy metassembly": [ { "name": "Parameters", - "options": ["--bx-tag", "--clusters", "--contig-cov", "--extra-params"], + "options": ["--bx-tag", "--extra-params", "kmer-length", "--max-memory"], }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/qc", hidden = True) -@click.option('-n', '--clusters', default = 35, show_default = True, type = int, help = 'Number of clusters') +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") @click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode (`BX` or `BC`)") -@click.option('-c', '--contig-cov', default = "10,30", show_default = True, type = IntPair(), help = "Coverage for low abundance contigs") -@click.option('-x', '--extra-params', type = str, help = 'Additional pagaea parameters, in quotes') +@click.option('-x', '--extra-params', type = str, help = 'Additional metaspades parameters, in quotes') +@click.option('-m', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 250000, help = 'Maximum memory for metaSPADES to use, in megabytes') +@click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for metaspades (`odd` and `<128`)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('fastq', required=True, type=click.Path(exists=True, readable=True), nargs=1) -@click.argument('fastq2', required=False, type=click.Path(exists=True, readable=True), nargs=1) -def metassembly(fastq, fastq2, clusters, contig_cov, bx_tag, output_dir, extra_params, threads, snakemake, skipreports, quiet, hpc, conda, setup_only): +@click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) +@click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) +def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, extra_params, threads, snakemake, skip_reports, quiet, hpc, setup_only): """ - Perform a metassembly from linked-read sequences. + Perform a metassembly from linked-read sequences The linked-read barcode must be in either a `BX:Z` or `BC:Z` FASTQ header tag, specified with `--bx-tag`. - Input FASTQ files can be one of: - - one interleaved FASTQ - - one single-end FASTQ - - the two FASTQ's of paired-end reads + If specifying `K` values, they must be separated by commas and without spaces (e.g. `-k 15,23,51`). """ output_dir = output_dir.rstrip("/") workflowdir = f"{output_dir}/workflow" - sdm = "conda" if conda else "conda apptainer" + sdm = "conda" # if conda else "conda apptainer" command = f'snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix .snakemake/conda --cores {threads} --directory . ' command += f"--snakefile {workflowdir}/metassembly.smk " command += f"--configfile {workflowdir}/config.yaml " @@ -58,9 +55,9 @@ def metassembly(fastq, fastq2, clusters, contig_cov, bx_tag, output_dir, extra_p if snakemake is not None: command += snakemake + validate_fastq_bx([fastq_r1, fastq_r2], threads, quiet) os.makedirs(workflowdir, exist_ok=True) fetch_rule(workflowdir, "metassembly.smk") - #fetch_report(workflowdir, "bx_count.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "metassembly") @@ -69,29 +66,32 @@ def metassembly(fastq, fastq2, clusters, contig_cov, bx_tag, output_dir, extra_p config.write(f"snakemake_log: {sm_log}\n") config.write(f"output_directory: {output_dir}\n") config.write(f"barcode_tag: {bx_tag.upper()}\n") - config.write(f"clusters: {clusters}\n") - config.write(f"contig_coverage: {contig_cov[0]},{contig_cov[1]}\n") + config.write("spades:\n") + config.write(f" max_memory: {max_memory}\n") + if kmer_length == "auto": + config.write(f" k: auto\n") + else: + config.write(f" k: " + ",".join(map(str,kmer_length)) + "\n") if extra_params: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f" extra: {extra_params}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") - config.write(f" fastq: {fastq}\n") - if fastq2: - config.write(f" fastq2: {fastq2}\n") + config.write(f" fastq_r1: {fastq_r1}\n") + config.write(f" fastq_r2: {fastq_r2}\n") create_conda_recipes() if setup_only: sys.exit(0) - n_fq = 2 if fastq2 else 1 start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column("value", justify="left") - start_text.add_row("FASTQ Inputs: ", f"{n_fq}") start_text.add_row("Barcode Tag: ", bx_tag.upper()) - start_text.add_row("Clusters: ", f"{clusters}") - start_text.add_row("Contig Cov. Thresh: ", f"{contig_cov[0]},{contig_cov[1]}") + if kmer_length == "auto": + start_text.add_row(f"Kmer Length: ", "auto") + else: + start_text.add_row(f"Kmer Length: ", ",".join(map(str,kmer_length))) start_text.add_row("Output Folder:", f"{output_dir}/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "metassembly", start_text, output_dir, sm_log, quiet) diff --git a/harpy/phase.py b/harpy/phase.py index 424dc5f3e..41b6b5656 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -19,12 +19,12 @@ }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/phase") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/phase") @click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') @@ -37,11 +37,11 @@ @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.option('--vcf-samples', is_flag = True, show_default = True, default = False, help = 'Use samples present in vcf file for phasing rather than those found the inputs') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, vcf_samples, genome, snakemake, extra_params, ignore_bx, skipreports, quiet, hpc, conda, setup_only): +def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, vcf_samples, genome, snakemake, extra_params, ignore_bx, skip_reports, quiet, hpc, conda, setup_only): """ Phase SNPs into haplotypes @@ -68,9 +68,9 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, bamlist, n = parse_alignment_inputs(inputs) samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) - validate_bam_RG(bamlist) + validate_bam_RG(bamlist, threads, quiet) if genome: - check_fasta(genome) + check_fasta(genome, quiet) fetch_rule(workflowdir, "phase.smk") fetch_report(workflowdir, "hapcut.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) @@ -86,7 +86,7 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, config.write(f"samples_from_vcf: {vcf_samples}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") config.write(f" variantfile: {vcf}\n") diff --git a/harpy/popgroups.py b/harpy/popgroups.py index 06ae8fad6..1ed18c02c 100644 --- a/harpy/popgroups.py +++ b/harpy/popgroups.py @@ -8,7 +8,7 @@ from ._printing import print_error, print_solution, print_notice -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/snp/#sample-grouping-file") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/snp/#sample-grouping-file") @click.option('-o', '--output', type=str, default = "samples.groups", metavar = "Output file name", help = 'Output file name, will overwrite existing') @click.argument('inputdir', required=True, type=click.Path(exists=True, file_okay=False)) def popgroup(inputdir, output): diff --git a/harpy/preflight.py b/harpy/preflight.py index 24cdff589..c0c465b77 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -35,7 +35,7 @@ def preflight(): ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/preflight/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/preflight/") @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/fastq", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @@ -96,7 +96,7 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "preflight_fastq", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/preflight/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/preflight/") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/bam", show_default=True, help = 'Output directory name') diff --git a/harpy/qc.py b/harpy/qc.py index c698808c9..0e4013e8c 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -7,7 +7,7 @@ import rich_click as click from ._conda import create_conda_recipes from ._launch import launch_snakemake -from ._misc import fetch_report, fetch_rule, snakemake_log, IntQuartet +from ._misc import fetch_report, fetch_rule, snakemake_log, IntList from ._parsers import parse_fastq_inputs docstring = { @@ -18,14 +18,13 @@ }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/qc") -@click.option('-c', '--deconvolve', is_flag = True, default = False, help = 'Resolve barcode clashes between reads from different molecules.') -@click.option('-p', '--deconvolve-params', type = IntQuartet(), show_default = True, default = "21,40,3,0", help = ' Accepts the QuickDeconvolution parameters for k,w,d,a, in that order') +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") +@click.option('-c', '--deconvolve', type = IntList(4), default = "0,0,0,0", help = 'Accepts the QuickDeconvolution parameters for `k`,`w`,`d`,`a` (in that order)') @click.option('-d', '--deduplicate', is_flag = True, default = False, help = 'Identify and remove PCR duplicates') @click.option('-x', '--extra-params', type = str, help = 'Additional Fastp parameters, in quotes') @click.option('-m', '--max-length', default = 150, show_default = True, type=int, help = 'Maximum length to trim sequences down to') @@ -37,10 +36,10 @@ @click.option('--setup-only', is_flag = True, hidden = True, show_default = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, deconvolve, deconvolve_params, extra_params, threads, snakemake, skipreports, quiet, hpc, conda, setup_only): +def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, deconvolve, extra_params, threads, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Remove adapters and quality-control sequences @@ -50,11 +49,12 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d The input reads will be quality trimmed using: - a sliding window from front to tail - poly-G tail removal - - use `-a` to automatically detect and remove adapters - - use `-d` to find and remove PCR duplicates - - use `-c` to resolve barcode clashing that may occur by unrelated sequences having the same barcode - - the parameters for `-p` are described [here](https://github.com/RolandFaure/QuickDeconvolution?tab=readme-ov-file#usage). - - you can use `harpy deconvolve` to perform this task separately + - `-a` automatically detects and remove adapters + - `-d` finds and remove PCR duplicates + - `-c` resolves barcodes clashing between unrelated sequences + - off by default, activated with [4 integers](https://github.com/RolandFaure/QuickDeconvolution?tab=readme-ov-file#usage), separated by commas + - use `21,40,3,0` for QuickDeconvolution defaults (or adjust as needed) + - use `harpy deconvolve` to perform this task separately """ output_dir = output_dir.rstrip("/") workflowdir = f"{output_dir}/workflow" @@ -80,9 +80,9 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d config.write(f"output_directory: {output_dir}\n") config.write(f"trim_adapters: {trim_adapters}\n") config.write(f"deduplicate: {deduplicate}\n") - if deconvolve: + if sum(deconvolve) > 0: config.write("deconvolve:\n") - k,w,d,a = deconvolve_params + k,w,d,a = deconvolve config.write(f" kmer_length: {k}\n") config.write(f" window_size: {w}\n") config.write(f" density: {d}\n") @@ -91,7 +91,7 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d config.write(f"max_len: {max_length}\n") if extra_params: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") for i in fqlist: @@ -100,14 +100,13 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d create_conda_recipes() if setup_only: sys.exit(0) - start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column("value", justify="left") start_text.add_row("Samples:", f"{sample_count}") start_text.add_row("Trim Adapters:", "yes" if trim_adapters else "no") start_text.add_row("Deduplicate:", "yes" if deduplicate else "no") - start_text.add_row("Deconvolve:", "yes" if deconvolve else "no") + start_text.add_row("Deconvolve:", "yes" if sum(deconvolve) > 0 else "no") start_text.add_row("Output Folder:", f"{output_dir}/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "qc", start_text, output_dir, sm_log, quiet) diff --git a/harpy/resume.py b/harpy/resume.py index 8084f9506..526212380 100644 --- a/harpy/resume.py +++ b/harpy/resume.py @@ -13,7 +13,7 @@ from ._misc import snakemake_log from ._conda import create_conda_recipes -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/other") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/other") @click.option('-c', '--conda', is_flag = True, default = False, help = 'Recreate the conda environments into .harpy_envs/') @click.option('-t', '--threads', type = click.IntRange(min = 2, max_open = True), help = 'Change the number of threads (>1)') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') diff --git a/harpy/simulate.py b/harpy/simulate.py index 3ed05906d..b5566e1c0 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -121,7 +121,7 @@ def simulate(): ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-linkedreads") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") @click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False), help = "File of linked-read barcodes to add to reads") @click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") @click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True, help = "Average number of molecules per partition") @@ -144,7 +144,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r Create linked reads from a genome Two haplotype genomes (un/compressed fasta) need to be provided as inputs at the end of the command. If - you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/modules/simulate/simulate-variants/#simulate-diploid-assembly). + you don't have a diploid genome, you can simulate one with `harpy simulate` as described [in the documentation](https://pdimens.github.io/harpy/workflows/simulate/simulate-variants/#simulate-diploid-assembly). If not providing a text file of `--barcodes`, Harpy will download the `4M-with-alts-february-2016.txt` file containing the standard 16-basepair 10X barcodes, which is available from 10X genomics and the @@ -162,8 +162,8 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r if snakemake is not None: command += snakemake - check_fasta(genome_hap1) - check_fasta(genome_hap2) + check_fasta(genome_hap1, quiet) + check_fasta(genome_hap2, quiet) os.makedirs(f"{workflowdir}/", exist_ok= True) fetch_rule(workflowdir, "simulate_linkedreads.smk") @@ -203,7 +203,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-s', '--snp-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known snps to simulate') @click.option('-i', '--indel-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known indels to simulate') @click.option('-n', '--snp-count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random snps to simluate") @@ -268,7 +268,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome) + check_fasta(genome, quiet) start_text.add_row("Input Genome:", os.path.basename(genome)) if snp_vcf: validate_input_by_ext(snp_vcf, "--snp-vcf", ["vcf","vcf.gz","bcf"]) @@ -334,7 +334,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "simulate_snpindel", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known inversions to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random inversions to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum inversion size (bp)") @@ -381,7 +381,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome) + check_fasta(genome, quiet) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") @@ -442,7 +442,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si launch_snakemake(command, "simulate_inversion", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known copy number variants to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random variants to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum variant size (bp)") @@ -498,7 +498,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome) + check_fasta(genome, quiet) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") @@ -561,7 +561,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "simulate_cnv", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/modules/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known translocations to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random translocations to simluate") @click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") @@ -605,7 +605,7 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, # instantiate workflow directory # move necessary files to workflow dir os.makedirs(f"{workflowdir}/input/", exist_ok= True) - check_fasta(genome) + check_fasta(genome, quiet) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column(header="value", justify="left") diff --git a/harpy/snakefiles/align_bwa.smk b/harpy/snakefiles/align_bwa.smk index 34064ec5b..f2ec94cf5 100644 --- a/harpy/snakefiles/align_bwa.smk +++ b/harpy/snakefiles/align_bwa.smk @@ -23,7 +23,7 @@ extra = config.get("extra", "") bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] windowsize = config["depth_windowsize"] bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} @@ -247,24 +247,32 @@ rule workflow_summary: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam", "bam.bai"]), - reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - agg_report = outdir + "/reports/bwa.stats.html" if not skipreports else [], - bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] + reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skip_reports else [], + agg_report = outdir + "/reports/bwa.stats.html" if not skip_reports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: + genomefile = genomefile, quality = config["alignment_quality"], unmapped = "" if keep_unmapped else "-F 4", extra = extra run: + summary_template = f""" +The harpy align bwa workflow ran using these parameters: + +The provided genome: {{params.genomefile}} + +Sequences were aligned with BWA using: + bwa mem -C -v 2 {{params.extra}} -R "@RG\\tID:SAMPLE\\tSM:SAMPLE" genome forward_reads reverse_reads | + samtools view -h {{params.unmapped}} -q {{params.quality}} + +Duplicates in the alignments were marked following: + samtools collate | + samtools fixmate | + samtools sort -T SAMPLE --reference {{params.genomefile}} -m 2000M | + samtools markdup -S --barcode-tag BX + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/align.bwa.summary", "w") as f: - _ = f.write("The harpy align bwa workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {genomefile}\n") - _ = f.write("Sequencing were aligned with BWA using:\n") - _ = f.write(f" bwa mem -C -v 2 {params.extra} -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome forward_reads reverse_reads |\n") - _ = f.write(f" samtools view -h {params.unmapped} -q {params.quality}\n") - _ = f.write("Duplicates in the alignments were marked following:\n") - _ = f.write(" samtools collate |\n") - _ = f.write(" samtools fixmate |\n") - _ = f.write(" samtools sort -T SAMPLE --reference genome -m 2000M |\n") - _ = f.write(" samtools markdup -S --barcode-tag BX\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template.format(params=params)) \ No newline at end of file diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index 12a7eca19..9f4ab6a59 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -27,7 +27,7 @@ bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" envdir = os.getcwd() + "/.harpy_envs" windowsize = config["depth_windowsize"] keep_unmapped = config["keep_unmapped"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} d = dict(zip(samplenames, samplenames)) @@ -328,36 +328,48 @@ rule workflow_summary: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = [ "bam", "bam.bai"] ), - cov_report = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [], - bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] + cov_report = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skip_reports else [], + agg_report = f"{outdir}/reports/ema.stats.html" if not skip_reports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: beadtech = "-p" if platform == "haplotag" else f"-w {barcode_list}", unmapped = "" if keep_unmapped else "-F 4" run: + summary_template= f""" +The harpy align ema workflow ran using these parameters: + +The provided genome: {bn} + +Barcodes were counted and validated with EMA using: + seqtk mergepe forward.fq.gz reverse.fq.gz | ema count {{params.beadtech}} + +Barcoded sequences were binned with EMA using: + seqtk mergepe forward.fq.gz reverse.fq.gz | ema preproc {{params.beadtech}} -n {nbins} + +Barcoded bins were aligned with ema align using: + ema align " + extra + " -d -p " + platform + " -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" | + samtools view -h {{params.unmapped}} -q " + str(config["quality"]) + " - | + samtools sort --reference genome -m 2000M + +Invalid/non barcoded sequences were aligned with BWA using: + bwa mem -C -v2 -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome forward_reads reverse_reads + samtools view -h {{params.unmapped}} -q " + str(config["quality"]) + " - | + samtools sort --reference genome -m 2000M + +Duplicates in non-barcoded alignments were marked following: + samtools collate | + samtools fixmate | + samtools sort -m 2000M | + samtools markdup -S + +Alignments were merged using: + samtools cat barcode.bam nobarcode.bam > concat.bam + +Merged alignments were sorted using: + samtools sort -m 2000M concat.bam + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/align.ema.summary", "w") as f: - _ = f.write("The harpy align ema workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("Barcodes were counted and validated with EMA using:\n") - _ = f.write(f" seqtk mergepe forward.fq.gz reverse.fq.gz | ema count {params.beadtech}\n") - _ = f.write("Barcoded sequences were binned with EMA using:\n") - _ = f.write(f" seqtk mergepe forward.fq.gz reverse.fq.gz | ema preproc {params.beadtech} -n {nbins}\n") - _ = f.write("Barcoded bins were aligned with ema align using:\n") - _ = f.write(f" ema align " + extra + " -d -p " + platform + " -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" |\n") - _ = f.write(" samtools view -h {params.unmapped} -q " + str(config["quality"]) + " - |\n") - _ = f.write(" samtools sort --reference genome -m 2000M\n\n") - _ = f.write("Invalid/non barcoded sequences were aligned with BWA using:\n") - _ = f.write(" bwa mem -C -v2 -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome forward_reads reverse_reads\n") - _ = f.write(" samtools view -h {params.unmapped} -q " + str(config["quality"]) + " - |\n") - _ = f.write(" samtools sort --reference genome -m 2000M\n\n") - _ = f.write("Duplicates in non-barcoded alignments were marked following:\n") - _ = f.write(" samtools collate |\n") - _ = f.write(" samtools fixmate |\n") - _ = f.write(" samtools sort -m 2000M |\n") - _ = f.write(" samtools markdup -S\n") - _ = f.write("Alignments were merged using:\n") - _ = f.write(" samtools cat barcode.bam nobarcode.bam > concat.bam\n") - _ = f.write("Merged alignments were sorted using:\n") - _ = f.write(" samtools sort -m 2000M concat.bam\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template.format(params=params)) diff --git a/harpy/snakefiles/align_strobealign.smk b/harpy/snakefiles/align_strobealign.smk index ae25a7bc3..91298e9df 100644 --- a/harpy/snakefiles/align_strobealign.smk +++ b/harpy/snakefiles/align_strobealign.smk @@ -21,7 +21,7 @@ extra = config.get("extra", "") bn = os.path.basename(genomefile) if bn.lower().endswith(".gz"): bn = bn[:-3] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] windowsize = config["depth_windowsize"] molecule_distance = config["molecule_distance"] keep_unmapped = config["keep_unmapped"] @@ -239,9 +239,9 @@ rule workflow_summary: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), - samtools = outdir + "/reports/strobealign.stats.html" if not skipreports else [] , - reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] + samtools = outdir + "/reports/strobealign.stats.html" if not skip_reports else [] , + reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skip_reports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: readlen = readlen, quality = config["quality"], @@ -249,22 +249,30 @@ rule workflow_summary: unmapped = "" if keep_unmapped else "-F 4", extra = extra run: + if autolen: + strobetext = f"Sequences were aligned with strobealign using:\n" + strobetext += f" strobealign -U -C --rg-id=SAMPLE --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n" + else: + strobetext = "The genome index was created using:\n" + strobetext += f" strobealign --create-index -r {params.readlen} genome\n\n" + strobetext += "Sequences were aligned with strobealign using:\n" + strobetext += f" strobealign --use-index {params.unmapped_strobe} -N 2 -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n" + summary_template = f""" +The harpy align strobealign workflow ran using these parameters: + +The provided genome: {bn} + +{strobetext} + samtools view -h {params.unmapped} -q {params.quality} + +Duplicates in the alignments were marked following: + samtools collate | + samtools fixmate | + samtools sort -m 2000M | + samtools markdup -S --barcode-tag BX + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/align.strobealign.summary", "w") as f: - _ = f.write("The harpy align strobealign workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - if autolen: - _ = f.write("Sequencing were aligned with strobealign using:\n") - _ = f.write(f" strobealign -U -C --rg-id=SAMPLE --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") - else: - _ = f.write("The genome index was created using:\n") - _ = f.write(f" strobealign --create-index -r {params.readlen} genome\n") - _ = f.write("Sequencing were aligned with strobealign using:\n") - _ = f.write(f" strobealign --use-index {params.unmapped_strobe} -N 2 -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") - _ = f.write(f" samtools view -h {params.unmapped} -q {params.quality}\n") - _ = f.write("Duplicates in the alignments were marked following:\n") - _ = f.write(" samtools collate |\n") - _ = f.write(" samtools fixmate |\n") - _ = f.write(" samtools sort -m 2000M |\n") - _ = f.write(" samtools markdup -S --barcode-tag BX\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) \ No newline at end of file diff --git a/harpy/snakefiles/assembly.smk b/harpy/snakefiles/assembly.smk new file mode 100644 index 000000000..c8010fe68 --- /dev/null +++ b/harpy/snakefiles/assembly.smk @@ -0,0 +1,61 @@ +import os +import logging + +onstart: + logger.logger.addHandler(logging.FileHandler(config["snakemake_log"])) +onsuccess: + os.remove(logger.logfile) +onerror: + os.remove(logger.logfile) + +FQ1 = config["inputs"]["fastq_r1"] +FQ2 = config["inputs"]["fastq_r2"] +outdir = config["output_directory"] +envdir = os.getcwd() + "/.harpy_envs" +max_mem = config["spades"]["max_memory"] +k_param = config["spades"]["k"] +extra = config["spades"].get("extra", "") + +rule cloudspades: + input: + FQ_R1 = FQ1, + FQ_R2 = FQ2 + output: + f"{outdir}/contigs.fasta", + f"{outdir}/scaffolds.fasta" + params: + outdir = outdir, + k = k_param, + mem = max_mem // 1000, + extra = extra + log: + outdir + "/logs/assembly.log" + conda: + f"{envdir}/assembly.yaml" + threads: + workflow.cores + resources: + mem_mb=max_mem + shell: + "spades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} --gemcode1-1 {input.FQ_R1} --gemcode1-2 {input.FQ_R2} -o {params.outdir} --isolate > {log}" + +rule workflow_summary: + default_target: True + input: + f"{outdir}/athena/athena.asm.fa" + params: + k_param = k_param, + max_mem = max_mem // 1000, + extra = extra + run: + summary_template = f""" +The harpy assemble workflow ran using these parameters: + +Reads were assembled using cloudspades: + spades.py -t THREADS -m {params.max_mem} --gemcode1-1 FQ1 --gemcode1-2 FQ2 --isolate -k {params.k_param} {params.extra} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" + with open(outdir + "/workflow/metassembly.summary", "w") as f: + f.write(summary_template) \ No newline at end of file diff --git a/harpy/snakefiles/containerize.smk b/harpy/snakefiles/containerize.smk index 200762aee..687affa80 100644 --- a/harpy/snakefiles/containerize.smk +++ b/harpy/snakefiles/containerize.smk @@ -6,7 +6,7 @@ onsuccess: rule all: input: - collect("{conda}.env", conda = ["align", "phase", "qc", "r", "simulations", "snp", "stitch", "sv"]) + collect("{conda}.env", conda = ["align", "metassembly", "phase", "qc", "r", "simulations", "snp", "stitch", "sv"]) rule qc: output: "qc.env" @@ -47,3 +47,13 @@ rule simulations: output: "simulations.env" conda: os.getcwd() + "/.harpy_envs/simulations.yaml" shell: "touch {output}" + +rule assembly: + output: "assembly.env" + conda: os.getcwd() + "/.harpy_envs/assembly.yaml" + shell: "touch {output}" + +rule metassembly: + output: "metassembly.env" + conda: os.getcwd() + "/.harpy_envs/metassembly.yaml" + shell: "touch {output}" \ No newline at end of file diff --git a/harpy/snakefiles/deconvolve.smk b/harpy/snakefiles/deconvolve.smk index 2f662e24e..6789613ab 100644 --- a/harpy/snakefiles/deconvolve.smk +++ b/harpy/snakefiles/deconvolve.smk @@ -86,14 +86,21 @@ rule workflow_summary: input: collect(outdir + "/{sample}.{FR}.fq.gz", FR = ["R1", "R2"], sample = samplenames), run: - with open(outdir + "/workflow/deconvolve.summary", "w") as f: - _ = f.write("The harpy deconvolve workflow ran using these parameters:\n\n") - _ = f.write("fastq files were interleaved with seqtk:\n") - _ = f.write(" seqtk mergepe forward.fq reverse.fq\n") - _ = f.write("Deconvolution occurred using QuickDeconvolution:\n") - _ = f.write(f" QuickDeconvolution -t threads -i infile.fq -o output.fq -k {kmer_length} -w {window_size} -d {density} -a {dropout}\n") - _ = f.write("The interleaved output was split back into forward and reverse reads with seqtk:\n") - _ = f.write(" seqtk -1 interleaved.fq | gzip > file.R1.fq.gz\n") - _ = f.write(" seqtk -2 interleaved.fq | gzip > file.R2.fq.gz\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") + summary_template = f""" +The harpy deconvolve workflow ran using these parameters: + +fastq files were interleaved with seqtk: + seqtk mergepe forward.fq reverse.fq + +Deconvolution occurred using QuickDeconvolution: + QuickDeconvolution -t threads -i infile.fq -o output.fq -k {kmer_length} -w {window_size} -d {density} -a {dropout} + +The interleaved output was split back into forward and reverse reads with seqtk: + seqtk -1 interleaved.fq | gzip > file.R1.fq.gz + seqtk -2 interleaved.fq | gzip > file.R2.fq.gz + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" + with open(outdir + "/workflow/deconvolve.summary", "w") as f: + f.write(summary_template) \ No newline at end of file diff --git a/harpy/snakefiles/demultiplex_gen1.smk b/harpy/snakefiles/demultiplex_gen1.smk index 5f60c3b77..93149ffdb 100644 --- a/harpy/snakefiles/demultiplex_gen1.smk +++ b/harpy/snakefiles/demultiplex_gen1.smk @@ -15,7 +15,7 @@ R2 = config["inputs"]["R2"] I1 = config["inputs"]["I1"] I2 = config["inputs"]["I2"] samplefile = config["inputs"]["demultiplex_schema"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] outdir = config["output_directory"] envdir = os.getcwd() + "/.harpy_envs" @@ -164,18 +164,30 @@ rule workflow_summary: default_target: True input: fq = collect(outdir + "/{sample}.{FR}.fq.gz", sample = samplenames, FR = ["F", "R"]), - reports = outdir + "/reports/demultiplex.QC.html" if not skipreports else [] + reports = outdir + "/reports/demultiplex.QC.html" if not skip_reports else [] run: os.makedirs(f"{outdir}/workflow/", exist_ok= True) + summary_template = f""" +The harpy demultiplex gen1 workflow ran using these parameters: + +Haplotag technology: Generation I + +The multiplexed input files: + - {R1} + - {R2} + - {I1} + - {I2} + +Barcodes were moved into the read headers using the command: + demuxGen1 DATA_ demux + +The delimited file associating CXX barcodes with samplenames: {samplefile} + +QC checks were performed on demultiplexed FASTQ files using: + falco -skip-report -skip-summary input.fq.gz + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/demux.gen1.summary", "w") as f: - _ = f.write("The harpy demultiplex gen1 workflow ran using these parameters:\n\n") - _ = f.write("Haplotag technology: Generation I\n") - _ = f.write(f"The multiplexed input files:\n -") - _ = f.write("\n -".join([R1,R2,I1,I2]) + "\n") - _ = f.write("Barcodes were moved into the read headers using the command:\n") - _ = f.write(f" demuxGen1 DATA_ demux\n") - _ = f.write(f"The delimited file associating CXX barcodes with samplenames:\n {samplefile}\n") - _ = f.write(f"QC checks were performed on demultiplexed FASTQ files using:\n") - _ = f.write(f" falco -skip-report -skip-summary input.fq.gz\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") + f.write(summary_template) diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk index 43858d5ae..65c612ba3 100644 --- a/harpy/snakefiles/impute.smk +++ b/harpy/snakefiles/impute.smk @@ -21,7 +21,7 @@ paramfile = config["inputs"]["paramfile"] biallelic = config["inputs"]["biallelic_contigs"] outdir = config["output_directory"] envdir = os.getcwd() + "/.harpy_envs" -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] paramspace = Paramspace(pd.read_csv(paramfile, sep=r"\s+", skip_blank_lines=True).rename(columns=str.lower), param_sep = "", filename_params = ["k", "s", "ngen", "bxlimit"]) with open(biallelic, "r") as f_open: contigs = [i.rstrip() for i in f_open.readlines()] @@ -216,39 +216,46 @@ rule workflow_summary: default_target: True input: vcf = collect(outdir + "/{stitchparams}/variants.imputed.bcf", stitchparams=paramspace.instance_patterns), - agg_report = collect(outdir + "/{stitchparams}/reports/variants.imputed.html", stitchparams=paramspace.instance_patterns) if not skipreports else [], - contig_report = collect(outdir + "/{stitchparams}/reports/{part}.stitch.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skipreports else [], + agg_report = collect(outdir + "/{stitchparams}/reports/variants.imputed.html", stitchparams=paramspace.instance_patterns) if not skip_reports else [], + contig_report = collect(outdir + "/{stitchparams}/reports/{part}.stitch.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skip_reports else [], run: + paramfiletext = "\t".join(open(paramfile, "r").readlines()) + summary_template = f""" +The harpy impute workflow ran using these parameters: + +The provided variant file: {variantfile} + +Preprocessing was performed with: + bcftools view -M2 -v snps --regions CONTIG INFILE | + bcftools query -i '(STRLEN(REF)==1) & (STRLEN(ALT[0])==1) & (REF!="N")' -f '%CHROM\\t%POS\\t%REF\\t%ALT\\n' + +The STITCH parameter file: {paramfile} + {paramfiletext} + +Within R, STITCH was invoked with the following parameters: + STITCH( + method = model, + posfile = posfile, + bamlist = bamlist, + nCores = ncores, + nGen = ngen, + chr = chr, + K = k, + S = s, + use_bx_tag = usebX, + bxTagUpperLimit = bxlimit, + niterations = 40, + switchModelIteration = 39, + splitReadIterations = NA, + outputdir = outdir, + output_filename = outfile + ) + +Additional STITCH parameters provided (overrides existing values above):\n") + {config.get("extra", "None provided")} + +The Snakemake workflow was called via command line:\n") + {config["workflow_call"]} +""" with open(outdir + "/workflow/impute.summary", "w") as f: - _ = f.write("The harpy impute workflow ran using these parameters:\n\n") - _ = f.write(f"The provided variant file: {variantfile}\n") - _ = f.write("Preprocessing was performed with:\n") - _ = f.write(" bcftools view -M2 -v snps --regions CONTIG INFILE |\n") - _ = f.write(""" bcftools query -i '(STRLEN(REF)==1) & (STRLEN(ALT[0])==1) & (REF!="N")' -f '%CHROM\\t%POS\\t%REF\\t%ALT\\n'\n""") - _ = f.write("\nThe STITCH parameters were governed by the rows of the input parameter table:\n") - with open(config["inputs"]["paramfile"], "r") as f1: - for line in f1: - _ = f.write(" " + line) - _ = f.write("\nWithin R, STITCH was invoked with the following parameters:\n") - _ = f.write( - " STITCH(\n" + - " method = model,\n" + - " posfile = posfile,\n" + - " bamlist = bamlist,\n" + - " nCores = ncores,\n" + - " nGen = ngen,\n" + - " chr = chr,\n" + - " K = k,\n" + - " S = s,\n" + - " use_bx_tag = usebX,\n" + - " bxTagUpperLimit = bxlimit,\n" + - " niterations = 40,\n" + - " switchModelIteration = 39,\n" + - " splitReadIterations = NA,\n" + - " outputdir = outdir,\n" + - " output_filename = outfile\n)\n" - ) - _ = f.write("Additional STITCH parameters provided (overrides existing values above):\n") - _ = f.write(" " + config.get("extra", "None provided") + "\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") + f.write(summary_template) diff --git a/harpy/snakefiles/metassembly.smk b/harpy/snakefiles/metassembly.smk index 13aebd79e..fe42b1c70 100644 --- a/harpy/snakefiles/metassembly.smk +++ b/harpy/snakefiles/metassembly.smk @@ -8,61 +8,92 @@ onsuccess: onerror: os.remove(logger.logfile) -FASTQ1 = config["inputs"]["fastq"] -FASTQ2 = config["inputs"].get("fastq2") -cont_cov = config["contig_coverage"] -clusters = config["clusters"] +FQ1 = config["inputs"]["fastq_r1"], +FQ2 = config["inputs"]["fastq_r2"] outdir = config["output_directory"] envdir = os.getcwd() + "/.harpy_envs" +max_mem = config["spades"]["max_memory"] +k_param = config["spades"]["k"] +extra = config["spades"].get("extra", "") -rule barcode_sort: +rule sort_by_barcode: input: - fq_f = FASTQ1, - fq_r = FASTQ2 + fq_f = FQ1, + fq_r = FQ2 output: - fq_f = temp(f"{outdir}/workflow/input.R1.fq"), - fq_r = temp(f"{outdir}/workflow/input.R2.fq") + fq_f = temp(f"{outdir}/fastq_preproc/tmp.R1.fq"), + fq_r = temp(f"{outdir}/fastq_preproc/tmp.R2.fq") params: - config["barcode_tag"].upper() + barcode_tag = config["barcode_tag"].upper() shell: """ samtools import -T "*" {input} | - samtools sort -O SAM -t {params} | + samtools sort -O SAM -t {params.barcode_tag} | samtools fastq -T "*" -1 {output.fq_f} -2 {output.fq_r} """ + +rule format_barcode: + input: + f"{outdir}/fastq_preproc/tmp.R{{FR}}.fq" + output: + temp(f"{outdir}/fastq_preproc/input.R{{FR}}.fq.gz") + params: + config["barcode_tag"].upper() + shell: + "sed 's/{params}:Z:[^[:space:]]*/&-1/g' {input} | bgzip > {output}" -rule metaspades: +rule error_correction: input: - reads = collect(outdir + "/workflow/input.R{X}.fq", X = [1,2]) + FQ_R1 = outdir + "/fastq_preproc/input.R1.fq.gz", + FQ_R2 = outdir + "/fastq_preproc/input.R2.fq.gz" output: - contigs = outdir + "/metaspades/contigs.fasta", - scaffolds = outdir + "/metaspades/scaffolds.fasta", - dir = directory(outdir + "/metaspades/intermediate_files"), - corrected_F = outdir + "/metaspades/intermediate_files/corrected/input.R100.0_0.cor.fastq.gz", - corrected_R = outdir + "/metaspades/intermediate_files/corrected/input.R200.0_0.cor.fastq.gz" + outdir + "/error_correction/corrected/input.R1.fq00.0_0.cor.fastq.gz", + outdir + "/error_correction/corrected/input.R2.fq00.0_0.cor.fastq.gz", + outdir + "/error_correction/corrected/input.R_unpaired00.0_0.cor.fastq.gz" params: - k="auto" - #extra="--only-assembler", + outdir = outdir + "/error_correction", + k = k_param, + mem = max_mem // 1000, + extra = extra log: - outdir + "/logs/spades.log", + outdir + "/logs/error_correct.log" + conda: + f"{envdir}/assembly.yaml" threads: workflow.cores resources: - mem_mem=250000, - time=60 * 24, - wrapper: - "v4.3.0/bio/spades/metaspades" + mem_mb=max_mem + shell: + "metaspades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} -1 {input.FQ_R1} -2 {input.FQ_R2} -o {params.outdir} --only-error-correction > {log}" -rule all: - default_target: True +rule spades_assembly: input: - f"{outdir}/metaspades/contigs.fasta" + FQ_R1C = outdir + "/error_correction/corrected/input.R1.fq00.0_0.cor.fastq.gz", + FQ_R2C = outdir + "/error_correction/corrected/input.R2.fq00.0_0.cor.fastq.gz", + FQ_UNC = outdir + "/error_correction/corrected/input.R_unpaired00.0_0.cor.fastq.gz" + output: + f"{outdir}/metaspades_assembly/contigs.fasta" + params: + outdir = outdir + "/metaspades_assembly", + k = k_param, + mem = max_mem // 1000, + extra = extra + log: + outdir + "/logs/metaspades_assembly.log" + conda: + f"{envdir}/assembly.yaml" + threads: + workflow.cores + resources: + mem_mb=max_mem + shell: + "metaspades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} -1 {input.FQ_R1C} -2 {input.FQ_R2C} -s {input.FQ_UNC} -o {params.outdir} --only-assembler > {log}" rule bwa_index: input: - f"{outdir}/metaspades/contigs.fasta" + f"{outdir}/metaspades_assembly/contigs.fasta" output: - multiext(f"{outdir}/metaspades/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb") + multiext(f"{outdir}/metaspades_assembly/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb") log: f"{outdir}/logs/bwa.index.log" conda: @@ -72,82 +103,126 @@ rule bwa_index: rule bwa_align: input: - fastq = f"{outdir}/workflow/input.fq", - contigs = f"{outdir}/metaspades/contigs.fasta", - indices = multiext(f"{outdir}/metaspades/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb") + multiext(f"{outdir}/metaspades_assembly/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb"), + fastq = collect(outdir + "/fastq_preproc/input.R{X}.fq.gz", X = [1,2]), + contigs = f"{outdir}/metaspades_assembly/contigs.fasta" output: - f"{outdir}/align/reads-to-metaspades.bam" + f"{outdir}/reads-to-metaspades.bam" log: bwa = f"{outdir}/logs/align.bwa.log", samsort = f"{outdir}/logs/sort.alignments.log" + threads: + workflow.cores conda: f"{envdir}/align.yaml" shell: - "bwa mem -C -p {input.contigs} /path/to/reads 2> {log.bwa} | samtools sort -O bam -o {output} - 2> {log.samsort}" + "bwa mem -C -t {threads} {input.contigs} {input.fastq} 2> {log.bwa} | samtools sort -O bam -o {output} - 2> {log.samsort}" rule index_alignment: input: - f"{outdir}/align/reads-to-metaspades.bam" + f"{outdir}/reads-to-metaspades.bam" output: - f"{outdir}/align/reads-to-metaspades.bam.bai" + f"{outdir}/reads-to-metaspades.bam.bai" log: f"{outdir}/logs/index.alignments.log" - conda: - f"{envdir}/align.yaml" shell: "samtools index {input} 2> {log}" +rule interleave_fastq: + input: + collect(outdir + "/fastq_preproc/input.R{FR}.fq.gz", FR = [1,2]) + output: + f"{outdir}/fastq_preproc/interleaved.fq" + shell: + "seqtk mergepe {input} > {output}" + rule athena_config: input: - fastq = f"{outdir}/workflow/input.fq", - bam = f"{outdir}/align/reads-to-metaspades.bam", - bai = f"{outdir}/align/reads-to-metaspades.bam.bai", - contigs = f"{outdir}/metaspades/contigs.fasta" + f"{outdir}/reads-to-metaspades.bam.bai", + fastq = f"{outdir}/fastq_preproc/interleaved.fq", + bam = f"{outdir}/reads-to-metaspades.bam", + contigs = f"{outdir}/metaspades_assembly/contigs.fasta" output: f"{outdir}/athena/athena.config" + params: + threads = workflow.cores run: - with open(output[0], "w") as conf: - _ = conf.write("{\n") - _ = conf.write("\"input_fqs\": \"/path/to/fq\",\n") - _ = conf.write("\"ctgfasta_path\": \"/path/to/seeds.fa\",\n") - _ = conf.write("\"reads_ctg_bam_path\": \"/path/to/reads_2_seeds.bam\"\n") - _ = conf.write("}\n") + import json + config_data = { + "input_fqs": input.fastq, + "ctgfasta_path": input.contigs, + "reads_ctg_bam_path": input.bam, + "cluster_settings": { + "processes": params.threads, + "cluster_options": { + "extra_params": {"run_local": "True"} + } + } + } + with open(output[0], "w") as conf: + json.dump(config_data, conf, indent=4) rule athena: input: - fastq = f"{outdir}/workflow/input.fq", - bam = f"{outdir}/align/reads-to-metaspades.bam", - bai = f"{outdir}/align/reads-to-metaspades.bam.bai", - contigs = f"{outdir}/metaspades/contigs.fasta", + multiext(f"{outdir}/reads-to-metaspades.", "bam", "bam.bai"), + f"{outdir}/fastq_preproc/interleaved.fq", + f"{outdir}/metaspades_assembly/contigs.fasta", config = f"{outdir}/athena/athena.config" output: - local_asm = f"{outdir}/athena/results/olc/flye-input-contigs.fa", - final_asm = f"{outdir}/athena/results/olc/athena.asm.fa" + temp(directory(collect(outdir + "/athena/{X}", X = ["results", "logs", "working"]))), + f"{outdir}/athena/flye-input-contigs.fa", + f"{outdir}/athena/athena.asm.fa", log: f"{outdir}/logs/athena.log" + params: + local_asm = f"{outdir}/athena/results/olc/flye-input-contigs.fa", + final_asm = f"{outdir}/athena/results/olc/athena.asm.fa", + result_dir = f"{outdir}/athena" conda: f"{envdir}/metassembly.yaml" shell: - "athena-meta --config {input.config}" + """ + athena-meta --config {input.config} 2> {log} &&\\ + mv {params.local_asm} {params.final_asm} {params.result_dir} + """ -#TODO figure this part out -# is it sorted reads, interleaved? Can I just use the starting ones? maybe just the corrected metaspades ones -rule pangaea: +rule workflow_summary: + default_target: True input: - F_fq = f"{outdir}/metaspades/corrected/corrected/input_100.0_0.cor.fastq.gz", - R_fq = f"{outdir}/metaspades/corrected/corrected/input_200.0_0.cor.fastq.gz", - spades_contigs = f"{outdir}/metaspades/corrected/contigs.fasta", - athena_local = f"{outdir}/athena/results/olc/flye-input-contigs.fa", - athena_hybrid = f"{outdir}/athena/results/olc/athena.asm.fa" - output: - f"{outdir}/pangaea/final.asm.fa" + f"{outdir}/athena/athena.asm.fa" params: - outdir = f"{outdir}/pangaea", - lt = cont_cov, - c = clusters - log: - f"{outdir}/logs/pangaea.log" - shell: - "python pangaea_path/pangaea.py -1 {input.F_fq} -2 {input.R_fq} -sp {input.spades_contigs} -lc {input.athena_local} -at {input.athena_hybrid} -lt {params.lt} -c {params.c} -o {params.outdir} 2> {log}" + bx = config["barcode_tag"].upper(), + k_param = k_param, + max_mem = max_mem, + extra = extra + run: + summary_template = f""" +The harpy metassembly workflow ran using these parameters: + +FASTQ inputs were sorted by their linked-read barcodes: + samtools import -T "*" FQ1 FQ2 | + samtools sort -O SAM -t {{params.bx}} | + samtools fastq -T "*" -1 FQ_out1 -2 FQ_out2 + +Barcoded-sorted FASTQ files had "-1" appended to the barcode to make them Athena-compliant: + sed 's/{{params.bx}}:Z:[^[:space:]]*/&-1/g' FASTQ | bgzip > FASTQ_OUT + +Reads were assembled using metaspades: + k values: {{params.k_param}} + maximum memory: {{params.max_mem}} + extra parameters: {{params.extra}} + +Original input FASTQ files were aligned to the metagenome using BWA: + bwa mem -C -p metaspades.contigs FQ1 FQ2 | samtools sort -O bam - + +Barcode-sorted Athena-compliant sequences were interleaved with seqtk: + seqtk mergepe FQ1 FQ2 > INTERLEAVED.FQ + +Athena ran with the config file Harpy built from the files created from the previous steps: + athena-meta --config athena.config -#rule workflow_summary: +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" + with open(outdir + "/workflow/metassembly.summary", "w") as f: + f.write(summary_template.format(params=params)) diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index caba10da2..531be1e57 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -22,7 +22,7 @@ molecule_distance = config["molecule_distance"] extra = config.get("extra", "") outdir = config["output_directory"] envdir = os.getcwd() + "/.harpy_envs" -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] samples_from_vcf = config["samples_from_vcf"] variantfile = config["inputs"]["variantfile"] bamlist = config["inputs"]["alignments"] @@ -272,22 +272,30 @@ rule workflow_summary: default_target: True input: vcf = outdir + "/variants.phased.bcf", - reports = outdir + "/reports/phase.html" if not skipreports else [] + reports = outdir + "/reports/phase.html" if not skip_reports else [] params: prune = f"--threshold {pruning}" if pruning > 0 else "--no_prune 1", extra = extra run: + summary_template = f""" +The harpy phase workflow ran using these parameters: + +The provided variant file: {variantfile} + +The variant file was split by sample and filtered for heterozygous sites using: + bcftools view -s SAMPLE | bcftools view -i 'GT="het"' + +Phasing was performed using the components of HapCut2: + extractHAIRS {linkarg} --nf 1 --bam sample.bam --VCF sample.vcf --out sample.unlinked.frags + LinkFragments.py --bam sample.bam --VCF sample.vcf --fragments sample.unlinked.frags --out sample.linked.frags -d {molecule_distance} + HAPCUT2 --fragments sample.linked.frags --vcf sample.vcf --out sample.blocks --nf 1 --error_analysis_mode 1 --call_homozygous 1 --outvcf 1 {params.prune} {params.extra} + +Variant annotation was performed using: + bcftools annotate -a sample.phased.vcf -c CHROM,POS,FMT/GT,FMT/PS,FMT/PQ,FMT/PD -m +HAPCUT + bcftools merge --output-type b samples.annot.bcf + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/phase.summary", "w") as f: - _ = f.write("The harpy phase workflow ran using these parameters:\n\n") - _ = f.write(f"The provided variant file: {variantfile}\n") - _ = f.write("The variant file was split by sample and filtered for heterozygous sites using:\n") - _ = f.write(""" bcftools view -s SAMPLE | bcftools view -i 'GT="het"' \n""") - _ = f.write("Phasing was performed using the components of HapCut2:\n") - _ = f.write(" extractHAIRS " + linkarg + " --nf 1 --bam sample.bam --VCF sample.vcf --out sample.unlinked.frags\n") - _ = f.write(" LinkFragments.py --bam sample.bam --VCF sample.vcf --fragments sample.unlinked.frags --out sample.linked.frags -d " + f"{molecule_distance}" + "\n") - _ = f.write(" HAPCUT2 --fragments sample.linked.frags --vcf sample.vcf --out sample.blocks --nf 1 --error_analysis_mode 1 --call_homozygous 1 --outvcf 1" + f" {params[0]} {params[1]}" + "\n\n") - _ = f.write("Variant annotation was performed using:\n") - _ = f.write(" bcftools annotate -a sample.phased.vcf -c CHROM,POS,FMT/GT,FMT/PS,FMT/PQ,FMT/PD -m +HAPCUT \n") - _ = f.write(" bcftools merge --output-type b samples.annot.bcf\n\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") + f.write(summary_template) diff --git a/harpy/snakefiles/preflight_bam.smk b/harpy/snakefiles/preflight_bam.smk index 3245c97c0..5d5cca669 100644 --- a/harpy/snakefiles/preflight_bam.smk +++ b/harpy/snakefiles/preflight_bam.smk @@ -85,9 +85,14 @@ rule workflow_summary: outdir + "/filecheck.bam.html" run: os.makedirs(f"{outdir}/workflow/", exist_ok= True) + summary_template = f""" +The harpy preflight bam workflow ran using these parameters: + +Validations were performed with: + check_bam.py sample.bam > sample.txt + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/preflight.bam.summary", "w") as f: - _ = f.write("The harpy preflight bam workflow ran using these parameters:\n\n") - _ = f.write("validations were performed with:\n") - _ = f.write(" check_bam.py sample.bam > sample.txt\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/preflight_fastq.smk b/harpy/snakefiles/preflight_fastq.smk index 93b9d1a0b..c7b28cd86 100644 --- a/harpy/snakefiles/preflight_fastq.smk +++ b/harpy/snakefiles/preflight_fastq.smk @@ -83,9 +83,14 @@ rule workflow_summary: outdir + "/filecheck.fastq.html" run: os.makedirs(f"{outdir}/workflow/", exist_ok= True) + summary_template = f""" +The harpy preflight fastq workflow ran using these parameters: + +Validations were performed with: + check_fastq.py sample.fastq > sample.txt + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/preflight.fastq.summary", "w") as f: - _ = f.write("The harpy preflight fastq workflow ran using these parameters:\n\n") - _ = f.write("validations were performed with:\n") - _ = f.write(" check_fastq.py sample.fastq > sample.txt\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index f4e04dfcf..eff482a21 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -27,7 +27,7 @@ if deconvolve: decon_w = deconvolve["window_size"] decon_d = deconvolve["density"] decon_a = deconvolve["dropout"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} @@ -172,8 +172,8 @@ rule workflow_summary: default_target: True input: fq = collect(outdir + "/{sample}.{FR}.fq.gz", FR = ["R1", "R2"], sample = samplenames), - bx_report = outdir + "/reports/barcode.summary.html" if not skipreports else [], - agg_report = outdir + "/reports/qc.report.html" if not skipreports else [] + bx_report = outdir + "/reports/barcode.summary.html" if not skip_reports else [], + agg_report = outdir + "/reports/qc.report.html" if not skip_reports else [] params: minlen = f"--length_required {min_len}", maxlen = f"--max_len1 {max_len}", @@ -181,15 +181,24 @@ rule workflow_summary: dedup = "-D" if dedup else "", extra = extra run: + if deconvolve: + deconvolve_text = "Deconvolution occurred using QuickDeconvolution:\n" + deconvolve_text += f" QuickDeconvolution -t threads -i infile.fq -o output.fq -k {decon_k} -w {decon_w} -d {decon_d} -a {decon_a}\n" + deconvolve_text += "The interleaved output was split back into forward and reverse reads with seqtk:\n" + deconvolve_text += " seqtk -1 interleaved.fq | gzip > file.R1.fq.gz\n" + deconvolve_text += " seqtk -2 interleaved.fq | gzip > file.R2.fq.gz\n" + else: + deconvolve_text = "" + summary_template = f""" +The harpy qc workflow ran using these parameters: + +fastp ran using: + fastp --trim_poly_g --cut_right {params} + +{deconvolve_text} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/qc.summary", "w") as f: - _ = f.write("The harpy qc workflow ran using these parameters:\n\n") - _ = f.write("fastp trimming ran using:\n") - _ = f.write(" fastp --trim_poly_g --cut_right " + " ".join(params) + "\n") - if deconvolve: - _ = f.write("Deconvolution occurred using QuickDeconvolution:\n") - _ = f.write(f" QuickDeconvolution -t threads -i infile.fq -o output.fq -k {decon_k} -w {decon_w} -d {decon_d} -a {decon_a}\n") - _ = f.write("The interleaved output was split back into forward and reverse reads with seqtk:\n") - _ = f.write(" seqtk -1 interleaved.fq | gzip > file.R1.fq.gz\n") - _ = f.write(" seqtk -2 interleaved.fq | gzip > file.R2.fq.gz\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk index b6e523b3f..b3de9a10e 100644 --- a/harpy/snakefiles/simulate_linkedreads.smk +++ b/harpy/snakefiles/simulate_linkedreads.smk @@ -183,16 +183,24 @@ rule workflow_summary: dwgmutationrate = config["mutation_rate"], dwgprefix = outdir + "/dwgsim_simulated/dwgsim.hap.12" run: + summary_template = f""" +The harpy simulate linkedreas workflow ran using these parameters: + +Genome haplotype 1: {gen_hap1} +Genome haplotype 2: {gen_hap2} +Barcode file: {barcodefile} + +Reads were simulated from the provided genomes using: + dwgsim -N {{params.dwgreadpairs}} -e 0.0001,0.0016 -E 0.0001,0.0016 -d {{params.dwgouterdist}} -s {{params.dwgdistsd}} -1 135 -2 151 -H -y 0 -S 0 -c 0 -R 0 -r {{params.dwgmutationrate}} -F 0 -o 1 -m /dev/null GENO PREFIX + +LRSIM was started from step 3 (-u 3) with these parameters: + LRSIM_harpy.pl -g genome1,genome2 -p {{params.lrsproj_dir}}/lrsim/sim -b BARCODES -r {{params.lrsproj_dir}} -i {{params.lrsoutdist}} -s {{params.lrsdist_sd}} -x {{params.lrsn_pairs}} -f {{params.lrsmol_len}} -t {{params.lrsparts}} -m {{params.lrsmols_per}} -z THREADS {{params.lrsstatic}} + +10X style barcodes were converted in haplotag BX:Z tags using: + 10xtoHaplotag.py + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/simulate.reads.summary", "w") as f: - _ = f.write("The harpy simulate linkedreas workflow ran using these parameters:\n\n") - _ = f.write(f"Genome haplotype 1: {gen_hap1}\n") - _ = f.write(f"Genome haplotype 2: {gen_hap2}\n") - _ = f.write(f"Barcode file: {barcodefile}\n") - _ = f.write("Reads were simulated from the provided genomes using:\n") - _ = f.write(f" dwgsim -N {params.dwgreadpairs} -e 0.0001,0.0016 -E 0.0001,0.0016 -d {params.dwgouterdist} -s {params.dwgdistsd} -1 135 -2 151 -H -y 0 -S 0 -c 0 -R 0 -r {params.dwgmutationrate} -F 0 -o 1 -m /dev/null GENO PREFIX\n") - _ = f.write("LRSIM was started from step 3 (-u 3) with these parameters:\n") - _ = f.write(f" LRSIM_harpy.pl -g genome1,genome2 -p {params.lrsproj_dir}/lrsim/sim -b BARCODES -r {params.lrsproj_dir} -i {params.lrsoutdist} -s {params.lrsdist_sd} -x {params.lrsn_pairs} -f {params.lrsmol_len} -t {params.lrsparts} -m {params.lrsmols_per} -z THREADS {params.lrsstatic}\n") - _ = f.write("10X style barcodes were converted in haplotag BX:Z tags using:\n") - _ = f.write(" 10xtoHaplotag.py\n") - _ = f.write("The Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template.format(params = params)) diff --git a/harpy/snakefiles/snp_freebayes.smk b/harpy/snakefiles/snp_freebayes.smk index 5e6335919..cf7586382 100644 --- a/harpy/snakefiles/snp_freebayes.smk +++ b/harpy/snakefiles/snp_freebayes.smk @@ -19,7 +19,7 @@ extra = config.get("extra", "") regiontype = config["regiontype"] windowsize = config.get("windowsize", None) outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) genomefile = config["inputs"]["genome"] @@ -135,7 +135,7 @@ rule concat_list: run: with open(output[0], "w") as fout: for bcf in input.bcfs: - _ = fout.write(f"{bcf}\n") + _ = fout.write(f"{bcf}\n") rule concat_variants: input: @@ -211,21 +211,36 @@ rule workflow_summary: default_target: True input: vcf = collect(outdir + "/variants.{file}.bcf", file = ["raw"]), - reports = collect(outdir + "/reports/variants.{file}.html", file = ["raw"]) if not skipreports else [] + reports = collect(outdir + "/reports/variants.{file}.html", file = ["raw"]) if not skip_reports else [] params: ploidy = f"-p {ploidy}", populations = f"--populations {groupings}" if groupings else '', extra = extra run: + if windowsize: + windowtext = f"Size of intervals to split genome for variant calling: {windowsize}" + else: + windowtext = f"Genomic positions for which variants were called: {regioninput}" + + summary_template = f""" +The harpy snp freebayes workflow ran using these parameters: + +The provided genome: {bn} + +{windowtext} + +The freebayes parameters: + freebayes -f GENOME -L samples.list -r REGION {params} | + bcftools sort - + +The variants identified in the intervals were merged into the final variant file using: + bcftools concat -f bcf.files -a --remove-duplicates + +The variants were normalized using: + bcftools norm -m -both -d both + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/snp.freebayes.summary", "w") as f: - _ = f.write("The harpy snp freebayes workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write(f"Size of intervals to split genome for variant calling: {windowsize}\n") - _ = f.write("The freebayes parameters:\n") - _ = f.write(" freebayes -f GENOME -L samples.list -r REGION " + " ".join(params) + " | bcftools sort -\n") - _ = f.write("The variants identified in the intervals were merged into the final variant file using:\n") - _ = f.write(" bcftools concat -f bcf.files -a --remove-duplicates\n") - _ = f.write("The variants were normalized using:\n") - _ = f.write(" bcftools norm -m -both -d both\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/snp_mpileup.smk b/harpy/snakefiles/snp_mpileup.smk index b2d227596..01750ffe5 100644 --- a/harpy/snakefiles/snp_mpileup.smk +++ b/harpy/snakefiles/snp_mpileup.smk @@ -19,7 +19,7 @@ mp_extra = config.get("extra", "") regiontype = config["regiontype"] windowsize = config.get("windowsize", None) outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) genomefile = config["inputs"]["genome"] @@ -245,25 +245,37 @@ rule workflow_summary: input: vcf = collect(outdir + "/variants.{file}.bcf", file = ["raw"]), agg_log = outdir + "/logs/mpileup.log", - reports = collect(outdir + "/reports/variants.{file}.html", file = ["raw", "normalized"]) if not skipreports else [] + reports = collect(outdir + "/reports/variants.{file}.html", file = ["raw", "normalized"]) if not skip_reports else [] params: ploidy = f"--ploidy {ploidy}", populations = f"--populations {groupings}" if groupings else "--populations -" run: + if windowsize: + windowtext = f"Size of intervals to split genome for variant calling: {windowsize}" + else: + windowtext = f"Genomic positions for which variants were called: {regioninput}" + summary_template = f""" +The harpy snp mpileup workflow ran using these parameters: + +The provided genome: {bn} + +{windowtext} + +The mpileup parameters: + bcftools mpileup --fasta-ref GENOME --region REGION --bam-list BAMS --annotate AD --output-type b {mp_extra} + +The bcftools call parameters: + bcftools call --multiallelic-caller {params} --variants-only --output-type b | + bcftools sort - + +The variants identified in the intervals were merged into the final variant file using: + bcftools concat -f bcf.files -a --remove-duplicates + +The variants were normalized using: + bcftools norm -m -both -d both + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/snp.mpileup.summary", "w") as f: - _ = f.write("The harpy snp mpileup workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - if windowsize: - _ = f.write(f"Size of intervals to split genome for variant calling: {windowsize}\n") - else: - _ = f.write(f"Genomic positions for which variants were called: {regioninput}\n") - _ = f.write("The mpileup parameters:\n") - _ = f.write(" bcftools mpileup --fasta-ref GENOME --region REGION --bam-list BAMS --annotate AD --output-type b" + mp_extra + "\n") - _ = f.write("The bcftools call parameters:\n") - _ = f.write(" bcftools call --multiallelic-caller " + " ".join(params) + " --variants-only --output-type b | bcftools sort -\n") - _ = f.write("The variants identified in the intervals were merged into the final variant file using:\n") - _ = f.write(" bcftools concat -f bcf.files -a --remove-duplicates\n") - _ = f.write("The variants were normalized using:\n") - _ = f.write(" bcftools norm -m -both -d both\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/sv_leviathan.smk b/harpy/snakefiles/sv_leviathan.smk index 23f5f48d2..1bfb0ba3c 100644 --- a/harpy/snakefiles/sv_leviathan.smk +++ b/harpy/snakefiles/sv_leviathan.smk @@ -24,7 +24,7 @@ min_bc = config["min_barcodes"] iterations = config["iterations"] extra = config.get("extra", "") outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False if genome_zip: @@ -203,19 +203,26 @@ rule workflow_summary: input: vcf = collect(outdir + "/vcf/{sample}.bcf", sample = samplenames), bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]), - reports = collect(outdir + "/reports/{sample}.SV.html", sample = samplenames) if not skipreports else [] + reports = collect(outdir + "/reports/{sample}.SV.html", sample = samplenames) if not skip_reports else [] params: min_sv = f"-v {min_sv}", min_bc = f"-c {min_bc}", iters = f"-B {iterations}", extra = extra run: + summary_template = f""" +The harpy sv leviathan workflow ran using these parameters: + +The provided genome: {bn} + +The barcodes were indexed using: + LRez index bam -p -b INPUT + +Leviathan was called using: + LEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/sv.leviathan.summary", "w") as f: - _ = f.write("The harpy sv leviathan workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("The barcodes were indexed using:\n") - _ = f.write(" LRez index bam -p -b INPUT\n") - _ = f.write("Leviathan was called using:\n") - _ = f.write(f" LEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params}\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index 1d3751669..86f5191ad 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -23,7 +23,7 @@ min_sv = config["min_sv"] min_bc = config["min_barcodes"] iterations = config["iterations"] outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bn = os.path.basename(genomefile) if bn.lower().endswith(".gz"): bn = bn[:-3] @@ -264,20 +264,27 @@ rule workflow_summary: input: vcf = collect(outdir + "/vcf/{pop}.bcf", pop = populations), bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]), - reports = collect(outdir + "/reports/{pop}.sv.html", pop = populations) if not skipreports else [], - agg_report = outdir + "/reports/leviathan.summary.html" if not skipreports else [] + reports = collect(outdir + "/reports/{pop}.sv.html", pop = populations) if not skip_reports else [], + agg_report = outdir + "/reports/leviathan.summary.html" if not skip_reports else [] params: min_sv = f"-v {min_sv}", min_bc = f"-c {min_bc}", iters = f"-B {iterations}", extra = extra run: + summary_template = f""" +The harpy sv leviathan workflow ran using these parameters: + +The provided genome: {bn} + +The barcodes were indexed using: + LRez index bam -p -b INPUT + +Leviathan was called using: + LEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/sv.leviathan.summary", "w") as f: - _ = f.write("The harpy sv leviathan workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("The barcodes were indexed using:\n") - _ = f.write(" LRez index bam -p -b INPUT\n") - _ = f.write("Leviathan was called using:\n") - _ = f.write(f" LEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params}\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/sv_naibr.smk b/harpy/snakefiles/sv_naibr.smk index 6d041ceac..eb3eb23c2 100644 --- a/harpy/snakefiles/sv_naibr.smk +++ b/harpy/snakefiles/sv_naibr.smk @@ -28,7 +28,7 @@ outdir = config["output_directory"] bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] def process_args(args): argsDict = { @@ -44,6 +44,8 @@ def process_args(args): argsDict[i[0].lstrip("-")] = i[1] return argsDict +argdict = process_args(extra) + def get_alignments(wildcards): """returns a list with the bam file for the sample based on wildcards.sample""" r = re.compile(fr".*/({wildcards.sample})\.(bam|sam)$", flags = re.IGNORECASE) @@ -75,7 +77,6 @@ rule naibr_config: lambda wc: wc.get("sample"), min(10, workflow.cores) run: - argdict = process_args(extra) with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") _ = conf.write(f"prefix={params[0]}\n") @@ -213,18 +214,23 @@ rule workflow_summary: input: bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames), bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), - reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else [] + reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argdict = process_args(extra) + argtext = [f"{k}={v}" for k,v in argdict.items()] + summary_template = f""" +The harpy sv naibr workflow ran using these parameters: + +The provided genome: {bn} + +naibr variant calling ran using these configurations: + bam_file=BAMFILE + prefix=PREFIX + outdir=Variants/naibr/PREFIX + {"\n\t".join(argtext)} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - _ = f.write("The harpy sv naibr workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("naibr variant calling ran using these configurations:\n") - _ = f.write(f" bam_file=BAMFILE\n") - _ = f.write(f" prefix=PREFIX\n") - _ = f.write(f" outdir=Variants/naibr/PREFIX\n") - for i in argdict: - _ = f.write(f" {i}={argdict[i]}\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/sv_naibr_phase.smk b/harpy/snakefiles/sv_naibr_phase.smk index 489ef208f..feca8d4dd 100644 --- a/harpy/snakefiles/sv_naibr_phase.smk +++ b/harpy/snakefiles/sv_naibr_phase.smk @@ -26,7 +26,7 @@ min_quality = config["min_quality"] min_sv = config["min_sv"] min_barcodes = config["min_barcodes"] outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bn = os.path.basename(genomefile) if bn.lower().endswith(".gz"): validgenome = bn[:-3] @@ -51,6 +51,8 @@ def process_args(args): argsDict[i[0].lstrip("-")] = i[1] return argsDict +argdict = process_args(extra) + def get_alignments(wildcards): """returns a list with the bam file for the sample based on wildcards.sample""" r = re.compile(fr".*/({wildcards.sample})\.(bam|sam)$", flags = re.IGNORECASE) @@ -161,7 +163,6 @@ rule naibr_config: lambda wc: wc.get("sample"), min(10, workflow.cores - 1) run: - argdict = process_args(extra) with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") _ = conf.write(f"prefix={params[0]}\n") @@ -271,20 +272,26 @@ rule workflow_summary: bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames), bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), phaselog = outdir + "/logs/whatshap-haplotag.log", - reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else [] + reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argdict = process_args(extra) + argtext = [f"{k}={v}" for k,v in argdict.items()] + summary_template = f""" +The harpy sv naibr workflow ran using these parameters: + +The provided genome: {bn} + +The alignment files were phased using: + whatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam + +naibr variant calling ran using these configurations: + bam_file=BAMFILE + prefix=PREFIX + outdir=Variants/naibr/PREFIX + {"\n\t".join(argtext)} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - _ = f.write("The harpy sv naibr workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("The alignment files were phased using:\n") - _ = f.write(f" whatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam\n") - _ = f.write("naibr variant calling ran using these configurations:\n") - _ = f.write(f" bam_file=BAMFILE\n") - _ = f.write(f" prefix=PREFIX\n") - _ = f.write(f" outdir=Variants/naibr/PREFIX\n") - for i in argdict: - _ = f.write(f" {i}={argdict[i]}\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snakefiles/sv_naibr_pop.smk b/harpy/snakefiles/sv_naibr_pop.smk index 859047344..517951882 100644 --- a/harpy/snakefiles/sv_naibr_pop.smk +++ b/harpy/snakefiles/sv_naibr_pop.smk @@ -25,7 +25,7 @@ min_barcodes = config["min_barcodes"] min_quality = config["min_quality"] mol_dist = config["molecule_distance"] outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] bn = os.path.basename(genomefile) if bn.lower().endswith(".gz"): bn = bn[:-3] @@ -44,6 +44,8 @@ def process_args(args): argsDict[i[0].lstrip("-")] = i[1] return argsDict +argdict = process_args(extra) + # create dictionary of population => filenames ## this makes it easier to set the snakemake rules/wildcards def pop_manifest(groupingfile, filelist): @@ -124,7 +126,6 @@ rule naibr_config: lambda wc: wc.get("population"), min(10, workflow.cores - 1) run: - argdict = process_args(extra) with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") _ = conf.write(f"outdir={outdir}/{params[0]}\n") @@ -258,22 +259,29 @@ rule workflow_summary: input: bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations), bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), - reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skipreports else [], - agg_report = outdir + "/reports/naibr.pop.summary.html" if not skipreports else [] + reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skip_reports else [], + agg_report = outdir + "/reports/naibr.pop.summary.html" if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") argdict = process_args(extra) + argtext = [f"{k}={v}" for k,v in argdict.items()] + summary_template = f""" +The harpy sv naibr workflow ran using these parameters: + +The provided genome: {bn} +The sample grouping file: {groupfile} + +The alignments were concatenated using: + concatenate_bam.py -o groupname.bam -b samples.list + +naibr variant calling ran using these configurations: + bam_file=BAMFILE + prefix=PREFIX + outdir=Variants/naibr/PREFIX + {"\n\t".join(argtext)} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - _ = f.write("The harpy sv naibr workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write(f"The sample grouping file: {groupfile}\n\n") - _ = f.write("The alignments were concatenated using:\n") - _ = f.write(" concatenate_bam.py -o groupname.bam -b samples.list\n") - _ = f.write("naibr variant calling ran using these configurations:\n") - _ = f.write(f" bam_file=BAMFILE\n") - _ = f.write(f" prefix=PREFIX\n") - _ = f.write(f" outdir=Variants/naibr/PREFIX\n") - for i in argdict: - _ = f.write(f" {i}={argdict[i]}\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") + f.write(summary_template) diff --git a/harpy/snakefiles/sv_naibr_pop_phase.smk b/harpy/snakefiles/sv_naibr_pop_phase.smk index 62249ad17..277bc62b6 100644 --- a/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -30,7 +30,7 @@ min_quality = config["min_quality"] min_barcodes = config["min_barcodes"] mol_dist = config["molecule_distance"] outdir = config["output_directory"] -skipreports = config["skip_reports"] +skip_reports = config["skip_reports"] if bn.lower().endswith(".gz"): bn = bn[:-3] @@ -48,6 +48,8 @@ def process_args(args): argsDict[i[0].lstrip("-")] = i[1] return argsDict +argdict = process_args(extra) + # create dictionary of population => filenames ## this makes it easier to set the snakemake rules/wildcards def pop_manifest(groupingfile, filelist): @@ -229,7 +231,6 @@ rule naibr_config: lambda wc: wc.get("population"), min(10, workflow.cores - 1) run: - argdict = process_args(extra) with open(output[0], "w") as conf: _ = conf.write(f"bam_file={input[0]}\n") _ = conf.write(f"outdir={outdir}/{params[0]}\n") @@ -342,24 +343,31 @@ rule workflow_summary: bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations), bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), phaselog = outdir + "/logs/whatshap-haplotag.log", - reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skipreports else [], - agg_report = outdir + "/reports/naibr.pop.summary.html" if not skipreports else [] + reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skip_reports else [], + agg_report = outdir + "/reports/naibr.pop.summary.html" if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argdict = process_args(extra) + argtext = [f"{k}={v}" for k,v in argdict.items()] + summary_template = f""" +The harpy sv naibr workflow ran using these parameters: + +The provided genome: {bn} +The sample grouping file: {groupfile} + +The alignment files were phased using: + whatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam + +The phased alignments were concatenated using: + concatenate_bam.py -o groupname.bam -b samples.list + +naibr variant calling ran using these configurations: + bam_file=BAMFILE + prefix=PREFIX + outdir=Variants/naibr/PREFIX + {"\n\t".join(argtext)} + +The Snakemake workflow was called via command line: + {config["workflow_call"]} +""" with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - _ = f.write("The harpy sv naibr workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write(f"The sample grouping file: {groupfile}\n\n") - _ = f.write("The alignment files were phased using:\n") - _ = f.write(f" whatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam\n") - _ = f.write("The phased alignments were concatenated using:\n") - _ = f.write(" concatenate_bam.py -o groupname.bam -b samples.list\n") - _ = f.write("naibr variant calling ran using these configurations:\n") - _ = f.write(f" bam_file=BAMFILE\n") - _ = f.write(f" prefix=PREFIX\n") - _ = f.write(f" outdir=Variants/naibr/PREFIX\n") - for i in argdict: - _ = f.write(f" {i}={argdict[i]}\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file + f.write(summary_template) diff --git a/harpy/snp.py b/harpy/snp.py index ba6cdf50c..4fd8e78d9 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -30,7 +30,7 @@ def snp(): }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ], "harpy snp freebayes": [ @@ -40,12 +40,12 @@ def snp(): }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/snp") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @@ -57,10 +57,10 @@ def snp(): @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, extra_params, snakemake, skipreports, quiet, hpc, conda, setup_only): +def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Call variants from using bcftools mpileup @@ -91,8 +91,8 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - validate_bam_RG(bamlist) - check_fasta(genome) + validate_bam_RG(bamlist, threads, quiet) + check_fasta(genome, quiet) # setup regions checks regtype = validate_regions(regions, genome) @@ -120,7 +120,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e config.write("windowsize: {regions}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") config.write(f" genome: {Path(genome).resolve()}\n") @@ -149,7 +149,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/snp") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @@ -161,10 +161,10 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, extra_params, snakemake, skipreports, quiet, hpc, conda, setup_only): +def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Call variants using freebayes @@ -195,8 +195,8 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - validate_bam_RG(bamlist) - check_fasta(genome) + validate_bam_RG(bamlist, threads, quiet) + check_fasta(genome, quiet) # setup regions checks regtype = validate_regions(regions, genome) @@ -224,7 +224,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, config.write("windowsize: {regions}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") config.write(f" genome: {Path(genome).resolve()}\n") diff --git a/harpy/stitchparams.py b/harpy/stitchparams.py index 967b64651..784f0892f 100644 --- a/harpy/stitchparams.py +++ b/harpy/stitchparams.py @@ -5,7 +5,7 @@ import rich_click as click from ._printing import print_notice -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/impute/#parameter-file") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/impute/#parameter-file") @click.option('-o', '--output', type=str, required = True, metavar = "Output file name", help = 'Name of output STITCH parameter file') def stitchparams(output): """ diff --git a/harpy/sv.py b/harpy/sv.py index 5590d9d44..649404a28 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -34,7 +34,7 @@ def sv(): }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ], "harpy sv naibr": [ @@ -44,12 +44,12 @@ def sv(): }, { "name": "Workflow Controls", - "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skipreports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/modules/sv/leviathan/") +@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/workflows/sv/leviathan/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @@ -62,10 +62,10 @@ def sv(): @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, threads, populations, extra_params, snakemake, skipreports, quiet, hpc, conda, setup_only): +def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, threads, populations, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Call structural variants using LEVIATHAN @@ -90,7 +90,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - check_fasta(genome) + check_fasta(genome, quiet) if populations is not None: fetch_report(workflowdir, "leviathan_pop.Rmd") fetch_report(workflowdir, "leviathan.Rmd") @@ -107,7 +107,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre config.write(f"iterations: {iterations}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") popgroupings = "" @@ -136,7 +136,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log, quiet) -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/modules/sv/naibr/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/sv/naibr/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Genome assembly') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @@ -151,10 +151,10 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') +@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('inputs', required=True, type=click.Path(exists=True, readable=True), nargs=-1) -def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, threads, populations, molecule_distance, extra_params, snakemake, skipreports, quiet, hpc, conda, setup_only): +def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, threads, populations, molecule_distance, extra_params, snakemake, skip_reports, quiet, hpc, conda, setup_only): """ Call structural variants using NAIBR @@ -186,7 +186,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) - check_fasta(genome) + check_fasta(genome, quiet) if populations is not None: fetch_report(workflowdir, "naibr_pop.Rmd") fetch_report(workflowdir, "naibr.Rmd") @@ -206,7 +206,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th config.write(f"molecule_distance: {molecule_distance}\n") if extra_params is not None: config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skipreports}\n") + config.write(f"skip_reports: {skip_reports}\n") config.write(f"workflow_call: {command}\n") config.write("inputs:\n") if vcf is not None: diff --git a/resources/createharpyenv.sh b/resources/createharpyenv.sh index 024fa9b8e..2648d3aab 100644 --- a/resources/createharpyenv.sh +++ b/resources/createharpyenv.sh @@ -2,4 +2,9 @@ ## Use the first positional argument to set a name, usually `harpy` or `harpytest` -mamba create -n $1 -f harpy.yaml \ No newline at end of file +if command -v mamba &> /dev/null +then + mamba env create -n $1 -f resources/harpy.yaml +else + conda env create -n $1 -f resources/harpy.yaml +fi diff --git a/resources/harpy.yaml b/resources/harpy.yaml index 9caf76b81..76b0d8628 100644 --- a/resources/harpy.yaml +++ b/resources/harpy.yaml @@ -6,7 +6,6 @@ dependencies: - apptainer - bcftools =1.20 - conda >24.7 - - mamba - pandas - pysam - python diff --git a/test/fastq/spades/ecoli.F.fq.gz b/test/fastq/spades/ecoli.F.fq.gz deleted file mode 100644 index d361d5a6d..000000000 Binary files a/test/fastq/spades/ecoli.F.fq.gz and /dev/null differ diff --git a/test/fastq/spades/ecoli.R.fq.gz b/test/fastq/spades/ecoli.R.fq.gz deleted file mode 100644 index e49b07d07..000000000 Binary files a/test/fastq/spades/ecoli.R.fq.gz and /dev/null differ