diff --git a/.deprecated/align-minimap.smk b/.deprecated/align-minimap.smk index f4adac488..a54b53333 100644 --- a/.deprecated/align-minimap.smk +++ b/.deprecated/align-minimap.smk @@ -7,7 +7,7 @@ from rich.panel import Panel from rich import print as rprint outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] fqlist = config["inputs"]["fastq"] extra = config.get("extra", "") diff --git a/.deprecated/phase.bak.smk b/.deprecated/phase.bak.smk index 4afd76175..cbb060750 100644 --- a/.deprecated/phase.bak.smk +++ b/.deprecated/phase.bak.smk @@ -14,7 +14,7 @@ pruning = config["prune"] molecule_distance = config["molecule_distance"] extra = config.get("extra", "") outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") if config["noBX"]: fragfile = outdir + "/extractHairs/{sample}.unlinked.frags" diff --git a/.github/filters.yml b/.github/filters.yml index 0b1f12112..11b8f84d5 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -1,8 +1,8 @@ common: &common - 'harpy/_launch.py' - 'harpy/_misc.py' - - 'harpy/_printing.py' - 'harpy/_parsers.py' + - 'harpy/_printing.py' - 'harpy/_validations.py' - 'harpy/__main__.py' - 'resources/harpy.yaml' @@ -11,6 +11,7 @@ common: &common - 'resources/buildforCI.sh' container: &container - 'harpy/_conda.py' + - 'harpy/container.py' - 'harpy/containerize.smk' preflight: &preflight - *common @@ -149,10 +150,11 @@ simreads: &simreads - 'extractReads.cpp' - 'harpy/bin/10xtoHaplotag.py' - 'harpy/scripts/LRSIM_harpy.pl' -metassembly: &metassembly +assembly: &assembly - *common - *container - - 'harpy/metassembly.py' + - 'harpy/assembly.py' + - 'harpy/snakefiles/assembly.smk' - 'harpy/snakefiles/metassembly.smk' other: &other - 'harpy/stitchparams.py' @@ -175,5 +177,5 @@ modules: - *naibr - *simvars - *simreads - - *metassembly + - *assembly - *other diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 61b52867d..4a044995f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -41,7 +41,7 @@ jobs: phase: ${{ steps.filter.outputs.phase }} simvars: ${{ steps.filter.outputs.simvars }} simreads: ${{ steps.filter.outputs.simreads }} - metassembly: ${{ steps.filter.outputs.metassembly }} + assembly: ${{ steps.filter.outputs.assembly }} other: ${{ steps.filter.outputs.other }} container: ${{ steps.filter.outputs.container }} modules: ${{ steps.filter.outputs.modules }} @@ -747,10 +747,10 @@ jobs: shell: micromamba-shell {0} run: harpy simulate linkedreads --quiet -t 4 -n 2 -l 100 -p 50 test/genome/genome.fasta.gz test/genome/genome2.fasta.gz - extras: + assembly: needs: [changes, pkgbuild] - if: ${{ needs.changes.outputs.other == 'true' && needs.pkgbuild.result == 'success' }} - name: harpy extras + if: ${{ needs.changes.outputs.assembly == 'true' && needs.pkgbuild.result == 'success' }} + name: metassembly runs-on: ubuntu-latest steps: - name: Checkout @@ -776,23 +776,25 @@ jobs: resources/buildforCI.sh - name: Clear Space run: rm -rf /opt/hostedtoolcache && mkdir -p .snakemake/singularity - - name: harpy stitchparams + - name: Download Singularity Artifact + uses: actions/download-artifact@v4 + with: + name: deps-image + path: .snakemake/singularity + - name: test assembly shell: micromamba-shell {0} - run: harpy stitchparams -o params.file - - name: harpy popgroup + run: harpy assembly --quiet -m 4000 test/fastq/sample1.* + - name: test metassembly shell: micromamba-shell {0} - run: harpy popgroup test/fastq - - name: harpy hpc + run: harpy assembly --metassembly spades --quiet -m 4000 test/fastq/sample1.* + - name: test metassembly cloud shell: micromamba-shell {0} - run: | - harpy hpc slurm - harpy hpc googlebatch - harpy hpc lsf - harpy hpc htcondor - metassembly: + run: harpy assembly --metassembly cloudspades --quiet -m 4000 test/fastq/sample1.* + + extras: needs: [changes, pkgbuild] - if: ${{ needs.changes.outputs.metassembly == 'true' && needs.pkgbuild.result == 'success' }} - name: metassembly + if: ${{ needs.changes.outputs.other == 'true' && needs.pkgbuild.result == 'success' }} + name: harpy extras runs-on: ubuntu-latest steps: - name: Checkout @@ -818,13 +820,16 @@ jobs: resources/buildforCI.sh - name: Clear Space run: rm -rf /opt/hostedtoolcache && mkdir -p .snakemake/singularity - - name: Download Singularity Artifact - uses: actions/download-artifact@v4 - with: - name: deps-image - path: .snakemake/singularity - - name: create a metassembly + - name: harpy stitchparams shell: micromamba-shell {0} - run: harpy metassembly --quiet -m 6000 test/fastq/sample1.* - - + run: harpy stitchparams -o params.file + - name: harpy popgroup + shell: micromamba-shell {0} + run: harpy popgroup test/fastq + - name: harpy hpc + shell: micromamba-shell {0} + run: | + harpy hpc slurm + harpy hpc googlebatch + harpy hpc lsf + harpy hpc htcondor diff --git a/.gitignore b/.gitignore index ded487a3d..9489c1938 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ Simulate/ Impute/ Preflight/ Phase/ +Assembly/ Metassembly/ samples.populations *.fasta diff --git a/Assembly/workflow/assembly.smk b/Assembly/workflow/assembly.smk deleted file mode 100644 index c8010fe68..000000000 --- a/Assembly/workflow/assembly.smk +++ /dev/null @@ -1,61 +0,0 @@ -import os -import logging - -onstart: - logger.logger.addHandler(logging.FileHandler(config["snakemake_log"])) -onsuccess: - os.remove(logger.logfile) -onerror: - os.remove(logger.logfile) - -FQ1 = config["inputs"]["fastq_r1"] -FQ2 = config["inputs"]["fastq_r2"] -outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" -max_mem = config["spades"]["max_memory"] -k_param = config["spades"]["k"] -extra = config["spades"].get("extra", "") - -rule cloudspades: - input: - FQ_R1 = FQ1, - FQ_R2 = FQ2 - output: - f"{outdir}/contigs.fasta", - f"{outdir}/scaffolds.fasta" - params: - outdir = outdir, - k = k_param, - mem = max_mem // 1000, - extra = extra - log: - outdir + "/logs/assembly.log" - conda: - f"{envdir}/assembly.yaml" - threads: - workflow.cores - resources: - mem_mb=max_mem - shell: - "spades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} --gemcode1-1 {input.FQ_R1} --gemcode1-2 {input.FQ_R2} -o {params.outdir} --isolate > {log}" - -rule workflow_summary: - default_target: True - input: - f"{outdir}/athena/athena.asm.fa" - params: - k_param = k_param, - max_mem = max_mem // 1000, - extra = extra - run: - summary_template = f""" -The harpy assemble workflow ran using these parameters: - -Reads were assembled using cloudspades: - spades.py -t THREADS -m {params.max_mem} --gemcode1-1 FQ1 --gemcode1-2 FQ2 --isolate -k {params.k_param} {params.extra} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" - with open(outdir + "/workflow/metassembly.summary", "w") as f: - f.write(summary_template) \ No newline at end of file diff --git a/Assembly/workflow/config.yaml b/Assembly/workflow/config.yaml deleted file mode 100644 index 4035a8418..000000000 --- a/Assembly/workflow/config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -workflow: assembly -snakemake_log: Assembly/logs/snakemake/assembly.1.15_10_2024.log -output_directory: Assembly -barcode_tag: BX -spades: - assembler: None - max_memory: 250000 - k: auto -skip_reports: False -workflow_call: snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method conda --conda-prefix .snakemake/conda --cores 4 --directory . --snakefile Assembly/workflow/assembly.smk --configfile Assembly/workflow/config.yaml -inputs: - fastq_r1: test/fastq/sample1.F.fq.gz - fastq_r2: test/fastq/sample1.R.fq.gz diff --git a/harpy/__main__.py b/harpy/__main__.py index f1bb369dc..e70a5a88a 100644 --- a/harpy/__main__.py +++ b/harpy/__main__.py @@ -7,7 +7,7 @@ from . import container from . import hpc from . import impute -from . import metassembly +from . import assembly from . import qc from . import phase from . import preflight @@ -56,7 +56,8 @@ def cli(): cli.add_command(hpc.hpc) cli.add_command(resume.resume) cli.add_command(deconvolve.deconvolve) -cli.add_command(metassembly.metassembly) +#cli.add_command(metassembly.metassembly) +cli.add_command(assembly.assembly) ## the workflows ## click.rich_click.COMMAND_GROUPS = { @@ -64,7 +65,7 @@ def cli(): [ { "name": "workflows", - "commands": ["demultiplex","qc", "align","snp","sv","impute","phase", "simulate", "metassembly"], + "commands": ["demultiplex","qc", "align","snp","sv","impute","phase", "simulate", "assembly"], }, { "name": "Other Commands", @@ -73,5 +74,5 @@ def cli(): ], } | simulate.commandstring | hpc.docstring -for i in [align, deconvolve, demultiplex, impute, phase, preflight, qc, simulate, snp, sv, metassembly]: +for i in [align, deconvolve, demultiplex, impute, phase, preflight, qc, simulate, snp, sv, assembly]: click.rich_click.OPTION_GROUPS |= i.docstring diff --git a/harpy/_conda.py b/harpy/_conda.py index efe8ba934..01a9a2697 100644 --- a/harpy/_conda.py +++ b/harpy/_conda.py @@ -1,4 +1,4 @@ -"""Module that sets up the conda environments and dependencies for all the Harpy workflows""" +"""Creates environment recipes for all the Harpy workflows""" import os @@ -16,9 +16,14 @@ def create_conda_recipes(): "conda-forge::icu", "conda-forge::libzlib", "conda-forge::xz" - ], - "assembly": [ - "conda-forge::python=3" + ], + "assembly":[ + "bioconda::arcs", + "bioconda::bwa", + "bioconda::cloudspades", + "bioconda::links", + "bioconda::samtools", + "bioconda::tigmint" ], "metassembly": [ "bioconda::athena_meta=1.2" @@ -26,14 +31,13 @@ def create_conda_recipes(): "phase" : [ "bioconda::hapcut2", "bioconda::whatshap" - ], + ], "qc" : [ - "bioconda::falco", "bioconda::fastp", "bioconda::multiqc=1.22", "bioconda::pysam=0.22", "bioconda::quickdeconvolution" - ], + ], "r" : [ "conda-forge::pandoc", "conda-forge::r-dt", @@ -49,7 +53,7 @@ def create_conda_recipes(): "conda-forge::r-viridislite", "conda-forge::r-xml2", "r::r-biocircos" - ], + ], "simulations" : [ "alienzj::msort", "bioconda::dwgsim=1.1.14", @@ -58,16 +62,19 @@ def create_conda_recipes(): "bioconda::perl-parse-recdescent", "conda-forge::numpy", "conda-forge::perl" - ], + ], + "spades": [ + "conda-forge::python=3" + ], + "stitch" : [ + "bioconda::r-stitch=1.6.10" + ], "variants": [ "bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6", "bioconda::leviathan", "bioconda::naibr-plus" - ], - "stitch" : [ - "bioconda::r-stitch=1.6.10" - ] + ] } os.makedirs(".harpy_envs", exist_ok = True) @@ -81,7 +88,7 @@ def create_conda_recipes(): yml.write("\n - ".join(deps) + "\n") # post-deployment scripts - with open(".harpy_envs/assembly.post-deploy.sh", "w", encoding="utf-8") as shellscript: + with open(".harpy_envs/spades.post-deploy.sh", "w", encoding="utf-8") as shellscript: shellscript.write("wget -O .spades.tar.gz https://github.com/ablab/spades/releases/download/v4.0.0/SPAdes-4.0.0-Linux.tar.gz\n") shellscript.write("tar -xvzf .spades.tar.gz && rm .spades.tar.gz\n") shellscript.write("mv SPAdes-4.0.0-Linux/bin/* ${CONDA_PREFIX}/bin && mv SPAdes-4.0.0-Linux/share/* ${CONDA_PREFIX}/share\n") diff --git a/harpy/_launch.py b/harpy/_launch.py index 6a9855780..43d2d3cb7 100644 --- a/harpy/_launch.py +++ b/harpy/_launch.py @@ -28,7 +28,7 @@ def purge_empty_logs(target_dir): if os.path.isdir(logfile) and not os.listdir(logfile): os.rmdir(logfile) -def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): +def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet, summaryfile = None): """launch snakemake with the given commands""" if not quiet: print_onstart(starttext, workflow.replace("_", " ")) @@ -56,22 +56,22 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): if process.poll() or iserror(output): exitcode = EXIT_CODE_SUCCESS if process.poll() == 0 else EXIT_CODE_GENERIC_ERROR break - while not output.startswith("Job stats:"): + while not output.startswith("Job stats:") and not exitcode: # print dependency text only once if "Downloading and installing remote packages" in output or "Running post-deploy" in output: deps = True - deploy_text = "[dim]Installing software dependencies" + deploy_text = "[dim]Installing workflow dependencies" break if "Pulling singularity image" in output: deps = True - deploy_text = "[dim]Downloading software container" + deploy_text = "[dim]Downloading harpy container" break if "Nothing to be" in output: exitcode = EXIT_CODE_SUCCESS - break if "MissingInput" in output: exitcode = EXIT_CODE_GENERIC_ERROR - break + if "AmbiguousRuleException" in output or "Error" in output or "Exception" in output: + exitcode = EXIT_CODE_RUNTIME_ERROR output = process.stderr.readline() # if dependency text present, print pulsing progress bar @@ -153,7 +153,7 @@ def launch_snakemake(sm_args, workflow, starttext, outdir, sm_logfile, quiet): gzip_file(sm_logfile) purge_empty_logs(outdir) if not quiet: - print_onsuccess(outdir) + print_onsuccess(outdir, summaryfile) sys.exit(0) else: if exitcode in (1,2): diff --git a/harpy/_printing.py b/harpy/_printing.py index 2260bf1a7..f40b6c444 100644 --- a/harpy/_printing.py +++ b/harpy/_printing.py @@ -90,11 +90,14 @@ def print_onstart(text, title): file = sys.stderr ) -def print_onsuccess(outdir): +def print_onsuccess(outdir, summary = None): """Print a green panel with success text. To be used in place of onsuccess: inside a snakefile""" + text = f"The workflow has finished successfully! Find the results in [bold]{outdir}/[/bold]" + if summary: + text += f" and an outline of the workflow in [bold]{summary}[/bold]" rprint( Panel( - f"The workflow has finished successfully! Find the results in [bold]{outdir}/[/bold]", + text, subtitle = "[bold]success!", title_align = "left", border_style = "green", diff --git a/harpy/_validations.py b/harpy/_validations.py index 479d26641..4d1fc81d0 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -73,98 +73,104 @@ def validate_input_by_ext(inputfile, option, ext): def check_impute_params(parameters): """Validate the STITCH parameter file for column names, order, types, missing values, etc.""" - with open(parameters, "r", encoding="utf-8") as fp: - header = fp.readline().rstrip().lower() + with open(parameters, "r", encoding="utf-8") as paramfile: + header = paramfile.readline().rstrip().lower() headersplt = header.split() - correct_header = sorted(["model", "usebx", "bxlimit", "k", "s", "ngen"]) + colnames = ["name", "model", "usebx", "bxlimit", "k", "s", "ngen"] + correct_header = sorted(colnames) + n_cols = len(colnames) row = 1 badrows = [] badlens = [] if sorted(headersplt) != correct_header: - culprits = [i for i in headersplt if i not in correct_header] - print_error("invalid column names", f"Parameter file [bold]{parameters}[/bold] has incorrect column names. Valid names are:\n[green bold]model usebx bxlimit k s ngen[/green bold]") + invalid_col = [i for i in headersplt if i not in colnames] + missing_col = [i for i in colnames if i not in headersplt] + errtext = [] + culprit_text = [] + if invalid_col: + errtext.append("\n - invalid column names") + culprit_text.append(f"[red]Invalid columns:[/red] {", ".join(invalid_col)}") + if missing_col: + errtext.append("\n - missing columns") + culprit_text.append( f"[yellow]Missing columns:[/yellow] {", ".join(missing_col)}") + + print_error("incorrect columns", f"Parameter file [bold]{parameters}[/bold] has incorrect column names{"".join(errtext)}\nValid names are: [green bold]{" ".join(colnames)}[/green bold]") print_solution_with_culprits( f"Fix the headers in [bold]{parameters}[/bold] or use [blue bold]harpy stitchparams[/blue bold] to generate a valid parameter file and modify it with appropriate values.", - "Column names causing this error:" + "Column causing this error:" ) - click.echo(" ".join(culprits), file = sys.stderr) + rprint("\n".join(culprit_text), file = sys.stderr) sys.exit(1) + else: + # in case the columns are out of order, reorder the columns to match `colnames` + col_order = [colnames.index(item) for item in headersplt] # instantiate dict with colnames data = {} - for i in headersplt: - data[i] = [] while True: # Get next line from file - line = fp.readline() + line = paramfile.readline() row += 1 # if line is empty, end of file is reached if not line: break if line == "\n": - break - # split the line by whitespace - rowvals = line.rstrip().split() - rowlen = len(rowvals) - if rowlen != 6: - badrows.append(row) - badlens.append(rowlen) - elif len(badrows) > 0: + # be lenient with empty rows continue + # split the line by whitespace and reorder to match expected colname order + row_values = [line.rstrip().split()[i] for i in col_order] + if len(row_values) == n_cols: + data[row_values[0]] = dict(zip(colnames[1:], row_values[1:])) else: - for j,k in zip(headersplt, rowvals): - data[j].append(k) - + badrows.append(row) + badlens.append(len(row_values)) if len(badrows) > 0: - print_error("invalid rows", f"Parameter file [bold]{parameters}[/bold] is formatted incorrectly. Not all rows have the expected 6 columns.") + print_error("invalid rows", f"Parameter file [blue]{parameters}[/blue] is formatted incorrectly. Not all rows have the expected {n_cols} columns.") print_solution_with_culprits( - f"See the problematic rows below. Check that you are using a whitespace (space or tab) delimeter in [bold]{parameters}[/bold] or use [blue bold]harpy stitchparams[/blue bold] to generate a valid parameter file and modify it with appropriate values.", + f"See the problematic rows below. Check that you are using a whitespace (space or tab) delimeter in [blue]{parameters}[/blue] or use [blue green]harpy stitchparams[/blue green] to generate a valid parameter file and modify it with appropriate values.", "Rows causing this error and their column count:" ) for i in zip(badrows, badlens): click.echo(f"{i[0]}\t{i[1]}", file = sys.stderr) sys.exit(1) - # Validate each column - culprits = {} - colerr = 0 + # validate each row + row_error = False errtable = Table(show_footer=True, box=box.SIMPLE) - errtable.add_column("Column", justify="right", style="white", no_wrap=True) - errtable.add_column("Expected Values", style="blue") - errtable.add_column("Rows with Issues", style = "white") - - culprits["model"] = [] - for i,j in enumerate(data["model"]): - if j not in ["pseudoHaploid", "diploid","diploid-inbred"]: - culprits["model"].append(str(i + 1)) - colerr += 1 - if culprits["model"]: - errtable.add_row("model", "diploid, diploid-inbred, pseudoHaploid", ", ".join(culprits["model"])) + errtable.add_column("Row", justify="right", style="white", no_wrap=True) + errtable.add_column("Columns with Issues", style = "white") - culprits["usebx"] = [] - for i,j in enumerate(data["usebx"]): - if j not in [True, "TRUE", "true", False, "FALSE", "false", "Y","y", "YES", "Yes", "yes", "N", "n", "NO", "No", "no"]: - culprits["usebx"].append(str(i + 1)) - colerr += 1 - if culprits["usebx"]: - errtable.add_row("usebx", "True, False", ",".join(culprits["usebx"])) - - for param in ["bxlimit","k","s","ngen"]: - culprits[param] = [] - for i,j in enumerate(data[param]): - if not j.isdigit(): - culprits[param].append(str(i + 1)) - colerr += 1 - if culprits[param]: - errtable.add_row(param, "Integers", ",".join(culprits[param])) - - if colerr > 0: - print_error("invalid column values", f"Parameter file [bold]{parameters}[/bold] is formatted incorrectly. Not all columns have valid values.") + row = 1 + for k,v in data.items(): + badcols = [] + if re.search(r'[^a-z0-9\-_\.]',k, flags = re.IGNORECASE): + badcols.append("name") + if v["model"] not in ["pseudoHaploid", "diploid","diploid-inbred"]: + badcols.append("model") + if f"{v["usebx"]}".lower() not in ["true", "false", "yes", "y", "no", "n"]: + badcols.append("usebx") + else: + if f"{v["usebx"]}".lower() in ["true", "yes", "y"]: + v["usebx"] = True + else: + v["usebx"] = False + for param in ["bxlimit", "k", "s", "ngen"]: + if not v[param].isdigit(): + badcols.append(param) + else: + v[param] = int(v[param]) + if badcols: + row_error = True + errtable.add_row(f"{row}", ", ".join(badcols)) + row += 1 + if row_error: + print_error("invalid parameter values", f"Parameter file [bold]{parameters}[/bold] is formatted incorrectly. Some rows have incorrect values for one or more parameters.") print_solution_with_culprits( - "Review the table below of what values are expected for each column and which rows are causing issues.", + "Review the table below of which rows/columns are causing issues", "Formatting Errors:" ) rprint(errtable, file = sys.stderr) - sys.exit(1) + sys.exit(1) + return data def validate_bam_RG(bamlist, threads, quiet): """Validate BAM files bamlist to make sure the sample name inferred from the file matches the @RG tag within the file""" diff --git a/harpy/align.py b/harpy/align.py index e2cae6bd7..44c51cc15 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -2,6 +2,7 @@ import os import sys +import yaml from time import sleep from pathlib import Path from rich import box @@ -62,13 +63,13 @@ def align(): ] } -@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/workflows/align/bwa/") +@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/bwa/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional bwa mem parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') -@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') +@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/bwa", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @@ -97,7 +98,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) @@ -108,24 +109,24 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ fetch_report(workflowdir, "align_bxstats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "align_bwa") - + configs = { + "workflow" : "align bwa", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "alignment_quality" : min_quality, + "keep_unmapped" : keep_unmapped, + "molecule_distance" : molecule_distance, + "depth_windowsize" : depth_window, + "skip_reports" : skip_reports, + **({'extra': extra_params} if extra_params else {}), + "workflow_call" : command.rstrip(), + "inputs" : { + "genome": Path(genome).resolve().as_posix(), + "fastq": [i.as_posix() for i in fqlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: align bwa\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"alignment_quality: {min_quality}\n") - config.write(f"keep_unmapped: {keep_unmapped}\n") - config.write(f"molecule_distance: {molecule_distance}\n") - config.write(f"depth_windowsize: {depth_window}\n") - config.write(f"skip_reports: {skip_reports}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - config.write(" fastq:\n") - for i in fqlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -138,9 +139,9 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ start_text.add_row("Genome:", genome) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "align_bwa", start_text, output_dir, sm_log, quiet, "workflow/align.bwa.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/align/ema") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/align/ema") @click.option('-x', '--extra-params', type = str, help = 'Additional ema align parameters, in quotes') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-b', '--ema-bins', default = 500, show_default = True, type = click.IntRange(1,1000), help="Number of barcode bins") @@ -179,7 +180,7 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake platform = platform.lower() @@ -203,27 +204,27 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u fetch_report(workflowdir, "align_bxstats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "align_ema") + configs = { + "workflow" : "align ema", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "alignment_quality" : min_quality, + "keep_unmapped" : keep_unmapped, + "depth_windowsize" : depth_window, + "platform" : platform, + "EMA_bins" : ema_bins, + "skip_reports" : skip_reports, + **({'extra': extra_params} if extra_params else {}), + "workflow_call" : command.rstrip(), + "inputs" : { + "genome": Path(genome).resolve().as_posix(), + **({'barcode_list': Path(barcode_list).resolve().as_posix()} if barcode_list else {}), + "fastq": [i.as_posix() for i in fqlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: align ema\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"quality: {min_quality}\n") - config.write(f"keep_unmapped: {keep_unmapped}\n") - config.write(f"platform: {platform}\n") - config.write(f"EMA_bins: {ema_bins}\n") - config.write(f"depth_windowsize: {depth_window}\n") - config.write(f"skip_reports: {skip_reports}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - if barcode_list: - config.write(f" barcode_list: {Path(barcode_list).resolve()}\n") - config.write(" fastq:\n") - for i in fqlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -237,15 +238,15 @@ def ema(inputs, output_dir, platform, barcode_list, genome, depth_window, keep_u start_text.add_row("Platform:", platform) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "align_ema", start_text, output_dir, sm_log, quiet, "workflow/align.ema.summary") -@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/workflows/align/minimap/") +@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/align/strobe/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for read mapping') @click.option('-w', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') @click.option('-u', '--keep-unmapped', is_flag = True, default = False, help = 'Retain unmapped sequences in the output') @click.option('-q', '--min-quality', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') -@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') +@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Align/strobealign", show_default=True, help = 'Output directory name') @click.option('-l', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @@ -278,7 +279,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) @@ -289,26 +290,25 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, fetch_report(workflowdir, "align_bxstats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "align_strobe") - + configs = { + "workflow" : "align strobe", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "alignment_quality" : min_quality, + "keep_unmapped" : keep_unmapped, + "molecule_distance" : molecule_distance, + "average_read_length": read_length, + "depth_windowsize" : depth_window, + "skip_reports" : skip_reports, + **({'extra': extra_params} if extra_params else {}), + "workflow_call" : command.rstrip(), + "inputs" : { + "genome": Path(genome).resolve().as_posix(), + "fastq": [i.as_posix() for i in fqlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: align strobe\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"genomefile: {genome}\n") - config.write(f"keep_unmapped: {keep_unmapped}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"quality: {min_quality}\n") - config.write(f"molecule_distance: {molecule_distance}\n") - config.write(f"average_read_length: {read_length}\n") - config.write(f"depth_windowsize: {depth_window}\n") - config.write(f"skip_reports: {skip_reports}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - config.write(" fastq:\n") - for i in fqlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -321,7 +321,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, start_text.add_row("Genome:", genome) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "align_strobe", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "align_strobe", start_text, output_dir, sm_log, quiet, "workflow/align.strobealign.summary") align.add_command(bwa) align.add_command(ema) diff --git a/harpy/assembly.py b/harpy/assembly.py index 813417866..05b30f17b 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -2,6 +2,7 @@ import os import sys +import yaml from rich import box from rich.table import Table import rich_click as click @@ -11,51 +12,71 @@ from ._validations import validate_fastq_bx docstring = { - "harpy assemble": [ + "harpy assembly": [ { - "name": "Parameters", - "options": ["--bx-tag", "--extra-params", "--kmer-length", "--max-memory", "--metassembly"], + "name": "Assembly/Metassembly (spades) Parameters", + "options": ["--bx-tag", "--kmer-length", "--max-memory", "--metassembly", "--spades-extra"], + }, + { + "name": "Scaffolding Parameters (ignored for metassembly)", + "options": ["--arcs-extra", "--contig-length", "--links", "--min-aligned", "--min-quality", "--mismatch", "--molecule-distance", "--molecule-length", "--seq-identity", "--span"], }, { "name": "Workflow Controls", - "options": ["--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], + "options": ["--conda", "--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], }, ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/assembly") +# SPADES @click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode (`BX` or `BC`)") -@click.option('-x', '--extra-params', type = str, help = 'Additional spades parameters, in quotes') -@click.option('-m', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 250000, help = 'Maximum memory for metaSPADES to use, in megabytes') -@click.option('-a', '--metassembly', is_flag = True, show_default = True, default = False, help = 'Perform a metagenomic assembly') @click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for assembly (`odd` and `<128`)') -@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') +@click.option('-r', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 10000, help = 'Maximum memory for spades to use, in megabytes') +@click.option('--metassembly', type = click.Choice(["cloudspades", "spades"]), help = 'Perform a metagenome assembly [`spades`, `cloudspades`]') +@click.option('-z', '--spades-extra', type = str, help = 'Additional spades parameters, in quotes') +# TIGMINT/ARCS/LINKS +@click.option('-y', '--arcs-extra', type = str, help = 'Additional ARCS parameters, in quotes') +@click.option("-c","--contig-length", type = int, default = 500, show_default = True, help = "Minimum contig length") +@click.option("-x", "--links", type = int, default = 5, show_default = True, help = "Minimum number of links to compute scaffold") +@click.option("-a", "--min-aligned", type = int, default = 5, show_default = True, help = "Minimum aligned read pairs per barcode") +@click.option("-q", "--min-quality", type = click.IntRange(0,40), default = 0, show_default = True, help = "Minimum mapping quality") +@click.option("-m", "--mismatch", type = int, default = 5, show_default = True, help = "Maximum number of mismatches") +@click.option("-d", "--molecule-distance", type = int, default = 50000, show_default = True, help = "Distance cutoff to split molecules (bp)") +@click.option("-l", "--molecule-length", type = int, default = 2000, show_default = True, help = "Minimum molecule length (bp)") +@click.option("-i", "--seq-identity", type = click.IntRange(0,100), default = 98, show_default = True, help = "Minimum sequence identity") +@click.option("-s", "--span", type = int, default = 20, show_default = True, help = "Minimum number of spanning molecules to be considered assembled") +# Common Workflow +@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Assembly", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') +@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') +@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') @click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') @click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) @click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) -def assemble(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, metassembly, output_dir, extra_params, threads, snakemake, skip_reports, quiet, hpc, setup_only): +def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, metassembly, output_dir, spades_extra,arcs_extra,contig_length,links,min_quality,min_aligned,mismatch,molecule_distance,molecule_length,seq_identity,span,conda, threads, snakemake, skip_reports, quiet, hpc, setup_only): """ - Perform an assembly from linked-read sequences + Create an assembly/metassembly from linked-reads - Use the `--metassembly` flag to perform a metagenome assembly (defaults to an assembly of a single sample). - The linked-read barcodes must be in either a `BX:Z` or `BC:Z` FASTQ header tag, specified with `--bx-tag`. - If specifying `K` values, they must be separated by commas and without spaces (e.g. `-k 15,23,51`). + The linked-read barcodes must be in `BX:Z` or `BC:Z` FASTQ header tags. If provided, values for `-k` must be + separated by commas and without spaces (e.g. `-k 15,23,51`). It is strongly recommended to first deconvolve + the input FASTQ files with `harpy deconvolve`. Use `--metassembly` to perform a metagenome assembly: + - `spades`: ignores linked read information for the initial metassembly + - `cloudspades` incorporates linked read data for the initial metassembly """ output_dir = output_dir.rstrip("/") asm = "metassembly" if metassembly else "assembly" workflowdir = f"{output_dir}/workflow" - sdm = "conda" #if conda else "conda apptainer" + sdm = "conda" if conda else "conda apptainer" command = f'snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix .snakemake/conda --cores {threads} --directory . ' command += f"--snakefile {workflowdir}/{asm}.smk " command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake validate_fastq_bx([fastq_r1, fastq_r2], threads, quiet) @@ -63,26 +84,48 @@ def assemble(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, metassembly, o fetch_rule(workflowdir, f"{asm}.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, asm) + configs = { + "workflow" : asm, + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "barcode_tag" : bx_tag.upper(), + "spades" : { + **({'assembler' : metassembly} if metassembly else {}), + "k" : 'auto' if kmer_length == "auto" else ",".join(map(str,kmer_length)), + "max_memory" : max_memory, + **({'extra' : spades_extra} if spades_extra else {}) + }, + **({'tigmint': {}} if not metassembly else {}), + **({'arcs' : {}} if not metassembly else {}), + **({'links' : {}} if not metassembly else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs": { + "fastq_r1" : fastq_r1, + "fastq_r2" : fastq_r2 + } + } + if not metassembly: + configs["tigmint"] = { + "minimum_mapping_quality" : min_quality, + "mismatch" : mismatch, + "molecule_distance" : molecule_distance, + "molecule_length" : molecule_length, + "span" : span + } + configs["arcs"] = { + "minimum_aligned_reads" : min_aligned, + "minimum_contig_length" : contig_length, + "minimum_sequence_identity" : seq_identity, + **({'extra' : arcs_extra} if arcs_extra else {}) + } + configs["links"] = { + "minimum_links" : links + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write(f"workflow: {asm}\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"barcode_tag: {bx_tag.upper()}\n") - config.write("spades:\n") - config.write(f" max_memory: {max_memory}\n") - if kmer_length == "auto": - config.write(f" k: auto\n") - else: - config.write(f" k: " + ",".join(map(str,kmer_length)) + "\n") - if extra_params: - config.write(f" extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" fastq_r1: {fastq_r1}\n") - config.write(f" fastq_r2: {fastq_r2}\n") - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) @@ -90,11 +133,12 @@ def assemble(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, metassembly, o start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) start_text.add_column("value", justify="left") + start_text.add_row("Metassembly: ", "True" if metassembly else "False") start_text.add_row("Barcode Tag: ", bx_tag.upper()) if kmer_length == "auto": - start_text.add_row(f"Kmer Length: ", "auto") + start_text.add_row("Kmer Length: ", "auto") else: - start_text.add_row(f"Kmer Length: ", ",".join(map(str,kmer_length))) + start_text.add_row("Kmer Length: ", ",".join(map(str,kmer_length))) start_text.add_row("Output Folder:", f"{output_dir}/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, asm, start_text, output_dir, sm_log, quiet) + launch_snakemake(command, asm, start_text, output_dir, sm_log, quiet, f"workflow/{asm}.summary") \ No newline at end of file diff --git a/harpy/bin/infer_sv.py b/harpy/bin/infer_sv.py index e6631869f..121fc91fe 100755 --- a/harpy/bin/infer_sv.py +++ b/harpy/bin/infer_sv.py @@ -26,7 +26,7 @@ "-+": "duplication" } -if args.failfile is not None: +if args.failfile: with open(args.bedfile, "r", encoding="utf-8") as f, open(args.failfile, "w", encoding="utf-8") as failout: # first line, the header line = f.readline().strip().split("\t") diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index 3c8bc1261..307d56589 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -2,6 +2,7 @@ import os import sys +import yaml from rich import box from rich.table import Table import rich_click as click @@ -23,7 +24,7 @@ ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/qc") @click.option('-k', '--kmer-length', default = 21, show_default = True, type=int, help = 'Size of kmers') @click.option('-w', '--window-size', default = 40, show_default = True, type=int, help = 'Size of window guaranteed to contain at least one kmer') @click.option('-d', '--density', default = 3, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'On average, 1/2^d kmers are indexed') @@ -54,7 +55,7 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(workflowdir, exist_ok=True) @@ -62,20 +63,20 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t fetch_rule(workflowdir, "deconvolve.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "deconvolve") - + configs = { + "workflow": "deconvolve", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "kmer_length" : kmer_length, + "window_size" : window_size, + "density" : density, + "dropout" : dropout, + "workflow_call" : command.rstrip(), + "inputs": [i.as_posix() for i in fqlist] + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: deconvolve\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"kmer_length: {kmer_length}\n") - config.write(f"window_size: {window_size}\n") - config.write(f"density: {density}\n") - config.write(f"dropout: {dropout}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - for i in fqlist: - config.write(f" - {i}\n") - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) @@ -86,4 +87,4 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t start_text.add_row("Samples:", f"{sample_count}") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "deconvolve", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "deconvolve", start_text, output_dir, sm_log, quiet, "workflow/deconvolve.summary") diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 1f08b3ddd..ebff9392c 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -2,6 +2,7 @@ import os import sys +import yaml from pathlib import Path from rich import box from rich.table import Table @@ -37,7 +38,7 @@ def demultiplex(): } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/demultiplex/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/demultiplex/") @click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Tab-delimited file of sample\barcode') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name') @@ -67,7 +68,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake validate_demuxschema(schema) @@ -75,20 +76,23 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski fetch_rule(workflowdir, "demultiplex_gen1.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "demultiplex_gen1") - + configs = { + "workflow" : "demultiplex gen1", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : { + "demultiplex_schema" : Path(schema).resolve().as_posix(), + "R1": Path(r1_fq).resolve().as_posix(), + "R2": Path(r2_fq).resolve().as_posix(), + "I1": Path(i1_fq).resolve().as_posix(), + "I2": Path(i2_fq).resolve().as_posix() + } + } with open(f"{workflowdir}/config.yaml", "w", encoding= "utf-8") as config: - config.write("workflow: demultiplex gen1\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" demultiplex_schema: {Path(schema).resolve()}\n") - config.write(f" R1: {Path(r1_fq).resolve()}\n") - config.write(f" R2: {Path(r2_fq).resolve()}\n") - config.write(f" I1: {Path(i1_fq).resolve()}\n") - config.write(f" I2: {Path(i2_fq).resolve()}\n") - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) @@ -100,6 +104,6 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski start_text.add_row("Demultiplex Schema:", schema) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "demultiplex_gen1", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "demultiplex_gen1", start_text, output_dir, sm_log, quiet, "workflow/demux.gen1.summary") demultiplex.add_command(gen1) diff --git a/harpy/impute.py b/harpy/impute.py index 11bc5c511..6e5151d32 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -2,6 +2,7 @@ import os import sys +import yaml from pathlib import Path from rich import box from rich.table import Table @@ -25,7 +26,7 @@ ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/impute/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/impute/") @click.option('-x', '--extra-params', type = str, help = 'Additional STITCH parameters, in quotes') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Impute", show_default=True, help = 'Output directory name') @click.option('-p', '--parameters', required = True, type=click.Path(exists=True, dir_okay=False), help = 'STITCH parameter file (tab-delimited)') @@ -63,12 +64,12 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) - check_impute_params(parameters) + params = check_impute_params(parameters) bamlist, n = parse_alignment_inputs(inputs) validate_bam_RG(bamlist, threads, quiet) samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) @@ -80,23 +81,24 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para fetch_report(workflowdir, "stitch_collate.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "impute") - + configs = { + "workflow" : "impute", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "samples_from_vcf" : vcf_samples, + **({'stitch_extra': extra_params} if extra_params else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "stitch_parameters" : params, + "inputs" : { + "paramfile" : Path(parameters).resolve().as_posix(), + "variantfile" : Path(vcf).resolve().as_posix(), + "biallelic_contigs" : Path(biallelic).resolve().as_posix(), + "alignments" : [i.as_posix() for i in bamlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: impute\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"samples_from_vcf: {vcf_samples}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" paramfile: {Path(parameters).resolve()}\n") - config.write(f" variantfile: {Path(vcf).resolve()}\n") - config.write(f" biallelic_contigs: {Path(biallelic).resolve()}\n") - config.write(" alignments:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -109,7 +111,7 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para start_text.add_row("VCF Samples:", f"{len(samplenames)}") start_text.add_row("Alignment Files:", f"{n}") start_text.add_row("Parameter File:", parameters) - start_text.add_row("Usable Contigs:", f"{n_biallelic}") + start_text.add_row("Contigs:", f"{n_biallelic} [dim](with at least 2 biallelic SNPs)") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "impute", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "impute", start_text, output_dir, sm_log, quiet, "workflow/impute.summary") diff --git a/harpy/metassembly.py b/harpy/metassembly.py deleted file mode 100644 index f63a12120..000000000 --- a/harpy/metassembly.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Perform a linked-read aware metassembly""" - -import os -import sys -from rich import box -from rich.table import Table -import rich_click as click -from ._conda import create_conda_recipes -from ._launch import launch_snakemake -from ._misc import fetch_rule, snakemake_log, KParam -from ._validations import validate_fastq_bx - -docstring = { - "harpy metassembly": [ - { - "name": "Parameters", - "options": ["--bx-tag", "--extra-params", "kmer-length", "--max-memory"], - }, - { - "name": "Workflow Controls", - "options": ["--hpc", "--output-dir", "--quiet", "--skip-reports", "--snakemake", "--threads", "--help"], - }, - ] -} - -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") -@click.option('-b', '--bx-tag', type = click.Choice(['BX', 'BC'], case_sensitive=False), default = "BX", show_default=True, help = "The header tag with the barcode (`BX` or `BC`)") -@click.option('-x', '--extra-params', type = str, help = 'Additional metaspades parameters, in quotes') -@click.option('-m', '--max-memory', type = click.IntRange(min = 1000, max_open = True), show_default = True, default = 250000, help = 'Maximum memory for metaSPADES to use, in megabytes') -@click.option('-k', '--kmer-length', type = KParam(), show_default = True, default = "auto", help = 'K values to use for metaspades (`odd` and `<128`)') -@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Metassembly", show_default=True, help = 'Output directory name') -@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') -@click.option('--setup-only', is_flag = True, hidden = True, default = False, help = 'Setup the workflow and exit') -@click.option('--hpc', type = click.Path(exists = True, file_okay = False, readable=True), help = 'Directory with HPC submission `config.yaml` file') -@click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -@click.option('--skip-reports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -@click.argument('fastq_r1', required=True, type=click.Path(exists=True, readable=True), nargs=1) -@click.argument('fastq_r2', required=True, type=click.Path(exists=True, readable=True), nargs=1) -def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, extra_params, threads, snakemake, skip_reports, quiet, hpc, setup_only): - """ - Perform a metassembly from linked-read sequences - - The linked-read barcode must be in either a `BX:Z` or `BC:Z` FASTQ header tag, specified with `--bx-tag`. - If specifying `K` values, they must be separated by commas and without spaces (e.g. `-k 15,23,51`). - """ - output_dir = output_dir.rstrip("/") - workflowdir = f"{output_dir}/workflow" - sdm = "conda" # if conda else "conda apptainer" - command = f'snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix .snakemake/conda --cores {threads} --directory . ' - command += f"--snakefile {workflowdir}/metassembly.smk " - command += f"--configfile {workflowdir}/config.yaml " - if hpc: - command += f"--workflow-profile {hpc} " - if snakemake is not None: - command += snakemake - - validate_fastq_bx([fastq_r1, fastq_r2], threads, quiet) - os.makedirs(workflowdir, exist_ok=True) - fetch_rule(workflowdir, "metassembly.smk") - os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) - sm_log = snakemake_log(output_dir, "metassembly") - - with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: metassembly\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"barcode_tag: {bx_tag.upper()}\n") - config.write("spades:\n") - config.write(f" max_memory: {max_memory}\n") - if kmer_length == "auto": - config.write(f" k: auto\n") - else: - config.write(f" k: " + ",".join(map(str,kmer_length)) + "\n") - if extra_params: - config.write(f" extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" fastq_r1: {fastq_r1}\n") - config.write(f" fastq_r2: {fastq_r2}\n") - - create_conda_recipes() - if setup_only: - sys.exit(0) - - start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) - start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) - start_text.add_column("value", justify="left") - start_text.add_row("Barcode Tag: ", bx_tag.upper()) - if kmer_length == "auto": - start_text.add_row(f"Kmer Length: ", "auto") - else: - start_text.add_row(f"Kmer Length: ", ",".join(map(str,kmer_length))) - start_text.add_row("Output Folder:", f"{output_dir}/") - start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "metassembly", start_text, output_dir, sm_log, quiet) diff --git a/harpy/phase.py b/harpy/phase.py index 41b6b5656..8b16d04e8 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -2,6 +2,7 @@ import os import sys +import yaml from rich import box from rich.table import Table import rich_click as click @@ -24,11 +25,11 @@ ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/phase") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/phase") @click.option('-x', '--extra-params', type = str, help = 'Additional HapCut2 parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to genome assembly if wanting to also extract reads spanning indels') @click.option('-b', '--ignore-bx', is_flag = True, show_default = True, default = False, help = 'Ignore barcodes when phasing') -@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') +@click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Distance cutoff to split molecules (bp)') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Phase", show_default=True, help = 'Output directory name') @click.option('-p', '--prune-threshold', default = 7, show_default = True, type = click.IntRange(0,100), help = 'PHRED-scale threshold (%) for pruning low-confidence SNPs (larger prunes more.)') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 2, max_open = True), help = 'Number of threads to use') @@ -61,7 +62,7 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/input", exist_ok= True) @@ -75,26 +76,25 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, fetch_report(workflowdir, "hapcut.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "phase") - + configs = { + "workflow" : "phase", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "ignore_bx" : ignore_bx, + "prune" : prune_threshold/100, + "molecule_distance" : molecule_distance, + "samples_from_vcf" : vcf_samples, + **({'extra': extra_params} if extra_params else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : { + "variantfile" : vcf, + **({'genome': genome} if genome else {}), + "alignments" : [i.as_posix() for i in bamlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: phase\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"ignore_bx: {ignore_bx}\n") - config.write(f"prune: {prune_threshold/100}\n") - config.write(f"molecule_distance: {molecule_distance}\n") - config.write(f"samples_from_vcf: {vcf_samples}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" variantfile: {vcf}\n") - if genome is not None: - config.write(f" genome: {genome}\n") - config.write(" alignments:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -107,8 +107,8 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, start_text.add_row("Samples in VCF:", f"{len(samplenames)}") start_text.add_row("Alignment Files:", f"{n}") start_text.add_row("Phase Indels:", "yes" if genome else "no") - if genome is not None: + if genome: start_text.add_row("Genome:", genome) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "phase", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "phase", start_text, output_dir, sm_log, quiet, "workflow/phase.summary") diff --git a/harpy/popgroups.py b/harpy/popgroups.py index 1ed18c02c..efa596270 100644 --- a/harpy/popgroups.py +++ b/harpy/popgroups.py @@ -8,7 +8,7 @@ from ._printing import print_error, print_solution, print_notice -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/snp/#sample-grouping-file") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp/#sample-grouping-file") @click.option('-o', '--output', type=str, default = "samples.groups", metavar = "Output file name", help = 'Output file name, will overwrite existing') @click.argument('inputdir', required=True, type=click.Path(exists=True, file_okay=False)) def popgroup(inputdir, output): diff --git a/harpy/preflight.py b/harpy/preflight.py index c0c465b77..5cb7ac484 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -2,6 +2,7 @@ import os import sys +import yaml from rich import box from rich.table import Table import rich_click as click @@ -35,7 +36,7 @@ def preflight(): ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/preflight/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/preflight/") @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/fastq", show_default=True, help = 'Output directory name') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') @@ -65,7 +66,7 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) @@ -74,15 +75,15 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) fetch_report(workflowdir, "preflight_fastq.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "preflight_fastq") - + configs = { + "workflow" : "preflight fastq", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "workflow_call" : command.rstrip(), + "inputs" : [i.as_posix() for i in fqlist] + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: preflight fastq\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - for i in fqlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -94,9 +95,9 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) start_text.add_row("FASTQ Files:", f"{n}") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "preflight_fastq", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "preflight_fastq", start_text, output_dir, sm_log, quiet, "workflow/preflight.fastq.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/preflight/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/preflight/") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Preflight/bam", show_default=True, help = 'Output directory name') @@ -125,7 +126,7 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only): command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) @@ -134,15 +135,15 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only): fetch_report(workflowdir, "preflight_bam.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "preflight_bam") - + configs = { + "workflow" : "preflight bam", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "workflow_call" : command.rstrip(), + "inputs" : [i.as_posix() for i in bamlist] + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: preflight bam\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -154,7 +155,7 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only): start_text.add_row("Alignment Files:", f"{n}") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "preflight_bam", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "preflight_bam", start_text, output_dir, sm_log, quiet, "workflow/preflight.bam.summary") preflight.add_command(fastq) preflight.add_command(bam) diff --git a/harpy/qc.py b/harpy/qc.py index 0e4013e8c..c7ead810e 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -2,6 +2,7 @@ import os import sys +import yaml from rich import box from rich.table import Table import rich_click as click @@ -23,7 +24,7 @@ ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/qc") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/qc") @click.option('-c', '--deconvolve', type = IntList(4), default = "0,0,0,0", help = 'Accepts the QuickDeconvolution parameters for `k`,`w`,`d`,`a` (in that order)') @click.option('-d', '--deduplicate', is_flag = True, default = False, help = 'Identify and remove PCR duplicates') @click.option('-x', '--extra-params', type = str, help = 'Additional Fastp parameters, in quotes') @@ -64,7 +65,7 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake fqlist, sample_count = parse_fastq_inputs(inputs) @@ -73,29 +74,28 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d fetch_report(workflowdir, "bx_count.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "qc") - + k,w,d,a = deconvolve + configs = { + "workflow" : "qc", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "trim_adapters" : trim_adapters, + "deduplicate" : deduplicate, + "min_len" : min_length, + "max_len" : max_length, + **({'extra': extra_params} if extra_params else {}), + **({'deconvolve': { + "kmer_length" : k, + "window_size" : w, + "density" : d, + "dropout" : a + }} if sum(deconvolve) > 0 else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : [i.as_posix() for i in fqlist] + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: qc\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"trim_adapters: {trim_adapters}\n") - config.write(f"deduplicate: {deduplicate}\n") - if sum(deconvolve) > 0: - config.write("deconvolve:\n") - k,w,d,a = deconvolve - config.write(f" kmer_length: {k}\n") - config.write(f" window_size: {w}\n") - config.write(f" density: {d}\n") - config.write(f" dropout: {a}\n") - config.write(f"min_len: {min_length}\n") - config.write(f"max_len: {max_length}\n") - if extra_params: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - for i in fqlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -109,4 +109,4 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d start_text.add_row("Deconvolve:", "yes" if sum(deconvolve) > 0 else "no") start_text.add_row("Output Folder:", f"{output_dir}/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "qc", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "qc", start_text, output_dir, sm_log, quiet, "workflow/qc.summary") diff --git a/harpy/reports/stitch_collate.Rmd b/harpy/reports/stitch_collate.Rmd index 461f6fece..67e7a69b9 100644 --- a/harpy/reports/stitch_collate.Rmd +++ b/harpy/reports/stitch_collate.Rmd @@ -21,6 +21,25 @@ sink(logfile) sink(logfile, type = "message") ``` + +```{css zoom-lib-src} +script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js" +``` + +```{js zoom-jquery} + $(document).ready(function() { + $('body').prepend('
'); + // onClick function for all plots (img's) + $('img:not(.zoomImg)').click(function() { + $('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'}); + $('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'}); + }); + // onClick function for zoomImg + $('img.zoomImg').click(function() { + $('.zoomDiv').css({opacity: '0', width: '0%'}); + }); + }); +``` ```{r setup environment} using<-function(...) { libs<-unlist(list(...)) @@ -35,23 +54,21 @@ using("flexdashboard","tidyr","magrittr","DT") ``` ```{r load data} -infile <- snakemake@input[[1]] +infile <- snakemake@input$statsfile #infile <- "~/contig1.stats" dataL <- readLines(infile) bcf <- gsub(".stats$", ".bcf", basename(infile)) chrom <- gsub(".bcf", "", bcf) basedir <- dirname(normalizePath(infile)) -plotdir <- normalizePath(snakemake@input[[2]]) +plotdir <- normalizePath(snakemake@input$plotdir) plotdir <- paste0(plotdir, "/") -# first by folder -params <- unlist(strsplit(infile, "[/]")) -model <- gsub("model", "", params[length(params) - 6]) -usebx <- gsub("usebx", "", params[length(params) - 5]) -otherparams <- unlist(strsplit(params[length(params) - 4], "_")) -k <- gsub("k", "", otherparams[1]) -s <- gsub("s", "", otherparams[2]) -ngen <- gsub("ngen", "", otherparams[3]) -bxlimit <- gsub("bxlimit", "", otherparams[4]) +model <- snakemake@params$model +usebx <- snakemake@params$usebx +bxlimit <- snakemake@params$bxlimit +k <- snakemake@params$k +s <- snakemake@params$s +ngen <- snakemake@params$ngen +extra <- snakemake@params$extra ``` ## Contig: `r chrom` @@ -75,7 +92,7 @@ sn <- as.data.frame(t(sn[2])) rownames(sn) <- NULL ``` -**model**: `r model` | **use barcodes**: `r usebx` | **k**: `r k` | **s**: `r s` | **ngen**: `r ngen` | **bx limit**: `r bxlimit` +**model**: `r model` | **use barcodes**: `r usebx` | **k**: `r k` | **s**: `r s` | **ngen**: `r ngen` | **bx limit**: `r format(bxlimit, scientific = F)` | **other parameters**: `r extra` **SAMPLES**: `r sn$samples[1]` | **TOTAL RECORDS**: `r sn$records[1]` | **no-ALTs**: `r sn[3,1]` | **SNPs**: `r sn$SNPs[1]` | **MNPs**: `r sn$MNPs[1]` diff --git a/harpy/resume.py b/harpy/resume.py index 526212380..7ddaf34bb 100644 --- a/harpy/resume.py +++ b/harpy/resume.py @@ -13,7 +13,7 @@ from ._misc import snakemake_log from ._conda import create_conda_recipes -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/other") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/other") @click.option('-c', '--conda', is_flag = True, default = False, help = 'Recreate the conda environments into .harpy_envs/') @click.option('-t', '--threads', type = click.IntRange(min = 2, max_open = True), help = 'Change the number of threads (>1)') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') diff --git a/harpy/scripts/stitch_impute.R b/harpy/scripts/stitch_impute.R index f8ccbd48f..52228581b 100755 --- a/harpy/scripts/stitch_impute.R +++ b/harpy/scripts/stitch_impute.R @@ -1,24 +1,24 @@ suppressWarnings(suppressPackageStartupMessages(library("STITCH"))) # Params pulled in from Snakemake -bamlist <- snakemake@input[["bamlist"]] -posfile <- snakemake@input[["infile"]] +bamlist <- snakemake@input$bamlist +posfile <- snakemake@input$infile chr <- gsub(".stitch", "", basename(posfile)) -outdir <- normalizePath(dirname(snakemake@output[[1]])) -outfile <- basename(snakemake@output[[1]]) -logfile <- file(snakemake@log[[1]], open = "wt") +outvcf <- snakemake@output$vcf +outdir <- normalizePath(dirname(outvcf)) +outfile <- basename(outvcf) +logfile <- file(snakemake@log$logfile, open = "wt") # create tmpdir if not already present tmpdr <- paste(outdir, "tmp", sep = "/") dir.create(file.path(tmpdr), showWarnings = FALSE) # model parameters -parameters <- snakemake@params[["parameters"]] -modeltype <- parameters$model -K <- parameters$k -S <- parameters$s -bx <- toupper(parameters$usebx) %in% c("TRUE", "YES", "Y") -bxlim <- parameters$bxlimit -nGenerations <- parameters$ngen +modeltype <- snakemake@params$model +K <- snakemake@params$k +S <- snakemake@params$s +bx <- as.logical(snakemake@params$usebx) +bxlim <- snakemake@params$bxlimit +nGenerations <- snakemake@params$ngen nCores <- snakemake@threads inputBundleBlockSize <- NA cli_args <- list( @@ -40,7 +40,7 @@ cli_args <- list( tempdir = tmpdr ) # if there are any extra arguments provided to harpy by the -x argument -extra <- snakemake@params[["extra"]] +extra <- snakemake@params$extra if(extra != ""){ # convert the extra arguments into proper R types # converts numbers to numeric, vectors to vectors, leaves strings as-is diff --git a/harpy/simulate.py b/harpy/simulate.py index b5566e1c0..cc1a2529a 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -1,6 +1,7 @@ """Harpy workflows to simulate genomic variants and linked-reads""" import os import sys +import yaml from pathlib import Path from rich import box from rich.table import Table @@ -121,7 +122,7 @@ def simulate(): ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-linkedreads") @click.option('-b', '--barcodes', type = click.Path(exists=True, dir_okay=False), help = "File of linked-read barcodes to add to reads") @click.option('-s', '--distance-sd', type = click.IntRange(min = 1), default = 15, show_default=True, help = "Standard deviation of read-pair distance") @click.option('-m', '--molecules-per', type = click.IntRange(min = 1), default = 10, show_default=True, help = "Average number of molecules per partition") @@ -159,7 +160,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake check_fasta(genome_hap1, quiet) @@ -170,25 +171,27 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r fetch_script(workflowdir, "LRSIM_harpy.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_linkedreads") - + configs = { + "workflow" : "simulate linkedreads", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "outer_distance" : outer_distance, + "distance_sd" : distance_sd, + "read_pairs" : read_pairs, + "mutation_rate" : mutation_rate, + "molecule_length" : molecule_length, + "partitions" : partitions, + "molecules_per_partition" : molecules_per, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome_hap1" : Path(genome_hap1).resolve().as_posix(), + "genome_hap2" : Path(genome_hap2).resolve().as_posix(), + **({'barcodes': Path(barcodes).resolve().as_posix()} if barcodes else {}), + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: simulate linkedreads\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - if barcodes: - config.write(f"barcodes: {Path(barcodes).resolve()}\n") - config.write(f"outer_distance: {outer_distance}\n") - config.write(f"distance_sd: {distance_sd}\n") - config.write(f"read_pairs: {read_pairs}\n") - config.write(f"mutation_rate: {mutation_rate}\n") - config.write(f"molecule_length: {molecule_length}\n") - config.write(f"partitions: {partitions}\n") - config.write(f"molecules_per_partition: {molecules_per}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome_hap1: {Path(genome_hap1).resolve()}\n") - config.write(f" genome_hap2: {Path(genome_hap2).resolve()}\n") - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) @@ -201,7 +204,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r start_text.add_row("Barcodes:", os.path.basename(barcodes) if barcodes else "Barcodes: 10X Default") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "simulate_linkedreads", start_text, output_dir, sm_log, quiet, "workflow/simulate.reads.summary") @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "This workflow can be quite technical, please read the docs for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-s', '--snp-vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known snps to simulate') @@ -259,7 +262,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) start_text.add_column("detail", justify="left", style="light_steel_blue", no_wrap=True) @@ -295,46 +298,49 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_snpindel") - - # setup the config file depending on inputs + configs = { + "workflow" : "simulate snpindel", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "prefix" : prefix, + **({"random_seed" : randomseed} if randomseed else {}), + "heterozygosity" : { + "ratio" : heterozygosity, + "only_vcf" : only_vcf, + }, + "snp" : { + **({"vcf" : Path(snp_vcf).resolve().as_posix()} if snp_vcf else {}), + **({'count': snp_count} if snp_count and not snp_vcf else {}), + **({"gene_constraints": snp_gene_constraints} if snp_gene_constraints and not snp_vcf else {}), + **({"titv_ratio" : titv_ratio} if titv_ratio and not snp_vcf else {}) + }, + "indel" : { + **({"vcf" : Path(indel_vcf).resolve().as_posix()} if indel_vcf else {}), + **({"count" : indel_count} if indel_count and not indel_vcf else {}), + **({"indel_ratio" : indel_ratio} if indel_ratio and not indel_vcf else {}), + **({"size_alpha" : indel_size_alpha} if indel_size_alpha and not indel_vcf else {}), + **({"size_constant" : indel_size_constant} if indel_size_constant and not indel_vcf else {}) + }, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), + **({"genes" : Path(genes).resolve().as_posix()} if genes else {}), + **({"excluded_chromosomes" : Path(exclude_chr).resolve().as_posix()} if exclude_chr else {}) + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: simulate snpindel\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"prefix: {prefix}\n") - config.write("heterozygosity:\n") - config.write(f" value: {heterozygosity}\n") - config.write(f" only_vcf: {only_vcf}\n") - if not snp_vcf: - config.write(f"snp_count: {snp_count}\n") if snp_count else None - config.write(f"snp_gene_constraints: {snp_gene_constraints}\n") if snp_gene_constraints else None - config.write(f"titv_ratio: {titv_ratio}\n") if titv_ratio else None - if not indel_vcf: - config.write(f"indel_count: {indel_count}\n") if indel_count else None - config.write(f"indel_ratio: {indel_ratio}\n") if indel_ratio else None - config.write(f"indel_size_alpha: {indel_size_alpha}\n") if indel_size_alpha else None - config.write(f"indel_size_constant: {indel_size_constant}\n") if indel_size_constant else None - config.write(f"random_seed: {randomseed}\n") if randomseed else None - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - if snp_vcf: - config.write(f" snp_vcf: {Path(snp_vcf).resolve()}\n") - if indel_vcf: - config.write(f" indel_vcf: {Path(indel_vcf).resolve()}\n") - config.write(f" centromeres: {Path(centromeres).resolve()}\n") if centromeres else None - config.write(f" genes: {Path(genes).resolve()}\n") if genes else None - config.write(f" exclude_chr: {Path(exclude_chr).resolve()}\n") if exclude_chr else None - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "simulate_snpindel", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "simulate_snpindel", start_text, output_dir, sm_log, quiet, "workflow/simulate.snpindel.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known inversions to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random inversions to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum inversion size (bp)") @@ -375,7 +381,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake # instantiate workflow directory @@ -407,42 +413,43 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_inversion") - - # setup the config file depending on inputs + configs = { + "workflow" : "simulate inversion", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "prefix" : prefix, + **({"random_seed" : randomseed} if randomseed else {}), + "heterozygosity" : { + "ratio" : heterozygosity, + "only_vcf" : only_vcf, + }, + "inversion" : { + **({"vcf" : Path(vcf).resolve().as_posix()} if vcf else {}), + **({'count': count} if count and not vcf else {}), + **({"min_size": min_size} if min_size and not vcf else {}), + **({"max_size" : max_size} if max_size and not vcf else {}) + }, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), + **({"genes" : Path(genes).resolve().as_posix()} if genes else {}), + **({"excluded_chromosomes" : Path(exclude_chr).resolve().as_posix()} if exclude_chr else {}) + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: simulate inversion\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write("variant_type: inversion\n") - config.write(f"prefix: {prefix}\n") - config.write("heterozygosity:\n") - config.write(f" value: {heterozygosity}\n") - config.write(f" only_vcf: {only_vcf}\n") - config.write(f"random_seed: {randomseed}\n") if randomseed else None - if not vcf: - config.write(f"count: {count}\n") - config.write(f"min_size: {min_size}\n") if min_size else None - config.write(f"max_size: {max_size}\n") if max_size else None - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - if vcf: - config.write(f" vcf: {Path(vcf).resolve()}\n") - else: - config.write(f" centromeres: {Path(centromeres).resolve()}\n") if centromeres else None - config.write(f" genes: {Path(genes).resolve()}\n") if genes else None - config.write(f" exclude_chr: {Path(exclude_chr).resolve()}\n") if exclude_chr else None - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "simulate_inversion", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "simulate_inversion", start_text, output_dir, sm_log, quiet, "workflow/simulate.inversion.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known copy number variants to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random variants to simluate") @click.option('-m', '--min-size', type = click.IntRange(min = 1), default = 1000, show_default= True, help = "Minimum variant size (bp)") @@ -492,7 +499,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake # instantiate workflow directory @@ -524,44 +531,45 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_cnv") - - # setup the config file depending on inputs + configs = { + "workflow" : "simulate cnv", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "prefix" : prefix, + **({"random_seed" : randomseed} if randomseed else {}), + "heterozygosity" : { + "ratio" : heterozygosity, + "only_vcf" : only_vcf, + }, + "cnv" : { + **({"vcf" : Path(vcf).resolve().as_posix()} if vcf else {}), + **({'count': count} if count and not vcf else {}), + **({"min_size": min_size} if min_size and not vcf else {}), + **({"max_size" : max_size} if max_size and not vcf else {}), + **({"duplication_ratio" : dup_ratio} if dup_ratio and not vcf else {}), + **({"max_copy" : max_copy} if max_copy and not vcf else {}), + **({"gain_ratio" : gain_ratio} if gain_ratio and not vcf else {}) + }, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), + **({"genes" : Path(genes).resolve().as_posix()} if genes else {}), + **({"excluded_chromosomes" : Path(exclude_chr).resolve().as_posix()} if exclude_chr else {}) + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: simulate cnv\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write("variant_type: cnv\n") - config.write(f"prefix: {prefix}\n") - config.write(f"random_seed: {randomseed}\n") if randomseed else None - config.write("heterozygosity:\n") - config.write(f" value: {heterozygosity}\n") - config.write(f" only_vcf: {only_vcf}\n") - if not vcf: - config.write(f"count: {count}\n") - config.write(f"min_size: {min_size}\n") if min_size else None - config.write(f"max_size: {max_size}\n") if max_size else None - config.write(f"dup_ratio: {dup_ratio}\n") if dup_ratio else None - config.write(f"cnv_max_copy: {max_copy}\n") if max_copy else None - config.write(f"gain_ratio: {gain_ratio}\n") if gain_ratio else None - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - if vcf: - config.write(f" vcf: {Path(vcf).resolve()}\n") - else: - config.write(f" centromeres: {Path(centromeres).resolve()}\n") if centromeres else None - config.write(f" genes: {Path(genes).resolve()}\n") if genes else None - config.write(f" exclude_chr: {Path(exclude_chr).resolve()}\n") if exclude_chr else None - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "simulate_cnv", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "simulate_cnv", start_text, output_dir, sm_log, quiet, "workflow/simulate.cnv.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please See the documentation for more information: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Please Documentation: https://pdimens.github.io/harpy/workflows/simulate/simulate-variants") @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'VCF file of known translocations to simulate') @click.option('-n', '--count', type = click.IntRange(min = 0), default=0, show_default=False, help = "Number of random translocations to simluate") @click.option('-c', '--centromeres', type = click.Path(exists=True, dir_okay=False, readable=True), help = "GFF3 file of centromeres to avoid") @@ -599,7 +607,7 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake # instantiate workflow directory @@ -631,37 +639,39 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_translocation") - + configs = { + "workflow" : "simulate translocation", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "prefix" : prefix, + **({"random_seed" : randomseed} if randomseed else {}), + "heterozygosity" : { + "ratio" : heterozygosity, + "only_vcf" : only_vcf, + }, + "translocation" : { + **({"vcf" : Path(vcf).resolve().as_posix()} if vcf else {}), + **({'count': count} if count and not vcf else {}), + }, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), + **({"genes" : Path(genes).resolve().as_posix()} if genes else {}), + **({"excluded_chromosomes" : Path(exclude_chr).resolve().as_posix()} if exclude_chr else {}) + } + } # setup the config file depending on inputs with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: simulate translocation\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write("variant_type: translocation\n") - config.write(f"prefix: {prefix}\n") - config.write(f"random_seed: {randomseed}\n") if randomseed else None - if not vcf: - config.write(f"count: {count}\n") - config.write("heterozygosity:\n") - config.write(f" value: {heterozygosity}\n") - config.write(f" only_vcf: {only_vcf}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - if vcf: - config.write(f" vcf: {Path(vcf).resolve()}\n") - else: - config.write(f" centromeres: {Path(centromeres).resolve()}\n") if centromeres else None - config.write(f" genes: {Path(genes).resolve()}\n") if genes else None - config.write(f" exclude_chr: {Path(exclude_chr).resolve()}\n") if exclude_chr else None - + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) + create_conda_recipes() if setup_only: sys.exit(0) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "simulate_translocation", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "simulate_translocation", start_text, output_dir, sm_log, quiet, "workflow/simulate.translocation.summary") simulate.add_command(linkedreads) simulate.add_command(snpindel) diff --git a/harpy/snakefiles/align_bwa.smk b/harpy/snakefiles/align_bwa.smk index f2ec94cf5..1eda8e5d3 100644 --- a/harpy/snakefiles/align_bwa.smk +++ b/harpy/snakefiles/align_bwa.smk @@ -14,7 +14,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+" outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] fqlist = config["inputs"]["fastq"] molecule_distance = config["molecule_distance"] @@ -173,6 +173,19 @@ rule barcode_stats: shell: "bx_stats.py -o {output} {input.bam}" +rule molecule_coverage: + input: + stats = outdir + "/reports/data/bxstats/{sample}.bxstats.gz", + fai = f"Genome/{bn}.fai" + output: + outdir + "/reports/data/coverage/{sample}.molcov.gz" + params: + windowsize + container: + None + shell: + "molecule_coverage.py -f {input.fai} {input.stats} | depth_windows.py {params} | gzip > {output}" + rule calculate_depth: input: bam = outdir + "/{sample}.bam", @@ -188,8 +201,9 @@ rule calculate_depth: rule sample_reports: input: - outdir + "/reports/data/bxstats/{sample}.bxstats.gz", - outdir + "/reports/data/coverage/{sample}.cov.gz" + bxstats = outdir + "/reports/data/bxstats/{sample}.bxstats.gz", + coverage = outdir + "/reports/data/coverage/{sample}.cov.gz", + molecule_coverage = outdir + "/reports/data/coverage/{sample}.molcov.gz" output: outdir + "/reports/{sample}.html" log: @@ -251,28 +265,24 @@ rule workflow_summary: agg_report = outdir + "/reports/bwa.stats.html" if not skip_reports else [], bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: - genomefile = genomefile, quality = config["alignment_quality"], unmapped = "" if keep_unmapped else "-F 4", extra = extra run: - summary_template = f""" -The harpy align bwa workflow ran using these parameters: - -The provided genome: {{params.genomefile}} - -Sequences were aligned with BWA using: - bwa mem -C -v 2 {{params.extra}} -R "@RG\\tID:SAMPLE\\tSM:SAMPLE" genome forward_reads reverse_reads | - samtools view -h {{params.unmapped}} -q {{params.quality}} - -Duplicates in the alignments were marked following: - samtools collate | - samtools fixmate | - samtools sort -T SAMPLE --reference {{params.genomefile}} -m 2000M | - samtools markdup -S --barcode-tag BX - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy align bwa workflow ran using these parameters:"] + summary.append(f"The provided genome: {genomefile}") + align = "Sequences were aligned with BWA using:\n" + align += f"\tbwa mem -C -v 2 {params.extra} -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome forward_reads reverse_reads |\n" + align += f"\tsamtools view -h {params.unmapped} -q {params.quality}" + summary.append(align) + duplicates = "Duplicates in the alignments were marked following:\n" + duplicates += "\tsamtools collate |\n" + duplicates += "\tsamtools fixmate |\n" + duplicates += f"\tsamtools sort -T SAMPLE --reference {genomefile} -m 2000M |\n" + duplicates += "\tsamtools markdup -S --barcode-tag BX" + summary.append(duplicates) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/align.bwa.summary", "w") as f: - f.write(summary_template.format(params=params)) \ No newline at end of file + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index 9f4ab6a59..7d7da4596 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -24,7 +24,7 @@ extra = config.get("extra", "") bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") windowsize = config["depth_windowsize"] keep_unmapped = config["keep_unmapped"] skip_reports = config["skip_reports"] @@ -147,7 +147,7 @@ rule align_ema: params: bxtype = f"-p {platform}", tmpdir = lambda wc: outdir + "/." + d[wc.sample], - quality = config["quality"], + quality = config["alignment_quality"], unmapped = "" if keep_unmapped else "-F 4", extra = extra threads: @@ -173,7 +173,7 @@ rule align_bwa: log: outdir + "/logs/align/{sample}.bwa.align.log" params: - quality = config["quality"], + quality = config["alignment_quality"], unmapped = "" if keep_unmapped else "-F 4" benchmark: ".Benchmark/Mapping/ema/bwaAlign.{sample}.txt" @@ -335,41 +335,37 @@ rule workflow_summary: beadtech = "-p" if platform == "haplotag" else f"-w {barcode_list}", unmapped = "" if keep_unmapped else "-F 4" run: - summary_template= f""" -The harpy align ema workflow ran using these parameters: - -The provided genome: {bn} - -Barcodes were counted and validated with EMA using: - seqtk mergepe forward.fq.gz reverse.fq.gz | ema count {{params.beadtech}} - -Barcoded sequences were binned with EMA using: - seqtk mergepe forward.fq.gz reverse.fq.gz | ema preproc {{params.beadtech}} -n {nbins} - -Barcoded bins were aligned with ema align using: - ema align " + extra + " -d -p " + platform + " -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" | - samtools view -h {{params.unmapped}} -q " + str(config["quality"]) + " - | - samtools sort --reference genome -m 2000M - -Invalid/non barcoded sequences were aligned with BWA using: - bwa mem -C -v2 -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome forward_reads reverse_reads - samtools view -h {{params.unmapped}} -q " + str(config["quality"]) + " - | - samtools sort --reference genome -m 2000M - -Duplicates in non-barcoded alignments were marked following: - samtools collate | - samtools fixmate | - samtools sort -m 2000M | - samtools markdup -S - -Alignments were merged using: - samtools cat barcode.bam nobarcode.bam > concat.bam - -Merged alignments were sorted using: - samtools sort -m 2000M concat.bam - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" - with open(outdir + "/workflow/align.ema.summary", "w") as f: - f.write(summary_template.format(params=params)) + summary = ["The harpy align ema workflow ran using these parameters:"] + summary.append(f"The provided genome: {genomefile}") + counts = "Barcodes were counted and validated with EMA using:\n" + counts += f"\tseqtk mergepe forward.fq.gz reverse.fq.gz | ema count {params.beadtech}" + summary.append(counts) + bins = "Barcoded sequences were binned with EMA using:\n" + bins += f"\tseqtk mergepe forward.fq.gz reverse.fq.gz | ema preproc {params.beadtech} -n {nbins}" + summary.append(bins) + ema_align = "Barcoded bins were aligned with ema align using:\n" + ema_align += f"\tema align {extra} -d -p {platform} -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" |\n" + ema_align += f"\tsamtools view -h {params.unmapped} -q {config["alignment_quality"]} - |\n" + ema_align += "\tsamtools sort --reference genome" + summary.append(ema_align) + bwa_align = "Non-barcoded and invalid-barcoded sequences were aligned with BWA using:\n" + bwa_align += "\tbwa mem -C -v 2 -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome forward_reads reverse_reads |\n" + bwa_align += f"\tsamtools view -h {params.unmapped} -q {config["alignment_quality"]}" + summary.append(bwa_align) + duplicates = "Duplicates in non-barcoded alignments were marked following:\n" + duplicates += "\tsamtools collate |\n" + duplicates += "\tsamtools fixmate |\n" + duplicates += f"\tsamtools sort -T SAMPLE --reference {genomefile} |\n" + duplicates += "\tsamtools markdup -S" + summary.append(duplicates) + merged = "Alignments were merged using:\n" + merged += "\tsamtools cat barcode.bam nobarcode.bam > concat.bam" + summary.append(merged) + sorting = "Merged alignments were sorted using:\n" + sorting += "\tsamtools sort -m 2000M concat.bam" + summary.append(sorting) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) + with open(outdir + "/workflow/align.bwa.summary", "w") as f: + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/align_strobealign.smk b/harpy/snakefiles/align_strobealign.smk index 91298e9df..f98e32f24 100644 --- a/harpy/snakefiles/align_strobealign.smk +++ b/harpy/snakefiles/align_strobealign.smk @@ -14,7 +14,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+" outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] fqlist = config["inputs"]["fastq"] extra = config.get("extra", "") @@ -86,7 +86,7 @@ rule align: params: samps = lambda wc: d[wc.get("sample")], readlen = "" if autolen else f"--use-index -r {readlen}", - quality = config["quality"], + quality = config["alignment_quality"], unmapped_strobe = "" if keep_unmapped else "-U", unmapped = "" if keep_unmapped else "-F 4", extra = extra @@ -244,35 +244,31 @@ rule workflow_summary: bx_report = outdir + "/reports/barcodes.summary.html" if (not skip_reports or len(samplenames) == 1) else [] params: readlen = readlen, - quality = config["quality"], + quality = config["alignment_quality"], unmapped_strobe = "" if keep_unmapped else "-U", unmapped = "" if keep_unmapped else "-F 4", extra = extra run: + summary = ["The harpy align strobe workflow ran using these parameters:"] + summary.append(f"The provided genome: {genomefile}") + align = "Sequences were aligned with strobealign using:\n" if autolen: - strobetext = f"Sequences were aligned with strobealign using:\n" - strobetext += f" strobealign -U -C --rg-id=SAMPLE --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n" + align += f"\tstrobealign -U -C --rg-id=SAMPLE --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n" else: - strobetext = "The genome index was created using:\n" - strobetext += f" strobealign --create-index -r {params.readlen} genome\n\n" - strobetext += "Sequences were aligned with strobealign using:\n" - strobetext += f" strobealign --use-index {params.unmapped_strobe} -N 2 -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n" - summary_template = f""" -The harpy align strobealign workflow ran using these parameters: - -The provided genome: {bn} - -{strobetext} - samtools view -h {params.unmapped} -q {params.quality} - -Duplicates in the alignments were marked following: - samtools collate | - samtools fixmate | - samtools sort -m 2000M | - samtools markdup -S --barcode-tag BX - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + align = "The genome index was created using:\n" + align += f"\tstrobealign --create-index -r {params.readlen} genome\n\n" + align += "Sequences were aligned with strobealign using:\n" + align += f"\tstrobealign --use-index {params.unmapped_strobe} -N 2 -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n" + align += f"\t\tsamtools view -h {params.unmapped} -q {params.quality}" + summary.append(align) + duplicates = "Duplicates in the alignments were marked following:\n" + duplicates += "\tsamtools collate |\n" + duplicates += "\tsamtools fixmate |\n" + duplicates += f"\tsamtools sort -T SAMPLE --reference {genomefile} -m 2000M |\n" + duplicates += "\tsamtools markdup -S --barcode-tag BX" + summary.append(duplicates) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/align.strobealign.summary", "w") as f: - f.write(summary_template) \ No newline at end of file + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/assembly.smk b/harpy/snakefiles/assembly.smk index c8010fe68..d4d344c37 100644 --- a/harpy/snakefiles/assembly.smk +++ b/harpy/snakefiles/assembly.smk @@ -1,3 +1,5 @@ +containerized: "docker://pdimens/harpy:latest" + import os import logging @@ -11,23 +13,35 @@ onerror: FQ1 = config["inputs"]["fastq_r1"] FQ2 = config["inputs"]["fastq_r2"] outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" -max_mem = config["spades"]["max_memory"] -k_param = config["spades"]["k"] -extra = config["spades"].get("extra", "") +envdir = os.path.join(os.getcwd(), ".harpy_envs") +# SPADES +max_mem = config["spades"]["max_memory"] +k_param = config["spades"]["k"] +spades_extra = config["spades"].get("extra", "") +# ARCS +mapq = config["tigmint"]["minimum_mapping_quality"] +mismatch = config["tigmint"]["mismatch"] +mol_dist = config["tigmint"]["molecule_distance"] +mol_len = config["tigmint"]["molecule_length"] +span = config["tigmint"]["span"] +min_align = config["arcs"]["minimum_aligned_reads"] +min_contig = config["arcs"]["minimum_contig_length"] +seq_id = config["arcs"]["minimum_sequence_identity"] +arcs_extra = config["arcs"].get("extra", "") +links = config["links"]["minimum_links"] rule cloudspades: input: FQ_R1 = FQ1, FQ_R2 = FQ2 output: - f"{outdir}/contigs.fasta", - f"{outdir}/scaffolds.fasta" + f"{outdir}/spades/contigs.fasta", + f"{outdir}/spades/scaffolds.fasta" params: - outdir = outdir, + outdir = f"{outdir}/spades", k = k_param, mem = max_mem // 1000, - extra = extra + extra = spades_extra log: outdir + "/logs/assembly.log" conda: @@ -39,23 +53,95 @@ rule cloudspades: shell: "spades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} --gemcode1-1 {input.FQ_R1} --gemcode1-2 {input.FQ_R2} -o {params.outdir} --isolate > {log}" +rule interleave_fastq: + input: + FQ1, + FQ2 + output: + temp(f"{outdir}/scaffold/interleaved.fq.gz") + container: + None + shell: + "seqtk mergepe {input} | bgzip > {output}" + +rule link_assembly: + input: + f"{outdir}/spades/scaffolds.fasta", + output: + f"{outdir}/scaffold/spades.fa" + container: + None + shell: + "ln -sr {input} {output}" + +rule scaffolding: + input: + asm = f"{outdir}/scaffold/spades.fa", + reads = f"{outdir}/scaffold/interleaved.fq.gz" + output: + f"{outdir}/scaffolds.fasta" + log: + outdir + "/logs/scaffolding.log" + threads: + workflow.cores + params: + workdir = f"{outdir}/scaffold", + threads = f"-j {workflow.cores}", + draft_asm = f"draft=spades", + reads = f"reads=interleaved", + bwa_threads = f"t={workflow.cores}", + min_mapq = f"mapq={mapq}", + max_mismatch = f"nm={mismatch}", + moldist = f"dist={mol_dist}", + min_length = f"minsize={mol_len}", + span = f"span={span}", + min_perbarcod = f"c={min_align}", + min_contig = f"z={min_contig}", + min_seqid = f"s={seq_id}", + min_links = f"l={links}", + prefix = "base_name=scaffolds", + extra = arcs_extra + conda: + f"{envdir}/assembly.yaml" + shell: + """ + arcs-make arcs-tigmint -C {params} 2> {log} + mv {params.workdir}/spades.tigmint*.scaffolds.fa {output} + """ + rule workflow_summary: default_target: True input: - f"{outdir}/athena/athena.asm.fa" + f"{outdir}/scaffolds.fasta" params: k_param = k_param, max_mem = max_mem // 1000, - extra = extra + spades_extra = spades_extra, + workdir = f"-C {outdir}/scaffold", + threads = f"-j {workflow.cores}", + draft_asm = f"draft=spades", + reads = f"reads=interleaved", + bwa_threads = f"t={workflow.cores}", + min_mapq = f"mapq={mapq}", + max_mismatch = f"nm={mismatch}", + moldist = f"dist={mol_dist}", + min_length = f"minsize={mol_len}", + span = f"span={span}", + min_perbarcod = f"c={min_align}", + min_contig = f"z={min_contig}", + min_seqid = f"s={seq_id}", + min_links = f"l={links}", + arcs_extra = arcs_extra run: - summary_template = f""" -The harpy assemble workflow ran using these parameters: - -Reads were assembled using cloudspades: - spades.py -t THREADS -m {params.max_mem} --gemcode1-1 FQ1 --gemcode1-2 FQ2 --isolate -k {params.k_param} {params.extra} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" - with open(outdir + "/workflow/metassembly.summary", "w") as f: - f.write(summary_template) \ No newline at end of file + summary = ["The harpy assemble workflow ran using these parameters:"] + spades = "Reads were assembled using cloudspades:\n" + spades += f"\tspades.py -t THREADS -m {params.max_mem} --gemcode1-1 FQ1 --gemcode1-2 FQ2 --isolate -k {params.k_param} {params.spades_extra}" + summary.append(spades) + arcs = "The draft assembly was error corrected and scaffolded with Tigmint/ARCS/LINKS:\n" + arcs += f"\tarcs-make arcs-tigmint {" ".join(params[3:])}" + summary.append(arcs) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) + with open(outdir + "/workflow/assembly.summary", "w") as f: + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/containerize.smk b/harpy/snakefiles/containerize.smk index 687affa80..f70ef2d92 100644 --- a/harpy/snakefiles/containerize.smk +++ b/harpy/snakefiles/containerize.smk @@ -6,7 +6,7 @@ onsuccess: rule all: input: - collect("{conda}.env", conda = ["align", "metassembly", "phase", "qc", "r", "simulations", "snp", "stitch", "sv"]) + collect("{conda}.env", conda = ["align", "assembly", "metassembly", "phase", "qc", "r", "simulations", "snp", "stitch", "sv"]) rule qc: output: "qc.env" diff --git a/harpy/snakefiles/deconvolve.smk b/harpy/snakefiles/deconvolve.smk index 6789613ab..d1a16e5e7 100644 --- a/harpy/snakefiles/deconvolve.smk +++ b/harpy/snakefiles/deconvolve.smk @@ -13,7 +13,7 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") fqlist = config["inputs"] outdir = config["output_directory"] kmer_length = config["kmer_length"] @@ -86,21 +86,19 @@ rule workflow_summary: input: collect(outdir + "/{sample}.{FR}.fq.gz", FR = ["R1", "R2"], sample = samplenames), run: - summary_template = f""" -The harpy deconvolve workflow ran using these parameters: - -fastq files were interleaved with seqtk: - seqtk mergepe forward.fq reverse.fq - -Deconvolution occurred using QuickDeconvolution: - QuickDeconvolution -t threads -i infile.fq -o output.fq -k {kmer_length} -w {window_size} -d {density} -a {dropout} - -The interleaved output was split back into forward and reverse reads with seqtk: - seqtk -1 interleaved.fq | gzip > file.R1.fq.gz - seqtk -2 interleaved.fq | gzip > file.R2.fq.gz - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy deconvolve workflow ran using these parameters:"] + interleave = "fastq files were interleaved with seqtk:\n" + interleave += "\tseqtk mergepe forward.fq reverse.fq" + summary.append(interleave) + deconv = "Deconvolution occurred using QuickDeconvolution:\n" + deconv += f"\tQuickDeconvolution -t threads -i infile.fq -o output.fq -k {kmer_length} -w {window_size} -d {density} -a {dropout}" + summary.append(deconv) + recover = "The interleaved output was split back into forward and reverse reads with seqtk:\n" + recover += "\tseqtk -1 interleaved.fq | gzip > file.R1.fq.gz\n" + recover += "\tseqtk -2 interleaved.fq | gzip > file.R2.fq.gz" + summary.append(recover) + sm = "Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/deconvolve.summary", "w") as f: - f.write(summary_template) \ No newline at end of file + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/demultiplex_gen1.smk b/harpy/snakefiles/demultiplex_gen1.smk index 93149ffdb..76c85f675 100644 --- a/harpy/snakefiles/demultiplex_gen1.smk +++ b/harpy/snakefiles/demultiplex_gen1.smk @@ -17,7 +17,7 @@ I2 = config["inputs"]["I2"] samplefile = config["inputs"]["demultiplex_schema"] skip_reports = config["skip_reports"] outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") ## the barcode log file ## def barcodedict(smpl): @@ -86,11 +86,11 @@ rule demux_barcodes: mv demux*BC.log logs """ -rule demux_samples_R1: +rule demux_read_1: input: f"{outdir}/demux_R1_001.fastq.gz" output: - outdir + "/{sample}.F.fq.gz" + outdir + "/{sample}.R1.fq.gz" params: c_barcode = lambda wc: samples[wc.get("sample")] container: @@ -100,61 +100,37 @@ rule demux_samples_R1: ( zgrep -A3 "A..{params}B..D" {input} | grep -v "^--$" | gzip -q > {output} ) || touch {output} """ -use rule demux_samples_R1 as demux_samples_R2 with: +use rule demux_read_1 as demux_read_2 with: input: f"{outdir}/demux_R2_001.fastq.gz" output: - outdir + "/{sample}.R.fq.gz" + outdir + "/{sample}.R2.fq.gz" -rule fastqc_R1: +rule quality_assessment: input: - outdir + "/{sample}.F.fq.gz" - output: - temp(outdir + "/logs/{sample}_F/fastqc_data.txt") - params: - lambda wc: f"{outdir}/logs/" + wc.get("sample") + "_F" + read1 = outdir + "/{sample}.R1.fq.gz", + read2 = outdir + "/{sample}.R2.fq.gz" + output: + temp(outdir + "/reports/data/{sample}.fastp.json") + log: + outdir + "/reports/{sample}.html" threads: - 1 + 2 conda: f"{envdir}/qc.yaml" shell: - """ - mkdir -p {params} - if [ -z $(gzip -cd {input} | head -c1) ]; then - echo "##Falco 1.2.1" > {output} - echo ">>Basic Statistics fail" >> {output} - echo "#Measure Value" >> {output} - echo "Filename {wildcards.sample}.F.fq.gz" >> {output} - echo "File type Conventional base calls" >> {output} - echo "Encoding Sanger / Illumina 1.9" >> {output} - echo "Total Sequences 0" >> {output} - echo "Sequences flagged as poor quality 0" >> {output} - echo "Sequence length 0" >> {output} - echo "%GC 0" >> {output} - echo ">>END_MODULE" >> {output} - else - falco -q --threads {threads} -skip-report -skip-summary -o {params} {input} - fi - """ - -use rule fastqc_R1 as fastqc_R2 with: - input: - outdir + "/{sample}.R.fq.gz" - output: - temp(outdir + "/logs/{sample}_R/fastqc_data.txt") - params: - lambda wc: f"{outdir}/logs/" + wc.get("sample") + "_R" + "fastp -pQLAG --stdout -w {threads} -i {input.read1} -I {input.read2} -j {output} --html {log} -R \"{wildcards.sample} demultiplex quality report\" > /dev/null" rule qc_report: input: - collect(outdir + "/logs/{sample}_{FR}/fastqc_data.txt", sample = samplenames, FR = ["F","R"]) + collect(outdir + "/reports/data/{sample}.fastp.json", sample = samplenames) output: outdir + "/reports/demultiplex.QC.html" params: - logdir = outdir + "/logs/", + logdir = outdir + "/reports/data/", options = "--no-version-check --force --quiet --no-data-dir", title = "--title \"QC for Demultiplexed Samples\"", - comment = "--comment \"This report aggregates the QC results created by falco.\"" + comment = "--comment \"This report aggregates the quality assessments from fastp. The data were NOT filtered, the report is suggesting what it would look like trimmed/filtered with default parameters.\"" conda: f"{envdir}/qc.yaml" shell: @@ -163,31 +139,27 @@ rule qc_report: rule workflow_summary: default_target: True input: - fq = collect(outdir + "/{sample}.{FR}.fq.gz", sample = samplenames, FR = ["F", "R"]), + fq = collect(outdir + "/{sample}.R{FR}.fq.gz", sample = samplenames, FR = [1,2]), reports = outdir + "/reports/demultiplex.QC.html" if not skip_reports else [] run: os.makedirs(f"{outdir}/workflow/", exist_ok= True) - summary_template = f""" -The harpy demultiplex gen1 workflow ran using these parameters: - -Haplotag technology: Generation I - -The multiplexed input files: - - {R1} - - {R2} - - {I1} - - {I2} - -Barcodes were moved into the read headers using the command: - demuxGen1 DATA_ demux - -The delimited file associating CXX barcodes with samplenames: {samplefile} - -QC checks were performed on demultiplexed FASTQ files using: - falco -skip-report -skip-summary input.fq.gz - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy demultiplex workflow ran using these parameters:"] + summary.append("Haplotag technology: Generation I") + inputs = "The multiplexed input files:\n" + inputs += f"\tread 1: {R1}\n" + inputs += f"\tread 2: {R2}\n" + inputs += f"\tindex 1: {I1}\n" + inputs += f"\tindex 2: {I2}" + summary.append(inputs) + demux = "Barcodes were moved into the read headers using the command:\n" + demux += "\tdemuxGen1 DATA_ demux" + summary.append(demux) + summary.append(f"The delimited file associating CXX barcodes with samplenames: {samplefile}") + qc = "QC checks were performed on demultiplexed FASTQ files using:\n" + qc += "\tfastp -pQLAG --stdout -i R1.fq -I R2.fq > /dev/null" + summary.append(qc) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/demux.gen1.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk index 65c612ba3..604a5261a 100644 --- a/harpy/snakefiles/impute.smk +++ b/harpy/snakefiles/impute.smk @@ -1,9 +1,6 @@ containerized: "docker://pdimens/harpy:latest" - import os -import pandas as pd import logging -from snakemake.utils import Paramspace onstart: logger.logger.addHandler(logging.FileHandler(config["snakemake_log"])) @@ -12,19 +9,21 @@ onsuccess: onerror: os.remove(logger.logfile) wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" + sample = "[a-zA-Z0-9._-]+", + paramset = "[^/]+", + contig = "[^/]+" -bamlist = config["inputs"]["alignments"] -bamdict = dict(zip(bamlist, bamlist)) -variantfile = config["inputs"]["variantfile"] -paramfile = config["inputs"]["paramfile"] -biallelic = config["inputs"]["biallelic_contigs"] -outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" -skip_reports = config["skip_reports"] -paramspace = Paramspace(pd.read_csv(paramfile, sep=r"\s+", skip_blank_lines=True).rename(columns=str.lower), param_sep = "", filename_params = ["k", "s", "ngen", "bxlimit"]) -with open(biallelic, "r") as f_open: - contigs = [i.rstrip() for i in f_open.readlines()] +bamlist = config["inputs"]["alignments"] +bamdict = dict(zip(bamlist, bamlist)) +variantfile = config["inputs"]["variantfile"] +paramfile = config["inputs"]["paramfile"] +biallelic = config["inputs"]["biallelic_contigs"] +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), ".harpy_envs") +skip_reports = config["skip_reports"] +stitch_params = config["stitch_parameters"] +with open(biallelic, "r") as f: + contigs = [line.rstrip() for line in f] rule sort_bcf: input: @@ -52,7 +51,7 @@ rule index_alignments: rule alignment_list: input: bam = bamlist, - bailist = [f"{i}.bai" for i in bamlist] + bailist = collect("{bam}.bai", bam = bamlist) output: outdir + "/workflow/input/samples.list" run: @@ -64,54 +63,53 @@ rule stitch_conversion: bcf = f"{outdir}/workflow/input/vcf/input.sorted.bcf", idx = f"{outdir}/workflow/input/vcf/input.sorted.bcf.csi" output: - outdir + "/workflow/input/stitch/{part}.stitch" + outdir + "/workflow/input/stitch/{contig}.stitch" threads: 3 container: None shell: """ - bcftools view --types snps -M2 --regions {wildcards.part} {input.bcf} | + bcftools view --types snps -M2 --regions {wildcards.contig} {input.bcf} | bcftools query -i '(STRLEN(REF)==1) & (STRLEN(ALT[0])==1) & (REF!="N")' -f '%CHROM\\t%POS\\t%REF\\t%ALT\\n' > {output} """ rule impute: input: bamlist = outdir + "/workflow/input/samples.list", - infile = outdir + "/workflow/input/stitch/{part}.stitch" + infile = outdir + "/workflow/input/stitch/{contig}.stitch", + bam = bamlist, + bailist = collect("{bam}.bai", bam = bamlist) output: - # format a wildcard pattern like "k{k}/s{s}/ngen{ngen}" - # into a file path, with k, s, ngen being the columns of the data frame - temp(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/{part}.vcf.gz"), - temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/plots")), - temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/tmp")), - temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/RData")), - temp(directory(f"{outdir}/{paramspace.wildcard_pattern}/contigs/" + "{part}/input")) + temp(directory(outdir + "/{paramset}/contigs/{contig}/plots")), + temp(directory(outdir + "/{paramset}/contigs/{contig}/tmp")), + temp(directory(outdir + "/{paramset}/contigs/{contig}/RData")), + temp(directory(outdir + "/{paramset}/contigs/{contig}/input")), + vcf = temp(outdir + "/{paramset}/contigs/{contig}/{contig}.vcf.gz") log: - f"{outdir}/{paramspace.wildcard_pattern}/logs/" + "{part}.stitch.log" + logfile = outdir + "/{paramset}/logs/{contig}.stitch.log" params: - # automatically translate the wildcard values into an instance of the param space - # in the form of a dict (here: {"k": ..., "s": ..., "ngen": ...}) - parameters = paramspace.instance, - extra = config.get("extra", "") - conda: - f"{envdir}/stitch.yaml" - benchmark: - f".Benchmark/{outdir}/stitch.{paramspace.wildcard_pattern}" + ".{part}.txt" + model = lambda wc: stitch_params[wc.paramset]["model"], + usebx = lambda wc: stitch_params[wc.paramset]["usebx"], + bxlimit = lambda wc: stitch_params[wc.paramset]["bxlimit"], + k = lambda wc: stitch_params[wc.paramset]["k"], + s = lambda wc: stitch_params[wc.paramset]["s"], + ngen = lambda wc: stitch_params[wc.paramset]["ngen"], + extra = config.get("stitch_extra", "") threads: workflow.cores - 1 + conda: + f"{envdir}/stitch.yaml" script: "scripts/stitch_impute.R" rule index_vcf: input: - vcf = outdir + "/{stitchparams}/contigs/{part}/{part}.vcf.gz" + vcf = outdir + "/{paramset}/contigs/{contig}/{contig}.vcf.gz" output: - vcf = outdir + "/{stitchparams}/contigs/{part}.vcf.gz", - idx = outdir + "/{stitchparams}/contigs/{part}.vcf.gz.tbi", - stats = outdir + "/{stitchparams}/reports/data/contigs/{part}.stats" - wildcard_constraints: - part = "[^/]+" + vcf = outdir + "/{paramset}/contigs/{contig}.vcf.gz", + idx = outdir + "/{paramset}/contigs/{contig}.vcf.gz.tbi", + stats = outdir + "/{paramset}/reports/data/contigs/{contig}.stats" container: None shell: @@ -121,14 +119,22 @@ rule index_vcf: bcftools stats -s "-" {input.vcf} > {output.stats} """ -rule stitch_reports: +rule contig_report: input: - outdir + "/{stitchparams}/reports/data/contigs/{part}.stats", - outdir + "/{stitchparams}/contigs/{part}/plots" + statsfile = outdir + "/{paramset}/reports/data/contigs/{contig}.stats", + plotdir = outdir + "/{paramset}/contigs/{contig}/plots" output: - outdir + "/{stitchparams}/reports/{part}.stitch.html" + outdir + "/{paramset}/reports/{contig}.{paramset}.html" log: - logfile = outdir + "/{stitchparams}/logs/reports/{part}.stitch.log" + logfile = outdir + "/{paramset}/logs/reports/{contig}.stitch.log" + params: + model = lambda wc: stitch_params[wc.paramset]["model"], + usebx = lambda wc: stitch_params[wc.paramset]["usebx"], + bxlimit = lambda wc: stitch_params[wc.paramset]["bxlimit"], + k = lambda wc: stitch_params[wc.paramset]["k"], + s = lambda wc: stitch_params[wc.paramset]["s"], + ngen = lambda wc: stitch_params[wc.paramset]["ngen"], + extra = config.get("stitch_extra", "") conda: f"{envdir}/r.yaml" script: @@ -136,19 +142,19 @@ rule stitch_reports: rule concat_list: input: - bcf = collect(outdir + "/{{stitchparams}}/contigs/{part}.vcf.gz", part = contigs) + bcf = collect(outdir + "/{{paramset}}/contigs/{contig}.vcf.gz", contig = contigs) output: - temp(outdir + "/{stitchparams}/bcf.files") + temp(outdir + "/{paramset}/bcf.files") run: with open(output[0], "w") as fout: _ = fout.write("\n".join(input.bcf)) rule merge_vcf: input: - files = outdir + "/{stitchparams}/bcf.files", - idx = collect(outdir + "/{{stitchparams}}/contigs/{part}.vcf.gz.tbi", part = contigs) + files = outdir + "/{paramset}/bcf.files", + idx = collect(outdir + "/{{paramset}}/contigs/{contig}.vcf.gz.tbi", contig = contigs) output: - outdir + "/{stitchparams}/variants.imputed.bcf" + outdir + "/{paramset}/{paramset}.bcf" threads: workflow.cores container: @@ -158,9 +164,9 @@ rule merge_vcf: rule index_merged: input: - outdir + "/{stitchparams}/variants.imputed.bcf" + outdir + "/{paramset}/{paramset}.bcf" output: - outdir + "/{stitchparams}/variants.imputed.bcf.csi" + outdir + "/{paramset}/{paramset}.bcf.csi" container: None shell: @@ -168,26 +174,24 @@ rule index_merged: rule general_stats: input: - bcf = outdir + "/{stitchparams}/variants.imputed.bcf", - idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi" + bcf = outdir + "/{paramset}/{paramset}.bcf", + idx = outdir + "/{paramset}/{paramset}.bcf.csi" output: - outdir + "/{stitchparams}/reports/data/impute.stats" + outdir + "/{paramset}/reports/data/impute.stats" container: None shell: - """ - bcftools stats -s "-" {input.bcf} > {output} - """ + "bcftools stats -s \"-\" {input.bcf} > {output}" rule compare_stats: input: orig = outdir + "/workflow/input/vcf/input.sorted.bcf", origidx = outdir + "/workflow/input/vcf/input.sorted.bcf.csi", - impute = outdir + "/{stitchparams}/variants.imputed.bcf", - idx = outdir + "/{stitchparams}/variants.imputed.bcf.csi" + impute = outdir + "/{paramset}/{paramset}.bcf", + idx = outdir + "/{paramset}/{paramset}.bcf.csi" output: - compare = outdir + "/{stitchparams}/reports/data/impute.compare.stats", - info_sc = temp(outdir + "/{stitchparams}/reports/data/impute.infoscore") + compare = outdir + "/{paramset}/reports/data/impute.compare.stats", + info_sc = temp(outdir + "/{paramset}/reports/data/impute.infoscore") container: None shell: @@ -198,64 +202,58 @@ rule compare_stats: rule impute_reports: input: - outdir + "/{stitchparams}/reports/data/impute.compare.stats", - outdir + "/{stitchparams}/reports/data/impute.infoscore" + outdir + "/{paramset}/reports/data/impute.compare.stats", + outdir + "/{paramset}/reports/data/impute.infoscore" output: - outdir + "/{stitchparams}/reports/variants.imputed.html" + outdir + "/{paramset}/reports/{paramset}.html" log: - logfile = outdir + "/{stitchparams}/logs/reports/imputestats.log" + logfile = outdir + "/{paramset}/logs/reports/imputestats.log" params: - lambda wc: wc.get("stitchparams") + lambda wc: wc.get("paramset") conda: f"{envdir}/r.yaml" script: "report/impute.Rmd" - rule workflow_summary: default_target: True input: - vcf = collect(outdir + "/{stitchparams}/variants.imputed.bcf", stitchparams=paramspace.instance_patterns), - agg_report = collect(outdir + "/{stitchparams}/reports/variants.imputed.html", stitchparams=paramspace.instance_patterns) if not skip_reports else [], - contig_report = collect(outdir + "/{stitchparams}/reports/{part}.stitch.html", stitchparams=paramspace.instance_patterns, part = contigs) if not skip_reports else [], + vcf = collect(outdir + "/{paramset}/{paramset}.bcf", paramset = list(stitch_params.keys())), + agg_report = collect(outdir + "/{paramset}/reports/{paramset}.html", paramset = stitch_params.keys()) if not skip_reports else [], + contig_report = collect(outdir + "/{paramset}/reports/{contig}.{paramset}.html", paramset = stitch_params.keys(), contig = contigs) if not skip_reports else [], run: paramfiletext = "\t".join(open(paramfile, "r").readlines()) - summary_template = f""" -The harpy impute workflow ran using these parameters: - -The provided variant file: {variantfile} - -Preprocessing was performed with: - bcftools view -M2 -v snps --regions CONTIG INFILE | - bcftools query -i '(STRLEN(REF)==1) & (STRLEN(ALT[0])==1) & (REF!="N")' -f '%CHROM\\t%POS\\t%REF\\t%ALT\\n' - -The STITCH parameter file: {paramfile} - {paramfiletext} - -Within R, STITCH was invoked with the following parameters: - STITCH( - method = model, - posfile = posfile, - bamlist = bamlist, - nCores = ncores, - nGen = ngen, - chr = chr, - K = k, - S = s, - use_bx_tag = usebX, - bxTagUpperLimit = bxlimit, - niterations = 40, - switchModelIteration = 39, - splitReadIterations = NA, - outputdir = outdir, - output_filename = outfile - ) - -Additional STITCH parameters provided (overrides existing values above):\n") - {config.get("extra", "None provided")} - -The Snakemake workflow was called via command line:\n") - {config["workflow_call"]} -""" + summary = ["The harpy impute workflow ran using these parameters:"] + summary.append(f"The provided variant file: {variantfile}") + preproc = "Preprocessing was performed with:\n" + preproc += "\tbcftools view -M2 -v snps --regions CONTIG INFILE |\n" + preproc += """\tbcftools query -i '(STRLEN(REF)==1) & (STRLEN(ALT[0])==1) & (REF!="N")' -f '%CHROM\\t%POS\\t%REF\\t%ALT\\n'""" + summary.append(preproc) + stitchparam = f"The STITCH parameter file: {paramfile}\n" + stitchparam += f"\t{paramfiletext}" + summary.append(stitchparam) + stitch = "Within R, STITCH was invoked with the following parameters:\n" + stitch += "\tSTITCH(\n" + stitch += "\t\tmethod = model,\n" + stitch += "\t\tposfile = posfile,\n" + stitch += "\t\tbamlist = bamlist,\n" + stitch += "\t\tnCores = ncores,\n" + stitch += "\t\tnGen = ngen,\n" + stitch += "\t\tchr = chr,\n" + stitch += "\t\tK = k,\n" + stitch += "\t\tS = s,\n" + stitch += "\t\tuse_bx_tag = usebx,\n" + stitch += "\t\tbxTagUpperLimit = bxlimit,\n" + stitch += "\t\tniterations = 40,\n" + stitch += "\t\tswitchModelIteration = 39,\n" + stitch += "\t\tsplitReadIterations = NA,\n" + stitch += "\t\toutputdir = outdir,\n" + stitch += "\t\toutput_filename = outfile\n\t)" + stitchextra = "Additional STITCH parameters provided (overrides existing values above):\n" + stitchextra += "\t" + config.get("stitch_extra", "None") + summary.append(stitchextra) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/impute.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/metassembly.smk b/harpy/snakefiles/metassembly.smk index fe42b1c70..fbd62aea2 100644 --- a/harpy/snakefiles/metassembly.smk +++ b/harpy/snakefiles/metassembly.smk @@ -1,3 +1,5 @@ +containerized: "docker://pdimens/harpy:latest" + import os import logging @@ -11,10 +13,13 @@ onerror: FQ1 = config["inputs"]["fastq_r1"], FQ2 = config["inputs"]["fastq_r2"] outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") max_mem = config["spades"]["max_memory"] k_param = config["spades"]["k"] -extra = config["spades"].get("extra", "") +metassembly = config["spades"]["assembler"] +extra = config["spades"].get("extra", "") +cloudspades = True if metassembly == "cloudspades" else False +spadesdir = f"{outdir}/cloudspades_assembly" if cloudspades else f"{outdir}/spades_assembly" rule sort_by_barcode: input: @@ -25,10 +30,14 @@ rule sort_by_barcode: fq_r = temp(f"{outdir}/fastq_preproc/tmp.R2.fq") params: barcode_tag = config["barcode_tag"].upper() + threads: + workflow.cores + container: + None shell: """ samtools import -T "*" {input} | - samtools sort -O SAM -t {params.barcode_tag} | + samtools sort -@ {threads} -O SAM -t {params.barcode_tag} | samtools fastq -T "*" -1 {output.fq_f} -2 {output.fq_r} """ @@ -39,6 +48,8 @@ rule format_barcode: temp(f"{outdir}/fastq_preproc/input.R{{FR}}.fq.gz") params: config["barcode_tag"].upper() + container: + None shell: "sed 's/{params}:Z:[^[:space:]]*/&-1/g' {input} | bgzip > {output}" @@ -57,29 +68,56 @@ rule error_correction: extra = extra log: outdir + "/logs/error_correct.log" - conda: - f"{envdir}/assembly.yaml" threads: workflow.cores resources: mem_mb=max_mem + conda: + f"{envdir}/spades.yaml" + container: + None shell: "metaspades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} -1 {input.FQ_R1} -2 {input.FQ_R2} -o {params.outdir} --only-error-correction > {log}" rule spades_assembly: input: - FQ_R1C = outdir + "/error_correction/corrected/input.R1.fq00.0_0.cor.fastq.gz", - FQ_R2C = outdir + "/error_correction/corrected/input.R2.fq00.0_0.cor.fastq.gz", - FQ_UNC = outdir + "/error_correction/corrected/input.R_unpaired00.0_0.cor.fastq.gz" + fastq_R1C = outdir + "/error_correction/corrected/input.R1.fq00.0_0.cor.fastq.gz", + fastq_R2C = outdir + "/error_correction/corrected/input.R2.fq00.0_0.cor.fastq.gz", + fastq_UNC = outdir + "/error_correction/corrected/input.R_unpaired00.0_0.cor.fastq.gz" output: - f"{outdir}/metaspades_assembly/contigs.fasta" + f"{outdir}/spades_assembly/contigs.fasta" params: - outdir = outdir + "/metaspades_assembly", + outdir = outdir + "/spades_assembly", k = k_param, mem = max_mem // 1000, extra = extra log: - outdir + "/logs/metaspades_assembly.log" + outdir + "/logs/spades_assembly.log" + threads: + workflow.cores + resources: + mem_mb=max_mem + conda: + f"{envdir}/spades.yaml" + container: + None + shell: + "metaspades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} -1 {input.fastq_R1C} -2 {input.fastq_R2C} -s {input.fastq_UNC} -o {params.outdir} --only-assembler > {log}" + +rule cloudspades_assembly: + input: + fastq_R1 = FQ1, + fastq_R2 = FQ2 + output: + f"{outdir}/cloudspades_assembly/contigs.fasta", + f"{outdir}/cloudspades_assembly/scaffolds.fasta" + params: + outdir = spadesdir, + k = k_param, + mem = max_mem // 1000, + extra = extra + log: + outdir + "/logs/assembly.log" conda: f"{envdir}/assembly.yaml" threads: @@ -87,13 +125,13 @@ rule spades_assembly: resources: mem_mb=max_mem shell: - "metaspades.py -t {threads} -m {params.mem} -k {params.k} {params.extra} -1 {input.FQ_R1C} -2 {input.FQ_R2C} -s {input.FQ_UNC} -o {params.outdir} --only-assembler > {log}" + "spades.py --meta -t {threads} -m {params.mem} -k {params.k} {params.extra} --gemcode1-1 {input.fastq_R1} --gemcode1-2 {input.fastq_R2} -o {params.outdir} > {log}" rule bwa_index: input: - f"{outdir}/metaspades_assembly/contigs.fasta" + f"{spadesdir}/contigs.fasta" output: - multiext(f"{outdir}/metaspades_assembly/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb") + multiext(f"{spadesdir}/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb") log: f"{outdir}/logs/bwa.index.log" conda: @@ -103,11 +141,11 @@ rule bwa_index: rule bwa_align: input: - multiext(f"{outdir}/metaspades_assembly/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb"), + multiext(f"{spadesdir}/contigs.fasta.", "ann", "bwt", "pac", "sa", "amb"), fastq = collect(outdir + "/fastq_preproc/input.R{X}.fq.gz", X = [1,2]), - contigs = f"{outdir}/metaspades_assembly/contigs.fasta" + contigs = f"{spadesdir}/contigs.fasta" output: - f"{outdir}/reads-to-metaspades.bam" + f"{outdir}/reads-to-spades.bam" log: bwa = f"{outdir}/logs/align.bwa.log", samsort = f"{outdir}/logs/sort.alignments.log" @@ -120,11 +158,13 @@ rule bwa_align: rule index_alignment: input: - f"{outdir}/reads-to-metaspades.bam" + f"{outdir}/reads-to-spades.bam" output: - f"{outdir}/reads-to-metaspades.bam.bai" + f"{outdir}/reads-to-spades.bam.bai" log: f"{outdir}/logs/index.alignments.log" + container: + None shell: "samtools index {input} 2> {log}" @@ -133,15 +173,17 @@ rule interleave_fastq: collect(outdir + "/fastq_preproc/input.R{FR}.fq.gz", FR = [1,2]) output: f"{outdir}/fastq_preproc/interleaved.fq" + container: + None shell: "seqtk mergepe {input} > {output}" rule athena_config: input: - f"{outdir}/reads-to-metaspades.bam.bai", + f"{outdir}/reads-to-spades.bam.bai", fastq = f"{outdir}/fastq_preproc/interleaved.fq", - bam = f"{outdir}/reads-to-metaspades.bam", - contigs = f"{outdir}/metaspades_assembly/contigs.fasta" + bam = f"{outdir}/reads-to-spades.bam", + contigs = f"{spadesdir}/contigs.fasta" output: f"{outdir}/athena/athena.config" params: @@ -164,9 +206,9 @@ rule athena_config: rule athena: input: - multiext(f"{outdir}/reads-to-metaspades.", "bam", "bam.bai"), + multiext(f"{outdir}/reads-to-spades.", "bam", "bam.bai"), f"{outdir}/fastq_preproc/interleaved.fq", - f"{outdir}/metaspades_assembly/contigs.fasta", + f"{spadesdir}/contigs.fasta", config = f"{outdir}/athena/athena.config" output: temp(directory(collect(outdir + "/athena/{X}", X = ["results", "logs", "working"]))), @@ -192,37 +234,34 @@ rule workflow_summary: f"{outdir}/athena/athena.asm.fa" params: bx = config["barcode_tag"].upper(), - k_param = k_param, - max_mem = max_mem, extra = extra run: - summary_template = f""" -The harpy metassembly workflow ran using these parameters: - -FASTQ inputs were sorted by their linked-read barcodes: - samtools import -T "*" FQ1 FQ2 | - samtools sort -O SAM -t {{params.bx}} | - samtools fastq -T "*" -1 FQ_out1 -2 FQ_out2 - -Barcoded-sorted FASTQ files had "-1" appended to the barcode to make them Athena-compliant: - sed 's/{{params.bx}}:Z:[^[:space:]]*/&-1/g' FASTQ | bgzip > FASTQ_OUT - -Reads were assembled using metaspades: - k values: {{params.k_param}} - maximum memory: {{params.max_mem}} - extra parameters: {{params.extra}} - -Original input FASTQ files were aligned to the metagenome using BWA: - bwa mem -C -p metaspades.contigs FQ1 FQ2 | samtools sort -O bam - - -Barcode-sorted Athena-compliant sequences were interleaved with seqtk: - seqtk mergepe FQ1 FQ2 > INTERLEAVED.FQ - -Athena ran with the config file Harpy built from the files created from the previous steps: - athena-meta --config athena.config - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy metassembly workflow ran using these parameters:"] + bxsort = "FASTQ inputs were sorted by their linked-read barcodes:\n" + bxsort += "\tsamtools import -T \"*\" FQ1 FQ2 |\n" + bxsort += f"\tsamtools sort -O SAM -t {params.bx} |\n" + bxsort += "\tsamtools fastq -T \"*\" -1 FQ_out1 -2 FQ_out2" + summary.append(bxsort) + bxappend = "Barcoded-sorted FASTQ files had \"-1\" appended to the barcode to make them Athena-compliant:\n" + bxappend += f"\tsed 's/{params.bx}:Z:[^[:space:]]*/&-1/g' FASTQ | bgzip > FASTQ_OUT" + summary.append(bxappend) + spades = f"Reads were assembled using {metassembly}:\n" + if cloudspades: + spades += f"\tspades.py -t THREADS -m {max_mem} --gemcode1-1 FQ1 --gemcode1-2 FQ2 --meta -k {k_param} {params.extra}" + else: + spades += f"\tmetaspades.py -t THREADS -m {max_mem} -k {k_param} {extra} -1 FQ_1 -2 FQ2 -o {spadesdir}" + summary.append(spades) + align = "Original input FASTQ files were aligned to the metagenome using BWA:\n" + align += "\tbwa mem -C -p spades.contigs FQ1 FQ2 | samtools sort -O bam -" + summary.append(align) + interleaved = "Barcode-sorted Athena-compliant sequences were interleaved with seqtk:\n" + interleaved += "\tseqtk mergepe FQ1 FQ2 > INTERLEAVED.FQ" + summary.append(interleaved) + athena = "Athena ran with the config file Harpy built from the files created from the previous steps:\n" + athena += "\tathena-meta --config athena.config" + summary.append(athena) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/metassembly.summary", "w") as f: - f.write(summary_template.format(params=params)) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index 531be1e57..bdb3f4e55 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -21,7 +21,7 @@ pruning = config["prune"] molecule_distance = config["molecule_distance"] extra = config.get("extra", "") outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") skip_reports = config["skip_reports"] samples_from_vcf = config["samples_from_vcf"] variantfile = config["inputs"]["variantfile"] @@ -277,25 +277,22 @@ rule workflow_summary: prune = f"--threshold {pruning}" if pruning > 0 else "--no_prune 1", extra = extra run: - summary_template = f""" -The harpy phase workflow ran using these parameters: - -The provided variant file: {variantfile} - -The variant file was split by sample and filtered for heterozygous sites using: - bcftools view -s SAMPLE | bcftools view -i 'GT="het"' - -Phasing was performed using the components of HapCut2: - extractHAIRS {linkarg} --nf 1 --bam sample.bam --VCF sample.vcf --out sample.unlinked.frags - LinkFragments.py --bam sample.bam --VCF sample.vcf --fragments sample.unlinked.frags --out sample.linked.frags -d {molecule_distance} - HAPCUT2 --fragments sample.linked.frags --vcf sample.vcf --out sample.blocks --nf 1 --error_analysis_mode 1 --call_homozygous 1 --outvcf 1 {params.prune} {params.extra} - -Variant annotation was performed using: - bcftools annotate -a sample.phased.vcf -c CHROM,POS,FMT/GT,FMT/PS,FMT/PQ,FMT/PD -m +HAPCUT - bcftools merge --output-type b samples.annot.bcf - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy phase workflow ran using these parameters:"] + summary.append(f"The provided variant file: {variantfile}") + hetsplit = "The variant file was split by sample and filtered for heterozygous sites using:\n" + hetsplit += "\tbcftools view -s SAMPLE | bcftools view -i \'GT=\"het\"\'" + summary.append(hetsplit) + phase = "Phasing was performed using the components of HapCut2:\n" + phase += "\textractHAIRS {linkarg} --nf 1 --bam sample.bam --VCF sample.vcf --out sample.unlinked.frags\n" + phase += f"\tLinkFragments.py --bam sample.bam --VCF sample.vcf --fragments sample.unlinked.frags --out sample.linked.frags -d {molecule_distance}\n" + phase += f"\tHAPCUT2 --fragments sample.linked.frags --vcf sample.vcf --out sample.blocks --nf 1 --error_analysis_mode 1 --call_homozygous 1 --outvcf 1 {params.prune} {params.extra}\n" + summary.append(phase) + annot = "Variant annotation was performed using:\n" + annot += "\tbcftools annotate -a sample.phased.vcf -c CHROM,POS,FMT/GT,FMT/PS,FMT/PQ,FMT/PD -m +HAPCUT\n" + annot += "\tbcftools merge --output-type b samples.annot.bcf" + summary.append(annot) + sm = "The Snakemake workflow was called via command line:\n" + sm = f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/phase.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/preflight_bam.smk b/harpy/snakefiles/preflight_bam.smk index 5d5cca669..87ad954d8 100644 --- a/harpy/snakefiles/preflight_bam.smk +++ b/harpy/snakefiles/preflight_bam.smk @@ -15,7 +15,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+" outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") bamlist = config["inputs"] bamdict = dict(zip(bamlist, bamlist)) samplenames = {Path(i).stem for i in bamlist} @@ -85,14 +85,12 @@ rule workflow_summary: outdir + "/filecheck.bam.html" run: os.makedirs(f"{outdir}/workflow/", exist_ok= True) - summary_template = f""" -The harpy preflight bam workflow ran using these parameters: - -Validations were performed with: - check_bam.py sample.bam > sample.txt - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy preflight bam workflow ran using these parameters:"] + valids = "Validations were performed with:\n" + valids += "\tcheck_bam.py sample.bam > sample.txt" + summary.append(valids) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/preflight.bam.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/preflight_fastq.smk b/harpy/snakefiles/preflight_fastq.smk index c7b28cd86..40704a1c3 100644 --- a/harpy/snakefiles/preflight_fastq.smk +++ b/harpy/snakefiles/preflight_fastq.smk @@ -15,7 +15,7 @@ wildcard_constraints: fqlist = config["inputs"] outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} @@ -83,14 +83,12 @@ rule workflow_summary: outdir + "/filecheck.fastq.html" run: os.makedirs(f"{outdir}/workflow/", exist_ok= True) - summary_template = f""" -The harpy preflight fastq workflow ran using these parameters: - -Validations were performed with: - check_fastq.py sample.fastq > sample.txt - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy preflight fastq workflow ran using these parameters:"] + valids = "Validations were performed with:\n" + valids += "\tcheck_fastq.py sample.fastq > sample.txt" + summary.append(valids) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/preflight.fastq.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index eff482a21..ed0613952 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -13,7 +13,7 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") fqlist = config["inputs"] outdir = config["output_directory"] min_len = config["min_len"] @@ -181,24 +181,20 @@ rule workflow_summary: dedup = "-D" if dedup else "", extra = extra run: + summary = ["The harpy qc workflow ran using these parameters:"] + fastp = "fastp ran using:\n" + fastp += f"\tfastp --trim_poly_g --cut_right {params}" + summary.append(fastp) if deconvolve: - deconvolve_text = "Deconvolution occurred using QuickDeconvolution:\n" - deconvolve_text += f" QuickDeconvolution -t threads -i infile.fq -o output.fq -k {decon_k} -w {decon_w} -d {decon_d} -a {decon_a}\n" - deconvolve_text += "The interleaved output was split back into forward and reverse reads with seqtk:\n" - deconvolve_text += " seqtk -1 interleaved.fq | gzip > file.R1.fq.gz\n" - deconvolve_text += " seqtk -2 interleaved.fq | gzip > file.R2.fq.gz\n" - else: - deconvolve_text = "" - summary_template = f""" -The harpy qc workflow ran using these parameters: - -fastp ran using: - fastp --trim_poly_g --cut_right {params} - -{deconvolve_text} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + deconv = "Deconvolution occurred using QuickDeconvolution:\n" + deconv += "\tQuickDeconvolution -t threads -i infile.fq -o output.fq -k {decon_k} -w {decon_w} -d {decon_d} -a {decon_a}" + summary.append(deconv) + interlv = "The interleaved output was split back into forward and reverse reads with seqtk:\n" + interlv += "\tseqtk -1 interleaved.fq | gzip > file.R1.fq.gz\n" + interlv += "\tseqtk -2 interleaved.fq | gzip > file.R2.fq.gz" + summary.append(interlv) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/qc.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk index b3de9a10e..43adf3905 100644 --- a/harpy/snakefiles/simulate_linkedreads.smk +++ b/harpy/snakefiles/simulate_linkedreads.smk @@ -15,8 +15,8 @@ onerror: outdir = config["output_directory"] gen_hap1 = config["inputs"]["genome_hap1"] gen_hap2 = config["inputs"]["genome_hap2"] -envdir = os.getcwd() + "/.harpy_envs" -barcodes = config.get("barcodes", None) +barcodes = config["inputs"].get("barcodes", None) +envdir = os.path.join(os.getcwd(), ".harpy_envs") barcodefile = barcodes if barcodes else f"{outdir}/workflow/input/4M-with-alts-february-2016.txt" rule link_1st_geno: @@ -183,24 +183,21 @@ rule workflow_summary: dwgmutationrate = config["mutation_rate"], dwgprefix = outdir + "/dwgsim_simulated/dwgsim.hap.12" run: - summary_template = f""" -The harpy simulate linkedreas workflow ran using these parameters: - -Genome haplotype 1: {gen_hap1} -Genome haplotype 2: {gen_hap2} -Barcode file: {barcodefile} - -Reads were simulated from the provided genomes using: - dwgsim -N {{params.dwgreadpairs}} -e 0.0001,0.0016 -E 0.0001,0.0016 -d {{params.dwgouterdist}} -s {{params.dwgdistsd}} -1 135 -2 151 -H -y 0 -S 0 -c 0 -R 0 -r {{params.dwgmutationrate}} -F 0 -o 1 -m /dev/null GENO PREFIX - -LRSIM was started from step 3 (-u 3) with these parameters: - LRSIM_harpy.pl -g genome1,genome2 -p {{params.lrsproj_dir}}/lrsim/sim -b BARCODES -r {{params.lrsproj_dir}} -i {{params.lrsoutdist}} -s {{params.lrsdist_sd}} -x {{params.lrsn_pairs}} -f {{params.lrsmol_len}} -t {{params.lrsparts}} -m {{params.lrsmols_per}} -z THREADS {{params.lrsstatic}} - -10X style barcodes were converted in haplotag BX:Z tags using: - 10xtoHaplotag.py - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy simulate linkedreas workflow ran using these parameters:"] + summary.append(f"Genome haplotype 1: {gen_hap1}") + summary.append(f"Genome haplotype 2: {gen_hap2}") + summary.append(f"Barcode file: {barcodefile}") + dwgsim = "Reads were simulated from the provided genomes using:\n" + dwgsim += f"\tdwgsim -N {params.dwgreadpairs} -e 0.0001,0.0016 -E 0.0001,0.0016 -d {params.dwgouterdist} -s {params.dwgdistsd} -1 135 -2 151 -H -y 0 -S 0 -c 0 -R 0 -r {params.dwgmutationrate} -F 0 -o 1 -m /dev/null GENO PREFIX" + summary.append(dwgsim) + lrsim = "LRSIM was started from step 3 (-u 3) with these parameters:\n" + lrsim += f"\tLRSIM_harpy.pl -g genome1,genome2 -p {params.lrsproj_dir}/lrsim/sim -b BARCODES -r {params.lrsproj_dir} -i {params.lrsoutdist} -s {params.lrsdist_sd} -x {params.lrsn_pairs} -f {params.lrsmol_len} -t {params.lrsparts} -m {params.lrsmols_per} -z THREADS {params.lrsstatic}" + summary.append(lrsim) + bxconvert = "10X style barcodes were converted in haplotag BX:Z tags using:\n" + bxconvert += "\t10xtoHaplotag.py" + summary.append(bxconvert) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/simulate.reads.summary", "w") as f: - f.write(summary_template.format(params = params)) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/simulate_snpindel.smk b/harpy/snakefiles/simulate_snpindel.smk index 76611dec2..f1891da3f 100644 --- a/harpy/snakefiles/simulate_snpindel.smk +++ b/harpy/snakefiles/simulate_snpindel.smk @@ -13,10 +13,10 @@ onerror: outdir = config["output_directory"] genome = config["inputs"]["genome"] -envdir = os.getcwd() + "/.harpy_envs" -snp_vcf = config["inputs"].get("snp_vcf", None) -indel_vcf = config["inputs"].get("indel_vcf", None) -heterozygosity = float(config["heterozygosity"]["value"]) +envdir = os.path.join(os.getcwd(), ".harpy_envs") +snp_vcf = config["snp"].get("vcf", None) +indel_vcf = config["indel"].get("vcf", None) +heterozygosity = float(config["heterozygosity"]["ratio"]) only_vcf = config["heterozygosity"]["only_vcf"] outprefix = config["prefix"] randomseed = config.get("random_seed", None) @@ -37,28 +37,31 @@ if snp_vcf or indel_vcf: variant_params += f" -indel_vcf {indel_vcf_correct}" in_vcfs.append(indel_vcf_correct) else: - snp_count = config.get("snp_count", None) - indel_count = config.get("indel_count", None) + snp_count = config["snp"].get("count", None) + indel_count = config["indel"].get("count", None) variant_params = "" if snp_count: snp = True variant_params += f" -snp_count {snp_count}" - snp_constraint = config.get("snp_gene_constraints", None) + snp_constraint = config["snp"].get("gene_constraints", None) variant_params += f" -coding_partition_for_snp_simulation {snp_constraint}" if snp_constraint else "" - ratio = config.get("titv_ratio", None) + ratio = config["snp"].get("titv_ratio", None) variant_params += f" -titv_ratio {ratio}" if ratio else "" - if indel_count: indel = True variant_params += f" -indel_count {indel_count}" - ratio = config.get("indel_ratio", None) + ratio = config["indel"].get("indel_ratio", None) variant_params += f" -ins_del_ratio {ratio}" if ratio else "" + size_alpha = config["indel"].get("size_alpha", None) + variant_params += f" -indel_size_powerlaw_alpha {size_alpha}" if size_alpha else "" + size_constant = config["indel"].get("size_constant", None) + variant_params += f" -indel_size_powerlaw_constant {size_constant}" if size_constant else "" centromeres = config["inputs"].get("centromeres", None) variant_params += f" -centromere_gff {centromeres}" if centromeres else "" genes = config["inputs"].get("genes", None) variant_params += f" -gene_gff {genes}" if genes else "" - exclude = config["inputs"].get("exclude_chr", None) + exclude = config["inputs"].get("excluded_chromosomes", None) variant_params += f" -excluded_chr_list {exclude}" if exclude else "" variant_params += f" -seed {randomseed}" if randomseed else "" @@ -163,3 +166,24 @@ rule workflow_summary: collect(f"{outdir}/{outprefix}" + ".{var}.vcf", var = variants), collect(f"{outdir}/diploid/{outprefix}" + ".hap{n}.fasta", n = [1,2]) if heterozygosity > 0 and not only_vcf else [], collect(f"{outdir}/diploid/{outprefix}" + ".{var}.hap{n}.vcf", n = [1,2], var = variants) if heterozygosity > 0 else [] + params: + prefix = f"{outdir}/{outprefix}", + parameters = variant_params, + snp = f"-snp_vcf {outdir}/diploid/{outprefix}.snp.hapX.vcf" if snp else "", + indel = f"-indel_vcf {outdir}/diploid/{outprefix}.indel.hapX.vcf" if indel else "" + run: + summary = ["The harpy simulate snpindel workflow ran using these parameters:"] + summary.append(f"The provided genome: {genome}") + summary.append(f"Heterozygosity specified: {heterozygosity}") + haploid = "Haploid variants were simulated using simuG:\n" + haploid += f"simuG -refseq {genome} -prefix {params.prefix} {params.parameters}" + summary.append(haploid) + if heterozygosity > 0 and not only_vcf: + diploid = "Diploid variants were simulated after splitting by the heterozygosity ratio:\n" + diploid += f"\tsimuG -refseq {genome} -prefix hapX {params.snp} {params.indel}" + summary.append(diploid) + sm = "The Snakemake workflow was called via command line:" + sm += f"\t{config['workflow_call']}" + summary.append(sm) + with open(f"{outdir}/workflow/simulate.snpindel.summary", "w") as f: + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/simulate_variants.smk b/harpy/snakefiles/simulate_variants.smk index c6c6ba8b1..fdb5dd89d 100644 --- a/harpy/snakefiles/simulate_variants.smk +++ b/harpy/snakefiles/simulate_variants.smk @@ -12,45 +12,34 @@ onerror: os.remove(logger.logfile) outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" -variant = config["variant_type"] +envdir = os.path.join(os.getcwd(), ".harpy_envs") +variant = config["workflow"].split()[1] outprefix = config["prefix"] genome = config["inputs"]["genome"] -vcf = config["inputs"].get("vcf", None) -heterozygosity = float(config["heterozygosity"]["value"]) +vcf = config[variant].get("vcf", None) +heterozygosity = float(config["heterozygosity"]["ratio"]) only_vcf = config["heterozygosity"]["only_vcf"] randomseed = config.get("random_seed", None) -vcf_correct = "None" if vcf: vcf_correct = vcf[:-4] + ".vcf.gz" if vcf.lower().endswith("bcf") else vcf variant_params = f"-{variant}_vcf {vcf_correct}" - else: - variant_params = f"-{variant}_count " + str(config["count"]) - centromeres = config.get("centromeres", None) + variant_params = f"-{variant}_count " + str(config[variant]["count"]) + centromeres = config["inputs"].get("centromeres", None) variant_params += f" -centromere_gff {centromeres}" if centromeres else "" genes = config["inputs"].get("genes", None) variant_params += f" -gene_gff {genes}" if genes else "" - exclude = config["inputs"].get("exclude_chr", None) + exclude = config["inputs"].get("excluded_chromosomes", None) variant_params += f" -excluded_chr_list {exclude}" if exclude else "" variant_params += f" -seed {randomseed}" if randomseed else "" - if variant == "inversion": - min_size = config.get("min_size", None) - variant_params += f" -{variant}_min_size {min_size}" if min_size else "" - max_size = config.get("max_size", None) - variant_params += f" -{variant}_max_size {max_size}" if max_size else "" + if variant in ["inversion", "cnv"]: + variant_params += f" -{variant}_min_size " + str(config[variant]["min_size"]) + variant_params += f" -{variant}_max_size " + str(config[variant]["max_size"]) if variant == "cnv": - min_size = config.get("min_size", None) - variant_params += f" -{variant}_min_size {min_size}" if min_size else "" - max_size = config.get("max_size", None) - variant_params += f" -{variant}_max_size {max_size}" if max_size else "" - ratio = config.get("ratio", None) - variant_params += f" -duplication_tandem_dispersed_ratio {ratio}" if ratio else "" - cnv_copy = config.get("cnv_max_copy", None) - variant_params += f" --cnv_max_copy_number {cnv_copy}" if cnv_copy else "" - cnv_ratio =config.get("cnv_ratio", None) - variant_params += f" --cnv_gain_loss_ratio {cnv_ratio}" if cnv_ratio else "" + variant_params += f" -duplication_tandem_dispersed_ratio " + str(config[variant]["duplication_ratio"]) + variant_params += f" --cnv_max_copy_number " + str(config[variant]["max_copy"]) + variant_params += f" --cnv_gain_loss_ratio " + str(config[variant]["gain_ratio"]) if vcf: rule convert_vcf: @@ -128,3 +117,23 @@ rule workflow_summary: multiext(f"{outdir}/{outprefix}", ".vcf", ".bed", ".fasta"), collect(f"{outdir}/diploid/{outprefix}.hap" + "{n}.fasta", n = [1,2]) if heterozygosity > 0 and not only_vcf else [], collect(f"{outdir}/diploid/{outprefix}.{variant}.hap" + "{n}.vcf", n = [1,2]) if heterozygosity > 0 else [] + params: + prefix = f"{outdir}/{outprefix}", + parameters = variant_params, + vcf_arg = f"-{variant}_vcf" + run: + summary = [f"The harpy simulate {variant} workflow ran using these parameters:"] + summary.append(f"The provided genome: {genome}") + summary.append(f"Heterozygosity specified: {heterozygosity}") + haploid = "Haploid variants were simulated using simuG:\n" + haploid += f"simuG -refseq {genome} -prefix {params.prefix} {params.parameters}" + summary.append(haploid) + if heterozygosity > 0 and not only_vcf: + diploid = f"Diploid variants were simulated after splitting by the heterozygosity ratio:\n" + diploid += f"\tsimuG -refseq {genome} -prefix HAP_PREFIX {params.vcf_arg} hapX.vcf" + summary.append(diploid) + sm = "The Snakemake workflow was called via command line:" + sm += f"\t{config['workflow_call']}" + summary.append(sm) + with open(f"{outdir}/workflow/simulate.{variant}.summary", "w") as f: + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/snp_freebayes.smk b/harpy/snakefiles/snp_freebayes.smk index cf7586382..783423449 100644 --- a/harpy/snakefiles/snp_freebayes.smk +++ b/harpy/snakefiles/snp_freebayes.smk @@ -13,10 +13,10 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") ploidy = config["ploidy"] extra = config.get("extra", "") -regiontype = config["regiontype"] +regiontype = config["region_type"] windowsize = config.get("windowsize", None) outdir = config["output_directory"] skip_reports = config["skip_reports"] @@ -91,7 +91,7 @@ rule index_alignments: rule bam_list: input: bam = bamlist, - bai = [f"{i}.bai" for i in bamlist] + bai = collect("{bam}.bai", bam = bamlist) output: outdir + "/workflow/samples.files" run: @@ -102,7 +102,7 @@ rule bam_list: rule call_variants: input: bam = bamlist, - bai = [f"{i}.bai" for i in bamlist], + bai = collect("{bam}.bai", bam = bamlist), groupfile = outdir + "/workflow/sample.groups" if groupings else [], ref = f"Genome/{bn}", ref_idx = f"Genome/{bn}.fai", @@ -217,30 +217,24 @@ rule workflow_summary: populations = f"--populations {groupings}" if groupings else '', extra = extra run: + summary = ["The harpy snp freebayes workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") if windowsize: - windowtext = f"Size of intervals to split genome for variant calling: {windowsize}" + summary.append(f"Size of intervals to split genome for variant calling: {windowsize}") else: - windowtext = f"Genomic positions for which variants were called: {regioninput}" - - summary_template = f""" -The harpy snp freebayes workflow ran using these parameters: - -The provided genome: {bn} - -{windowtext} - -The freebayes parameters: - freebayes -f GENOME -L samples.list -r REGION {params} | - bcftools sort - - -The variants identified in the intervals were merged into the final variant file using: - bcftools concat -f bcf.files -a --remove-duplicates - -The variants were normalized using: - bcftools norm -m -both -d both - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary.append(f"Genomic positions for which variants were called: {regioninput}") + varcall = "The freebayes parameters:\n" + varcall += f"\tfreebayes -f GENOME -L samples.list -r REGION {params} |\n" + varcall += f"\tbcftools sort -" + summary.append(varcall) + merged = "The variants identified in the intervals were merged into the final variant file using:\n" + merged += "\tbcftools concat -f bcf.files -a --remove-duplicates" + summary.append(merged) + normalize = "The variants were normalized using:\n" + normalize += "\tbcftools norm -m -both -d both" + summary.append(normalize) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/snp.freebayes.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/snp_mpileup.smk b/harpy/snakefiles/snp_mpileup.smk index 01750ffe5..162950c15 100644 --- a/harpy/snakefiles/snp_mpileup.smk +++ b/harpy/snakefiles/snp_mpileup.smk @@ -13,10 +13,10 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") ploidy = config["ploidy"] mp_extra = config.get("extra", "") -regiontype = config["regiontype"] +regiontype = config["region_type"] windowsize = config.get("windowsize", None) outdir = config["output_directory"] skip_reports = config["skip_reports"] @@ -103,9 +103,9 @@ rule index_alignments: rule bam_list: input: bam = bamlist, - bai = [f"{i}.bai" for i in bamlist] + bai = collect("{bam}.bai", bam = bamlist) output: - outdir + "/logs/samples.files" + temp(outdir + "/logs/samples.files") run: with open(output[0], "w") as fout: for bamfile in input.bam: @@ -114,6 +114,8 @@ rule bam_list: rule mpileup: input: bamlist = outdir + "/logs/samples.files", + bam = bamlist, + bai = collect("{bam}.bai", bam = bamlist), genome = f"Genome/{bn}", genome_fai = f"Genome/{bn}.fai" output: @@ -250,32 +252,27 @@ rule workflow_summary: ploidy = f"--ploidy {ploidy}", populations = f"--populations {groupings}" if groupings else "--populations -" run: + summary = ["The harpy snp freebayes workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") if windowsize: - windowtext = f"Size of intervals to split genome for variant calling: {windowsize}" + summary.append(f"Size of intervals to split genome for variant calling: {windowsize}") else: - windowtext = f"Genomic positions for which variants were called: {regioninput}" - summary_template = f""" -The harpy snp mpileup workflow ran using these parameters: - -The provided genome: {bn} - -{windowtext} - -The mpileup parameters: - bcftools mpileup --fasta-ref GENOME --region REGION --bam-list BAMS --annotate AD --output-type b {mp_extra} - -The bcftools call parameters: - bcftools call --multiallelic-caller {params} --variants-only --output-type b | - bcftools sort - - -The variants identified in the intervals were merged into the final variant file using: - bcftools concat -f bcf.files -a --remove-duplicates - -The variants were normalized using: - bcftools norm -m -both -d both - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary.append(f"Genomic positions for which variants were called: {regioninput}") + mpileup = "The mpileup parameters:\n" + mpileup += f"\tbcftools mpileup --fasta-ref GENOME --region REGION --bam-list BAMS --annotate AD --output-type b {mp_extra}" + summary.append(mpileup) + bcfcall = "The bcftools call parameters:\n" + bcfcall += "\tbcftools call --multiallelic-caller {params} --variants-only --output-type b |\n" + bcfcall += "\tbcftools sort -" + summary.append(bcfcall) + merged = "The variants identified in the intervals were merged into the final variant file using:\n" + merged += "\tbcftools concat -f bcf.files -a --remove-duplicates" + summary.append(merged) + normalize = "The variants were normalized using:\n" + normalize += "\tbcftools norm -m -both -d both" + summary.append(normalize) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/snp.mpileup.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/sv_leviathan.smk b/harpy/snakefiles/sv_leviathan.smk index 1bfb0ba3c..2dad894be 100644 --- a/harpy/snakefiles/sv_leviathan.smk +++ b/harpy/snakefiles/sv_leviathan.smk @@ -14,7 +14,7 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) @@ -210,19 +210,16 @@ rule workflow_summary: iters = f"-B {iterations}", extra = extra run: - summary_template = f""" -The harpy sv leviathan workflow ran using these parameters: - -The provided genome: {bn} - -The barcodes were indexed using: - LRez index bam -p -b INPUT - -Leviathan was called using: - LEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy sv leviathan workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") + bc_idx = "The barcodes were indexed using:\n" + bc_idx += "LRez index bam -p -b INPUT" + summary.append(bc_idx) + svcall = "Leviathan was called using:\n" + svcall += f"\tLEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params}" + summary.append(svcall) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/sv.leviathan.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index 86f5191ad..87bc8a1ee 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -14,7 +14,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+", population = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] groupfile = config["inputs"]["groupings"] @@ -272,19 +272,19 @@ rule workflow_summary: iters = f"-B {iterations}", extra = extra run: - summary_template = f""" -The harpy sv leviathan workflow ran using these parameters: - -The provided genome: {bn} - -The barcodes were indexed using: - LRez index bam -p -b INPUT - -Leviathan was called using: - LEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy sv leviathan workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") + concat = "The alignments were concatenated using:\n" + concat += "\tconcatenate_bam.py -o groupname.bam -b samples.list" + summary.append(concat) + bc_idx = "The barcodes were indexed using:\n" + bc_idx += "LRez index bam -p -b INPUT" + summary.append(bc_idx) + svcall = "Leviathan was called using:\n" + svcall += f"\tLEVIATHAN -b INPUT -i INPUT.BCI -g GENOME {params}" + summary.append(svcall) + sm = "The Snakemake workflow was called via command line:\n" + sm += f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/sv.leviathan.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/sv_naibr.smk b/harpy/snakefiles/sv_naibr.smk index eb3eb23c2..7bec14dfe 100644 --- a/harpy/snakefiles/sv_naibr.smk +++ b/harpy/snakefiles/sv_naibr.smk @@ -14,7 +14,7 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) @@ -217,20 +217,16 @@ rule workflow_summary: reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argtext = [f"{k}={v}" for k,v in argdict.items()] - summary_template = f""" -The harpy sv naibr workflow ran using these parameters: - -The provided genome: {bn} - -naibr variant calling ran using these configurations: - bam_file=BAMFILE - prefix=PREFIX - outdir=Variants/naibr/PREFIX - {"\n\t".join(argtext)} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy sv naibr workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") + naibr = "naibr variant calling ran using these configurations:\n" + naibr += "\tbam_file=BAMFILE\n" + naibr += "\tprefix=PREFIX\n" + naibr += "\toutdir=Variants/naibr/PREFIX\n" + naibr += "\n\t".join([f"{k}={v}" for k,v in argdict.items()]) + summary.append(naibr) + sm = "The Snakemake workflow was called via command line:\n" + sm = f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snakefiles/sv_naibr_phase.smk b/harpy/snakefiles/sv_naibr_phase.smk index feca8d4dd..98824cd0e 100644 --- a/harpy/snakefiles/sv_naibr_phase.smk +++ b/harpy/snakefiles/sv_naibr_phase.smk @@ -14,7 +14,7 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) @@ -275,23 +275,19 @@ rule workflow_summary: reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argtext = [f"{k}={v}" for k,v in argdict.items()] - summary_template = f""" -The harpy sv naibr workflow ran using these parameters: - -The provided genome: {bn} - -The alignment files were phased using: - whatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam - -naibr variant calling ran using these configurations: - bam_file=BAMFILE - prefix=PREFIX - outdir=Variants/naibr/PREFIX - {"\n\t".join(argtext)} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy sv naibr workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") + phase = "The alignment files were phased using:\n" + phase += f"\twhatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam" + summary.append(phase) + naibr = "naibr variant calling ran using these configurations:\n" + naibr += "\tbam_file=BAMFILE\n" + naibr += "\tprefix=PREFIX\n" + naibr += "\toutdir=Variants/naibr/PREFIX\n" + naibr += "\n\t".join([f"{k}={v}" for k,v in argdict.items()]) + summary.append(naibr) + sm = "The Snakemake workflow was called via command line:\n" + sm = f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/sv_naibr_pop.smk b/harpy/snakefiles/sv_naibr_pop.smk index 517951882..eb5dc4b14 100644 --- a/harpy/snakefiles/sv_naibr_pop.smk +++ b/harpy/snakefiles/sv_naibr_pop.smk @@ -15,7 +15,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+", population = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] groupfile = config["inputs"]["groupings"] @@ -263,25 +263,19 @@ rule workflow_summary: agg_report = outdir + "/reports/naibr.pop.summary.html" if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argdict = process_args(extra) - argtext = [f"{k}={v}" for k,v in argdict.items()] - summary_template = f""" -The harpy sv naibr workflow ran using these parameters: - -The provided genome: {bn} -The sample grouping file: {groupfile} - -The alignments were concatenated using: - concatenate_bam.py -o groupname.bam -b samples.list - -naibr variant calling ran using these configurations: - bam_file=BAMFILE - prefix=PREFIX - outdir=Variants/naibr/PREFIX - {"\n\t".join(argtext)} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy sv naibr workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") + concat = "The alignments were concatenated using:\n" + concat += "\tconcatenate_bam.py -o groupname.bam -b samples.list" + summary.append(concat) + naibr = "naibr variant calling ran using these configurations:\n" + naibr += "\tbam_file=BAMFILE\n" + naibr += "\tprefix=PREFIX\n" + naibr += "\toutdir=Variants/naibr/PREFIX\n" + naibr += "\n\t".join([f"{k}={v}" for k,v in argdict.items()]) + summary.append(naibr) + sm = "The Snakemake workflow was called via command line:\n" + sm = f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) \ No newline at end of file diff --git a/harpy/snakefiles/sv_naibr_pop_phase.smk b/harpy/snakefiles/sv_naibr_pop_phase.smk index 277bc62b6..86de2305e 100644 --- a/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -15,7 +15,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+", population = "[a-zA-Z0-9._-]+" -envdir = os.getcwd() + "/.harpy_envs" +envdir = os.path.join(os.getcwd(), ".harpy_envs") genomefile = config["inputs"]["genome"] bn = os.path.basename(genomefile) bamlist = config["inputs"]["alignments"] @@ -347,27 +347,22 @@ rule workflow_summary: agg_report = outdir + "/reports/naibr.pop.summary.html" if not skip_reports else [] run: os.system(f"rm -rf {outdir}/naibrlog") - argtext = [f"{k}={v}" for k,v in argdict.items()] - summary_template = f""" -The harpy sv naibr workflow ran using these parameters: - -The provided genome: {bn} -The sample grouping file: {groupfile} - -The alignment files were phased using: - whatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam - -The phased alignments were concatenated using: - concatenate_bam.py -o groupname.bam -b samples.list - -naibr variant calling ran using these configurations: - bam_file=BAMFILE - prefix=PREFIX - outdir=Variants/naibr/PREFIX - {"\n\t".join(argtext)} - -The Snakemake workflow was called via command line: - {config["workflow_call"]} -""" + summary = ["The harpy sv naibr workflow ran using these parameters:"] + summary.append(f"The provided genome: {bn}") + phase = "The alignment files were phased using:\n" + phase += f"\twhatshap haplotag --reference genome.fasta --linked-read-distance-cutoff {mol_dist} --ignore-read-groups --tag-supplementary --sample sample_x file.vcf sample_x.bam" + summary.append(phase) + concat = "The alignments were concatenated using:\n" + concat += "\tconcatenate_bam.py -o groupname.bam -b samples.list" + summary.append(concat) + naibr = "naibr variant calling ran using these configurations:\n" + naibr += "\tbam_file=BAMFILE\n" + naibr += "\tprefix=PREFIX\n" + naibr += "\toutdir=Variants/naibr/PREFIX\n" + naibr += "\n\t".join([f"{k}={v}" for k,v in argdict.items()]) + summary.append(naibr) + sm = "The Snakemake workflow was called via command line:\n" + sm = f"\t{config['workflow_call']}" + summary.append(sm) with open(outdir + "/workflow/sv.naibr.summary", "w") as f: - f.write(summary_template) + f.write("\n\n".join(summary)) diff --git a/harpy/snp.py b/harpy/snp.py index 4fd8e78d9..068ca33e9 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -2,6 +2,7 @@ import os import sys +import yaml from pathlib import Path from rich import box from rich.table import Table @@ -45,7 +46,7 @@ def snp(): ] } -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/snp") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @@ -86,7 +87,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) @@ -109,30 +110,29 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e fetch_report(workflowdir, "bcftools_stats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "snp_mpileup") - + if populations: + validate_popfile(populations) + # check that samplenames and populations line up + validate_popsamples(bamlist, populations, quiet) + configs = { + "workflow" : "snp mpileup", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "ploidy" : ploidy, + "region_type" : regtype, + **({'windowsize': int(regions)} if regtype == "windows" else {}), + **({'extra': extra_params} if extra_params else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + "regions" : Path(region).resolve().as_posix() if regtype != "region" else region, + **({'groupings': Path(populations).resolve().as_posix()} if populations else {}), + "alignments" : [i.as_posix() for i in bamlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: snp mpileup\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"ploidy: {ploidy}\n") - config.write(f"regiontype: {regtype}\n") - if regtype == "windows": - config.write("windowsize: {regions}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - config.write(f" regions: {region}\n") - if populations: - validate_popfile(populations) - # check that samplenames and populations line up - validate_popsamples(bamlist, populations, quiet) - config.write(f" groupings: {populations}\n") - config.write(" alignments:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -147,9 +147,9 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e start_text.add_row("Genome:", genome) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "snp_mpileup", start_text, output_dir, sm_log, quiet, "workflow/snp.mpileup.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/snp") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/snp") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @@ -190,7 +190,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) @@ -201,42 +201,43 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, # setup regions checks regtype = validate_regions(regions, genome) if regtype == "windows": - region = Path(f"{workflowdir}/positions.bed").resolve() + region = Path(f"{workflowdir}/positions.bed").resolve().as_posix() os.system(f"make_windows.py -m 0 -i {genome} -o {region} -w {regions}") elif regtype == "region": region = regions else: - region = Path(f"{workflowdir}/positions.bed").resolve() + region = Path(f"{workflowdir}/positions.bed").resolve().as_posix() os.system(f"cp -f {regions} {region}") fetch_rule(workflowdir, "snp_freebayes.smk") fetch_report(workflowdir, "bcftools_stats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "snp_freebayes") + if populations: + # check for delimeter and formatting + validate_popfile(populations) + # check that samplenames and populations line up + validate_popsamples(bamlist, populations,quiet) + configs = { + "workflow" : "snp freebayes", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "ploidy" : ploidy, + "region_type" : regtype, + **({'windowsize': int(regions)} if regtype == "windows" else {}), + **({'extra': extra_params} if extra_params else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + "regions" : Path(region).resolve().as_posix() if regtype != "region" else region, + **({'groupings': Path(populations).resolve().as_posix()} if populations else {}), + "alignments" : [i.as_posix() for i in bamlist] + } + } with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: - config.write("workflow: snp freebayes\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"ploidy: {ploidy}\n") - config.write(f"regiontype: {regtype}\n") - if regtype == "windows": - config.write("windowsize: {regions}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - config.write(f" genome: {Path(genome).resolve()}\n") - config.write(f" regions: {region}\n") - if populations is not None: - validate_popfile(populations) - # check that samplenames and populations line up - validate_popsamples(bamlist, populations, quiet) - config.write(f" groupings: {populations}\n") - config.write(" alignments:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -251,7 +252,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, start_text.add_row("Genome:", genome) start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "snp_freebayes", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "snp_freebayes", start_text, output_dir, sm_log, quiet, "workflow/snp.freebayes.summary") snp.add_command(mpileup) snp.add_command(freebayes) diff --git a/harpy/stitchparams.py b/harpy/stitchparams.py index 784f0892f..17f5cfec7 100644 --- a/harpy/stitchparams.py +++ b/harpy/stitchparams.py @@ -5,7 +5,7 @@ import rich_click as click from ._printing import print_notice -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/impute/#parameter-file") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/impute/#parameter-file") @click.option('-o', '--output', type=str, required = True, metavar = "Output file name", help = 'Name of output STITCH parameter file') def stitchparams(output): """ @@ -22,10 +22,10 @@ def stitchparams(output): click.echo("Please suggest a different name for the output file") sys.exit(0) with open(output, "w", encoding="utf-8") as file: - _ = file.write('model\tusebx\tbxlimit\tk\ts\tngen\n') - _ = file.write('diploid\tTRUE\t50000\t10\t1\t50\n') - _ = file.write('diploid\tTRUE\t50000\t10\t1\t50\n') - _ = file.write('diploid\tTRUE\t50000\t15\t1\t100') + _ = file.write('name\tmodel\tusebx\tbxlimit\tk\ts\tngen\n') + _ = file.write('k10_ng50\tdiploid\tTRUE\t50000\t10\t1\t50\n') + _ = file.write('k1_ng30\tdiploid\tTRUE\t50000\t5\t1\t30\n') + _ = file.write('high_ngen\tdiploid\tTRUE\t50000\t15\t1\t100') print_notice( f"Created STITCH parameter file: {output}\n" + "Modify the model parameters as needed, but DO NOT add/remove columns." diff --git a/harpy/sv.py b/harpy/sv.py index 649404a28..be6798671 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -2,6 +2,7 @@ import os import sys +import yaml from pathlib import Path from rich import box from rich.table import Table @@ -49,7 +50,7 @@ def sv(): ] } -@click.command(no_args_is_help = True, epilog= "See the documentation for more information: https://pdimens.github.io/harpy/workflows/sv/leviathan/") +@click.command(no_args_is_help = True, epilog= "Documentation: https://pdimens.github.io/harpy/workflows/sv/leviathan/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False, readable=True), required = True, help = 'Genome assembly for variant calling') @click.option('-i', '--iterations', show_default = True, default=50, type = click.IntRange(min = 10, max_open = True), help = 'Number of iterations to perform through index (reduces memory)') @@ -85,42 +86,38 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) check_fasta(genome, quiet) - if populations is not None: + if populations: + validate_popfile(populations) + validate_popsamples(bamlist, populations,quiet) fetch_report(workflowdir, "leviathan_pop.Rmd") fetch_report(workflowdir, "leviathan.Rmd") fetch_rule(workflowdir, f"sv_{vcaller}.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) - sm_log = snakemake_log(output_dir, "sv_naibr") - + sm_log = snakemake_log(output_dir, "sv_leviathan") + configs = { + "workflow" : "sv leviathan", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "min_barcodes" : min_barcodes, + "min_sv" : min_sv, + "iterations" : iterations, + **({'extra': extra_params} if extra_params else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : { + "genome" : Path(genome).resolve().as_posix(), + **({'groupings': Path(populations).resolve().as_posix()} if populations else {}), + "alignments" : [i.as_posix() for i in bamlist] + } + } with open(f'{workflowdir}/config.yaml', "w", encoding="utf-8") as config: - config.write("workflow: sv leviathan\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"min_barcodes: {min_barcodes}\n") - config.write(f"min_sv: {min_sv}\n") - config.write(f"iterations: {iterations}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - popgroupings = "" - config.write(f" genome: {Path(genome).resolve()}\n") - if populations is not None: - # check for delimeter and formatting - validate_popfile(populations) - # check that samplenames and populations line up - validate_popsamples(bamlist, populations,quiet) - config.write(f" groupings: {Path(populations).resolve()}\n") - config.write(" alignments:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -134,9 +131,9 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre start_text.add_row("Sample Pooling:", populations if populations else "no") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "sv_leviathan", start_text, output_dir, sm_log, quiet, "workflow/sv.leviathan.summary") -@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "See the documentation for more information: https://pdimens.github.io/harpy/workflows/sv/naibr/") +@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/sv/naibr/") @click.option('-x', '--extra-params', type = str, help = 'Additional variant caller parameters, in quotes') @click.option('-g', '--genome', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Genome assembly') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @@ -175,53 +172,48 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th workflowdir = f"{output_dir}/workflow" sdm = "conda" if conda else "conda apptainer" vcaller = "naibr" if populations is None else "naibr_pop" - vcaller += "_phase" if vcf is not None else "" + vcaller += "_phase" if vcf else "" command = f'snakemake --rerun-incomplete --show-failed-logs --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . ' command += f"--snakefile {workflowdir}/sv_{vcaller}.smk " command += f"--configfile {workflowdir}/config.yaml " if hpc: command += f"--workflow-profile {hpc} " - if snakemake is not None: + if snakemake: command += snakemake os.makedirs(f"{workflowdir}/", exist_ok= True) bamlist, n = parse_alignment_inputs(inputs) check_fasta(genome, quiet) - if populations is not None: + if populations: + validate_popfile(populations) + validate_popsamples(bamlist, populations, quiet) fetch_report(workflowdir, "naibr_pop.Rmd") fetch_report(workflowdir, "naibr.Rmd") - if vcf is not None: + if vcf: check_phase_vcf(vcf) fetch_rule(workflowdir, f"sv_{vcaller}.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "sv_naibr") - + configs = { + "workflow" : "sv naibr", + "snakemake_log" : sm_log, + "output_directory" : output_dir, + "min_barcodes" : min_barcodes, + "min_quality" : min_quality, + "min_sv" : min_sv, + "molecule_distance" : molecule_distance, + **({'extra': extra_params} if extra_params else {}), + "skip_reports" : skip_reports, + "workflow_call" : command.rstrip(), + "inputs" : { + **({'genome': Path(genome).resolve().as_posix()} if genome else {}), + **({'vcf': Path(vcf).resolve().as_posix()} if vcf else {}), + **({'groupings': Path(populations).resolve().as_posix()} if populations else {}), + "alignments" : [i.as_posix() for i in bamlist] + } + } with open(f'{workflowdir}/config.yaml', "w", encoding="utf-8") as config: - config.write("workflow: sv naibr\n") - config.write(f"snakemake_log: {sm_log}\n") - config.write(f"output_directory: {output_dir}\n") - config.write(f"min_barcodes: {min_barcodes}\n") - config.write(f"min_quality: {min_quality}\n") - config.write(f"min_sv: {min_sv}\n") - config.write(f"molecule_distance: {molecule_distance}\n") - if extra_params is not None: - config.write(f"extra: {extra_params}\n") - config.write(f"skip_reports: {skip_reports}\n") - config.write(f"workflow_call: {command}\n") - config.write("inputs:\n") - if vcf is not None: - config.write(f" vcf: {Path(vcf).resolve()}\n") - if genome is not None: - config.write(f" genome: {Path(genome).resolve()}\n") - if populations is not None: - # check for delimeter and formatting - validate_popfile(populations) - # check that samplenames and populations line up - validate_popsamples(bamlist, populations, quiet) - config.write(f" groupings: {Path(populations).resolve()}\n") - config.write(" alignments:\n") - for i in bamlist: - config.write(f" - {i}\n") + yaml.dump(configs, config, default_flow_style= False, sort_keys=False) create_conda_recipes() if setup_only: @@ -236,7 +228,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th start_text.add_row("Perform Phasing:", "yes" if vcf else "no") start_text.add_row("Output Folder:", output_dir + "/") start_text.add_row("Workflow Log:", sm_log.replace(f"{output_dir}/", "") + "[dim].gz") - launch_snakemake(command, "sv_naibr", start_text, output_dir, sm_log, quiet) + launch_snakemake(command, "sv_naibr", start_text, output_dir, sm_log, quiet, "workflow/sv.naibr.summary") sv.add_command(leviathan) sv.add_command(naibr) diff --git a/resources/harpy.yaml b/resources/harpy.yaml index 76b0d8628..8dd249b61 100644 --- a/resources/harpy.yaml +++ b/resources/harpy.yaml @@ -6,7 +6,6 @@ dependencies: - apptainer - bcftools =1.20 - conda >24.7 - - pandas - pysam - python - rich-click diff --git a/resources/meta.yaml b/resources/meta.yaml index 36ebfc6d1..f7b81569d 100644 --- a/resources/meta.yaml +++ b/resources/meta.yaml @@ -36,7 +36,6 @@ requirements: - apptainer - bcftools =1.20 - conda >24.7 - - pandas - pysam - python - rich-click diff --git a/test/stitch.params b/test/stitch.params index 712bb7e13..ce21b5e6e 100644 --- a/test/stitch.params +++ b/test/stitch.params @@ -1,3 +1,3 @@ -model usebx bxlimit k s ngen -diploid TRUE 50000 3 2 10 -diploid TRUE 50000 3 1 5 \ No newline at end of file +name model usebx bxlimit k s ngen +k10_ng50 diploid TRUE 50000 10 1 50 +high_ngen diploid TRUE 50000 15 1 100 \ No newline at end of file