Skip to content

Commit

Permalink
Merge pull request #168 from pdimens/pop_out_simuG
Browse files Browse the repository at this point in the history
swap simuG to conda-based install
  • Loading branch information
pdimens authored Nov 26, 2024
2 parents 736e540 + caa8df3 commit 46f5c80
Show file tree
Hide file tree
Showing 49 changed files with 370 additions and 4,597 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -689,25 +689,25 @@ jobs:
shell: micromamba-shell {0}
run: |
harpy simulate snpindel --quiet --snp-count 10 --indel-count 10 -z 0.5 test/genome/genome.fasta.gz
harpy simulate snpindel --quiet --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/diploid/sim.snp.hap1.vcf --indel-vcf Simulate/snpindel/diploid/sim.indel.hap1.vcf test/genome/genome.fasta.gz
harpy simulate snpindel --quiet --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/haplotype_1/sim.hap1.snp.vcf --indel-vcf Simulate/snpindel/haplotype_1/sim.hap1.indel.vcf test/genome/genome.fasta.gz
- name: simulate inversions
shell: micromamba-shell {0}
if: always()
run: |
harpy simulate inversion --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz
harpy simulate inversion --quiet --prefix Simulate/invvcf --vcf Simulate/inversion/diploid/sim.inversion.hap1.vcf test/genome/genome.fasta.gz
harpy simulate inversion --quiet --prefix Simulate/invvcf --vcf Simulate/inversion/haplotype_1/sim.hap1.inversion.vcf test/genome/genome.fasta.gz
- name: simulate cnv
shell: micromamba-shell {0}
if: always()
run: |
harpy simulate cnv --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz
harpy simulate cnv --quiet --prefix Simulate/cnvvcf --vcf Simulate/cnv/diploid/sim.cnv.hap1.vcf test/genome/genome.fasta.gz
harpy simulate cnv --quiet --prefix Simulate/cnvvcf --vcf Simulate/cnv/haplotype_1/sim.hap1.cnv.vcf test/genome/genome.fasta.gz
- name: simulate translocations
shell: micromamba-shell {0}
if: always()
run: |
harpy simulate translocation --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz
harpy simulate translocation --quiet --prefix Simulate/transvcf --vcf Simulate/translocation/diploid/sim.translocation.hap1.vcf test/genome/genome.fasta.gz
harpy simulate translocation --quiet --prefix Simulate/transvcf --vcf Simulate/translocation/haplotype_1/sim.hap1.translocation.vcf test/genome/genome.fasta.gz
simulate_linkedreads:
needs: [changes, pkgbuild]
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
.snakemake/
.vscode/
.harpy_envs/
.condarc
.cache/
hpc/
Expand Down
1 change: 1 addition & 0 deletions harpy/_cli_types_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def convert(self, value, param, ctx):

class InputFile(click.ParamType):
"""A class for a click type that verifies that a file exists and that it has an expected extension"""
name = "input_file"
def __init__(self, filetype, gzip_ok):
super().__init__()
self.filetype = filetype
Expand Down
82 changes: 61 additions & 21 deletions harpy/_conda.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
"""Creates environment recipes for all the Harpy workflows"""

import os
import sys
import yaml
from rich import box
from rich.table import Table
from rich import print as rprint
from ._printing import print_error, print_solution_with_culprits

def create_conda_recipes():
def create_conda_recipes(outdir: str, envs: list=None) -> None:
"""Create the YAML files of the workflow conda dependencies"""
condachannels = ["bioconda","conda-forge"]
environ = {
"align": [
"align" : [
"bioconda::bwa",
"bioconda::ema",
"bioconda::samtools=1.20",
Expand All @@ -17,7 +22,7 @@ def create_conda_recipes():
"conda-forge::libzlib",
"conda-forge::xz"
],
"assembly":[
"assembly" : [
"bioconda::arcs",
"bioconda::bwa",
"bioconda::cloudspades",
Expand Down Expand Up @@ -63,36 +68,71 @@ def create_conda_recipes():
"bioconda::perl-math-random",
"bioconda::perl-inline-c",
"bioconda::perl-parse-recdescent",
"bioconda::simug>1.0.0",
"conda-forge::numpy",
"conda-forge::perl"
],
"spades": [
"spades" : [
"conda-forge::python=3"
],
"stitch" : [
"bioconda::r-stitch=1.6.10"
],
"variants": [
"variants" : [
"bioconda::bcftools=1.20",
"bioconda::freebayes=1.3.6",
"bioconda::leviathan",
"bioconda::naibr-plus"
]
}

os.makedirs(".harpy_envs", exist_ok = True)
# overwrites existing
for env,deps in environ.items():
with open(f".harpy_envs/{env}.yaml", mode="w", encoding="utf-8") as yml:
yml.write(f"name: {env}\n")
yml.write("channels:\n - ")
yml.write("\n - ".join(condachannels))
yml.write("\ndependencies:\n - ")
yml.write("\n - ".join(deps) + "\n")
os.makedirs(f"{outdir}/workflow/envs", exist_ok = True)
# if none provided, use all
if not envs:
envs = environ.keys()

# post-deployment scripts
with open(".harpy_envs/spades.post-deploy.sh", "w", encoding="utf-8") as shellscript:
shellscript.write("wget -O .spades.tar.gz https://github.com/ablab/spades/releases/download/v4.0.0/SPAdes-4.0.0-Linux.tar.gz\n")
shellscript.write("tar -xvzf .spades.tar.gz && rm .spades.tar.gz\n")
shellscript.write("mv SPAdes-4.0.0-Linux/bin/* ${CONDA_PREFIX}/bin && mv SPAdes-4.0.0-Linux/share/* ${CONDA_PREFIX}/share\n")
shellscript.write("rm -r SPAdes-4.0.0-Linux\n")
for i in envs:
try:
env_dict = {
"name" : i,
"channels" : ["bioconda","conda-forge"],
"dependencies": environ[i]
}
except KeyError:
sys.stderr.write(f"Key '{i}' is not an available conda environment name. The options are: " + ", ".join(environ.keys()))
sys.exit(1)
with open(f"{outdir}/workflow/envs/{i}.yaml", "w", encoding="utf-8") as recipe:
yaml.dump(env_dict, recipe, default_flow_style= False, sort_keys=False, width=float('inf'), indent=2)

if "spades" in envs:
# post-deployment script
with open(f"{outdir}/workflow/envs/spades.post-deploy.sh", "w", encoding="utf-8") as shellscript:
shellscript.write("wget -O .spades.tar.gz https://github.com/ablab/spades/releases/download/v4.0.0/SPAdes-4.0.0-Linux.tar.gz\n")
shellscript.write("tar -xvzf .spades.tar.gz && rm .spades.tar.gz\n")
shellscript.write("mv SPAdes-4.0.0-Linux/bin/* ${CONDA_PREFIX}/bin && mv SPAdes-4.0.0-Linux/share/* ${CONDA_PREFIX}/share\n")
shellscript.write("rm -r SPAdes-4.0.0-Linux\n")

def check_environments(dirpath: str, envs: list) -> None:
"""Check that the provided dir exists and contains the necessary environment definitions"""
if not os.path.exists(f"{dirpath}/workflow/envs"):
print_error("missing conda files", "This working directory does not contain the expected directory of conda environment definitions ([blue bold]workflow/envs/[/blue bold])\n - use [green bold]--conda[/green bold] to recreate it")
sys.exit(1)
envlist = os.listdir(f"{dirpath}/workflow/envs")
errcount = 0
errtable = Table(show_footer=True, box=box.SIMPLE)
errtable.add_column("File", justify="left", no_wrap=True)
errtable.add_column("Status", justify="center")
for i in envs:
if f"{i}.yaml" in envlist:
errtable.add_row(f"[dim]{i}.yaml", "[dim]present")
else:
errcount += 1
errtable.add_row(f"[yellow bold]{i}.yaml", "[yellow bold]missing")
if errcount > 0:
print_error("Missing environment files", f"The directory [blue]{dirpath}/workflows/envs[/blue] is missing [yellow bold]{errcount}[/yellow bold] of the expected conda environment definition files.")
print_solution_with_culprits(
"Check that the names conform to Harpy's expectations, otherwise you can recreate this directory using the [green bold]--conda[/green bold] option.",
"Expected environment files:"
)
rprint(errtable, file = sys.stderr)
sys.exit(1)
2 changes: 1 addition & 1 deletion harpy/_printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def print_setup_error(exitcode):
errortext = "Something is wrong with the Snakefile for this workflow. If you manually edited the Snakefile, see the error below for troubleshooting. If you didn't, it's probably a bug (oops!) and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues"
errortype = "Snakefile Error"
else:
errortext = "There was an issue creating the software environment(s) necessary to run this workflow. If you manually edited the conda dependencies in [blue].harpy_envs[/blue], see the error below for troubleshooting. If you didn't, it might be a bug or related to how your system is setup for Conda or Singularity environments and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues"
errortext = "There was an issue creating the software environment(s) necessary to run this workflow. If you manually edited the conda dependencies in [blue]/workflows/envs[/blue], see the error below for troubleshooting. If you didn't, it might be a bug or related to how your system is setup for Conda or Singularity environments and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues"
errortype = "Software Environment Error"

rprint(
Expand Down
26 changes: 0 additions & 26 deletions harpy/_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,32 +32,6 @@ def is_plaintext(file_path):
except UnicodeDecodeError:
return False

def check_envdir(dirpath):
"""Check that the provided dir exists and contains the necessary environment definitions"""
if not os.path.exists(dirpath):
print_error("missing conda files", "This working directory does not contain the expected directory of conda environment definitions ([blue bold].harpy_envs/[/blue bold])\n - use [green bold]--conda[/green bold] to recreate it")
sys.exit(1)
envlist = os.listdir(dirpath)
envs = ["align", "metassembly", "phase", "qc", "r", "simulations", "stitch", "variants"]
errcount = 0
errtable = Table(show_footer=True, box=box.SIMPLE)
errtable.add_column("File", justify="left", style="blue", no_wrap=True)
errtable.add_column("Exists", justify="center")
for i in envs:
if f"{i}.yaml" in envlist:
errtable.add_row(f"{i}.yaml", "[blue]✓")
else:
errcount += 1
errtable.add_row(f"{i}.yaml", "[yellow]🗙")
if errcount > 0:
print_error("missing conda files", f"The conda environment definition directory ([blue bold]{dirpath}[/blue bold]) is missing [yellow bold]{errcount}[/yellow bold] of the expected definition files. All of the environment files are expected to be present, even if a particular workflow doesn't use it.")
print_solution_with_culprits(
"Check that the names conform to Harpy's expectations, otheriwse you can recreate this directory using the [green bold]--conda[/green bold] option.",
"Expected environment files:"
)
rprint(errtable, file = sys.stderr)
sys.exit(1)

def check_impute_params(parameters):
"""Validate the STITCH parameter file for column names, order, types, missing values, etc."""
with open(parameters, "r", encoding="utf-8") as paramfile:
Expand Down
13 changes: 10 additions & 3 deletions harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_
fetch_report(workflowdir, "align_bxstats.Rmd")
os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True)
sm_log = snakemake_log(output_dir, "align_bwa")
conda_envs = ["align", "r", "qc"]
configs = {
"workflow" : "align bwa",
"snakemake_log" : sm_log,
Expand All @@ -123,6 +124,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_
"depth_windowsize" : depth_window,
**({'extra': extra_params} if extra_params else {}),
"workflow_call" : command.rstrip(),
"conda_environments" : conda_envs,
"reports" : {
"skip": skip_reports,
**({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}),
Expand All @@ -135,7 +137,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config:
yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf'))

create_conda_recipes()
create_conda_recipes(output_dir, conda_envs)
if setup_only:
sys.exit(0)

Expand Down Expand Up @@ -216,6 +218,8 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
fetch_report(workflowdir, "align_bxstats.Rmd")
os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True)
sm_log = snakemake_log(output_dir, "align_ema")
conda_envs = ["align", "r", "qc"]

configs = {
"workflow" : "align ema",
"snakemake_log" : sm_log,
Expand All @@ -228,6 +232,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
"EMA_bins" : ema_bins,
**({'extra': extra_params} if extra_params else {}),
"workflow_call" : command.rstrip(),
"conda_environments" : conda_envs,
"reports" : {
"skip": skip_reports,
**({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}),
Expand All @@ -242,7 +247,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config:
yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf'))

create_conda_recipes()
create_conda_recipes(output_dir, conda_envs)
if setup_only:
sys.exit(0)

Expand Down Expand Up @@ -309,6 +314,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window,
fetch_report(workflowdir, "align_bxstats.Rmd")
os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True)
sm_log = snakemake_log(output_dir, "align_strobe")
conda_envs = ["align", "r", "qc"]
configs = {
"workflow" : "align strobe",
"snakemake_log" : sm_log,
Expand All @@ -320,6 +326,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window,
"depth_windowsize" : depth_window,
**({'extra': extra_params} if extra_params else {}),
"workflow_call" : command.rstrip(),
"conda_environments" : conda_envs,
"reports" : {
"skip": skip_reports,
**({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}),
Expand All @@ -332,7 +339,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window,
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config:
yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf'))

create_conda_recipes()
create_conda_recipes(output_dir, conda_envs)
if setup_only:
sys.exit(0)

Expand Down
4 changes: 3 additions & 1 deletion harpy/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, ex
fetch_rule(workflowdir, f"{asm}.smk")
os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True)
sm_log = snakemake_log(output_dir, asm)
conda_envs = ["assembly","qc"]
configs = {
"workflow" : asm,
"snakemake_log" : sm_log,
Expand Down Expand Up @@ -111,6 +112,7 @@ def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, ex
"minimum_links" : links
},
"workflow_call" : command.rstrip(),
"conda_environments" : conda_envs,
"reports" : {
"skip": skip_reports,
"organism_type": organism_type
Expand All @@ -123,7 +125,7 @@ def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, ex
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config:
yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf'))

create_conda_recipes()
create_conda_recipes(output_dir, conda_envs)
if setup_only:
sys.exit(0)

Expand Down
3 changes: 2 additions & 1 deletion harpy/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def containerize():
**INTERNAL USE ONLY**. Used to recreate all the conda environments required
by the workflows and build a dockerfile from that.
"""
create_conda_recipes()
#TODO MAKE THIS ALL OF THEM
create_conda_recipes("container")
fetch_rule(os.getcwd(), "containerize.smk")

with open("Dockerfile", "w", encoding = "utf-8") as dockerfile:
Expand Down
4 changes: 3 additions & 1 deletion harpy/deconvolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t
fetch_rule(workflowdir, "deconvolve.smk")
os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True)
sm_log = snakemake_log(output_dir, "deconvolve")
conda_envs = ["qc"]
configs = {
"workflow": "deconvolve",
"snakemake_log" : sm_log,
Expand All @@ -73,12 +74,13 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t
"density" : density,
"dropout" : dropout,
"workflow_call" : command.rstrip(),
"conda_environments" : conda_envs,
"inputs": [i.as_posix() for i in fqlist]
}
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config:
yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf'))

create_conda_recipes()
create_conda_recipes(output_dir, conda_envs)
if setup_only:
sys.exit(0)

Expand Down
8 changes: 5 additions & 3 deletions harpy/demultiplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def demultiplex():
}

@click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/demultiplex/")
@click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Tab-delimited file of sample\<tab\>barcode')
@click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'File of `sample`\\<TAB\\>`barcode`')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use')
@click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name')
@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container')
Expand All @@ -57,7 +57,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski
Demultiplex Generation I haplotagged FASTQ files
Use the R1, R2, I2, and I2 FASTQ files provided by the sequencing facility as inputs (in that exact order) provided after the options.
The `--schema` must be **tab** (or space) delimited, have **no header** (i.e. no column names), and be in the format of `sample`\<tab\>`barcode`,
The `--schema` must be **tab** (or space) delimited, have **no header** (i.e. no column names), and be in the format of `sample`\\<TAB\\>`barcode`,
where `barcode` is the C- beadtag assigned to the sample (.e.g. `C01`, `C02`, etc.)
"""
output_dir = output_dir.rstrip("/")
Expand All @@ -76,11 +76,13 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski
fetch_rule(workflowdir, "demultiplex_gen1.smk")
os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True)
sm_log = snakemake_log(output_dir, "demultiplex_gen1")
conda_envs = ["qc"]
configs = {
"workflow" : "demultiplex gen1",
"snakemake_log" : sm_log,
"output_directory" : output_dir,
"workflow_call" : command.rstrip(),
"conda_environments" : conda_envs,
"reports" : {
"skip": skip_reports
},
Expand All @@ -95,7 +97,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski
with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding= "utf-8") as config:
yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf'))

create_conda_recipes()
create_conda_recipes(output_dir, conda_envs)
if setup_only:
sys.exit(0)

Expand Down
Loading

0 comments on commit 46f5c80

Please sign in to comment.