diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6e5c8d329..0204e0c3d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -689,25 +689,25 @@ jobs: shell: micromamba-shell {0} run: | harpy simulate snpindel --quiet --snp-count 10 --indel-count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate snpindel --quiet --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/diploid/sim.snp.hap1.vcf --indel-vcf Simulate/snpindel/diploid/sim.indel.hap1.vcf test/genome/genome.fasta.gz + harpy simulate snpindel --quiet --prefix Simulate/snpvcf --snp-vcf Simulate/snpindel/haplotype_1/sim.hap1.snp.vcf --indel-vcf Simulate/snpindel/haplotype_1/sim.hap1.indel.vcf test/genome/genome.fasta.gz - name: simulate inversions shell: micromamba-shell {0} if: always() run: | harpy simulate inversion --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate inversion --quiet --prefix Simulate/invvcf --vcf Simulate/inversion/diploid/sim.inversion.hap1.vcf test/genome/genome.fasta.gz + harpy simulate inversion --quiet --prefix Simulate/invvcf --vcf Simulate/inversion/haplotype_1/sim.hap1.inversion.vcf test/genome/genome.fasta.gz - name: simulate cnv shell: micromamba-shell {0} if: always() run: | harpy simulate cnv --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate cnv --quiet --prefix Simulate/cnvvcf --vcf Simulate/cnv/diploid/sim.cnv.hap1.vcf test/genome/genome.fasta.gz + harpy simulate cnv --quiet --prefix Simulate/cnvvcf --vcf Simulate/cnv/haplotype_1/sim.hap1.cnv.vcf test/genome/genome.fasta.gz - name: simulate translocations shell: micromamba-shell {0} if: always() run: | harpy simulate translocation --quiet --count 10 -z 0.5 test/genome/genome.fasta.gz - harpy simulate translocation --quiet --prefix Simulate/transvcf --vcf Simulate/translocation/diploid/sim.translocation.hap1.vcf test/genome/genome.fasta.gz + harpy simulate translocation --quiet --prefix Simulate/transvcf --vcf Simulate/translocation/haplotype_1/sim.hap1.translocation.vcf test/genome/genome.fasta.gz simulate_linkedreads: needs: [changes, pkgbuild] diff --git a/.gitignore b/.gitignore index cc4204d04..3fee04828 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ .snakemake/ .vscode/ -.harpy_envs/ .condarc .cache/ hpc/ diff --git a/harpy/_cli_types_generic.py b/harpy/_cli_types_generic.py index 77d01bf25..311a2a2dd 100644 --- a/harpy/_cli_types_generic.py +++ b/harpy/_cli_types_generic.py @@ -56,6 +56,7 @@ def convert(self, value, param, ctx): class InputFile(click.ParamType): """A class for a click type that verifies that a file exists and that it has an expected extension""" + name = "input_file" def __init__(self, filetype, gzip_ok): super().__init__() self.filetype = filetype diff --git a/harpy/_conda.py b/harpy/_conda.py index 08c0b93e2..e921ed6f9 100644 --- a/harpy/_conda.py +++ b/harpy/_conda.py @@ -1,12 +1,17 @@ """Creates environment recipes for all the Harpy workflows""" import os +import sys +import yaml +from rich import box +from rich.table import Table +from rich import print as rprint +from ._printing import print_error, print_solution_with_culprits -def create_conda_recipes(): +def create_conda_recipes(outdir: str, envs: list=None) -> None: """Create the YAML files of the workflow conda dependencies""" - condachannels = ["bioconda","conda-forge"] environ = { - "align": [ + "align" : [ "bioconda::bwa", "bioconda::ema", "bioconda::samtools=1.20", @@ -17,7 +22,7 @@ def create_conda_recipes(): "conda-forge::libzlib", "conda-forge::xz" ], - "assembly":[ + "assembly" : [ "bioconda::arcs", "bioconda::bwa", "bioconda::cloudspades", @@ -63,16 +68,17 @@ def create_conda_recipes(): "bioconda::perl-math-random", "bioconda::perl-inline-c", "bioconda::perl-parse-recdescent", + "bioconda::simug>1.0.0", "conda-forge::numpy", "conda-forge::perl" ], - "spades": [ + "spades" : [ "conda-forge::python=3" ], "stitch" : [ "bioconda::r-stitch=1.6.10" ], - "variants": [ + "variants" : [ "bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6", "bioconda::leviathan", @@ -80,19 +86,53 @@ def create_conda_recipes(): ] } - os.makedirs(".harpy_envs", exist_ok = True) - # overwrites existing - for env,deps in environ.items(): - with open(f".harpy_envs/{env}.yaml", mode="w", encoding="utf-8") as yml: - yml.write(f"name: {env}\n") - yml.write("channels:\n - ") - yml.write("\n - ".join(condachannels)) - yml.write("\ndependencies:\n - ") - yml.write("\n - ".join(deps) + "\n") + os.makedirs(f"{outdir}/workflow/envs", exist_ok = True) + # if none provided, use all + if not envs: + envs = environ.keys() - # post-deployment scripts - with open(".harpy_envs/spades.post-deploy.sh", "w", encoding="utf-8") as shellscript: - shellscript.write("wget -O .spades.tar.gz https://github.com/ablab/spades/releases/download/v4.0.0/SPAdes-4.0.0-Linux.tar.gz\n") - shellscript.write("tar -xvzf .spades.tar.gz && rm .spades.tar.gz\n") - shellscript.write("mv SPAdes-4.0.0-Linux/bin/* ${CONDA_PREFIX}/bin && mv SPAdes-4.0.0-Linux/share/* ${CONDA_PREFIX}/share\n") - shellscript.write("rm -r SPAdes-4.0.0-Linux\n") \ No newline at end of file + for i in envs: + try: + env_dict = { + "name" : i, + "channels" : ["bioconda","conda-forge"], + "dependencies": environ[i] + } + except KeyError: + sys.stderr.write(f"Key '{i}' is not an available conda environment name. The options are: " + ", ".join(environ.keys())) + sys.exit(1) + with open(f"{outdir}/workflow/envs/{i}.yaml", "w", encoding="utf-8") as recipe: + yaml.dump(env_dict, recipe, default_flow_style= False, sort_keys=False, width=float('inf'), indent=2) + + if "spades" in envs: + # post-deployment script + with open(f"{outdir}/workflow/envs/spades.post-deploy.sh", "w", encoding="utf-8") as shellscript: + shellscript.write("wget -O .spades.tar.gz https://github.com/ablab/spades/releases/download/v4.0.0/SPAdes-4.0.0-Linux.tar.gz\n") + shellscript.write("tar -xvzf .spades.tar.gz && rm .spades.tar.gz\n") + shellscript.write("mv SPAdes-4.0.0-Linux/bin/* ${CONDA_PREFIX}/bin && mv SPAdes-4.0.0-Linux/share/* ${CONDA_PREFIX}/share\n") + shellscript.write("rm -r SPAdes-4.0.0-Linux\n") + +def check_environments(dirpath: str, envs: list) -> None: + """Check that the provided dir exists and contains the necessary environment definitions""" + if not os.path.exists(f"{dirpath}/workflow/envs"): + print_error("missing conda files", "This working directory does not contain the expected directory of conda environment definitions ([blue bold]workflow/envs/[/blue bold])\n - use [green bold]--conda[/green bold] to recreate it") + sys.exit(1) + envlist = os.listdir(f"{dirpath}/workflow/envs") + errcount = 0 + errtable = Table(show_footer=True, box=box.SIMPLE) + errtable.add_column("File", justify="left", no_wrap=True) + errtable.add_column("Status", justify="center") + for i in envs: + if f"{i}.yaml" in envlist: + errtable.add_row(f"[dim]{i}.yaml", "[dim]present") + else: + errcount += 1 + errtable.add_row(f"[yellow bold]{i}.yaml", "[yellow bold]missing") + if errcount > 0: + print_error("Missing environment files", f"The directory [blue]{dirpath}/workflows/envs[/blue] is missing [yellow bold]{errcount}[/yellow bold] of the expected conda environment definition files.") + print_solution_with_culprits( + "Check that the names conform to Harpy's expectations, otherwise you can recreate this directory using the [green bold]--conda[/green bold] option.", + "Expected environment files:" + ) + rprint(errtable, file = sys.stderr) + sys.exit(1) \ No newline at end of file diff --git a/harpy/_printing.py b/harpy/_printing.py index f40b6c444..21d88d4c4 100644 --- a/harpy/_printing.py +++ b/harpy/_printing.py @@ -23,7 +23,7 @@ def print_setup_error(exitcode): errortext = "Something is wrong with the Snakefile for this workflow. If you manually edited the Snakefile, see the error below for troubleshooting. If you didn't, it's probably a bug (oops!) and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues" errortype = "Snakefile Error" else: - errortext = "There was an issue creating the software environment(s) necessary to run this workflow. If you manually edited the conda dependencies in [blue].harpy_envs[/blue], see the error below for troubleshooting. If you didn't, it might be a bug or related to how your system is setup for Conda or Singularity environments and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues" + errortext = "There was an issue creating the software environment(s) necessary to run this workflow. If you manually edited the conda dependencies in [blue]/workflows/envs[/blue], see the error below for troubleshooting. If you didn't, it might be a bug or related to how your system is setup for Conda or Singularity environments and you should submit an issue on GitHub: [bold]https://github.com/pdimens/harpy/issues" errortype = "Software Environment Error" rprint( diff --git a/harpy/_validations.py b/harpy/_validations.py index f659dc6d3..5fb039a06 100644 --- a/harpy/_validations.py +++ b/harpy/_validations.py @@ -32,32 +32,6 @@ def is_plaintext(file_path): except UnicodeDecodeError: return False -def check_envdir(dirpath): - """Check that the provided dir exists and contains the necessary environment definitions""" - if not os.path.exists(dirpath): - print_error("missing conda files", "This working directory does not contain the expected directory of conda environment definitions ([blue bold].harpy_envs/[/blue bold])\n - use [green bold]--conda[/green bold] to recreate it") - sys.exit(1) - envlist = os.listdir(dirpath) - envs = ["align", "metassembly", "phase", "qc", "r", "simulations", "stitch", "variants"] - errcount = 0 - errtable = Table(show_footer=True, box=box.SIMPLE) - errtable.add_column("File", justify="left", style="blue", no_wrap=True) - errtable.add_column("Exists", justify="center") - for i in envs: - if f"{i}.yaml" in envlist: - errtable.add_row(f"{i}.yaml", "[blue]โœ“") - else: - errcount += 1 - errtable.add_row(f"{i}.yaml", "[yellow]๐Ÿ—™") - if errcount > 0: - print_error("missing conda files", f"The conda environment definition directory ([blue bold]{dirpath}[/blue bold]) is missing [yellow bold]{errcount}[/yellow bold] of the expected definition files. All of the environment files are expected to be present, even if a particular workflow doesn't use it.") - print_solution_with_culprits( - "Check that the names conform to Harpy's expectations, otheriwse you can recreate this directory using the [green bold]--conda[/green bold] option.", - "Expected environment files:" - ) - rprint(errtable, file = sys.stderr) - sys.exit(1) - def check_impute_params(parameters): """Validate the STITCH parameter file for column names, order, types, missing values, etc.""" with open(parameters, "r", encoding="utf-8") as paramfile: diff --git a/harpy/align.py b/harpy/align.py index d050a77b9..e8f81a5d3 100644 --- a/harpy/align.py +++ b/harpy/align.py @@ -113,6 +113,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ fetch_report(workflowdir, "align_bxstats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "align_bwa") + conda_envs = ["align", "r", "qc"] configs = { "workflow" : "align bwa", "snakemake_log" : sm_log, @@ -123,6 +124,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ "depth_windowsize" : depth_window, **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, **({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}), @@ -135,7 +137,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, keep_unmapped, extra_ with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -216,6 +218,8 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de fetch_report(workflowdir, "align_bxstats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "align_ema") + conda_envs = ["align", "r", "qc"] + configs = { "workflow" : "align ema", "snakemake_log" : sm_log, @@ -228,6 +232,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de "EMA_bins" : ema_bins, **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, **({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}), @@ -242,7 +247,7 @@ def ema(inputs, output_dir, platform, barcode_list, fragment_density, genome, de with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -309,6 +314,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, fetch_report(workflowdir, "align_bxstats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "align_strobe") + conda_envs = ["align", "r", "qc"] configs = { "workflow" : "align strobe", "snakemake_log" : sm_log, @@ -320,6 +326,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, "depth_windowsize" : depth_window, **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, **({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}), @@ -332,7 +339,7 @@ def strobe(inputs, output_dir, genome, read_length, keep_unmapped, depth_window, with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/assembly.py b/harpy/assembly.py index 3a678221c..3a3580707 100644 --- a/harpy/assembly.py +++ b/harpy/assembly.py @@ -84,6 +84,7 @@ def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, ex fetch_rule(workflowdir, f"{asm}.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, asm) + conda_envs = ["assembly","qc"] configs = { "workflow" : asm, "snakemake_log" : sm_log, @@ -111,6 +112,7 @@ def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, ex "minimum_links" : links }, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, "organism_type": organism_type @@ -123,7 +125,7 @@ def assembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, output_dir, ex with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/container.py b/harpy/container.py index c59fbedf1..93f65c016 100644 --- a/harpy/container.py +++ b/harpy/container.py @@ -15,7 +15,8 @@ def containerize(): **INTERNAL USE ONLY**. Used to recreate all the conda environments required by the workflows and build a dockerfile from that. """ - create_conda_recipes() + #TODO MAKE THIS ALL OF THEM + create_conda_recipes("container") fetch_rule(os.getcwd(), "containerize.smk") with open("Dockerfile", "w", encoding = "utf-8") as dockerfile: diff --git a/harpy/deconvolve.py b/harpy/deconvolve.py index b2fe3be49..5d77e4d62 100644 --- a/harpy/deconvolve.py +++ b/harpy/deconvolve.py @@ -64,6 +64,7 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t fetch_rule(workflowdir, "deconvolve.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "deconvolve") + conda_envs = ["qc"] configs = { "workflow": "deconvolve", "snakemake_log" : sm_log, @@ -73,12 +74,13 @@ def deconvolve(inputs, output_dir, kmer_length, window_size, density, dropout, t "density" : density, "dropout" : dropout, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs": [i.as_posix() for i in fqlist] } with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/demultiplex.py b/harpy/demultiplex.py index 7a0b82402..ff917c8db 100644 --- a/harpy/demultiplex.py +++ b/harpy/demultiplex.py @@ -39,7 +39,7 @@ def demultiplex(): } @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/demultiplex/") -@click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Tab-delimited file of sample\barcode') +@click.option('-s', '--schema', required = True, type=click.Path(exists=True, dir_okay=False, readable=True), help = 'File of `sample`\\`barcode`') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 1, max_open = True), help = 'Number of threads to use') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "Demultiplex", show_default=True, help = 'Output directory name') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @@ -57,7 +57,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski Demultiplex Generation I haplotagged FASTQ files Use the R1, R2, I2, and I2 FASTQ files provided by the sequencing facility as inputs (in that exact order) provided after the options. - The `--schema` must be **tab** (or space) delimited, have **no header** (i.e. no column names), and be in the format of `sample`\`barcode`, + The `--schema` must be **tab** (or space) delimited, have **no header** (i.e. no column names), and be in the format of `sample`\\`barcode`, where `barcode` is the C- beadtag assigned to the sample (.e.g. `C01`, `C02`, etc.) """ output_dir = output_dir.rstrip("/") @@ -76,11 +76,13 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski fetch_rule(workflowdir, "demultiplex_gen1.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "demultiplex_gen1") + conda_envs = ["qc"] configs = { "workflow" : "demultiplex gen1", "snakemake_log" : sm_log, "output_directory" : output_dir, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports }, @@ -95,7 +97,7 @@ def gen1(r1_fq, r2_fq, i1_fq, i2_fq, output_dir, schema, threads, snakemake, ski with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding= "utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/impute.py b/harpy/impute.py index 751d1f2f8..df9feef77 100644 --- a/harpy/impute.py +++ b/harpy/impute.py @@ -82,6 +82,7 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para fetch_report(workflowdir, "stitch_collate.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "impute") + conda_envs = ["r", "stitch"] configs = { "workflow" : "impute", "snakemake_log" : sm_log, @@ -89,6 +90,7 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para "samples_from_vcf" : vcf_samples, **({'stitch_extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : {"skip": skip_reports}, "stitch_parameters" : params, "inputs" : { @@ -101,7 +103,7 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/metassembly.py b/harpy/metassembly.py index 1fb8ddb81..84a8b7ad9 100644 --- a/harpy/metassembly.py +++ b/harpy/metassembly.py @@ -69,6 +69,7 @@ def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, ignore_bx, fetch_rule(workflowdir, f"metassembly.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "metassembly") + conda_envs = ["align", "assembly", "metassembly", "qc", "spades"] configs = { "workflow" : "metassembly", "snakemake_log" : sm_log, @@ -81,6 +82,7 @@ def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, ignore_bx, **({'extra' : extra_params} if extra_params else {}) }, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, "organism_type": organism_type @@ -93,7 +95,7 @@ def metassembly(fastq_r1, fastq_r2, bx_tag, kmer_length, max_memory, ignore_bx, with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/phase.py b/harpy/phase.py index 240f93e7f..56d4b8659 100644 --- a/harpy/phase.py +++ b/harpy/phase.py @@ -80,6 +80,7 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, fetch_report(workflowdir, "hapcut.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "phase") + conda_envs = ["phase", "r"] configs = { "workflow" : "phase", "snakemake_log" : sm_log, @@ -90,6 +91,7 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, "samples_from_vcf" : vcf_samples, **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, **({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}), @@ -103,7 +105,7 @@ def phase(inputs, output_dir, vcf, threads, molecule_distance, prune_threshold, with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/popgroup.py b/harpy/popgroup.py index 0168c6e87..e5ec6d207 100644 --- a/harpy/popgroup.py +++ b/harpy/popgroup.py @@ -25,7 +25,7 @@ def popgroup(inputdir, output): """ try: samplenames = set() - re_ext = re.compile("\.(bam|sam)$", re.IGNORECASE) + re_ext = re.compile(r"\.(bam|sam)$", re.IGNORECASE) for i in os.listdir(inputdir): if i.lower().endswith(".bam") or i.lower().endswith(".sam"): samplenames.add(re_ext.sub("", os.path.basename(i))) @@ -33,7 +33,7 @@ def popgroup(inputdir, output): raise Exception except: full_flist = [i for i in glob.iglob(f"{inputdir}/*") if not os.path.isdir(i)] - r = re.compile(".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE) + r = re.compile(r".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE) full_fqlist = list(filter(r.match, full_flist)) fqlist = [os.path.basename(i) for i in full_fqlist] bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$" diff --git a/harpy/preflight.py b/harpy/preflight.py index 5880cad23..cd442ed6e 100755 --- a/harpy/preflight.py +++ b/harpy/preflight.py @@ -76,17 +76,19 @@ def fastq(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only) fetch_report(workflowdir, "preflight_fastq.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "preflight_fastq") + conda_envs = ["r"] configs = { "workflow" : "preflight fastq", "snakemake_log" : sm_log, "output_directory" : output_dir, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs" : [i.as_posix() for i in fqlist] } with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -136,17 +138,19 @@ def bam(inputs, output_dir, threads, snakemake, quiet, hpc, conda, setup_only): fetch_report(workflowdir, "preflight_bam.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "preflight_bam") + conda_envs = ["r"] configs = { "workflow" : "preflight bam", "snakemake_log" : sm_log, "output_directory" : output_dir, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs" : [i.as_posix() for i in bamlist] } with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/qc.py b/harpy/qc.py index 52ea8cd26..704feebe3 100644 --- a/harpy/qc.py +++ b/harpy/qc.py @@ -76,6 +76,7 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d fetch_report(workflowdir, "bx_count.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "qc") + conda_envs = ["qc", "r"] k,w,d,a = deconvolve configs = { "workflow" : "qc", @@ -93,13 +94,14 @@ def qc(inputs, output_dir, min_length, max_length, trim_adapters, deduplicate, d "dropout" : a }} if sum(deconvolve) > 0 else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : {"skip": skip_reports}, "inputs" : [i.as_posix() for i in fqlist] } with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) start_text = Table(show_header=False,pad_edge=False, show_edge=False, padding = (0,0), box=box.SIMPLE) diff --git a/harpy/reports/align_bxstats.Rmd b/harpy/reports/align_bxstats.Rmd index 9b5ec71d1..539fb988d 100644 --- a/harpy/reports/align_bxstats.Rmd +++ b/harpy/reports/align_bxstats.Rmd @@ -105,7 +105,7 @@ aggregate_df <- Reduce(rbind, Map(process_input, infiles)) if(nrow(aggregate_df) == 0){ print("All input files were empty") - knittr::knit_exit() + knitr::knit_exit() } ``` diff --git a/harpy/reports/align_stats.Rmd b/harpy/reports/align_stats.Rmd index e56b680b9..78b6a4653 100644 --- a/harpy/reports/align_stats.Rmd +++ b/harpy/reports/align_stats.Rmd @@ -42,17 +42,29 @@ using("flexdashboard","dplyr","highcharter","DT","BioCircos") #samplename <- "test sample" infile <- snakemake@input$bxstats samplename <- snakemake@params$samplename +#sm_moldist <- 50000 +sm_moldist <- snakemake@params$mol_dist bamfile <- paste0(samplename, ".bam") tb <- read.table(infile, header = T, sep = "\t") %>% select(-start, -end) if(nrow(tb) == 0){ - print(paste("Input data file",infle, "is empty")) - knittr::knit_exit() + print(paste("Input data file",infile, "is empty")) + knitr::knit_exit() } tb$valid <- tb$molecule != "invalidBX" ``` -```{r bxper, results = F} +```{r valids, results=F} valids <- tb[tb$valid, -ncol(tb)] +if(nrow(valids) == 0){ + VALID_PRESENT <- FALSE + SKIPTEXT <- TRUE +} else { + VALID_PRESENT <- TRUE + SKIPTEXT <- FALSE +} +``` + +```{r bxper, results = F} invalids <- tb[!(tb$valid), -ncol(tb)] valids$molecule <- as.integer(valids$molecule) nBX <- group_by(valids, contig) %>% @@ -96,8 +108,8 @@ valueBox(scales::comma(totuniqBX), caption = "Unique barcodes", color = "#8f4f76 ### mol-thresh ```{r valuebox_mol_thresh} -if(snakemake@params$mol_dist > 0){ - valueBox(scales::comma(snakemake@params$mol_dist), caption = "Molecule threshold", color = "#be96b9") +if(sm_moldist > 0){ + valueBox(scales::comma(sm_moldist), caption = "Molecule threshold", color = "#be96b9") } ``` @@ -118,18 +130,28 @@ valueBox(scales::comma(tot_invalid), caption = "Invalid BX Records", color = "wa ### singletons ```{r valuebox_singletons} -valueBox(round(sum(valids$reads <= 2)/nrow(valids), 2), caption = "% Singletons") +if (VALID_PRESENT){ + valueBox(round(sum(valids$reads <= 2)/nrow(valids), 2), caption = "% Singletons") +} else { + valueBox("NA", caption = "% Singletons") +} ``` ### glob-avg ```{r valuebox_mol_per_contig} -valueBox(scales::comma(avgBX), caption = "Avg molecules per contig", color = "#d4d4d4") +if (VALID_PRESENT){ + valueBox(scales::comma(avgBX), caption = "Avg molecules per contig", color = "#d4d4d4") +} else { + valueBox("NA", caption = "Avg molecules per contig", color = "#d4d4d4") +} ``` - ## N50 and N90 ### Molecule NXX Length Metrics -```{r NX_stats} +```{r validtext, eval = SKIPTEXT, results="asis"} +cat(paste0("**`",infile,"` has no valid haplotagging barcodes, skipping reporting on linked-read metrics.**")) +``` +```{r NX_stats, eval = VALID_PRESENT} summary_table <- valids %>% group_by(contig) %>% summarize(valid_records = sum(reads), molecules = length(molecule),n50 = NX(length_inferred, 50), n75 = NX(length_inferred, 75), n90 = NX(length_inferred, 90)) @@ -168,7 +190,7 @@ distribution is relative to the total number of non-singleton molecules. ## the plots ### reads per {.no-title} -```{r readsper, out.width = '100%'} +```{r readsper, out.width = '100%', eval = VALID_PRESENT} hs <- hist(valids$reads, breaks = min(valids$reads):max(valids$reads), plot = F) hs_mult <- hist(non_singletons$reads, breaks = hs$breaks, plot = F) hs$counts <- round(hs$counts / sum(hs$counts) * 100, 4) @@ -193,7 +215,7 @@ highchart() |> ``` ### bases per {.no-title} -```{r basesper, out.width="100%"} +```{r basesper, out.width="100%", eval = VALID_PRESENT} hs <- hist(round(valids$aligned_bp, -2), breaks = 50, plot = F) hs_mult <- hist(round(non_singletons$aligned_bp, -2), breaks = hs$breaks, plot = F) hs$counts <- round(hs$counts / sum(hs$counts)*100,4) @@ -230,7 +252,7 @@ relative to the total number of non-singleton molecules. ## inferred-plot ### inferredplot {.no-title} -```{r inferred, out.width = '100%'} +```{r inferred, out.width = '100%', eval = VALID_PRESENT} hs <- hist(round(valids$length_inferred / 1000,0),breaks = 25,plot = F) hs_mult <- hist(round(non_singletons$length_inferred / 1000,0),breaks = hs$breaks,plot = F) hs$counts <- round(hs$counts / sum(hs$counts)*100,2) @@ -276,7 +298,7 @@ molecule coverage as `300bp รท molecule_length`, whereas the plot on the right c ## breadthplot ### coverage by aligned bp {.no-title} -```{r inferred_cov_bp, out.width = '100%'} +```{r inferred_cov_bp, out.width = '100%', eval = VALID_PRESENT} hs <- hist(round(valids$coverage_bp, 0), breaks = seq(0, 1, by = 0.05), plot = F) hs_mult <- hist(round(non_singletons$coverage_bp, 0), breaks = hs$breaks, plot = F) hs$counts <- round(hs$counts / sum(hs$counts)*100, 4) @@ -301,7 +323,7 @@ highchart() |> ``` ### coverage by inserts {.no-title} -```{r inferred_cov_insert, out.width = '100%'} +```{r inferred_cov_insert, out.width = '100%', eval = VALID_PRESENT} hs <- hist(round(valids$coverage_inserts, 0), breaks = seq(0, 1, by = 0.05), plot = F) hs_mult <- hist(round(non_singletons$coverage_inserts, 0), breaks = hs$breaks, plot = F) hs$counts <- round(hs$counts / sum(hs$counts)*100,2) @@ -382,6 +404,7 @@ rm(hs_mult) rm(valids) rm(invalids) rm(non_singletons) +#knitr::knit_exit() ``` ```{r imports} covfile <- snakemake@input$coverage @@ -395,12 +418,12 @@ samplename <- snakemake@params$samplename tb <- read.table(covfile, header = F) if(nrow(tb) == 0){ print(paste("Input data file", covfile, "is empty")) - knittr::knit_exit() + knitr::knit_exit() } tbmol <- read.table(molcovfile, header = F) if(nrow(tbmol) == 0){ print(paste("Input data file", molcovfile, "is empty")) - knittr::knit_exit() + knitr::knit_exit() } ``` diff --git a/harpy/resume.py b/harpy/resume.py index 92fc43c12..4ad47e3ff 100644 --- a/harpy/resume.py +++ b/harpy/resume.py @@ -7,14 +7,14 @@ from rich import box from rich.table import Table import rich_click as click -from ._validations import check_envdir +from ._conda import check_environments from ._printing import print_error from ._launch import launch_snakemake from ._misc import snakemake_log from ._conda import create_conda_recipes @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False), epilog = "Documentation: https://pdimens.github.io/harpy/workflows/other") -@click.option('-c', '--conda', is_flag = True, default = False, help = 'Recreate the conda environments into .harpy_envs/') +@click.option('-c', '--conda', is_flag = True, default = False, help = 'Recreate the conda environments') @click.option('-t', '--threads', type = click.IntRange(min = 2, max_open = True), help = 'Change the number of threads (>1)') @click.option('--quiet', is_flag = True, default = False, help = 'Don\'t show output text while running') @click.argument('directory', required=True, type=click.Path(exists=True, file_okay=False, readable=True), nargs=1) @@ -24,23 +24,25 @@ def resume(directory, conda, threads, quiet): In the event you need to run the Snakemake workflow present in a Harpy output directory (e.g. `Align/bwa`) without Harpy rewriting any of the configuration files, this command - bypasses all the initialization steps of Harpy workflows and executes the Snakemake command - present in `directory/workflow/config.yaml`. It will reuse an existing `.harpy_envs/` folder, - otherwise use `--conda` to create one. + bypasses all the preprocessing steps of Harpy workflows and executes the Snakemake command + present in `directory/workflow/config.yaml`. It will reuse an existing `workflow/envs/` folder + for conda environments, otherwise use `--conda` to create one. - The only requirements is: + The only requirements are: - the target directory has `workflow/config.yaml` present in it + - the targest directory has `workflow/envs/*.yaml` present in it """ directory = directory.rstrip("/") if not os.path.exists(f"{directory}/workflow/config.yaml"): - print_error("config.yaml missing", f"Target directory [blue bold]{directory}[/blue bold] does not contain [blue bold]workflow/config.yaml[/blue bold]") + print_error("Missing config file", f"Target directory [blue]{directory}[/blue] does not contain the file [bold]workflow/config.yaml[/bold]") sys.exit(1) - if conda: - create_conda_recipes() - else: - check_envdir(".harpy_envs") with open(f"{directory}/workflow/config.yaml", 'r', encoding="utf-8") as f: harpy_config = yaml.full_load(f) + conda_envs = harpy_config["conda_environments"] + if conda: + create_conda_recipes(directory, conda_envs) + else: + check_environments(directory, conda_envs) workflow = harpy_config["workflow"].replace(" ", "_") sm_log = snakemake_log(directory, workflow) diff --git a/harpy/scripts/simuG.pl b/harpy/scripts/simuG.pl deleted file mode 100755 index 2eda4dd21..000000000 --- a/harpy/scripts/simuG.pl +++ /dev/null @@ -1,4373 +0,0 @@ -use warnings FATAL => 'all'; -use strict; -use Getopt::Long qw(GetOptions); -use Pod::Usage qw(pod2usage); -use List::Util qw(sum min max shuffle); -# use Data::Dumper; - -############################################################## -# script: simuG.pl -# author: Jia-Xing Yue (GitHub ID: yjx1217) -# version: 1.0.0+ -# fork date: 2022.03.25 -# original repository: https://github.com/yjx1217/simuG -# description: simuG.pl can simulate genome sequences with pre-defined or random genomic variants of full spectrum (e.g. SNP, INDEL, CNV, inversions, and translocations). -# modified by: Pavel Dimens for harpy -############################################################## - -################### -# input parameters -################### - -# General options -my $help; -my $man; -my $version; -# Options for the reference genome -my $refseq; -my $excluded_chr_list; -# Options for SNP variants simulation -my $snp_vcf; -my $snp_count; -my $snp_model; -my $titv_ratio = 0.5; -my $coding_partition_for_snp_simulation; -# Options for INDEL variants simulation -my $indel_vcf; -my $indel_count; -my $indel_model; -my $ins_del_ratio = 1.0; -my $indel_size_powerlaw_alpha = 2.0; -my $indel_size_powerlaw_constant = 0.5; -# Options for copy-number variants simulation -my $cnv_vcf; -my $cnv_count; -my $cnv_gain_loss_ratio = 1.0; -my $duplication_tandem_dispersed_ratio = 1.0; -my $cnv_max_copy_number = 10; -my $cnv_min_size = 100; -my $cnv_max_size = 10000; -# Options for inversions simulation -my $inversion_vcf; -my $inversion_count; -my $inversion_min_size = 1000; -my $inversion_max_size = 100000; -my $inversion_breakpoint_gff; -# Options for translocation simulation -my $translocation_vcf; -my $translocation_count; -my $translocation_breakpoint_gff; -# Option for defining centromere for CNV/inversion/translocation simulation -my $centromere_gff; -# Option for defining genes for SNP/INDEL/CNV/inversion/translocation simulation -my $gene_gff; - -# Options for setting random seed and output file prefix -my $seed; -my $prefix = "output_prefix"; - -GetOptions( - 'help|h|?' => \$help, - 'man|m' => \$man, - 'version|ver' => \$version, - 'refseq|r:s' => \$refseq, - 'excluded_chr_list:s' => \$excluded_chr_list, - 'snp_vcf:s' => \$snp_vcf, - 'snp_count:i' => \$snp_count, - 'snp_model:s' => \$snp_model, - 'titv_ratio:s' => \$titv_ratio, - 'coding_partition_for_snp_simulation:s' => \$coding_partition_for_snp_simulation, - 'indel_vcf:s' => \$indel_vcf, - 'indel_count:i' => \$indel_count, - 'indel_model:s' => \$indel_model, - 'ins_del_ratio:s' => \$ins_del_ratio, - 'indel_size_powerlaw_alpha:f' => \$indel_size_powerlaw_alpha, - 'indel_size_powerlaw_constant:f' => \$indel_size_powerlaw_constant, - 'cnv_vcf:s' => \$cnv_vcf, - 'cnv_count:i' => \$cnv_count, - 'cnv_gain_loss_ratio:s' => \$cnv_gain_loss_ratio, - 'duplication_tandem_dispersed_ratio:s' => \$duplication_tandem_dispersed_ratio, - 'cnv_max_copy_number:i' => \$cnv_max_copy_number, - 'cnv_min_size:i' => \$cnv_min_size, - 'cnv_max_size:i' => \$cnv_max_size, - 'inversion_vcf:s' => \$inversion_vcf, - 'inversion_count:i' => \$inversion_count, - 'inversion_min_size:i' => \$inversion_min_size, - 'inversion_max_size:i' => \$inversion_max_size, - 'inversion_breakpoint_gff:s' => \$inversion_breakpoint_gff, - 'translocation_vcf:s' => \$translocation_vcf, - 'translocation_count:i' => \$translocation_count, - 'translocation_breakpoint_gff:s' => \$translocation_breakpoint_gff, - 'centromere_gff:s' => \$centromere_gff, - 'gene_gff:s' => \$gene_gff, - 'seed|s:i' => \$seed, - 'prefix|p:s' => \$prefix, - ); - -## Option check and print out Usage if needed. -# If usage (or version number) was explicitly requested, print usage (or version number). - -if (defined $help) { - pod2usage(-verbose => 1); -} - -if (defined $man) { - pod2usage(-verbose => 2); -} - -if (defined $version) { - pod2usage(-verbose => 99, -sections => qw(NAME|DESCRIPTION|VERSION)); -} - -if (not defined $refseq) { - pod2usage(-message => "Mandatory argument '-refseq' is missing! Exit!", - -exitval => 1, - -verbose => 1); -} elsif (not -e $refseq) { - print "!!! Error! The defined input file $refseq does not exist!\n"; - print "!!! Exit!\n"; - die; -} - -## Main program -print "\n"; -my $local_time = localtime(); -print "[$local_time]\n"; -print "Starting simuG ..\n\n"; -$local_time = localtime(); -print "[$local_time]\n"; -print "Check specified options ..\n"; -# check specified options -if ((defined $snp_vcf) or (defined $snp_count) or (defined $indel_vcf) or (defined $indel_count)) { - print "Running simuG for SNP/INDEL simulation >>\n"; - print "Ignore all options for CNV/inversion/translocation simulation.\n"; - undef $centromere_gff; - undef $cnv_vcf; - undef $cnv_count; - undef $cnv_gain_loss_ratio; - undef $duplication_tandem_dispersed_ratio; - undef $cnv_max_copy_number; - undef $cnv_min_size; - undef $cnv_max_size; - undef $inversion_vcf; - undef $inversion_count; - undef $inversion_min_size; - undef $inversion_max_size; - undef $inversion_breakpoint_gff; - undef $translocation_vcf; - undef $translocation_count; - undef $translocation_breakpoint_gff; -} elsif ((defined $cnv_vcf) or (defined $cnv_count)) { - print "Running simuG for CNV simulation >>\n\n"; - print "Ignore all options for inversion/translocation simulation.\n"; - undef $coding_partition_for_snp_simulation; - undef $inversion_vcf; - undef $inversion_count; - undef $inversion_min_size; - undef $inversion_max_size; - undef $inversion_breakpoint_gff; - undef $translocation_vcf; - undef $translocation_count; - undef $translocation_breakpoint_gff; -} elsif ((defined $inversion_vcf) or (defined $inversion_count)) { - print "Running simuG for inversion simulation >>\n\n"; - print "Ignore all options for translocation simulation.\n"; - undef $coding_partition_for_snp_simulation; - undef $translocation_vcf; - undef $translocation_count; - undef $translocation_breakpoint_gff; -} elsif ((defined $translocation_vcf) or (defined $translocation_count)) { - undef $coding_partition_for_snp_simulation; - print "Running simuG for translocation simulation >>\n\n"; -} else { - print "\n"; - print "!!! There seems no task for simuG to run !!!\n"; - print "!!! 1) If you want to run simuG for SNP/INDEL simulation, at least one of the following options need to be specified:\n"; - print "!!! -snp_vcf, -snp_count, -indel_vcf, -indel_count\n"; - print "!!! 2) If you want to run simuG for CNV simulation, at least one of the following options need to be specified:\n"; - print "!!! -cnv_vcf, -cnv_count\n"; - print "!!! 3) If you want to run simuG for inversion simulation, at least one of the following options need to be specified:\n"; - print "!!! -inversion_vcf, -inversion_count\n"; - print "!!! 4) If you want to run simuG for translocation simulation, at least one of the following options need to be specified:\n"; - print "!!! -translocation_vcf, translocation_count\n"; - print "!!! 5) If you want to check all available options for simuG, type: perl simuG.pl -h\n"; - print "!!! Exit !!!\n\n"; - $local_time = localtime(); - print "[$local_time]\n"; - exit; -} - -# define excluded chromosome(s) if any -my %excluded_chr_list = (); -if (defined $excluded_chr_list) { - my $excluded_chr_list_fh = read_file($excluded_chr_list); - %excluded_chr_list = parse_list_file($excluded_chr_list_fh); - $local_time = localtime(); - print "\n[$local_time]\n"; - print "Check for excluded chromosome(s) defined in $excluded_chr_list ..\n"; - -} -my %excluded_refseq = (); - -# set up reference genome as the template -my @refseq = (); -my %refseq = (); -my $refseq_fh = read_file($refseq); -parse_fasta_file($refseq_fh, \%refseq, \@refseq); -close $refseq_fh; - -# remove excluded chromosomes if specified -foreach my $chr (@refseq) { - if (exists $excluded_chr_list{$chr}) { - print "Exclude chromosome: $chr\n"; - $excluded_refseq{$chr} = $refseq{$chr}; - delete $refseq{$chr}; - } -} - -# profile the base composition of the reference genome -# my %refseq_base_freq = profile_base_freq(\%refseq); - -my %refseq_base_freq = ( - 'A' => 0.25, - 'T' => 0.25, - 'G' => 0.25, - 'C' => 0.25 - ); - -my %centromere_by_chr = (); -if (defined $centromere_gff) { - my $centromere_gff_fh = read_file($centromere_gff); - my %input_gff = parse_gff_file($centromere_gff_fh); - foreach my $feature_id (sort keys %input_gff) { - my $feature_type = $input_gff{$feature_id}{'type'}; - if ($feature_type eq "centromere") { - my $chr = $input_gff{$feature_id}{'chr'}; - if (exists $refseq{$chr}) { - $centromere_by_chr{$chr} = \%{$input_gff{$feature_id}}; - } - } - } -} - -my %inversion_breakpoint_by_chr_by_type = (); -if (defined $inversion_breakpoint_gff) { - my $inversion_breakpoint_gff_fh = read_file($inversion_breakpoint_gff); - my %input_gff = parse_gff_file($inversion_breakpoint_gff_fh); - foreach my $feature_id (sort keys %input_gff) { - my $chr = $input_gff{$feature_id}{'chr'}; - my $type = $input_gff{$feature_id}{'type'}; - if (exists $refseq{$chr}) { - $inversion_breakpoint_by_chr_by_type{$chr}{$type}{$feature_id} = \%{$input_gff{$feature_id}}; - } - } -} - -my %translocation_breakpoint_by_chr_by_type = (); -if (defined $translocation_breakpoint_gff) { - my $translocation_breakpoint_gff_fh = read_file($translocation_breakpoint_gff); - my %input_gff = parse_gff_file($translocation_breakpoint_gff_fh); - foreach my $feature_id (sort keys %input_gff) { - my $chr = $input_gff{$feature_id}{'chr'}; - my $type = $input_gff{$feature_id}{'type'}; - if (exists $refseq{$chr}) { - $translocation_breakpoint_by_chr_by_type{$chr}{$type}{$feature_id} = \%{$input_gff{$feature_id}}; - } - } -} - -my %gene = (); -my %gene_by_chr = (); -if (defined $gene_gff) { - # check valid protein-coding genes - my $gene_gff_fh = read_file($gene_gff); - my %input_gff = parse_gff_file($gene_gff_fh); - foreach my $feature_id (sort keys %input_gff) { - my $feature_type = $input_gff{$feature_id}{'type'}; - if ($feature_type eq "gene") { - $gene{$feature_id} = \%{$input_gff{$feature_id}}; - } - } - foreach my $gene_id (sort keys %gene) { - my $gene_chr = $gene{$gene_id}{'chr'}; - if (exists $refseq{$gene_chr}) { - $gene_by_chr{$gene_chr}{$gene_id}{'start'} = $gene{$gene_id}{'start'}; - $gene_by_chr{$gene_chr}{$gene_id}{'end'} = $gene{$gene_id}{'end'}; - $gene_by_chr{$gene_chr}{$gene_id}{'strand'} = $gene{$gene_id}{'strand'}; - } - } - if ((defined $coding_partition_for_snp_simulation) and (defined $gene_gff)) { - # check overlapped genes - foreach my $chr (@refseq) { - if ((exists $refseq{$chr}) and (exists $gene_by_chr{$chr})) { - my @gene_by_chr = sort {$gene_by_chr{$chr}{$a}{'start'} <=> $gene_by_chr{$chr}{$b}{'start'} or $gene_by_chr{$chr}{$a}{'end'} <=> $gene_by_chr{$chr}{$b}{'end'}} keys %{$gene_by_chr{$chr}}; - if ((scalar @gene_by_chr) > 0) { - my $gene_index = 0; - my $previous_gene_id; - my $previous_gene_start; - my $previous_gene_end; - my $previous_gene_length; - foreach my $gene_id (@gene_by_chr) { - if (exists $gene{$gene_id}) { - $gene_index++; - my $gene_start = $gene{$gene_id}{'start'}; - my $gene_end = $gene{$gene_id}{'end'}; - if ($gene_index == 1) { - $previous_gene_id = $gene_id; - $previous_gene_start = $gene_start; - $previous_gene_end = $gene_end; - $previous_gene_length = $previous_gene_end - $previous_gene_start + 1; - } else { - if ($previous_gene_end >= $gene_start) { - print "\n"; - print "!!! Warning! Coordinate overlap detected between the gene $previous_gene_id and the gene $gene_id !!!\n"; - my $gene_length = $gene_end - $gene_start + 1; - if ($gene_length <= $previous_gene_length) { - print "!!! simuG will ignore the smaller gene $gene_id !!!\n"; - delete $gene{$gene_id}; - delete $gene_by_chr{$chr}{$gene_id}; - } else { - print "!!! simuG will ignore the smaller gene $previous_gene_id !!!\n"; - delete $gene{$previous_gene_id}; - delete $gene_by_chr{$chr}{$previous_gene_id}; - $previous_gene_id = $gene_id; - $previous_gene_end = $gene_end; - $previous_gene_length = $gene_end - $gene_start + 1; - } - # sleep(3); - } else { - $previous_gene_id = $gene_id; - $previous_gene_end = $gene_end; - $previous_gene_length = $gene_end - $gene_start + 1; - } - } - } - } - } - } - } - } -} - -# initialize the simulated genome -my %simseq = %refseq; -my %ref2sim_map = (); - -# initialize the seed for random number generator -if (not defined $seed) { - $seed = int(rand(2**31)); -} -print "\nThis simulation use the random seed: $seed\n\n"; -srand($seed); - -if (defined $snp_vcf) { - print "The option snp_vcf has been specified: snp_vcf = $snp_vcf\n"; - print "Ignore incompatible option: snp_count\n"; - undef $snp_count; - print "Ignore incompatible option: snp_model\n"; - undef $snp_model; - print "Ignore incompatible option: titv_ratio\n"; - undef $titv_ratio; -} elsif (defined $snp_count) { - print "The option snp_count has been specified: snp_count = $snp_count\n"; - if (defined $snp_model) { - print "The option snp_model has been specified: snp_model = $snp_model\n"; - print "Ignore incompatible option: titv_ratio\n"; - undef $titv_ratio; - } else { - print "The option titv_ratio has been specified: titv_ratio = $titv_ratio\n"; - } - if (defined $gene_gff) { - print "The option gene_gff has been specified: gene_gff = $gene_gff\n"; - } -} - -if (defined $indel_vcf) { - print "The option indel_vcf has been specified: indel_vcf = $indel_vcf\n"; - print "Ignore incompatible option: indel_count\n"; - undef $indel_count; - print "Ignore incompatible option: indel_model\n"; - undef $indel_model; - print "Ignore incompatible option: ins_del_ratio\n"; - undef $ins_del_ratio; - print "Ignore incompatible option: indel_size_powerlaw_alpha\n"; - undef $indel_size_powerlaw_alpha; - print "Ignore incompatible option: indel_size_powerlaw_constant\n"; - undef $indel_size_powerlaw_constant; -} elsif (defined $indel_count) { - print "The option indel_count has been specified: indel_count = $indel_count\n"; - if (defined $indel_model) { - print "The option indel_model has been specified: indel_model = $indel_model\n"; - print "Ignore incompatible option: ins_del_ratio\n"; - undef $ins_del_ratio; - print "Ignore incompatible option: indel_size_powerlaw_alpha\n"; - undef $indel_size_powerlaw_alpha; - print "Ignore incompatible option: indel_size_powerlaw_constant\n"; - undef $indel_size_powerlaw_constant; - } else { - print "The option ins_del_ratio has been specified: ins_del_ratio = $ins_del_ratio\n"; - print "The option indel_size_powerlaw_alpha has been specified: indel_size_powerlaw_alpha = $indel_size_powerlaw_alpha\n"; - print "The option indel_size_powerlaw_constant has been specified: indel_size_powerlaw_constant = $indel_size_powerlaw_constant\n"; - } - if (defined $gene_gff) { - print "The option gene_gff has been specified: gene_gff = $gene_gff\n"; - } -} - -if (defined $cnv_vcf) { - print "The option cnv_vcf has been specified: cnv_vcf = $cnv_vcf\n"; - print "Ignore incompatible option: cnv_count\n"; - undef $cnv_count; - print "Ignore incompatible option: cnv_gain_loss_ratio\n"; - undef $cnv_gain_loss_ratio; - print "Ignore incompatible option: duplication_tandem_dispersed_ratio\n"; - undef $duplication_tandem_dispersed_ratio; - print "Ignore incompatible option: cnv_max_copy_number\n"; - undef $cnv_max_copy_number; - print "Ignore incompatible option: cnv_min_size\n"; - undef $cnv_min_size; - print "Ignore incompatible option: cnv_max_size\n"; - undef $cnv_max_size; - print "Ignore incompatible option: centromere_gff\n"; - undef $centromere_gff; -} elsif (defined $cnv_count) { - print "The option cnv_count has been specified: cnv_count = $cnv_count\n"; - print "The option duplication_tandem_dispersed_ratio has been specified: duplication_tandem_dispersed_ratio = $duplication_tandem_dispersed_ratio\n"; - print "The option cnv_max_copy_number has been specified: cnv_max_copy_number = $cnv_max_copy_number\n"; - print "The option cnv_min_size has been specified: cnv_min_size = $cnv_min_size\n"; - print "The option cnv_min_size has been specified: cnv_max_size = $cnv_max_size\n"; - if (defined $centromere_gff) { - print "The option centromere_gff has been specified: centromere_gff = $centromere_gff\n"; - } - if (defined $gene_gff) { - print "The option gene_gff has been specified: gene_gff = $gene_gff\n"; - } -} - -if (defined $inversion_vcf) { - print "The option inversion_vcf has been specified: inversion_vcf = $inversion_vcf\n"; - print "Ignore incompatible option: inversion_count\n"; - undef $inversion_count; - print "Ignore incompatible option: inversion_min_size\n"; - undef $inversion_min_size; - print "Ignore incompatible option: inversion_max_size\n"; - undef $inversion_max_size; - print "Ignore incompatible option: inversion_breakpoint_gff\n"; - undef $inversion_breakpoint_gff; - print "Ignore incompatible option: centromere_gff\n"; - undef $centromere_gff; -} elsif (defined $inversion_count) { - print "The option inversion_count has been specified: inversion_count = $inversion_count\n"; - if (defined $inversion_breakpoint_gff) { - print "The option inversion_breakpoint_gff has been specified: inversion_breakpoint_gff = $inversion_breakpoint_gff\n"; - print "Ignore incompatible option: inversion_min_size\n"; - undef $inversion_min_size; - print "Ignore incompatible option: inversion_max_size\n"; - undef $inversion_max_size; - } else { - print "The option inversion_min_size has been specified: inversion_min_size = $inversion_min_size\n"; - print "The option inversion_max_size has been specified: inversion_max_size = $inversion_max_size\n"; - } - if (defined $centromere_gff) { - print "The option centromere_gff has been specified: centromere_gff = $centromere_gff\n"; - } - if (defined $gene_gff) { - print "The option gene_gff has been specified: gene_gff = $gene_gff\n"; - } -} - -if (defined $translocation_vcf) { - print "The option translocation_vcf has been specified: translocation_vcf = $translocation_vcf\n"; - print "Ignore incompatible option: translocation_count\n"; - undef $translocation_count; - print "Ignore incompatible option: translocation_breakpoint_gff\n"; - undef $translocation_breakpoint_gff; - print "Ignore incompatible option: centromere_gff\n"; - undef $centromere_gff; -} elsif (defined $translocation_count) { - print "The option translocation_count has been specified: translocation_count = $translocation_count\n"; - if (defined $translocation_breakpoint_gff) { - print "The option translocation_breakpoint_gff has been specified: translocation_breakpoint_gff = $translocation_breakpoint_gff\n"; - } - if (defined $centromere_gff) { - print "The option centromere_gff has been specified: centromere_gff = $centromere_gff\n"; - } - if (defined $gene_gff) { - print "The option gene_gff has been specified: gene_gff = $gene_gff\n"; - } -} -print "\n"; - -my %snp_vcf = (); -if (defined $snp_vcf) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Parsing the input vcf file: $snp_vcf\n\n"; - my $snp_vcf_fh = read_file($snp_vcf); - %snp_vcf = parse_simple_vcf_file($snp_vcf_fh, 0, 'SNP'); - close $snp_vcf_fh; -} - -my %indel_vcf = (); -if (defined $indel_vcf) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Parsing the input vcf file: $indel_vcf\n\n"; - my $indel_vcf_fh = read_file($indel_vcf); - %indel_vcf = parse_simple_vcf_file($indel_vcf_fh, 0, 'INDEL'); - close $indel_vcf_fh; -} - -my %vcf = (); -if ((defined $snp_vcf) and (defined $indel_vcf)) { - %vcf = merge_vcf(\%snp_vcf, \%indel_vcf); -} elsif (defined $snp_vcf) { - %vcf = %snp_vcf; -} elsif (defined $indel_vcf) { - %vcf = %indel_vcf; -} - -# introduce SNP/INDEL variants based on user-provided vcf(s) -if (%vcf) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing defined SNP/INDELs based on the input vcf file(s):\n"; - if (defined $snp_vcf) { - print "> snp_vcf = $snp_vcf\n"; - } - if (defined $indel_vcf) { - print "> indel_vcf = $indel_vcf\n"; - } - introduce_defined_snp_indel(\%vcf, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce random SNP variants -if (defined $snp_count) { - $local_time = localtime(); - print "[$local_time] Introducing random SNPs based on the following parameters:\n"; - print "> snp_count = $snp_count\n"; - if (defined $snp_model) { - print "> snp_model = $snp_model\n"; - } else { - print "> titv_ratio = $titv_ratio\n"; - } - if (defined $coding_partition_for_snp_simulation) { - if (defined $gene_gff) { - print "> gene_gff = $gene_gff\n"; - print "> coding_partition_for_snp_simulation = $coding_partition_for_snp_simulation\n"; - introduce_random_snp_with_coding_partition($snp_count, $snp_model, $titv_ratio, $coding_partition_for_snp_simulation, \%gene, \%refseq, \%simseq, \%ref2sim_map); - } else { - print "!!! Warning! The option '-coding_partition_for_snp_simulation' need to be used together with '-gene_gff' !!!\n"; - print "!!! '-gene_gff' is undefined !!! \n"; - print "!!! Ignore the specified \'-coding_partition_for_snp_simulation $coding_partition_for_snp_simulation \' option. !!!\n"; - } - } else { - introduce_random_snp($snp_count, $snp_model, $titv_ratio, \%refseq, \%simseq, \%ref2sim_map); - } - print "\n"; -} - - -# introduce random INDEL variants -if (defined $indel_count) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing random INDELs based on the following parameters:\n"; - print "> indel_count = $indel_count\n"; - if (defined $indel_model) { - print "> indel_model = $indel_model\n"; - } else { - print "> ins_del_ratio = $ins_del_ratio\n"; - print "> indel_size_powerlaw_alpha = $indel_size_powerlaw_alpha\n"; - print "> indel_size_powerlaw_constant = $indel_size_powerlaw_constant\n"; - } - introduce_random_indel($indel_count, $indel_model, $ins_del_ratio, \%refseq_base_freq, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce CNVs based on user-provided CNV VCF file. -if (defined $cnv_vcf) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing defined CNVs based on the input vcf file:\n"; - print "> cnv_vcf = $cnv_vcf\n"; - my $cnv_vcf_fh = read_file($cnv_vcf); - my %sv = parse_sv_vcf_file($cnv_vcf_fh); - my %cnv = extract_cnv_from_sv(\%sv); - introduce_defined_cnv(\%cnv, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce random CNVs -if (defined $cnv_count) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing random CNVs with the following parameters:\n"; - print "> cnv_count = $cnv_count\n"; - print "> cnv_gain_loss_ratio = $cnv_gain_loss_ratio\n"; - print "> duplication_tandem_dispersed_ratio = $duplication_tandem_dispersed_ratio\n"; - print "> cnv_max_copy_number = $cnv_max_copy_number\n"; - print "> cnv_min_size = $cnv_min_size\n"; - print "> cnv_max_size = $cnv_max_size\n"; - if (defined $centromere_gff) { - print "> centromere_gff = $centromere_gff\n"; - } - if (defined $gene_gff) { - print "> gene_gff = $gene_gff\n"; - } - introduce_random_cnv($cnv_count, $cnv_gain_loss_ratio, $duplication_tandem_dispersed_ratio, $cnv_min_size, $cnv_max_size, $cnv_max_copy_number, \%centromere_by_chr, \%gene_by_chr, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce inversions based on the input inversion vcf file -if (defined $inversion_vcf) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing defined Inversions based on the input vcf file:\n"; - print "> inversion_vcf = $inversion_vcf\n"; - my $inversion_vcf_fh = read_file($inversion_vcf); - my %sv = parse_sv_vcf_file($inversion_vcf_fh); - my %inversion = extract_inversion_from_sv(\%sv); - introduce_defined_inversion(\%inversion, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce random inversions -if (defined $inversion_count) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing random Inversions based on the following parameters:\n"; - print "> inversion_count = $inversion_count\n"; - if (defined $centromere_gff) { - print "> centromere_gff = $centromere_gff\n"; - } - if (defined $gene_gff) { - print "> gene_gff = $gene_gff\n"; - } - if (defined $inversion_breakpoint_gff) { - print "> inversion_breakpoint_gff = $inversion_breakpoint_gff\n"; - } else { - print "> inversion_min_size = $inversion_min_size\n"; - print "> inversion_max_size = $inversion_max_size\n"; - } - introduce_random_inversion($inversion_count, $inversion_min_size, $inversion_max_size, \%centromere_by_chr, \%inversion_breakpoint_by_chr_by_type, \%gene_by_chr, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce translocations based on the input translocation vcf file -if (defined $translocation_vcf) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing defined Translocations based on the input vcf file:\n"; - print "> translocation_vcf = $translocation_vcf\n"; - my $translocation_vcf_fh = read_file($translocation_vcf); - my %sv = parse_sv_vcf_file($translocation_vcf_fh); - my %translocation = extract_translocation_from_sv(\%sv); - introduce_defined_translocation(\%translocation, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# introduce random translocations -if (defined $translocation_count) { - $local_time = localtime(); - print "[$local_time]\n"; - print "Introducing random Translocations based on the following parameters:\n"; - print "> translocation_count = $translocation_count\n"; - if (defined $centromere_gff) { - print "> centromere_gff = $centromere_gff\n"; - } - if (defined $gene_gff) { - print "> gene_gff = $gene_gff\n"; - } - if (defined $translocation_breakpoint_gff) { - print "> translocation_breakpoint_gff = $translocation_breakpoint_gff\n"; - } - introduce_random_translocation($translocation_count, \%centromere_by_chr, \%translocation_breakpoint_by_chr_by_type, \%gene_by_chr, \%refseq, \%simseq, \%ref2sim_map); - print "\n"; -} - -# generate output files -$local_time = localtime(); -print "[$local_time]\n"; -print "Simulation completed! :) \n\n"; -$local_time = localtime(); -print "[$local_time]\n"; -print "Generating output files .. \n\n"; -generate_output_files($prefix, \@refseq, \%refseq, \%simseq, \%ref2sim_map, \%excluded_refseq); -$local_time = localtime(); -print "[$local_time]\n"; -print "Done! :) \n\n"; - -sub read_file { - my $file = shift @_; - my $fh; - if ($file =~ /\.gz$/) { - open($fh, "gunzip -c $file |") or die "can't open pipe to $file"; - } else { - open($fh, $file) or die "can't open $file"; - } - return $fh; -} - -sub write_file { - my $file = shift @_; - my $fh; - if ($file =~ /\.gz$/) { - open($fh, "| gzip -c >$file") or die "can't open pipe to $file\n"; - } else { - open($fh, ">$file") or die "can't open $file\n"; - } - return $fh; -} - -sub parse_list_file { - my $fh = shift @_; - my %list = (); - while (<$fh>) { - chomp; - /^\s*$/ and next; - /^#/ and next; - if (exists $list{$_}) { - $list{$_}++; - } else { - $list{$_} = 1; - } - } - return %list; -} - -sub parse_fasta_file { - my ($fh, $input_hashref, $input_arrayref) = @_; - my $seq_name = ""; - while (<$fh>) { - chomp; - if (/^\s*$/) { - next; - } elsif (/^\s*#/) { - next; - } elsif (/^>(\S+)/) { - $seq_name = $1; - push @$input_arrayref, $seq_name; - $$input_hashref{$seq_name} = ""; - } else { - my $seq_line = uc $_; - $$input_hashref{$seq_name} .= $seq_line; - } - } -} - -sub profile_base_freq { - my $genome_hashref = shift @_; - my @base = qw(A T G C); - my %base_count = (); - my %base_freq = (); - foreach my $chr (sort keys %$genome_hashref) { - my $seq = uc $$genome_hashref{$chr}; - foreach my $base (@base) { - my $bc = () = $seq =~ /$base/g; - if (exists $base_count{$base}) { - $base_count{$base} += $bc; - } else { - $base_count{$base} = $bc; - } - } - } - my $total_base_count = sum(values %base_count); - foreach my $base (@base) { - $base_freq{$base} = $base_count{$base}/$total_base_count; - } - return %base_freq; -} - -sub create_genome_space { - my $genome_hashref = shift @_; - my %genome_space = (); - my $offset = 0; - foreach my $chr (sort keys %$genome_hashref) { - my $chr_length = length $$genome_hashref{$chr}; - my $start = $offset + 1; - my $end = $start + $chr_length - 1; - # print "chr=$chr, chr_length=$chr_length, genome_space_start=$start, genome_space_end=$end\n"; - $genome_space{'chr-wide'}{$chr}{"start"} = $start; - $genome_space{'chr-wide'}{$chr}{"end"} = $end; - $genome_space{'chr-wide'}{$chr}{"length"} = $chr_length; - $offset = $end; - if (not exists $genome_space{'genome-wide'}) { - $genome_space{'genome-wide'}{"start"} = $start; - $genome_space{'genome-wide'}{"end"} = $end; - $genome_space{'genome-wide'}{"length"} = $chr_length; - } else { - $genome_space{'genome-wide'}{"length"} += $chr_length; - $genome_space{'genome-wide'}{"end"} = $genome_space{'genome-wide'}{"length"}; - } - } - return %genome_space; -} - -sub parse_simple_vcf_file { - my ($fh, $qual_cutoff, $query_type) = @_; - my %vcf = (); - while (<$fh>) { - chomp; - /^#/ and next; - /^\s*$/ and next; - my ($ref_chr, $ref_start, $variant_id, $ref_allele, $alt_allele, $variant_qual, $variant_filter, $variant_info) = split /\t/, $_; - if (($variant_qual eq ".") or ($variant_qual >= $qual_cutoff)) { - my $variant_type; - my $ref_allele_length = length $ref_allele; - my $alt_allele_length = length $alt_allele; - my $ref_end = $ref_start + $ref_allele_length - 1; - if ($alt_allele =~ /,/) { - print "!!! Warning! Multiple alternative variants found at the same site:\n"; - print "!!! $ref_chr:$ref_start $ref_allele=>$alt_allele QUAL=$variant_qual!\n"; - print "!!! Ignore all variants at this site.\n\n"; - } else { - if ($ref_allele_length ne $alt_allele_length) { - $variant_type = "INDEL"; - } else { - $variant_type = "SNP"; - } - if ((defined $query_type) and ($query_type ne $variant_type)) { - next; - } else { - my $check_overlap_flag = 0; - if (exists $vcf{$ref_chr}) { - if (exists $vcf{$ref_chr}{$ref_start}) { - $check_overlap_flag = 1; - print "!!! Warning! Multiple variants were defined within the same region: $ref_chr:$ref_start-$ref_end in the input vcf file!\n"; - print "!!! Only keep the first instance: $ref_chr:$ref_start $vcf{$ref_chr}{$ref_start}{'ref_allele'}=>$vcf{$ref_chr}{$ref_start}{'alt_allele'} QUAL=$vcf{$ref_chr}{$ref_start}{'variant_qual'}.\n"; - print "!!! Ignore the variant: $ref_chr:$ref_start $ref_allele => $alt_allele QUAL=$variant_qual.\n\n"; - } else { - foreach my $s (sort {$a <=> $b} keys %{$vcf{$ref_chr}}) { - if ($ref_end < $s) { - last; - } elsif ($ref_start <= $vcf{$ref_chr}{$s}{'ref_end'}) { - if (($variant_type eq "SNP") and ($vcf{$ref_chr}{$s}{'variant_type'} eq "SNP")) { - next; - } else { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $vcf{$ref_chr}{$s}{'ref_start'}, $vcf{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - print "!!! Warning! Multiple variants were defined within the same region: $ref_chr:$ref_start-$ref_end in the input vcf file!\n"; - print "!!! Only keep the first instance: $ref_chr:$s $vcf{$ref_chr}{$s}{'ref_allele'}=>$vcf{$ref_chr}{$s}{'alt_allele'} QUAL=$vcf{$ref_chr}{$s}{'variant_qual'}.\n"; - print "!!! Ignore the variant: $ref_chr:$ref_start $ref_allele => $alt_allele QUAL=$variant_qual.\n\n"; - last; - } - } - } - } - } - } - if ($check_overlap_flag == 0) { - $vcf{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $vcf{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $vcf{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $vcf{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $vcf{$ref_chr}{$ref_start}{'alt_allele'} = $alt_allele; - $vcf{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $vcf{$ref_chr}{$ref_start}{'variant_id'} = $variant_id; - $vcf{$ref_chr}{$ref_start}{'variant_qual'} = $variant_qual; - $vcf{$ref_chr}{$ref_start}{'variant_info'} = $variant_info; - } - } - } - } - } - return %vcf; -} - -sub merge_vcf { - my ($snp_vcf_hashref, $indel_vcf_hashref) = @_; - my %merged_vcf = %$snp_vcf_hashref; - foreach my $ref_chr (sort keys %$indel_vcf_hashref) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$indel_vcf_hashref{$ref_chr}}) { - my $ref_start = $$indel_vcf_hashref{$ref_chr}{$ref_start}{'ref_start'}; - my $ref_end = $$indel_vcf_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $check_overlap_flag = 0; - if (exists $$snp_vcf_hashref{$ref_chr}) { - if (exists $$snp_vcf_hashref{$ref_chr}{$ref_start}) { - $check_overlap_flag = 1; - print "!!! Warning! Both SNP and INDEL variants were defined within the same region: $ref_chr:$ref_start-$ref_end in the input vcf files!\n"; - print "!!! Only keep the SNP variant: $$snp_vcf_hashref{$ref_chr}{$ref_start}{'ref_allele'}=>$$snp_vcf_hashref{$ref_chr}{$ref_start}{'alt_allele'}.\n"; - print "!!! Ignore the INDEL variant $$indel_vcf_hashref{$ref_chr}{$ref_start}{'ref_allele'}=>$$indel_vcf_hashref{$ref_chr}{$ref_start}{'alt_allele'} within this region.\n\n"; - } else { - foreach my $s (sort {$a <=> $b} keys %{$$snp_vcf_hashref{$ref_chr}}) { - if ($ref_end < $s) { - last; - } elsif ($ref_start <= $$snp_vcf_hashref{$ref_chr}{$s}{'ref_end'}) { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $$snp_vcf_hashref{$ref_chr}{$s}{'ref_start'}, $$snp_vcf_hashref{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - print "!!! Warning! Both SNP and INDEL variants were defined within the same region: $ref_chr:$ref_start-$ref_end in the input vcf files!\n"; - print "!!! Only keep the SNP variant: $$snp_vcf_hashref{$ref_chr}{$s}{'ref_allele'}=>$$snp_vcf_hashref{$ref_chr}{$s}{'alt_allele'}.\n"; - print "!!! Ignore the INDEL variant $$indel_vcf_hashref{$ref_chr}{$ref_start}{'ref_allele'}=>$$indel_vcf_hashref{$ref_chr}{$ref_start}{'alt_allele'} within this region.\n\n"; - last; - } - } - } - } - } - if ($check_overlap_flag == 0) { - $merged_vcf{$ref_chr}{$ref_start} = \%{$$indel_vcf_hashref{$ref_chr}{$ref_start}}; - } - } - } - return %merged_vcf; -} - -sub parse_gff_file { - my $fh = shift @_; - my %gff = (); - while (<$fh>) { - chomp; - /^##FASTA/ and last; - /^#/ and next; - /^\s*$/ and next; - my ($chr, $source, $type, $start, $end, $score, $strand, $phase, $attributes) = split /\t/, $_; - if ($attributes =~ /ID=([^;]+)/) { - my ($feature_id) = $1; - $gff{$feature_id}{'id'} = $feature_id; - $gff{$feature_id}{'type'} = $type; - $gff{$feature_id}{'chr'} = $chr; - $gff{$feature_id}{'start'} = $start; - $gff{$feature_id}{'end'} = $end; - $gff{$feature_id}{'strand'} = $strand; - $gff{$feature_id}{'source'} = $source; - $gff{$feature_id}{'score'} = $score; - $gff{$feature_id}{'phase'} = $phase; - $gff{$feature_id}{'attributes'} = $attributes; - if ($type eq 'mRNA') { - my ($mRNA_id, $gene_id) = ($attributes =~ /ID=([^;]+);\S*Parent=([^;]+)/); - $gff{$mRNA_id}{'parent'} = $gene_id; - } - } - if ($type eq 'exon') { - my ($mRNA_id) = ($attributes =~ /Parent=([^;]+)/); - my $exon_index = $start; - $gff{$mRNA_id}{'exon'}{$exon_index}{'chr'} = $chr; - $gff{$mRNA_id}{'exon'}{$exon_index}{'start'} = $start; - $gff{$mRNA_id}{'exon'}{$exon_index}{'end'} = $end; - $gff{$mRNA_id}{'exon'}{$exon_index}{'strand'} = $strand; - $gff{$mRNA_id}{'exon'}{$exon_index}{'source'} = $source; - $gff{$mRNA_id}{'exon'}{$exon_index}{'score'} = $score; - $gff{$mRNA_id}{'exon'}{$exon_index}{'phase'} = $phase; - } elsif ($type eq 'CDS') { - my ($mRNA_id) = ($attributes =~ /Parent=([^;]+)/); - my $cds_index = $start; - $gff{$mRNA_id}{'cds'}{$cds_index}{'chr'} = $chr; - $gff{$mRNA_id}{'cds'}{$cds_index}{'start'} = $start; - $gff{$mRNA_id}{'cds'}{$cds_index}{'end'} = $end; - $gff{$mRNA_id}{'cds'}{$cds_index}{'strand'} = $strand; - $gff{$mRNA_id}{'cds'}{$cds_index}{'source'} = $source; - $gff{$mRNA_id}{'cds'}{$cds_index}{'score'} = $score; - $gff{$mRNA_id}{'cds'}{$cds_index}{'phase'} = $phase; - } - } - foreach my $feature_id (sort keys %gff) { - if ($gff{$feature_id}{'type'} eq "mRNA") { - my $mRNA_id = $feature_id; - my $gene_id = $gff{$mRNA_id}{'parent'}; - $gff{$gene_id}{'mRNA'}{$mRNA_id} = \%{$gff{$mRNA_id}}; - } - } - # print Dumper(%gff); - return %gff; -} - -sub analyze_coding_partition { - my ($refseq_hashref, $gene_gff_hashref) = @_; - my %coding_partition = (); - foreach my $gene_id (sort keys %$gene_gff_hashref) { - my $gene_chr = $$gene_gff_hashref{$gene_id}{'chr'}; - if (exists $$refseq_hashref{$gene_chr}) { - my $gene_start = $$gene_gff_hashref{$gene_id}{'start'}; - my $gene_end = $$gene_gff_hashref{$gene_id}{'end'}; - my $gene_strand = $$gene_gff_hashref{$gene_id}{'strand'}; - my %mRNA_length = (); - # print "gene_id=$gene_id\n"; - if (not exists $$gene_gff_hashref{$gene_id}{'mRNA'}) { - # skip unexpected cases such as ENSG00000188403 that lacks the mRNA track in Ensembl's GFF3 file - next; - } - foreach my $mRNA_id (sort keys %{$$gene_gff_hashref{$gene_id}{'mRNA'}}) { - my $mRNA_length = $$gene_gff_hashref{$gene_id}{'mRNA'}{$mRNA_id}{'end'} - $$gene_gff_hashref{$gene_id}{'mRNA'}{$mRNA_id}{'start'} + 1; - $mRNA_length{$mRNA_id} = $mRNA_length; - } - my @mRNA_length = sort {$mRNA_length{$b} <=> $mRNA_length{$a}} keys %mRNA_length; - my $primary_mRNA_id = $mRNA_length[0]; - my $primary_mRNA_strand = $$gene_gff_hashref{$gene_id}{'mRNA'}{$primary_mRNA_id}{'strand'}; - my @cds_index = sort {$a <=> $b} keys %{$$gene_gff_hashref{$gene_id}{'mRNA'}{$primary_mRNA_id}{'cds'}}; - my @coding_sites = (); - foreach my $cds_index (@cds_index) { - my $cds_start = $$gene_gff_hashref{$gene_id}{'mRNA'}{$primary_mRNA_id}{'cds'}{$cds_index}{'start'}; - my $cds_end = $$gene_gff_hashref{$gene_id}{'mRNA'}{$primary_mRNA_id}{'cds'}{$cds_index}{'end'}; - for (my $pos = $cds_start; $pos <= $cds_end; $pos++) { - $coding_partition{'coding'}{"$gene_chr:$pos"} = 1; - push @coding_sites, $pos; - } - } - my @ordered_coding_sites = sort {$a <=> $b} @coding_sites; - my $codon_num = (scalar @ordered_coding_sites)/3; - for (my $codon_index = 1; $codon_index <= $codon_num; $codon_index++) { - my $codon_pos1; - my $codon_pos2; - my $codon_pos3; - my $codon_seq; - if ($primary_mRNA_strand eq "+") { - ($codon_pos1, $codon_pos2, $codon_pos3) = splice @ordered_coding_sites, 0, 3; - my $codon_pos1_base = substr $$refseq_hashref{$gene_chr}, $codon_pos1 - 1, 1; - my $codon_pos2_base = substr $$refseq_hashref{$gene_chr}, $codon_pos2 - 1, 1; - my $codon_pos3_base = substr $$refseq_hashref{$gene_chr}, $codon_pos3 - 1, 1; - $codon_seq = $codon_pos1_base . $codon_pos2_base . $codon_pos3_base; - } else { - ($codon_pos3, $codon_pos2, $codon_pos1) = splice @ordered_coding_sites, 0, 3; - my $codon_pos1_base = substr $$refseq_hashref{$gene_chr}, $codon_pos1 - 1, 1; - my $codon_pos2_base = substr $$refseq_hashref{$gene_chr}, $codon_pos2 - 1, 1; - my $codon_pos3_base = substr $$refseq_hashref{$gene_chr}, $codon_pos3 - 1, 1; - $codon_seq = $codon_pos3_base . $codon_pos2_base . $codon_pos1_base; - $codon_seq = revcom($codon_seq); - } - my $codon_4d_site = codon_4d_site($codon_seq, $codon_pos1, $codon_pos2, $codon_pos3); - my $codon_2d_site = codon_2d_site($codon_seq, $codon_pos1, $codon_pos2, $codon_pos3); - if ($codon_4d_site ne "-1") { - $coding_partition{'4d'}{"$gene_chr:$codon_4d_site"} = 1; - } - if ($codon_2d_site ne "-1") { - if ($codon_2d_site =~ /;/) { - my @codon_2d_site = split /;/, $codon_2d_site; - $coding_partition{'2d'}{"$gene_chr:$codon_2d_site[0]"} = 1; - $coding_partition{'2d'}{"$gene_chr:$codon_2d_site[1]"} = 1; - } else { - $coding_partition{'2d'}{"$gene_chr:$codon_2d_site"} = 1; - } - } - } - } - } - return %coding_partition; -} - -sub codon_4d_site { - my ($codon_seq, $codon_pos1, $codon_pos2, $codon_pos3) = @_; - $codon_seq = uc $codon_seq; - if ($codon_seq =~ /GC[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /CG[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /GG[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /CT[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /CC[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /TC[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /AC[ATGC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /GT[ATGC]/) { - return "$codon_pos3"; - } else { - return "-1"; - } -} - -sub codon_2d_site { - my ($codon_seq, $codon_pos1, $codon_pos2, $codon_pos3) = @_; - $codon_seq = uc $codon_seq; - if ($codon_seq =~ /TT[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /TT[AG]/) { - return "$codon_pos1;$codon_pos3"; - } elsif ($codon_seq =~ /AT[ATC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /TA[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /TA[AG]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /CA[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /CA[AG]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /AA[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /AA[AG]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /GA[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /GA[AG]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /TG[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /AG[TC]/) { - return "$codon_pos3"; - } elsif ($codon_seq =~ /AG[AG]/) { - return "$codon_pos1;$codon_pos3"; - } elsif ($codon_seq =~ /CT[AG]/) { - return "$codon_pos1"; - } elsif ($codon_seq =~ /CG[AG]/) { - return "$codon_pos1"; - } else { - return "-1"; - } -} - -sub revcom { - my $seq = shift @_; - my $seq_revcom = reverse $seq; - $seq_revcom =~ tr/ATGCatgc/TACGtacg/; - return $seq_revcom; -} - -sub introduce_defined_snp_indel { - my ($vcf_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my $snp_count = 0; - my $indel_count = 0; - my %offset = (); - foreach my $ref_chr (sort keys %$vcf_hashref) { - $offset{$ref_chr} = 0; - } - foreach my $ref_chr (sort keys %$vcf_hashref) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$vcf_hashref{$ref_chr}}) { - my $variant_id = $$vcf_hashref{$ref_chr}{$ref_start}{'variant_id'}; - my $variant_type = $$vcf_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $ref_end = $$vcf_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $$vcf_hashref{$ref_chr}{$ref_start}{'ref_allele'}; - my $alt_allele = $$vcf_hashref{$ref_chr}{$ref_start}{'alt_allele'}; - my $ref_allele_length = length $ref_allele; - my $alt_allele_length = length $alt_allele; - # print "ref_chr=$ref_chr, ref_start=$ref_start, ref_end=$ref_end, offset=$offset{$ref_chr}\n"; - # check if there are pre-introduced SNP/INDEL variants overlapping at the same site already - my $check_overlap_flag = 0; - if (exists $$ref2sim_map_hashref{$ref_chr}) { - if (exists $$ref2sim_map_hashref{$ref_chr}{$ref_start}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} =~ /(SNP|INDEL)/) { - $check_overlap_flag = 1; - } - } else { - foreach my $s (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($ref_end < $s) { - last; - } elsif ($ref_start <= $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}) { - if ($$ref2sim_map_hashref{$ref_chr}{$s}{'variant_type'} =~ /(SNP|INDEL)/) { - if (($variant_type eq "SNP") and ($$ref2sim_map_hashref{$ref_chr}{$s}{'variant_type'} eq "SNP")) { - next; - } else { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_start'}, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - last; - } - } - } - } - } - } - } - if ($check_overlap_flag == 0) { - substr $$simseq_hashref{$ref_chr}, $ref_start - 1 + $offset{$ref_chr}, $ref_allele_length, $$vcf_hashref{$ref_chr}{$ref_start}{'alt_allele'}; - - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $variant_id; - - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $alt_allele_length - 1 + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - - if ($variant_type eq "SNP") { - $snp_count++; - } else { - # INDEL - my $indel_size = $alt_allele_length - $ref_allele_length; - $offset{$ref_chr} += $indel_size; - $indel_count++; - if ($indel_size > 0) { - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'indel_type'} = 'INSERTION'; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'indel_size'} = $indel_size; - } else { - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'indel_type'} = 'DELETION'; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'indel_size'} = -$indel_size; - } - } - } else { - print "!!! Warning! Multiple variants were defined within the same region: $ref_chr:$ref_start-$ref_end in the input vcf file(s)\n"; - print "!!! Only keep the first instance: $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} => $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'alt_allele'}\n"; - print "!!! Ignore all the other variants defined within this region.\n"; - } - } - } - # print "> Introduced $snp_count SNP variants and $indel_count INDEL variants based on the input vcf file(s).\n"; -} - -sub check_overlap_region { - my ($region1_start, $region1_end, $region2_start, $region2_end) = @_; - # print "region1_start=$region1_start, region1_end=$region1_end, region2_start=$region2_start, region2_end=$region2_end\n"; - my $flag = 0; - if (($region1_start <= $region2_end) and ($region1_end >= $region2_start)) { - $flag = 1; - } - return $flag; -} - -sub adjust_variant_coordinates_in_simseq { - my ($ref_chr, $ref_start, $offset, $ref2sim_map_hashref) = @_; - if (exists $$ref2sim_map_hashref{$ref_chr}) { - foreach my $s (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($s > $ref_start) { - # adjustment needed - $$ref2sim_map_hashref{$ref_chr}{$s}{'sim_start'} += $offset; - $$ref2sim_map_hashref{$ref_chr}{$s}{'sim_end'} += $offset; - } - } - } -} - -sub cal_prob_interval { - my $prob_hashref = shift @_; - my %prob_interval = (); - my $lower_bound = 0; - my $upper_bound = 0; - foreach my $key (sort {($$prob_hashref{$a} <=> $$prob_hashref{$b}) or ($a cmp $b)} keys %$prob_hashref){ - $upper_bound = $lower_bound + $$prob_hashref{$key}; - $prob_interval{$key} = "$lower_bound--$upper_bound"; - $lower_bound = $upper_bound; - } - return %prob_interval; -} - -sub sample_from_interval { - my $prob_interval_hashref = shift @_; - my $dice = rand(1); - # print "dice = $dice\n"; - foreach my $key (sort keys %$prob_interval_hashref){ - my ($lower, $upper) = split /--/, $$prob_interval_hashref{$key}; - if(($dice >= $lower) and ($dice < $upper)){ - return $key; - } - } -} - -sub sample_inserted_seq { - my ($indel_size, $refseq_base_freq_hashref) = @_; - my $inserted_seq = ""; - my %refseq_base_prob_interval = cal_prob_interval($refseq_base_freq_hashref); - # print Dumper(%refseq_base_prob_interval); - for(my $i = 1; $i <= $indel_size; $i++) { - $inserted_seq .= sample_from_interval(\%refseq_base_prob_interval); - } - return $inserted_seq; -} - -sub parse_snp_model { - my $fh = shift @_; - my %model = (); - while (<$fh>) { - chomp; - /^#/ and next; - /^\s*$/ and next; - if (/titv_ratio=(\S+)/) { - $model{'titv_ratio'} = $1; - if ($model{'titv_ratio'} eq "NA") { - print "\n!!! Error! The supplied SNP model is incomplete: titv_ratio = NA\n"; - print "!!! Exit!\n"; - die; - } - } else { - my ($original_base, $new_base, $freq) = ($_ =~ /(\w)\-\>(\w)\t(\S+)/); - # print "original_base=$original_base, new_base=$new_base, freq=$freq\n"; - $model{'substitution_freq'}{$original_base}{$new_base} = $freq; - if ($model{'substitution_freq'}{$original_base}{$new_base} eq "NA") { - print "\n!!! Error! The supplied SNP model is incomplete: ${original_base}\-\>${new_base} substitution frequency = NA\n"; - print "!!! Exit!\n"; - die; - } - if (exists $model{'total_substitution_freq'}{$original_base}) { - $model{'total_substitution_freq'}{$original_base} += $freq; - } else { - $model{'total_substitution_freq'}{$original_base} = $freq; - } - } - } - my @base = qw(A T G C); - foreach my $original_base (@base) { - foreach my $new_base (@base) { - if ($original_base ne $new_base) { - $model{'substitution_prob'}{$original_base}{$new_base} = $model{'substitution_freq'}{$original_base}{$new_base}/$model{'total_substitution_freq'}{$original_base}; - } - } - } - # print(Dumper(%model)); - return %model; -} - -sub pupy { - my $base = shift @_; - my $result; - if ($base =~ /(A|a)/) { - $result = "purine"; - } elsif ($base =~ /(G|g)/) { - $result = "purine"; - } elsif ($base =~ /(T|t)/) { - $result = "pyrimidine"; - } elsif ($base =~ /(C|c)/) { - $result = "pyrimidine"; - } else { - $result = "unknown"; - } - return $result; -} - -sub determine_substitution_probability_for_snp { - my ($titv_ratio, $snp_model) = @_; - my %substitution_prob = (); - my @base = qw(A T G C); - # set up titv_ratio and substitution_probability to prepare for the derived base sampling - if (defined $snp_model) { - # based on model - # print "> snp_model = $snp_model\n\n"; - my $snp_model_fh = read_file($snp_model); - my %snp_model = parse_snp_model($snp_model_fh); - close $snp_model_fh; - $titv_ratio = $snp_model{'titv_ratio'}; - %substitution_prob = %{$snp_model{'substitution_prob'}}; - } else { - # random - # print "> titv_ratio = $titv_ratio\n\n"; - foreach my $original_base (@base) { - foreach my $new_base (@base) { - if ($original_base ne $new_base) { - if ($titv_ratio eq "Inf") { - # transition only - if ((pupy($original_base) eq "transition") and (pupy($new_base) eq "transition")) { - $substitution_prob{$original_base}{$new_base} = 1; - } else { - $substitution_prob{$original_base}{$new_base} = 0; - } - } else { - if (pupy($original_base) eq pupy($new_base)) { - $substitution_prob{$original_base}{$new_base} = $titv_ratio/($titv_ratio + 1); - } else { - $substitution_prob{$original_base}{$new_base} = 1/(($titv_ratio + 1) * 2); - } - } - } - } - } - } - return %substitution_prob; -} - -sub sample_genome_space { - my $genome_space_hashref = shift @_; - my $sample = 1 + int rand($$genome_space_hashref{'genome-wide'}{'length'}); - # print "sub sample_genome_space >> sample = $sample\n"; - my ($chr_sampled, $start_sampled) = genome_space_translator($sample, $genome_space_hashref); - # print "sub sample_genome_space >> chr_sampled = $chr_sampled, start_sampled = $start_sampled\n"; - return ($chr_sampled, $start_sampled); -} - -sub genome_space_translator { - my ($sample, $genome_space_hashref) = @_; - my ($chr_translated, $pos_translated); - foreach my $chr (sort keys %{$$genome_space_hashref{'chr-wide'}}) { - my $chr_start = $$genome_space_hashref{'chr-wide'}{$chr}{'start'}; - my $chr_end = $$genome_space_hashref{'chr-wide'}{$chr}{'end'}; - if (($sample >= $chr_start) and ($sample <= $chr_end)) { - $chr_translated = $chr; - $pos_translated = $sample - $chr_start + 1; - last; - } - } - return ($chr_translated, $pos_translated); -} - -sub sample_alt_allele { - my ($ref_allele, $substitution_prob_hashref) = @_; - # print "ref_allele = $ref_allele\n"; - # print "substitution_prob = \n"; - # print Dumper(%$substitution_prob_hashref); - my %conditional_substitution_prob = %{$$substitution_prob_hashref{$ref_allele}}; - # print "conditional_substitution_prob = \n"; - # print Dumper(%conditional_substitution_prob); - my %prob_interval = cal_prob_interval(\%conditional_substitution_prob); - # print "prob_interval = \n"; - # print Dumper(%prob_interval); - my $alt_allele = sample_from_interval(\%prob_interval); - # print "alt_allele = $alt_allele\n"; - return $alt_allele; -} - -sub introduce_random_snp_with_coding_partition { - my ($snp_count, $snp_model, $titv_ratio, $coding_partition_for_snp_simulation, $gene_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my %coding_partition = analyze_coding_partition($refseq_hashref, $gene_hashref); - if ($coding_partition_for_snp_simulation =~ /noncoding/) { - foreach my $chr (sort keys %$refseq_hashref) { - my $chr_length = length $$refseq_hashref{$chr}; - for (my $i = 1; $i <= $chr_length; $i++) { - if (not exists $coding_partition{'coding'}{"$chr:$i"}) { - $coding_partition{'noncoding'}{"$chr:$i"} = 1; - } - } - } - } - my @sampling_space = sort keys %{$coding_partition{$coding_partition_for_snp_simulation}}; - my $sampling_space_size = scalar @sampling_space; - my %substitution_prob = determine_substitution_probability_for_snp($titv_ratio, $snp_model); - # print Dumper(%substitution_prob); - # sample and introduce SNP in the same time - if ($sampling_space_size < $snp_count) { - print "!!! Warning! Only $sampling_space_size $coding_partition_for_snp_simulation sites available in the genome whereas $snp_count SNPs needs to be simulated !!!\n"; - print "!!! Will only introduce $sampling_space_size SNPs in $coding_partition_for_snp_simulation sites !!!\n"; - $snp_count = $sampling_space_size; - } - my $indel_max_radius = 50; - for (my $i = 1; $i <= $snp_count; $i++) { - # sample partition space - SAMPLE_RANDOM_SNP: - my $sample = int rand($sampling_space_size); - my ($ref_chr, $ref_start) = split /:/, $sampling_space[$sample]; - # print "chr_sampled = $ref_chr, start_sampled = $ref_start\n"; - my $ref_end = $ref_start; - my $ref_allele = substr $$refseq_hashref{$ref_chr}, $ref_start - 1, 1; - my $alt_allele; - # check if the sampled position contains any ambiguous bases - if ($ref_allele =~ /(N|n)/) { - goto SAMPLE_RANDOM_SNP; - } else { - # check if there are pre-introduced SNP/INDEL variants overlapping at the same site already - my $check_overlap_flag = 0; - if (exists $$ref2sim_map_hashref{$ref_chr}) { - if (exists $$ref2sim_map_hashref{$ref_chr}{$ref_start}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} =~ /(SNP|INDEL)/) { - $check_overlap_flag = 1; - goto SAMPLE_RANDOM_SNP; - } - } else { - if ((defined $indel_count) and ($indel_count > 0)) { - foreach my $s (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - my $e = $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}; - if ($ref_end < $s - $indel_max_radius - 1) { - last; - } elsif ($ref_start >= $e + $indel_max_radius + 1) { - next; - } else { - if ($$ref2sim_map_hashref{$ref_chr}{$s}{'variant_type'} =~ /INDEL/) { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_start'}, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - goto SAMPLE_RANDOM_SNP; - } - } - } - } - } - } - } - if ($check_overlap_flag == 0) { - # register this SNP - $alt_allele = sample_alt_allele($ref_allele, \%substitution_prob); - # print "ref_allele = $ref_allele, alt_allele = $alt_allele\n"; - substr $$simseq_hashref{$ref_chr}, $ref_start - 1, 1, $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = "SNP_${i}"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = "SNP"; - } - } - } -} - -sub introduce_random_snp { - my ($snp_count, $snp_model, $titv_ratio, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my %refseq_genome_space = create_genome_space($refseq_hashref); - my %substitution_prob = determine_substitution_probability_for_snp($titv_ratio, $snp_model); - # print Dumper(%substitution_prob); - # sample and introduce SNP in the same time - my $indel_max_radius = 50; - for (my $i = 1; $i <= $snp_count; $i++) { - # sample genome space - SAMPLE_RANDOM_SNP: - my ($ref_chr, $ref_start) = sample_genome_space(\%refseq_genome_space); - # print "chr_sampled = $ref_chr, start_sampled = $ref_start\n"; - my $ref_end = $ref_start; - my $ref_allele = substr $$refseq_hashref{$ref_chr}, $ref_start - 1, 1; - my $alt_allele; - # check if the sampled position contains any ambiguous bases - if ($ref_allele =~ /(N|n)/) { - goto SAMPLE_RANDOM_SNP; - } else { - # check if there are pre-introduced SNP/INDEL variants overlapping at the same site already - my $check_overlap_flag = 0; - if (exists $$ref2sim_map_hashref{$ref_chr}) { - if (exists $$ref2sim_map_hashref{$ref_chr}{$ref_start}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} =~ /(SNP|INDEL)/) { - $check_overlap_flag = 1; - goto SAMPLE_RANDOM_SNP; - } - } else { - if ((defined $indel_count) and ($indel_count > 0)) { - foreach my $s (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - my $e = $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}; - if ($ref_end < $s - $indel_max_radius - 1) { - last; - } elsif ($ref_start >= $e + $indel_max_radius + 1) { - next; - } else { - if ($$ref2sim_map_hashref{$ref_chr}{$s}{'variant_type'} =~ /INDEL/) { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_start'}, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - goto SAMPLE_RANDOM_SNP; - } - } - } - } - } - } - } - if ($check_overlap_flag == 0) { - # register this SNP - $alt_allele = sample_alt_allele($ref_allele, \%substitution_prob); - # print "ref_allele = $ref_allele, alt_allele = $alt_allele\n"; - substr $$simseq_hashref{$ref_chr}, $ref_start - 1, 1, $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = "SNP_${i}"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = "SNP"; - } - } - } -} - -sub parse_indel_model { - my $fh = shift @_; - my %model = (); - while (<$fh>) { - chomp; - /^#/ and next; - /^\s*$/ and next; - if (/ins_del_ratio=(\S+)/) { - $model{'ins_del_ratio'} = $1; - if ($model{'ins_del_ratio'} eq "NA") { - print "!!! Error! The supplied INDEL model is incomplete: ins_del_ratio = NA\n"; - print "!!! Exit!\n"; - die; - } - } else { - my ($indel_size, $freq) = ($_ =~ /(\d+)\t(\S+)/); - if ($freq eq "NA") { - print "!!! Error! The supplied INDEL model is incomplete: the frequency of ${indel_size}-bp INDEL is NA\n"; - print "!!! Exit!\n"; - die; - } - $model{'indel_size_prob'}{$indel_size} = $freq; - } - } - return %model; -} - -sub sample_indel_type { - my $ins_del_ratio = shift @_; - my $sample = rand(1); - if ($ins_del_ratio eq "Inf") { - $sample = "INSERTION"; - } elsif ($sample < $ins_del_ratio/(1 + $ins_del_ratio)) { - $sample = "INSERTION"; - } else { - $sample = "DELETION"; - } - return $sample; -} - -sub sample_indel_size { - my $indel_size_prob_hashref = shift @_; - my %indel_size_prob_interval = cal_prob_interval($indel_size_prob_hashref); - my $indel_size = sample_from_interval(\%indel_size_prob_interval); - return $indel_size; -} - -sub introduce_random_indel { - my ($indel_count, $indel_model, $ins_del_ratio, $refseq_base_freq_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my %refseq_genome_space = create_genome_space($refseq_hashref); - my %indel_prob = (); - if (defined $indel_model) { - # print "> indel_model = $indel_model\n\n"; - my $indel_model_fh = read_file($indel_model); - %indel_prob = parse_indel_model($indel_model_fh); - } else { - # print "> ins_del_ratio = $ins_del_ratio\n\n"; - $indel_prob{'ins_del_ratio'} = $ins_del_ratio; - # initialize default INDEL size distribution - my $cumulative_prob = 0; - for(my $indel_size = 50; $indel_size > 0; $indel_size--) { - if ($indel_size > 1) { - $indel_prob{'indel_size_prob'}{$indel_size} = $indel_size_powerlaw_constant * $indel_size **(-$indel_size_powerlaw_alpha); - $cumulative_prob += $indel_prob{'indel_size_prob'}{$indel_size}; - } else { - $indel_prob{'indel_size_prob'}{$indel_size} = 1 - $cumulative_prob; - } - } - } - - # sample INDEL first - my %indel_samples = (); - my $indel_max_radius = 50; # maximal indel size: 50 bp - for (my $i = 1; $i <= $indel_count; $i++) { - # sample indel type and size - my $indel_type = sample_indel_type($indel_prob{'ins_del_ratio'}); - my $indel_size = sample_indel_size(\%{$indel_prob{'indel_size_prob'}}); - # print "indel_type = $indel_type, indel_size = $indel_size\n"; - my $ref_allele; - my $ref_allele_length; - my $alt_allele; - my $alt_allele_length; - # sample genome space - SAMPLE_RANDOM_INDEL: - my ($ref_chr, $ref_start) = sample_genome_space(\%refseq_genome_space); - my $ref_end; - # print "chr_sampled = $ref_chr, start_sampled = $ref_start\n"; - if ($indel_type eq "INSERTION") { - $ref_end = $ref_start; - $ref_allele = substr $$refseq_hashref{$ref_chr}, $ref_start - 1, 1; - $ref_allele_length = 1; - } else { - # Deletion - # actual deletion starts from ref_start + 1 - $ref_end = $ref_start + $indel_size; - # check if the deletion will go beyond the chromosome end - if ($ref_end > $refseq_genome_space{'chr-wide'}{$ref_chr}{'length'}) { - goto SAMPLE_RANDOM_INDEL; - } - $ref_allele = substr $$refseq_hashref{$ref_chr}, $ref_start - 1, $indel_size + 1; - $ref_allele_length = $indel_size + 1; - } - # check if the sampled INDEL contains any ambiguous bases - if ($ref_allele =~ /(N|n)/) { - goto SAMPLE_RANDOM_INDEL; - } - # check if there are pre-introduced SNP/INDEL variants overlapping at the same site already - my $check_overlap_flag = 0; - if (exists $$ref2sim_map_hashref{$ref_chr}) { - if (exists $$ref2sim_map_hashref{$ref_chr}{$ref_start}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} =~ /(SNP|INDEL)/) { - $check_overlap_flag = 1; - goto SAMPLE_RANDOM_INDEL; - } - } else { - foreach my $s (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - my $e = $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}; - if ($ref_end < $s - $indel_max_radius - 1) { - last; - } elsif ($ref_start >= $e + $indel_max_radius + 1) { - next; - } else { - if ($$ref2sim_map_hashref{$ref_chr}{$s}{'variant_type'} =~ /(SNP|INDEL)/) { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_start'}, $$ref2sim_map_hashref{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - goto SAMPLE_RANDOM_INDEL; - } - } - } - } - } - } - # check if there are pre-sampled INDEL variants overlapping at the same site already - if (exists $indel_samples{$ref_chr}) { - if (exists $indel_samples{$ref_chr}{$ref_start}) { - $check_overlap_flag = 1; - goto SAMPLE_RANDOM_INDEL; - } - } else { - foreach my $s (sort {$a <=> $b} keys %{$indel_samples{$ref_chr}}) { - my $e = $indel_samples{$ref_chr}{$s}{'ref_end'}; - if ($ref_end < $s - $indel_max_radius - 1) { - last; - } elsif ($ref_start >= $e + $indel_max_radius + 1) { - next; - } else { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $indel_samples{$ref_chr}{$s}{'ref_start'}, $indel_samples{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - goto SAMPLE_RANDOM_INDEL; - } - } - } - } - # all check passed, register this INDEL - if ($indel_type eq "INSERTION") { - $alt_allele = sample_inserted_seq($indel_size, $refseq_base_freq_hashref); - $alt_allele = $ref_allele . $alt_allele; - } else { - $alt_allele = substr $$refseq_hashref{$ref_chr}, $ref_start - 1, 1; - } - # print "ref_allele = $ref_allele, alt_allele = $alt_allele\n"; - # register this INDEL for later introduction. - $indel_samples{$ref_chr}{$ref_start}{'indel_id'} = "INDEL_${i}"; - $indel_samples{$ref_chr}{$ref_start}{'indel_type'} = $indel_type; - $indel_samples{$ref_chr}{$ref_start}{'indel_size'} = $indel_size; - $indel_samples{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $indel_samples{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $indel_samples{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $indel_samples{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $indel_samples{$ref_chr}{$ref_start}{'alt_allele'} = $alt_allele; - } - - # introduce all sampled INDELs - my %offset = (); - foreach my $ref_chr (sort keys %$refseq_hashref) { - $offset{$ref_chr} = 0; - } - for my $ref_chr (sort keys %indel_samples) { - for my $ref_start (sort {$a <=> $b} keys %{$indel_samples{$ref_chr}}) { - my $indel_id = $indel_samples{$ref_chr}{$ref_start}{'indel_id'}; - my $indel_type = $indel_samples{$ref_chr}{$ref_start}{'indel_type'}; - my $indel_size = $indel_samples{$ref_chr}{$ref_start}{'indel_size'}; - my $ref_end = $indel_samples{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $indel_samples{$ref_chr}{$ref_start}{'ref_allele'}; - my $alt_allele = $indel_samples{$ref_chr}{$ref_start}{'alt_allele'}; - my $ref_allele_length = length $ref_allele; - my $alt_allele_length = length $alt_allele; - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, $ref_allele_length, $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $alt_allele_length - 1 + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = "INDEL"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $indel_id; - if ($indel_type eq "INSERTION") { - $offset{$ref_chr} += $indel_size; - adjust_variant_coordinates_in_simseq($ref_chr, $ref_start, $indel_size, $ref2sim_map_hashref); - } else { - $offset{$ref_chr} -= $indel_size; - adjust_variant_coordinates_in_simseq($ref_chr, $ref_start, -$indel_size, $ref2sim_map_hashref); - } - } - } -} - -sub parse_sv_vcf_file { - my $fh = shift @_; - my %sv = (); - while (<$fh>) { - chomp; - /^\s*$/ and next; - /^#/ and next; - my ($ref_chr, $ref_start, $id, $ref_allele, $alt_allele, $variant_qual, $variant_filter, $variant_info) = split /\t/, $_; - if (($variant_info !~ /SVTYPE=/) or ($variant_info !~ /EVENT=/)) { - print "!!! Error! The tags SVTYPE and EVENT are mandatory in the input vcf for defined CNV, inversion, or translocation variants!\n"; - print "!!! Exit!\n"; - die; - } else { - my ($sv_type) = ($variant_info =~ /SVTYPE=([^;]+)/); - my ($sv_event) = ($variant_info =~ /EVENT=([^;]+)/); - if ($sv_type eq "INV") { - my $ref_end; - if ($variant_info =~ /END=([^;]+)/) { - $ref_end = $1; - } else { - print "!!! Error! The mandatory tag 'END=' has not been specified in the input vcf for the defined INV:\n"; - print "!!! $_\n"; - print "!!! Exit!\n"; - die; - } - $sv{$sv_event}{'ref_chr'} = $ref_chr; - $sv{$sv_event}{'ref_start'} = $ref_start; - $sv{$sv_event}{'ref_end'} = $ref_end; - $sv{$sv_event}{'sv_type'} = $sv_type; - $sv{$sv_event}{'sv_event'} = $sv_event; - } elsif ($sv_type eq "DEL") { - my $ref_end; - if ($variant_info =~ /END=([^;]+)/) { - $ref_end = $1; - } else { - print "!!! Error! The mandatory tag 'END=' has not been specified in the input vcf for the defined DEL:\n"; - print "!!! $_\n"; - print "!!! Exit!\n"; - die; - } - $sv{$sv_event}{'ref_chr'} = $ref_chr; - $sv{$sv_event}{'ref_start'} = $ref_start; - $sv{$sv_event}{'ref_end'} = $ref_end; - $sv{$sv_event}{'sv_type'} = $sv_type; - $sv{$sv_event}{'sv_event'} = $sv_event; - } elsif ($sv_type eq "BND") { - $sv{$sv_event}{'sv_type'} = "to_be_classified"; - $sv{$sv_event}{'sv_event'} = $sv_event; - if ($variant_info =~ /duplication_type=tandem_duplication/) { - $sv{$sv_event}{'duplication_type'} = "tandem_duplication"; - } elsif ($variant_info =~ /duplication_type=dispersed_duplication/) { - $sv{$sv_event}{'duplication_type'} = "dispersed_duplication"; - } - if ($variant_info =~ /inserted_copy_number=(\d+)/) { - $sv{$sv_event}{'inserted_copy_number'} = $1; - } - if ($variant_info =~ /total_copy_number=(\d+)/) { - $sv{$sv_event}{'total_copy_number'} = $1; - } - # see the secion 5 of VCFv4.1 specification (https://samtools.github.io/hts-specs/VCFv4.1.pdf) for the detailed meaning of s, t, and p used below. - my $s = $ref_allele; - my $t; - my $p; - my $p_relative_strand; # the relative strand of p relative to its original sequence - my $p_relative_position; # the relative positon of p relative to t: "before_t" or "after_t" - # print "alt_allele = $alt_allele\n"; - if ($alt_allele =~ /\[$/) { - $p_relative_strand = "+"; - $p_relative_position = "after_ref_allele"; - ($t, $p) = ($alt_allele =~ /(\S+)\[(\S+)\[/); - } elsif ($alt_allele =~ /\]$/) { - $p_relative_strand = "-"; - $p_relative_position = "after_ref_allele"; - ($t, $p) = ($alt_allele =~ /(\S+)\](\S+)\]/); - } elsif ($alt_allele =~ /^\]/) { - $p_relative_strand = "+"; - $p_relative_position = "before_ref_allele"; - ($p, $t) = ($alt_allele =~ /\](\S+)\](\S+)/); - } elsif ($alt_allele =~ /^\[/) { - $p_relative_strand = "-"; - $p_relative_position = "before_ref_allele"; - ($p, $t) = ($alt_allele =~ /\[(\S+)\[(\S+)/); - } else { - print "Unexpected ALT field in the input vcf file for defined CNVs/inversions/translocation:\n"; - print "$_\n"; - print "Exit!\n"; - die; - } - # print "s=$s, t=$t, p=$p\n"; - if (not exists $sv{$sv_event}{'BND'}) { - @{$sv{$sv_event}{'BND'}} = (); - } - my %bnd = (); - $bnd{'ref_chr'} = $ref_chr; - $bnd{'ref_start'} = $ref_start; - $bnd{'s'} = $s; - $bnd{'t'} = $t; - $bnd{'p'} = $p; - $bnd{'p_relative_strand'} = $p_relative_strand; - $bnd{'p_relative_position'} = $p_relative_position; - push @{$sv{$sv_event}{'BND'}}, \%bnd; - } - } - } - return %sv; -} - -sub check_cnv_overlap { - my ($cnv_hashref, $chr, $start, $end, $check_donor) = @_; - my $flag = 0; - # print "chr=$chr, start=$start, end=$end, check_donor=$check_donor\n"; - foreach my $c (sort keys %$cnv_hashref) { - foreach my $s (sort {$a <=> $b} keys %{$$cnv_hashref{$c}}) { - if ($chr eq $c) { - if ($end < $s) { - last; - } elsif ($start <= $$cnv_hashref{$c}{$s}{'ref_end'}) { - $flag = check_overlap_region($start, $end, $$cnv_hashref{$c}{$s}{'ref_start'}, $$cnv_hashref{$c}{$s}{'ref_end'}); - if ($flag == 1) { - return $flag; - } - } - } - if ($check_donor eq "yes") { - if ($$cnv_hashref{$c}{$s}{'variant_type'} eq "DUP") { - if ($$cnv_hashref{$c}{$s}{'donor_chr_in_ref'} eq $chr) { - $flag = check_overlap_region($start, $end, $$cnv_hashref{$c}{$s}{'donor_start_in_ref'}, $$cnv_hashref{$c}{$s}{'donor_end_in_ref'}); - if ($flag == 1){ - return $flag; - } - } - } - } - } - } - return $flag; -} - -sub extract_cnv_from_sv { - my $sv_hashref = shift @_; - my %cnv = (); - foreach my $sv_event (sort keys %$sv_hashref) { - my $check_overlap_flag = 0; - if ($$sv_hashref{$sv_event}{'sv_type'} eq "DEL") { - my $ref_chr = $$sv_hashref{$sv_event}{'ref_chr'}; - my $ref_start = $$sv_hashref{$sv_event}{'ref_start'}; - my $ref_end = $$sv_hashref{$sv_event}{'ref_end'}; # ref_end - ref_start = deletion_size - # check overlap with pre-defined CNVs - my $check_overlap_flag = check_cnv_overlap(\%cnv, $ref_chr, $ref_start, $ref_end, "no"); - if ($check_overlap_flag == 1) { - print "!!! Warning! Multiple overlapped CNVs found within the region $ref_chr:$ref_start-$ref_end!\n"; - print "!!! Only keep the first instance and ignore the others\n"; - next; - } else { - # register this CNV - $cnv{$ref_chr}{$ref_start}{'cnv_id'} = $sv_event; - $cnv{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $cnv{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $cnv{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $cnv{$ref_chr}{$ref_start}{'deletion_start'} = $ref_start + 1; - $cnv{$ref_chr}{$ref_start}{'deletion_end'} = $ref_end; - $cnv{$ref_chr}{$ref_start}{'deletion_size'} = $ref_end - $ref_start; - $cnv{$ref_chr}{$ref_start}{'variant_type'} = "DEL"; - $cnv{$ref_chr}{$ref_start}{'cnv_type'} = "copy_number_loss"; - } - } elsif ($$sv_hashref{$sv_event}{'sv_type'} eq "to_be_classified") { - my $bnd_count = scalar @{$$sv_hashref{$sv_event}{'BND'}}; - if ($bnd_count == 2) { - my %bnd = (); - foreach my $b_hashref (sort @{$$sv_hashref{$sv_event}{'BND'}}) { - my $s = $$b_hashref{'s'}; - my $p = $$b_hashref{'p'}; - my $t = $$b_hashref{'t'}; - my ($bnd_p_chr, $bnd_p_start) = split /:/, $p; - my $p_relative_strand = $$b_hashref{'p_relative_strand'}; - my $p_relative_position = $$b_hashref{'p_relative_position'}; - if ($p_relative_position eq "after_ref_allele") { - $bnd{'1'} = $b_hashref; - $bnd{'1'}{'p_chr'} = $bnd_p_chr; - $bnd{'1'}{'p_start'} = $bnd_p_start; - } else { - $bnd{'2'} = $b_hashref; - $bnd{'2'}{'p_chr'} = $bnd_p_chr; - $bnd{'2'}{'p_start'} = $bnd_p_start; - } - } - # print Dumper(%bnd); - # print "\n"; - # verify this is indeed a segmental duplication - if ($bnd{'1'}{'ref_chr'} eq $bnd{'2'}{'ref_chr'}) { - my $recipient_chr = $bnd{'1'}{'ref_chr'}; - my ($recipient_start, $recipient_end) = ($bnd{'1'}{'ref_start'}, $bnd{'2'}{'ref_start'}); - if ($recipient_end - $recipient_start == 1) { - if ($bnd{'1'}{'p_chr'} eq $bnd{'2'}{'p_chr'}) { - my $donor_chr_in_ref = $bnd{'1'}{'p_chr'}; - my ($donor_start_in_ref, $donor_end_in_ref) = sort {$a <=> $b} ($bnd{'1'}{'p_start'}, $bnd{'2'}{'p_start'}); - my $donor_strand_in_ref = $bnd{'1'}{'p_relative_strand'}; - # check overlap with pre-defined CNVs - my $check_overlap_flag1 = check_cnv_overlap(\%cnv, $recipient_chr, $recipient_start, $recipient_end, "yes"); - my $check_overlap_flag2 = check_cnv_overlap(\%cnv, $donor_chr_in_ref, $donor_start_in_ref, $donor_end_in_ref, "no"); - if (($check_overlap_flag1 == 0) and ($check_overlap_flag2 == 0)) { - $cnv{$recipient_chr}{$recipient_start}{'cnv_id'} = $$sv_hashref{$sv_event}{'sv_event'}; - $cnv{$recipient_chr}{$recipient_start}{'ref_chr'} = $recipient_chr; - $cnv{$recipient_chr}{$recipient_start}{'ref_start'} = $recipient_start; # - $cnv{$recipient_chr}{$recipient_start}{'ref_end'} = $recipient_start; # same as ref_start - $cnv{$recipient_chr}{$recipient_start}{'donor_chr_in_ref'} = $donor_chr_in_ref; - $cnv{$recipient_chr}{$recipient_start}{'donor_start_in_ref'} = $donor_start_in_ref; - $cnv{$recipient_chr}{$recipient_start}{'donor_end_in_ref'} = $donor_end_in_ref; - $cnv{$recipient_chr}{$recipient_start}{'donor_strand_in_ref'} = $donor_strand_in_ref; - $cnv{$recipient_chr}{$recipient_start}{'donor_size'} = $donor_end_in_ref - $donor_start_in_ref + 1; - $cnv{$recipient_chr}{$recipient_start}{'variant_type'} = "DUP"; - $cnv{$recipient_chr}{$recipient_start}{'cnv_type'} = "copy_number_gain"; - if ($$sv_hashref{$sv_event}{'duplication_type'} eq "dispersed_duplication") { - $cnv{$recipient_chr}{$recipient_start}{'duplication_type'} = "dispersed_duplication"; - } elsif ($$sv_hashref{$sv_event}{'duplication_type'} eq "tandem_duplication") { - $cnv{$recipient_chr}{$recipient_start}{'duplication_type'} = "tandem_duplication"; - } - if (defined $$sv_hashref{$sv_event}{'total_copy_number'}) { - $cnv{$recipient_chr}{$recipient_start}{'total_copy_number'} = $$sv_hashref{$sv_event}{'total_copy_number'}; - } else { - $cnv{$recipient_chr}{$recipient_start}{'total_copy_number'} = "?"; - } - if (defined $$sv_hashref{$sv_event}{'inserted_copy_number'}) { - $cnv{$recipient_chr}{$recipient_start}{'inserted_copy_number'} = $$sv_hashref{$sv_event}{'inserted_copy_number'}; - } else { - $cnv{$recipient_chr}{$recipient_start}{'inserted_copy_number'} = "?"; - } - if (($cnv{$recipient_chr}{$recipient_start}{'duplication_type'} eq "tandem_duplication") and ($cnv{$recipient_chr}{$recipient_start}{'inserted_copy_number'} eq "?")) { - print "!!! Error! The mandatory tag 'inserted_copy_number=' has not been specified in the input vcf for defined tandem duplication: $sv_event\n"; - print "!!! Exit!\n"; - die; - } - } - } - } - } - } - } - } - return %cnv; -} - -sub introduce_defined_cnv { - my ($cnv_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my $cnv_count = 0; - my %offset = (); - foreach my $ref_chr (sort keys %$cnv_hashref) { - $offset{$ref_chr} = 0; - } - foreach my $ref_chr (sort keys %$cnv_hashref) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$cnv_hashref{$ref_chr}}) { - my $cnv_id = $$cnv_hashref{$ref_chr}{$ref_start}{'cnv_id'}; - my $variant_type = $$cnv_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $cnv_type = $$cnv_hashref{$ref_chr}{$ref_start}{'cnv_type'}; - if ($variant_type eq "DEL") { - my $ref_start = $$cnv_hashref{$ref_chr}{$ref_start}{'ref_start'}; - my $ref_end = $$cnv_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $deletion_start = $$cnv_hashref{$ref_chr}{$ref_start}{'deletion_start'}; - my $deletion_end = $$cnv_hashref{$ref_chr}{$ref_start}{'deletion_end'}; - my $deletion_size = $$cnv_hashref{$ref_chr}{$ref_start}{'deletion_size'}; - # standard notation for the reference allele - # my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, $deletion_size + 1; - # short-handed notation for the reference allele - my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # standard notation for the alternative allele - # my $alt_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # short-handed notation for the alternative allele - my $alt_allele = ""; - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr}, $deletion_size, ""; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $cnv_id; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'cnv_type'} = "copy_number_loss"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'deletion_size'} = $deletion_size; - $offset{$ref_chr} -= $deletion_size; - } else { - # cnv_type eq "DUP" - my $donor_chr_in_ref = $$cnv_hashref{$ref_chr}{$ref_start}{'donor_chr_in_ref'}; - my $donor_start_in_ref = $$cnv_hashref{$ref_chr}{$ref_start}{'donor_start_in_ref'}; - my $donor_end_in_ref = $$cnv_hashref{$ref_chr}{$ref_start}{'donor_end_in_ref'}; - my $donor_strand_in_ref = $$cnv_hashref{$ref_chr}{$ref_start}{'donor_strand_in_ref'}; - my $donor_size = $$cnv_hashref{$ref_chr}{$ref_start}{'donor_size'}; - my $donor_seq = substr $$refseq_hashref{$donor_chr_in_ref}, $donor_start_in_ref - 1, $donor_size; - my $duplication_type = $$cnv_hashref{$ref_chr}{$ref_start}{'duplication_type'}; - my $total_copy_number; - my $inserted_copy_number; - if (defined $$cnv_hashref{$ref_chr}{$ref_start}{'total_copy_number'}) { - $total_copy_number = $$cnv_hashref{$ref_chr}{$ref_start}{'total_copy_number'}; - } - if (defined $$cnv_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'}) { - $inserted_copy_number = $$cnv_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'}; - } - # test the strand - if ($donor_strand_in_ref eq "-") { - $donor_seq = revcom($donor_seq); - } - # standard - # standard notation for the reference allele - # my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # short-handed notation for the reference allele - my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # standard notation for the alternative allele - # my $alt_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # $alt_allele = $alt_allele . $donor_seq; - # short-handed notation for the alternative allele - my $alt_allele = ""; - if ($duplication_type eq "dispersed_duplication") { - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr}, 0, $donor_seq; - } else { - # tandem_duplication - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr}, 0, $donor_seq x $inserted_copy_number; - } - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start + $offset{$ref_chr}; - if ($duplication_type eq "dispersed_duplication") { - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $donor_size + $offset{$ref_chr}; - } else { - # tandem_duplication - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $donor_size * $inserted_copy_number + $offset{$ref_chr}; - } - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $cnv_id; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'cnv_type'} = "copy_number_gain"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'duplication_type'} = $duplication_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_size'} = $donor_size; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_chr_in_ref'} = $donor_chr_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_start_in_ref'} = $donor_start_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_end_in_ref'} = $donor_end_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_strand_in_ref'} = $donor_strand_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'total_copy_number'} = $total_copy_number; - if ($duplication_type eq "dispersed_duplication") { - $offset{$ref_chr} += $donor_size; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'} = 1; - } else { - $offset{$ref_chr} += $donor_size * $inserted_copy_number; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'} = $inserted_copy_number; - } - } - $cnv_count++; - } - } -} - -sub sample_cnv_type { - my $cnv_gain_loss_ratio = shift @_; - my $sample = rand(1); - # print "sample=$sample\n"; - if ($cnv_gain_loss_ratio eq "Inf") { - $sample = "copy_number_gain"; - } elsif ($sample < $cnv_gain_loss_ratio/(1 + $cnv_gain_loss_ratio)) { - $sample = "copy_number_gain"; - } else { - $sample = "copy_number_loss"; - } - return $sample; -} - -sub sample_duplication_type { - my $duplication_tandem_dispersed_ratio = shift @_; - my $sample = rand(1); - if ($duplication_tandem_dispersed_ratio eq "Inf") { - $sample = "tandem_duplication"; - } elsif ($sample < $duplication_tandem_dispersed_ratio/(1 + $duplication_tandem_dispersed_ratio)) { - $sample = "tandem_duplication"; - } else { - $sample = "dispersed_duplication"; - } - return $sample; -} - -sub sample_cnv_size { - my ($cnv_min_size, $cnv_max_size) = @_; - my $cnv_size = $cnv_min_size + int rand($cnv_max_size - $cnv_min_size); - return $cnv_size; -} - -sub sample_cnv_copy_number { - my $cnv_max_copy_number = shift @_; - my $cnv_copy_number = 2 + int rand($cnv_max_copy_number - 1); - # print "cnv_max_copy_number = $cnv_max_copy_number; cnv_copy_number = $cnv_copy_number\n"; - return $cnv_copy_number; -} - -sub sample_strand { - my $forward2reverse_strand_ratio = shift @_; - my $sample = rand(1); - if ($forward2reverse_strand_ratio eq "Inf") { - $sample = "+"; - } elsif ($sample < $forward2reverse_strand_ratio/(1 + $forward2reverse_strand_ratio)) { - $sample = "+"; - } else { - $sample = "-"; - } - return $sample; -} - -sub introduce_random_cnv { - my ($cnv_count, $cnv_gain_loss_ratio, $duplication_tandem_dispersed_ratio, $cnv_min_size, $cnv_max_size, $cnv_max_copy_number, $centromere_by_chr_hashref, $gene_by_chr_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my %refseq_genome_space = create_genome_space($refseq_hashref); - my $forward2reverse_strand_ratio = 1; # the ratio of forward vs. inverted inserted copies for DUP. Default = 1.0 (i.e. equal chance for the two possible inserted orientations). - # sample CNV first - my %cnv_samples = (); - for (my $i = 1; $i <= $cnv_count; $i++) { - # sample CNV type - my $cnv_type = sample_cnv_type($cnv_gain_loss_ratio); - # print "i = $i, cnv_type = $cnv_type\n"; - if ($cnv_type eq "copy_number_loss") { - my $variant_type = "DEL"; - # sample deletion size - my $deletion_size = sample_cnv_size($cnv_min_size, $cnv_max_size); - # print "deletion_size = $deletion_size\n"; - # sample the deleted region from the genome space - my $chr_end_margin = 1000; - SAMPLE_RANDOM_DEL: - # sample the deleted region (for copy number loss) - my ($ref_chr, $ref_pos) = sample_genome_space(\%refseq_genome_space); - my $ref_start = $ref_pos - 1; # including the base immediately before the sampled region - if ($ref_start < $chr_end_margin) { - goto SAMPLE_RANDOM_DEL; - } - my $ref_end = $ref_start + $deletion_size; - # check if the sampled end position will go beyond the chromosome end - if ($ref_end > ($refseq_genome_space{'chr-wide'}{$ref_chr}{'length'} - $chr_end_margin)) { - goto SAMPLE_RANDOM_DEL; - } - # check if the sampled breakpoints overlapped with the defined genes - if (exists $$gene_by_chr_hashref{$ref_chr}) { - foreach my $gene_id (sort keys %{$$gene_by_chr_hashref{$ref_chr}}) { - my $gene_check_flag1 = check_overlap_region($ref_start, $ref_start, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'start'}, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'end'}); - my $gene_check_flag2 = check_overlap_region($ref_end, $ref_end, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'start'}, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'end'}); - if (($gene_check_flag1 == 1) or ($gene_check_flag2 == 1)) { - goto SAMPLE_RANDOM_DEL; - } - } - } - # check if the sampled region overlapped with the defined centromeres - if (exists $$centromere_by_chr_hashref{$ref_chr}) { - my $centromere_check_flag = check_overlap_region($ref_start, $ref_end, $$centromere_by_chr_hashref{$ref_chr}{'start'}, $$centromere_by_chr_hashref{$ref_chr}{'end'}); - if ($centromere_check_flag == 1) { - goto SAMPLE_RANDOM_DEL; - } - } - - my $ref_allele = substr $$refseq_hashref{$ref_chr}, $ref_start - 1, $deletion_size + 1; - # check if the sampled region hit the uncertain part of the reference genome - if ($ref_allele =~ /(N|n)/) { - goto SAMPLE_RANDOM_DEL; - } - # check if there are pre-sampled CNV variants overlapping at the same site already - my $check_overlap_flag = check_cnv_overlap(\%cnv_samples, $ref_chr, $ref_start, $ref_end, "yes"); - if ($check_overlap_flag == 1) { - goto SAMPLE_RANDOM_DEL; - } else { - # all check passed, register this DEL - # print "register sampled DEL at $ref_chr:$ref_start-$ref_end, deletion_size = $deletion_size\n"; - $cnv_samples{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $cnv_samples{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $cnv_samples{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $cnv_samples{$ref_chr}{$ref_start}{'cnv_id'} = "CNV_${i}.1"; - $cnv_samples{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $cnv_samples{$ref_chr}{$ref_start}{'cnv_type'} = $cnv_type; - $cnv_samples{$ref_chr}{$ref_start}{'deletion_size'} = $deletion_size; - } - } else { - my $variant_type = "DUP"; - my $donor_size = sample_cnv_size($cnv_min_size, $cnv_max_size); - # print "donor_size = $donor_size\n"; - # sample the donor region from the genome space - SAMPLE_RANDOM_DONOR: - my ($donor_chr_in_ref, $donor_start_in_ref) = sample_genome_space(\%refseq_genome_space); - my $donor_end_in_ref = $donor_start_in_ref + $donor_size - 1; - # check if the sampled end position will go beyond the chromosome end - if ($donor_end_in_ref > $refseq_genome_space{'chr-wide'}{$donor_chr_in_ref}{'length'}) { - goto SAMPLE_RANDOM_DONOR; - } - # check if the sampled breakpoints overlapped with the defined genes - if (exists $$gene_by_chr_hashref{$donor_chr_in_ref}) { - foreach my $gene_id (sort keys %{$$gene_by_chr_hashref{$donor_chr_in_ref}}) { - my $gene_check_flag1 = check_overlap_region($donor_start_in_ref, $donor_start_in_ref, $$gene_by_chr_hashref{$donor_chr_in_ref}{$gene_id}{'start'}, $$gene_by_chr_hashref{$donor_chr_in_ref}{$gene_id}{'end'}); - my $gene_check_flag2 = check_overlap_region($donor_end_in_ref, $donor_end_in_ref, $$gene_by_chr_hashref{$donor_chr_in_ref}{$gene_id}{'start'}, $$gene_by_chr_hashref{$donor_chr_in_ref}{$gene_id}{'end'}); - if (($gene_check_flag1 == 1) or ($gene_check_flag2 == 1)) { - goto SAMPLE_RANDOM_DONOR; - } - } - } - # check if the sampled region overlapped with the defined centromeres - if (exists $$centromere_by_chr_hashref{$donor_chr_in_ref}) { - my $centromere_check_flag = check_overlap_region($donor_start_in_ref, $donor_end_in_ref, $$centromere_by_chr_hashref{$donor_chr_in_ref}{'start'}, $$centromere_by_chr_hashref{$donor_chr_in_ref}{'end'}); - if ($centromere_check_flag == 1) { - goto SAMPLE_RANDOM_DONOR; - } - } - - my $donor_seq = substr $$refseq_hashref{$donor_chr_in_ref}, $donor_start_in_ref - 1, $donor_size + 1; - # check if the sampled region hit the uncertain part of the reference genome - if ($donor_seq =~ /(N|n)/) { - goto SAMPLE_RANDOM_DONOR; - } - # check if there are pre-sampled CNV variants overlapping at the same site already - my $check_overlap_flag1 = check_cnv_overlap(\%cnv_samples, $donor_chr_in_ref, $donor_start_in_ref, $donor_end_in_ref, "no"); - if ($check_overlap_flag1 == 1) { - goto SAMPLE_RANDOM_DONOR; - } else { - # donor sampling is completed - # print "sampled donor: donor_chr_in_ref = $donor_chr_in_ref, donor_start_in_ref = $donor_start_in_ref, donor_end_in_ref = $donor_end_in_ref\n"; - # now sample inserted copy number - my $cnv_extra_copy_number = sample_cnv_copy_number($cnv_max_copy_number) - 1; - my $duplication_type = sample_duplication_type($duplication_tandem_dispersed_ratio); - if ($duplication_type eq "tandem_duplication") { - my $inserted_chr_in_ref = $donor_chr_in_ref; - my $inserted_start_in_ref = $donor_end_in_ref; - my $inserted_end_in_ref = $donor_end_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'ref_chr'} = $inserted_chr_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'ref_start'} = $inserted_start_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'ref_end'} = $inserted_end_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_chr_in_ref'} = $donor_chr_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_start_in_ref'} = $donor_start_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_end_in_ref'} = $donor_end_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_strand_in_ref'} = "+"; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'cnv_id'} = "CNV_"."$i".".1-"."$cnv_extra_copy_number"; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'variant_type'} = $variant_type; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'cnv_type'} = $cnv_type; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'duplication_type'} = $duplication_type; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'inserted_copy_number'} = $cnv_extra_copy_number; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'total_copy_number'} = $cnv_extra_copy_number + 1; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_size'} = $donor_size; - } else { - # dispersed_duplication - for (my $j = 1; $j <= $cnv_extra_copy_number; $j++) { - # sample the inserted location for the donor sequence from the genome space - SAMPLE_RANDOM_DUP_INSERT: - my ($inserted_chr_in_ref, $inserted_start_in_ref) = sample_genome_space(\%refseq_genome_space); - my $inserted_end_in_ref = $inserted_start_in_ref; - # print "inserted_chr_in_ref = $inserted_chr_in_ref, inserted_start_in_ref = $inserted_start_in_ref\n"; - # check if the sampled breakpoints overlapped with the defined genes - if (exists $$gene_by_chr_hashref{$inserted_chr_in_ref}) { - foreach my $gene_id (sort keys %{$$gene_by_chr_hashref{$inserted_chr_in_ref}}) { - my $gene_check_flag = check_overlap_region($inserted_start_in_ref, $inserted_end_in_ref, $$gene_by_chr_hashref{$inserted_chr_in_ref}{$gene_id}{'start'}, $$gene_by_chr_hashref{$inserted_chr_in_ref}{$gene_id}{'end'}); - if ($gene_check_flag == 1) { - goto SAMPLE_RANDOM_DUP_INSERT; - } - } - } - # check if the sampled region overlapped with the defined centromere - if (exists $$centromere_by_chr_hashref{$inserted_chr_in_ref}) { - my $centromere_check_flag = check_overlap_region($inserted_start_in_ref, $inserted_end_in_ref, $$centromere_by_chr_hashref{$inserted_chr_in_ref}{'start'}, $$centromere_by_chr_hashref{$inserted_chr_in_ref}{'end'}); - if ($centromere_check_flag == 1) { - goto SAMPLE_RANDOM_DUP_INSERT; - } - } - - # check if there are pre-sampled CNV variants overlapping at the same site already - my $check_overlap_flag2 = check_cnv_overlap(\%cnv_samples, $inserted_chr_in_ref, $inserted_start_in_ref, $inserted_end_in_ref, "yes"); - if ($check_overlap_flag2 == 1) { - goto SAMPLE_RANDOM_DUP_INSERT; - } else { - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'ref_chr'} = $inserted_chr_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'ref_start'} = $inserted_start_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'ref_end'} = $inserted_end_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_chr_in_ref'} = $donor_chr_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_start_in_ref'} = $donor_start_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_end_in_ref'} = $donor_end_in_ref; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_strand_in_ref'} = sample_strand($forward2reverse_strand_ratio); - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'cnv_id'} = "CNV_"."$i.$j"; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'variant_type'} = $variant_type; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'cnv_type'} = $cnv_type; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'duplication_type'} = $duplication_type; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'inserted_copy_number'} = 1; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'total_copy_number'} = $cnv_extra_copy_number + 1; - $cnv_samples{$inserted_chr_in_ref}{$inserted_start_in_ref}{'donor_size'} = $donor_size; - } - } - } - } - } - } - # introduce all sampled CNVs - my %offset = (); - foreach my $ref_chr (sort keys %$refseq_hashref) { - $offset{$ref_chr} = 0; - } - for my $ref_chr (sort keys %cnv_samples) { - for my $ref_start (sort {$a <=> $b} keys %{$cnv_samples{$ref_chr}}) { - my $cnv_id = $cnv_samples{$ref_chr}{$ref_start}{'cnv_id'}; - my $variant_type = $cnv_samples{$ref_chr}{$ref_start}{'variant_type'}; - my $cnv_type = $cnv_samples{$ref_chr}{$ref_start}{'cnv_type'}; - my $ref_end = $cnv_samples{$ref_chr}{$ref_start}{'ref_end'}; - if ($variant_type eq "DEL") { - my $deletion_start = $cnv_samples{$ref_chr}{$ref_start}{'deletion_start'}; - my $deletion_end = $cnv_samples{$ref_chr}{$ref_start}{'deletion_end'}; - my $deletion_size = $cnv_samples{$ref_chr}{$ref_start}{'deletion_size'}; - # standard notation for the reference allele - # my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, $deletion_size + 1; - # short-handed notation for the reference allele - my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # standard notation for the alternative allele - # my $alt_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # short-handed notation for the alternative allele - my $alt_allele = ""; - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr}, $deletion_size, ""; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $offset{$ref_chr}; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $cnv_id; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'cnv_type'} = $cnv_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'deletion_size'} = $deletion_size; - $offset{$ref_chr} -= $deletion_size; - } else { - # variant_type eq "DUP" - my $donor_chr_in_ref = $cnv_samples{$ref_chr}{$ref_start}{'donor_chr_in_ref'}; - my $donor_start_in_ref = $cnv_samples{$ref_chr}{$ref_start}{'donor_start_in_ref'}; - my $donor_end_in_ref = $cnv_samples{$ref_chr}{$ref_start}{'donor_end_in_ref'}; - my $donor_strand_in_ref = $cnv_samples{$ref_chr}{$ref_start}{'donor_strand_in_ref'}; - my $duplication_type = $cnv_samples{$ref_chr}{$ref_start}{'duplication_type'}; - my $inserted_copy_number = $cnv_samples{$ref_chr}{$ref_start}{'inserted_copy_number'}; - my $total_copy_number = $cnv_samples{$ref_chr}{$ref_start}{'total_copy_number'}; - my $donor_size = $cnv_samples{$ref_chr}{$ref_start}{'donor_size'}; - my $donor_seq = substr $$refseq_hashref{$donor_chr_in_ref}, $donor_start_in_ref - 1, $donor_size; - # test the orientation - if ($donor_strand_in_ref eq "-") { - $donor_seq = revcom($donor_seq); - } - # standard - # standard notation for the reference allele - # my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # short-handed notation for the reference allele - my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # standard notation for the alternative allele - # my $alt_allele = substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr} - 1, 1; - # $alt_allele = $alt_allele . $donor_seq; - # short-handed notation for the alternative allele - my $alt_allele = ""; - if ($duplication_type eq "dispersed_duplication") { - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr}, 0, $donor_seq; - } else { - # tandem_duplication - substr $$simseq_hashref{$ref_chr}, $ref_start + $offset{$ref_chr}, 0, $donor_seq x ($total_copy_number - 1); - } - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start + $offset{$ref_chr}; - if ($duplication_type eq "dispersed_duplication") { - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $donor_size + $offset{$ref_chr}; - } else { - # tandem_duplication - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_start + $donor_size * ($total_copy_number - 1) + $offset{$ref_chr}; - } - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = $variant_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $cnv_id; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'cnv_type'} = "copy_number_gain"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_size'} = $donor_size; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_chr_in_ref'} = $donor_chr_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_start_in_ref'} = $donor_start_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_end_in_ref'} = $donor_end_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_strand_in_ref'} = $donor_strand_in_ref; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'duplication_type'} = $duplication_type; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'} = $inserted_copy_number; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'total_copy_number'} = $total_copy_number; - if ($duplication_type eq "dispersed_duplication") { - $offset{$ref_chr} += $donor_size; - } else { - # tandem_duplication - $offset{$ref_chr} += $donor_size * $inserted_copy_number; - } - } - } - } -} - -sub sample_inversion_size { - my ($inv_min_size, $inv_max_size) = @_; - my $inv_size = $inv_min_size + int rand($inv_max_size - $inv_min_size); - return $inv_size; -} - -sub extract_inversion_from_sv { - my $sv_hashref = shift @_; - my %inversion = (); - foreach my $sv_event (sort keys %$sv_hashref) { - my $check_overlap_flag = 0; - if ($$sv_hashref{$sv_event}{'sv_type'} eq "INV") { - my $ref_chr = $$sv_hashref{$sv_event}{'ref_chr'}; - my $ref_start = $$sv_hashref{$sv_event}{'ref_start'}; - my $ref_end = $$sv_hashref{$sv_event}{'ref_end'}; - my $inversion_id = $$sv_hashref{$sv_event}{'sv_event'}; - # check if there are pre-sampled SV variants overlapping at the same site already - my $check_overlap_flag = 0; - if (exists $inversion{$ref_chr}) { - if (exists $inversion{$ref_chr}{$ref_start}) { - $check_overlap_flag = 1; - } - } else { - foreach my $s (sort {$a <=> $b} keys %{$inversion{$ref_chr}}) { - if ($ref_end < $s) { - last; - } elsif ($ref_start <= $inversion{$ref_chr}{$s}{'ref_end'}) { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $inversion{$ref_chr}{$s}{'ref_start'}, $inversion{$ref_chr}{$s}{'ref_end'}); - if ($check_overlap_flag == 1) { - last; - } - } - } - } - if ($check_overlap_flag == 1) { - print "!!! Warning! Multiple overlapped inversions found within $ref_chr:$ref_start-$ref_end!\n"; - print "!!! Only keep the first one and ignore the others\n"; - next; - } else { - # register this inversion - $inversion{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $inversion{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $inversion{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $inversion{$ref_chr}{$ref_start}{'inversion_id'} = $inversion_id; - } - } - } - return %inversion; -} - -sub introduce_defined_inversion { - my ($inversion_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - # print Dumper(%$inversion_hashref); - foreach my $ref_chr (sort keys %$inversion_hashref) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$inversion_hashref{$ref_chr}}) { - my $inversion_id = $$inversion_hashref{$ref_chr}{$ref_start}{'inversion_id'}; - my $ref_end = $$inversion_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $inversion_size = $ref_end - $ref_start + 1; - if ($ref_end > (length $$refseq_hashref{$ref_chr})) { - print "!!! Error! The end of the defined inversion $inversion_id goes beyond the end of the corresponding chromosome!\n"; - print "!!! Exit!\n"; - die; - } - my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start - 1, 1; - my $alt_allele = ""; - my $inversion_seq = substr $$simseq_hashref{$ref_chr}, $ref_start - 1, $inversion_size; - $inversion_seq = revcom($inversion_seq); - substr $$simseq_hashref{$ref_chr}, $ref_start - 1, $inversion_size, $inversion_seq; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "-"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = "INV"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $inversion_id; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inversion_size'} = $inversion_size; - } - } -} - -sub examine_breakpoint_for_inversion { - my ($breakpoint_by_chr_by_type_hashref, $centromere_by_chr_hashref) = @_; - my %valid_breakpoint_pair = (); - foreach my $chr (sort keys %$breakpoint_by_chr_by_type_hashref) { - foreach my $type (sort keys %{$$breakpoint_by_chr_by_type_hashref{$chr}}) { - my @breakpoint_same_chr_same_type = sort keys %{$$breakpoint_by_chr_by_type_hashref{$chr}{$type}}; - if ((scalar @breakpoint_same_chr_same_type) >= 2) { - my @breakpoint_same_chr_same_type_positive_strand = (); - my @breakpoint_same_chr_same_type_negative_strand = (); - foreach my $b (@breakpoint_same_chr_same_type) { - if ($$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b}{'strand'} eq "+") { - push @breakpoint_same_chr_same_type_positive_strand, $b; - } elsif ($$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b}{'strand'} eq "-") { - push @breakpoint_same_chr_same_type_negative_strand, $b; - } else { - print "\n!!! Error! Strand was not defined for the breakpoint $b in the input inversion_breakpoint_gff\n"; - print "!!! Exit!\n"; - die; - } - } - if (((scalar @breakpoint_same_chr_same_type_positive_strand) > 0) and ((scalar @breakpoint_same_chr_same_type_negative_strand) > 0)) { - foreach my $b1 (@breakpoint_same_chr_same_type_positive_strand) { - foreach my $b2 (@breakpoint_same_chr_same_type_negative_strand) { - my $inv_start; - my $inv_end; - if ($$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b1}{'start'} > $$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b2}{'end'}) { - ($inv_start, $inv_end) = ($$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b2}{'start'}, $$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b1}{'end'}); - } else { - ($inv_start, $inv_end) = ($$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b1}{'start'}, $$breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b2}{'end'}); - } - my $centromere_check_flag1 = check_overlap_region($inv_start, $inv_start, $$centromere_by_chr_hashref{$chr}{'start'}, $$centromere_by_chr_hashref{$chr}{'end'}); - my $centromere_check_flag2 = check_overlap_region($inv_end, $inv_end, $$centromere_by_chr_hashref{$chr}{'start'}, $$centromere_by_chr_hashref{$chr}{'end'}); - if (($centromere_check_flag1 == 0) and ($centromere_check_flag2 == 0)) { - $valid_breakpoint_pair{'+-'}{$b1}{$b2}{'chr'} = $chr; - $valid_breakpoint_pair{'+-'}{$b1}{$b2}{'type'} = $type; - $valid_breakpoint_pair{'-+'}{$b2}{$b1}{'chr'} = $chr; - $valid_breakpoint_pair{'-+'}{$b2}{$b1}{'type'} = $type; - } - } - } - } - } - } - } - if (not exists $valid_breakpoint_pair{'+-'}) { - print "\n!!! Error! None of the defined breakpoints is valid for triggering inversions!\n"; - print "!!! Valid breakpoints should satisfy the following creteria:\n"; - print "!!! 1) The two breakpoints should come from the same chromosome.\n"; - print "!!! 2) The two breakpoints should belong to the same feature type.\n"; - print "!!! 3) The two breakpoints should come from opposite strands.\n"; - print "!!! 4) If the centromere for the corresponding chromosome has been defined, the two breakpoints should not enclose this centromere.\n"; - print "!!! Exit!\n"; - die; - } else { - return %valid_breakpoint_pair; - } -} - -sub introduce_random_inversion { - my ($inversion_count, $inversion_min_size, $inversion_max_size, $centromere_by_chr_hashref, $inversion_breakpoint_by_chr_by_type_hashref, $gene_by_chr_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my %inversion = (); - # print "sample inversion: i=$i\n"; - if (keys %$inversion_breakpoint_by_chr_by_type_hashref) { - my %valid_breakpoint_pair = examine_breakpoint_for_inversion($inversion_breakpoint_by_chr_by_type_hashref, $centromere_by_chr_hashref); - for (my $i = 1; $i <= $inversion_count; $i++) { - SAMPLE_RANDOM_INV1: - # sample inversion based on defined breakpoints - my $valid_breakpoint_pair_count = scalar (keys %{$valid_breakpoint_pair{'+-'}}); - if ($valid_breakpoint_pair_count < 1) { - my $j = $i - 1; - print "\n!!! Warning! No more valid breakpoint pairs can be found in the current simulation based on the defined breakpoint file: $inversion_breakpoint_gff\n"; - print "!!! Only $j inversions were introduced\n"; - last; - } else { - my @breakpoint1 = shuffle(sort keys %{$valid_breakpoint_pair{'+-'}}); - my $breakpoint1 = shift @breakpoint1; - my @breakpoint2 = shuffle(sort keys %{$valid_breakpoint_pair{'+-'}{$breakpoint1}}); - my $breakpoint2 = shift @breakpoint2; - # print "breakpoint1 = $breakpoint1\n"; - # print "breakpoint2 = $breakpoint2\n"; - my $ref_chr = $valid_breakpoint_pair{'+-'}{$breakpoint1}{$breakpoint2}{'chr'}; - my $type = $valid_breakpoint_pair{'+-'}{$breakpoint1}{$breakpoint2}{'type'}; - my $ref_start; - my $ref_end; - if ($$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint1}{'start'} > $$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint2}{'end'}) { - ($ref_start, $ref_end) = ($$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint2}{'start'}, $$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint1}{'end'}); - } else { - ($ref_start, $ref_end) = ($$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint1}{'start'}, $$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint2}{'end'}); - } - # check if there are pre-sampled inversions overlapping at the same site already - foreach my $inversion_id (sort keys %inversion) { - if ($inversion{$inversion_id}{'ref_chr'} eq $ref_chr) { - my $check_overlap_flag1 = check_overlap_region($$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint1}{'start'}, $$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint1}{'end'}, $inversion{$inversion_id}{'ref_start'}, $inversion{$inversion_id}{'ref_end'}); - if ($check_overlap_flag1 == 1) { - delete $valid_breakpoint_pair{'+-'}{$breakpoint1}; - foreach my $bp2 (sort keys %{$valid_breakpoint_pair{'-+'}}) { - if (exists $valid_breakpoint_pair{'-+'}{$bp2}{$breakpoint1}) { - delete $valid_breakpoint_pair{'-+'}{$bp2}{$breakpoint1}; - } - if (scalar (keys %{$valid_breakpoint_pair{'-+'}{$bp2}}) == 0) { - delete $valid_breakpoint_pair{'-+'}{$bp2}; - } - } - } - my $check_overlap_flag2 = check_overlap_region($$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint2}{'start'}, $$inversion_breakpoint_by_chr_by_type_hashref{$ref_chr}{$type}{$breakpoint2}{'end'}, $inversion{$inversion_id}{'ref_start'}, $inversion{$inversion_id}{'ref_end'}); - if ($check_overlap_flag2 == 1) { - delete $valid_breakpoint_pair{'-+'}{$breakpoint2}; - foreach my $bp1 (sort keys %{$valid_breakpoint_pair{'+-'}}) { - if (exists $valid_breakpoint_pair{'+-'}{$bp1}{$breakpoint2}) { - delete $valid_breakpoint_pair{'+-'}{$bp1}{$breakpoint2}; - } - if (scalar (keys %{$valid_breakpoint_pair{'+-'}{$bp1}}) == 0) { - delete $valid_breakpoint_pair{'+-'}{$bp1}; - } - } - } - if (($check_overlap_flag1 == 1) or ($check_overlap_flag2 == 1)) { - goto SAMPLE_RANDOM_INV1; - } - } - } - my $inversion_id = "INV_${i}"; - $inversion{$inversion_id}{'ref_chr'} = $ref_chr; - $inversion{$inversion_id}{'ref_start'} = $ref_start; - $inversion{$inversion_id}{'ref_end'} = $ref_end; - $inversion{$inversion_id}{'breakpoint1'} = $breakpoint1; - $inversion{$inversion_id}{'breakpoint2'} = $breakpoint2; - # print "sampled inversion: ref_chr=$ref_chr, ref_start=$ref_start, ref_end=$ref_end, breakpoint1=$breakpoint1, breakpoint2=$breakpoint2\n"; - # delete used breakpoints - delete $valid_breakpoint_pair{'+-'}{$breakpoint1}; - foreach my $bp2 (sort keys %{$valid_breakpoint_pair{'-+'}}) { - if (exists $valid_breakpoint_pair{'-+'}{$bp2}{$breakpoint1}) { - delete $valid_breakpoint_pair{'-+'}{$bp2}{$breakpoint1}; - } - if (scalar (keys %{$valid_breakpoint_pair{'-+'}{$bp2}}) == 0) { - delete $valid_breakpoint_pair{'-+'}{$bp2}; - } - } - delete $valid_breakpoint_pair{'-+'}{$breakpoint2}; - foreach my $bp1 (sort keys %{$valid_breakpoint_pair{'+-'}}) { - if (exists $valid_breakpoint_pair{'+-'}{$bp1}{$breakpoint2}) { - delete $valid_breakpoint_pair{'+-'}{$bp1}{$breakpoint2}; - } - if (scalar (keys %{$valid_breakpoint_pair{'+-'}{$bp1}}) == 0) { - delete $valid_breakpoint_pair{'+-'}{$bp1}; - } - } - } - } - } else { - for (my $i = 1; $i <= $inversion_count; $i++) { - # random sampling across the genome - my %refseq_genome_space = create_genome_space($refseq_hashref); - # sample the inverted region from the genome space - my $chr_end_margin = 1000; - SAMPLE_RANDOM_INV2: - # sample the inverted region - my ($ref_chr, $ref_start) = sample_genome_space(\%refseq_genome_space); - # print "ref_chr = $ref_chr, ref_start = $ref_start\n"; - # check if the chromosome-end has been involved in the simulated inversion - if ($ref_start <= $chr_end_margin) { - goto SAMPLE_RANDOM_INV2; - } - # sample inversion size - my $inversion_size = sample_inversion_size($inversion_min_size, $inversion_max_size); - my $ref_end = $ref_start + $inversion_size - 1; - # print "ref_chr=$ref_chr, ref_start = $ref_start, inversion_size = $inversion_size\n"; - # check if the sampled end position will go beyond the chromosome end - if ($ref_end > ($refseq_genome_space{'chr-wide'}{$ref_chr}{'length'} - $chr_end_margin)) { - goto SAMPLE_RANDOM_INV2; - } - # check if the sampled breakpoints overlapped with the defined genes - if (exists $$gene_by_chr_hashref{$ref_chr}) { - foreach my $gene_id (sort keys %{$$gene_by_chr_hashref{$ref_chr}}) { - my $gene_check_flag1 = check_overlap_region($ref_start, $ref_start, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'start'}, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'end'}); - my $gene_check_flag2 = check_overlap_region($ref_end, $ref_end, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'start'}, $$gene_by_chr_hashref{$ref_chr}{$gene_id}{'end'}); - if (($gene_check_flag1 == 1) or ($gene_check_flag2 == 1)) { - goto SAMPLE_RANDOM_INV2; - } - } - } - # check if the sampled breakpoints overlaped with the defined centromeres - if (exists $$centromere_by_chr_hashref{$ref_chr}) { - my $centromere_check_flag1 = check_overlap_region($ref_start, $ref_start, $$centromere_by_chr_hashref{$ref_chr}{'start'}, $$centromere_by_chr_hashref{$ref_chr}{'end'}); - my $centromere_check_flag2 = check_overlap_region($ref_end, $ref_end, $$centromere_by_chr_hashref{$ref_chr}{'start'}, $$centromere_by_chr_hashref{$ref_chr}{'end'}); - if (($centromere_check_flag1 == 1) or ($centromere_check_flag2 == 1)) { - goto SAMPLE_RANDOM_INV2; - } - } - - # check if there are pre-sampled inversions overlapping at the same site already - my $check_overlap_flag = 0; - foreach my $id (sort keys %inversion) { - if ($inversion{$id}{'ref_chr'} eq $ref_chr) { - $check_overlap_flag = check_overlap_region($ref_start, $ref_end, $inversion{$id}{'ref_start'}, $inversion{$id}{'ref_end'}); - if ($check_overlap_flag == 1) { - goto SAMPLE_RANDOM_INV2; - } - } - } - # all check passed, register this INV - my $inversion_id = "INV_${i}"; - $inversion{$inversion_id}{'ref_chr'} = $ref_chr; - $inversion{$inversion_id}{'ref_start'} = $ref_start; - $inversion{$inversion_id}{'ref_end'} = $ref_end; - $inversion{$inversion_id}{'breakpoint1'} = "$ref_chr:$ref_start"; - $inversion{$inversion_id}{'breakpoint2'} = "$ref_chr:$ref_end"; - # print "sampled inversion: ref_chr=$ref_chr, ref_start=$ref_start, ref_end=$ref_end\n"; - } - } - # print Dumper(%inversion); - # introduce all sampled inversions - foreach my $inversion_id (sort keys %inversion) { - my $ref_chr = $inversion{$inversion_id}{'ref_chr'}; - my $ref_start = $inversion{$inversion_id}{'ref_start'}; - my $ref_end = $inversion{$inversion_id}{'ref_end'}; - my $inversion_size = $ref_end - $ref_start + 1; - # print "inversion_id = $inversion_id, ref_chr = $ref_chr, ref_start = $ref_start\n\n"; - my $ref_allele = substr $$simseq_hashref{$ref_chr}, $ref_start - 1, 1; - my $alt_allele = ""; - my $inversion_seq = substr $$simseq_hashref{$ref_chr}, $ref_start - 1, $inversion_size; - $inversion_seq = revcom($inversion_seq); - substr $$simseq_hashref{$ref_chr}, $ref_start - 1, $inversion_size, $inversion_seq; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'} = $ref_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'} = "+"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'} = $ref_chr; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'} = $ref_start; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'} = $ref_end; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'} = $alt_allele; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'} = "-"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} = "INV"; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'} = $inversion_id; - $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inversion_size'} = $inversion_size; - } -} - -sub extract_translocation_from_sv { - my $sv_hashref = shift @_; - my %translocation = (); - foreach my $sv_event (sort keys %$sv_hashref) { - my $check_overlap_flag = 0; - if ($$sv_hashref{$sv_event}{'sv_type'} eq "to_be_classified") { - my $bnd_count = scalar @{$$sv_hashref{$sv_event}{'BND'}}; - # print "sv_event = $sv_event, bnd_count = $bnd_count\n"; - if ($bnd_count == 4) { - my %bnd = (); - foreach my $b_hashref (sort @{$$sv_hashref{$sv_event}{'BND'}}) { - my $ref_chr = $$b_hashref{'ref_chr'}; - my $ref_start = $$b_hashref{'ref_start'}; - my $s = $$b_hashref{'s'}; - my $p = $$b_hashref{'p'}; - my $t = $$b_hashref{'t'}; - my ($bnd_p_chr, $bnd_p_start) = split /:/, $p; - # print "p=$p, bnd_p_chr=$bnd_p_chr, bnd_p_start=$bnd_p_start\n"; - my $p_relative_strand = $$b_hashref{'p_relative_strand'}; - my $p_relative_position = $$b_hashref{'p_relative_position'}; - $bnd{$ref_chr}{$ref_start}{'s'} = $s; - $bnd{$ref_chr}{$ref_start}{'t'} = $t; - $bnd{$ref_chr}{$ref_start}{'p_chr'} = $bnd_p_chr; - $bnd{$ref_chr}{$ref_start}{'p_start'} = $bnd_p_start; - $bnd{$ref_chr}{$ref_start}{'p_relative_strand'} = $p_relative_strand; - $bnd{$ref_chr}{$ref_start}{'p_relative_position'} = $p_relative_position; - } - # verify this is indeed a translocation - my $translocation_test_flag = 0; - my $translocation_resolution = "NA"; - # chromosome check - my @bnd_ref_chr = sort keys %bnd; - if ((scalar @bnd_ref_chr) != 2) { - $translocation_test_flag = 1; - } else { - # breakpoint check - my $bnd_ref_chr1 = $bnd_ref_chr[0]; - my $bnd_ref_chr2 = $bnd_ref_chr[1]; - foreach my $ref_chr (@bnd_ref_chr) { - my @bnd_position = sort {$a <=> $b} keys %{$bnd{$ref_chr}}; - if ((scalar @bnd_position) != 2) { - print "\n!!! Error! There seems mistakes in the defined translocation(s) invovled with $ref_chr.\n"; - print "!!! Please check the input translocation_vcf file\n"; - print "!!! Exit!\n"; - $translocation_test_flag = 1; - die; - } elsif (($bnd_position[1] - $bnd_position[0]) != 1) { - print "\n!!! Error! There seems mistakes in the defined translocation(s) involved with $ref_chr.\n"; - print "!!! Please check the input translocation_vcf file\n"; - print "!!! Exit!\n"; - $translocation_test_flag = 1; - die; - } - } - # breakpoint and strand check - if ($translocation_test_flag == 0) { - my ($bnd_ref_chr1_bnd_position1, $bnd_ref_chr1_bnd_position2) = sort {$a <=> $b} keys %{$bnd{$bnd_ref_chr1}}; - my ($bnd_ref_chr2_bnd_position1, $bnd_ref_chr2_bnd_position2) = sort {$a <=> $b} keys %{$bnd{$bnd_ref_chr2}}; - - my $bnd_ref_chr1_bnd_position1_p_chr = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position1}{'p_chr'}; - my $bnd_ref_chr1_bnd_position1_p_start = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position1}{'p_start'}; - my $bnd_ref_chr1_bnd_position1_p_relative_strand = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position1}{'p_relative_strand'}; - my $bnd_ref_chr1_bnd_position1_p_relative_position = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position1}{'p_relative_position'}; - - my $bnd_ref_chr1_bnd_position2_p_chr = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position2}{'p_chr'}; - my $bnd_ref_chr1_bnd_position2_p_start = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position2}{'p_start'}; - my $bnd_ref_chr1_bnd_position2_p_relative_strand = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position2}{'p_relative_strand'}; - my $bnd_ref_chr1_bnd_position2_p_relative_position = $bnd{$bnd_ref_chr1}{$bnd_ref_chr1_bnd_position2}{'p_relative_position'}; - - my $bnd_ref_chr2_bnd_position1_p_chr = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position1}{'p_chr'}; - my $bnd_ref_chr2_bnd_position1_p_start = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position1}{'p_start'}; - my $bnd_ref_chr2_bnd_position1_p_relative_strand = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position1}{'p_relative_strand'}; - my $bnd_ref_chr2_bnd_position1_p_relative_position = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position1}{'p_relative_position'}; - - my $bnd_ref_chr2_bnd_position2_p_chr = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position2}{'p_chr'}; - my $bnd_ref_chr2_bnd_position2_p_start = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position2}{'p_start'}; - my $bnd_ref_chr2_bnd_position2_p_relative_strand = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position2}{'p_relative_strand'}; - my $bnd_ref_chr2_bnd_position2_p_relative_position = $bnd{$bnd_ref_chr2}{$bnd_ref_chr2_bnd_position2}{'p_relative_position'}; - # print "bnd_ref_chr1 = $bnd_ref_chr1\n"; - # print "bnd_ref_chr2 = $bnd_ref_chr2\n"; - # print "bnd_ref_chr1_bnd_position1_p_chr = $bnd_ref_chr1_bnd_position1_p_chr\n"; - # print "bnd_ref_chr1_bnd_position2_p_chr = $bnd_ref_chr1_bnd_position2_p_chr\n"; - # print "bnd_ref_chr2_bnd_position1_p_chr = $bnd_ref_chr2_bnd_position1_p_chr\n"; - # print "bnd_ref_chr2_bnd_position2_p_chr = $bnd_ref_chr2_bnd_position2_p_chr\n"; - # print "bnd_ref_chr1_bnd_position1_p_relative_strand = $bnd_ref_chr1_bnd_position1_p_relative_strand\n"; - # print "bnd_ref_chr1_bnd_position2_p_relative_strand = $bnd_ref_chr1_bnd_position2_p_relative_strand\n"; - # print "bnd_ref_chr2_bnd_position1_p_relative_strand = $bnd_ref_chr2_bnd_position1_p_relative_strand\n"; - # print "bnd_ref_chr2_bnd_position2_p_relative_strand = $bnd_ref_chr2_bnd_position2_p_relative_strand\n"; - # print "bnd_ref_chr1_bnd_position1_p_relative_position = $bnd_ref_chr1_bnd_position1_p_relative_position\n"; - # print "bnd_ref_chr1_bnd_position2_p_relative_position = $bnd_ref_chr1_bnd_position2_p_relative_position\n"; - # print "bnd_ref_chr2_bnd_position1_p_relative_position = $bnd_ref_chr2_bnd_position1_p_relative_position\n"; - # print "bnd_ref_chr2_bnd_position2_p_relative_position = $bnd_ref_chr2_bnd_position2_p_relative_position\n"; - if (($bnd_ref_chr1 ne $bnd_ref_chr2_bnd_position1_p_chr) or ($bnd_ref_chr1 ne $bnd_ref_chr2_bnd_position2_p_chr) or ($bnd_ref_chr2 ne $bnd_ref_chr1_bnd_position1_p_chr) or ($bnd_ref_chr2 ne $bnd_ref_chr1_bnd_position2_p_chr)) { - print "\n!!! Error! There seems mistakes in the defined translocation(s) involved with $bnd_ref_chr1 and $bnd_ref_chr2.\n"; - print "!!! Please check the input translocation_vcf file\n"; - print "!!! Exit!\n"; - $translocation_test_flag = 1; - die; - } else { - my $positive_strand_count = 0; - if ($bnd_ref_chr1_bnd_position1_p_relative_strand eq "+") { - $positive_strand_count++; - } - if ($bnd_ref_chr1_bnd_position2_p_relative_strand eq "+") { - $positive_strand_count++; - } - if ($bnd_ref_chr2_bnd_position1_p_relative_strand eq "+") { - $positive_strand_count++; - } - if ($bnd_ref_chr2_bnd_position2_p_relative_strand eq "+") { - $positive_strand_count++; - } - if ($positive_strand_count == 4) { - $translocation_resolution = "++++"; # strand choice for chr1_chr2_part1, chr1_chr2_part2, chr2_chr1_part1, chr2_chr1_part2; - } elsif ($positive_strand_count == 0) { - $translocation_resolution = "+--+"; - } else { - print "\n!!! Error! There seems mistakes in the defined translocation(s) involved with $bnd_ref_chr1 and $bnd_ref_chr2.\n"; - print "!!! Please check the input translocation_vcf file\n"; - print "!!! Exit!\n"; - $translocation_test_flag = 1; - die; - } - } - } - } - if ($translocation_test_flag == 0) { - # check if overlapped with pre-registed translocations - my ($bnd_ref_chr1, $bnd_ref_chr2) = sort @bnd_ref_chr; - my ($bnd_ref_chr1_bnd_position1, $bnd_ref_chr1_bnd_position2) = sort {$a <=> $b} keys %{$bnd{$bnd_ref_chr1}}; - my ($bnd_ref_chr2_bnd_position1, $bnd_ref_chr2_bnd_position2) = sort {$a <=> $b} keys %{$bnd{$bnd_ref_chr2}}; - - my $check_overlap_flag = check_translocation_overlap(\%translocation, $bnd_ref_chr1, $bnd_ref_chr2); - if ($check_overlap_flag == 1) { - print "\n!!! Warning! Multiple translocation defined for $bnd_ref_chr1 or $bnd_ref_chr2!\n"; - print "!!! Only keep the first one and ignore the others\n"; - } else { - # register this translocation - my $ref_chr1 = $bnd_ref_chr1; - my $ref_chr2 = $bnd_ref_chr2; - - my $ref_chr1_chr2_part1_chr; - my $ref_chr1_chr2_part1_start; - my $ref_chr1_chr2_part1_end; - my $ref_chr1_chr2_part1_strand; - - my $ref_chr1_chr2_part2_chr; - my $ref_chr1_chr2_part2_start; - my $ref_chr1_chr2_part2_end; - my $ref_chr1_chr2_part2_strand; - - my $ref_chr2_chr1_part1_chr; - my $ref_chr2_chr1_part1_start; - my $ref_chr2_chr1_part1_end; - my $ref_chr2_chr1_part1_strand; - - my $ref_chr2_chr1_part2_chr; - my $ref_chr2_chr1_part2_start; - my $ref_chr2_chr1_part2_end; - my $ref_chr2_chr1_part2_strand; - - if ($translocation_resolution eq "++++") { - $ref_chr1_chr2_part1_chr = $ref_chr1; - $ref_chr1_chr2_part1_start = 1; - $ref_chr1_chr2_part1_end = $bnd_ref_chr1_bnd_position1; - $ref_chr1_chr2_part1_strand = "+"; - - $ref_chr1_chr2_part2_chr = $ref_chr2; - $ref_chr1_chr2_part2_start = $bnd_ref_chr2_bnd_position2; - $ref_chr1_chr2_part2_end = "chr_end"; - $ref_chr1_chr2_part2_strand = "+"; - - $ref_chr2_chr1_part1_chr = $ref_chr2; - $ref_chr2_chr1_part1_start = 1; - $ref_chr2_chr1_part1_end = $bnd_ref_chr2_bnd_position1; - $ref_chr2_chr1_part1_strand = "+"; - - $ref_chr2_chr1_part2_chr = $ref_chr1; - $ref_chr2_chr1_part2_start = $bnd_ref_chr1_bnd_position2; - $ref_chr2_chr1_part2_end = "chr_end"; - $ref_chr2_chr1_part2_strand = "+"; - } else { - $ref_chr1_chr2_part1_chr = $ref_chr1; - $ref_chr1_chr2_part1_start = 1; - $ref_chr1_chr2_part1_end = $bnd_ref_chr1_bnd_position1; - $ref_chr1_chr2_part1_strand = "+"; - - $ref_chr1_chr2_part2_chr = $ref_chr2; - $ref_chr1_chr2_part2_start = 1; - $ref_chr1_chr2_part2_end = $bnd_ref_chr2_bnd_position1; - $ref_chr1_chr2_part2_strand = "-"; - - $ref_chr2_chr1_part1_chr = $ref_chr2; - $ref_chr2_chr1_part1_start = $bnd_ref_chr2_bnd_position2; - $ref_chr2_chr1_part1_end = "chr_end"; - $ref_chr2_chr1_part1_strand = "-"; - - $ref_chr2_chr1_part2_chr = $ref_chr1; - $ref_chr2_chr1_part2_start = $bnd_ref_chr1_bnd_position2; - $ref_chr2_chr1_part2_end = "chr_end"; - $ref_chr2_chr1_part2_strand = "+"; - } - - $translocation{$sv_event}{'ref_chr1'} = $ref_chr1; - $translocation{$sv_event}{'ref_chr2'} = $ref_chr2; - - $translocation{$sv_event}{'ref_chr1_chr2_part1_chr'} = $ref_chr1_chr2_part1_chr; - $translocation{$sv_event}{'ref_chr1_chr2_part1_start'} = $ref_chr1_chr2_part1_start; - $translocation{$sv_event}{'ref_chr1_chr2_part1_end'} = $ref_chr1_chr2_part1_end; - $translocation{$sv_event}{'ref_chr1_chr2_part1_strand'} = $ref_chr1_chr2_part1_strand; - - $translocation{$sv_event}{'ref_chr1_chr2_part2_chr'} = $ref_chr1_chr2_part2_chr; - $translocation{$sv_event}{'ref_chr1_chr2_part2_start'} = $ref_chr1_chr2_part2_start; - $translocation{$sv_event}{'ref_chr1_chr2_part2_end'} = $ref_chr1_chr2_part2_end; - $translocation{$sv_event}{'ref_chr1_chr2_part2_strand'} = $ref_chr1_chr2_part2_strand; - - $translocation{$sv_event}{'ref_chr2_chr1_part1_chr'} = $ref_chr2_chr1_part1_chr; - $translocation{$sv_event}{'ref_chr2_chr1_part1_start'} = $ref_chr2_chr1_part1_start; - $translocation{$sv_event}{'ref_chr2_chr1_part1_end'} = $ref_chr2_chr1_part1_end; - $translocation{$sv_event}{'ref_chr2_chr1_part1_strand'} = $ref_chr2_chr1_part1_strand; - - $translocation{$sv_event}{'ref_chr2_chr1_part2_chr'} = $ref_chr2_chr1_part2_chr; - $translocation{$sv_event}{'ref_chr2_chr1_part2_start'} = $ref_chr2_chr1_part2_start; - $translocation{$sv_event}{'ref_chr2_chr1_part2_end'} = $ref_chr2_chr1_part2_end; - $translocation{$sv_event}{'ref_chr2_chr1_part2_strand'} = $ref_chr2_chr1_part2_strand; - } - } - } - } - } - return %translocation; -} - -sub check_translocation_overlap { - my ($translocation_hashref, $chr1, $chr2) = @_; - my $flag = 0; - foreach my $translocation_id (sort keys %$translocation_hashref) { - if ((exists $$translocation_hashref{$translocation_id}{$chr1}) or (exists $$translocation_hashref{$translocation_id}{$chr2})) { - $flag = 1; - last; - } - } - return $flag; -} - -sub introduce_defined_translocation { - my ($translocation_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - foreach my $translocation_id (sort keys %$translocation_hashref) { - my $ref_chr1 = $$translocation_hashref{$translocation_id}{'ref_chr1'}; - my $ref_chr2 = $$translocation_hashref{$translocation_id}{'ref_chr2'}; - - my $ref_chr1_chr2_part1_chr = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part1_chr'}; - my $ref_chr1_chr2_part1_start = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part1_start'}; - my $ref_chr1_chr2_part1_end = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part1_end'}; - my $ref_chr1_chr2_part1_strand = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part1_strand'}; - - my $ref_chr1_chr2_part2_chr = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part2_chr'}; - my $ref_chr1_chr2_part2_start = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part2_start'}; - my $ref_chr1_chr2_part2_end = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part2_end'}; - my $ref_chr1_chr2_part2_strand = $$translocation_hashref{$translocation_id}{'ref_chr1_chr2_part2_strand'}; - - my $ref_chr2_chr1_part1_chr = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part1_chr'}; - my $ref_chr2_chr1_part1_start = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part1_start'}; - my $ref_chr2_chr1_part1_end = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part1_end'}; - my $ref_chr2_chr1_part1_strand = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part1_strand'}; - - my $ref_chr2_chr1_part2_chr = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part2_chr'}; - my $ref_chr2_chr1_part2_start = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part2_start'}; - my $ref_chr2_chr1_part2_end = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part2_end'}; - my $ref_chr2_chr1_part2_strand = $$translocation_hashref{$translocation_id}{'ref_chr2_chr1_part2_strand'}; - - my $ref_chr1_chr2 = "${ref_chr1}_${ref_chr2}"; - my $ref_chr2_chr1 = "${ref_chr2}_${ref_chr1}"; - - if ($ref_chr1_chr2_part1_strand eq $ref_chr1_chr2_part2_strand) { - $ref_chr1_chr2_part2_end = length $$refseq_hashref{$ref_chr2}; - $ref_chr2_chr1_part2_end = length $$refseq_hashref{$ref_chr1}; - } else { - $ref_chr2_chr1_part1_end = length $$refseq_hashref{$ref_chr2}; - $ref_chr2_chr1_part2_end = length $$refseq_hashref{$ref_chr1}; - } - - my $ref_chr1_chr2_part1_seq = substr $$refseq_hashref{$ref_chr1}, $ref_chr1_chr2_part1_start - 1, $ref_chr1_chr2_part1_end - $ref_chr1_chr2_part1_start + 1; - my $ref_chr1_chr2_part2_seq = substr $$refseq_hashref{$ref_chr2}, $ref_chr1_chr2_part2_start - 1, $ref_chr1_chr2_part2_end - $ref_chr1_chr2_part2_start + 1; - my $ref_chr2_chr1_part1_seq = substr $$refseq_hashref{$ref_chr2}, $ref_chr2_chr1_part1_start - 1, $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1; - my $ref_chr2_chr1_part2_seq = substr $$refseq_hashref{$ref_chr1}, $ref_chr2_chr1_part2_start - 1, $ref_chr2_chr1_part2_end - $ref_chr2_chr1_part2_start + 1; - - if ($ref_chr1_chr2_part1_strand ne $ref_chr1_chr2_part2_strand) { - $ref_chr1_chr2_part2_seq = revcom($ref_chr1_chr2_part2_seq); - $ref_chr2_chr1_part1_seq = revcom($ref_chr2_chr1_part1_seq); - } - - $$simseq_hashref{$ref_chr1_chr2} = $ref_chr1_chr2_part1_seq . $ref_chr1_chr2_part2_seq; - $$simseq_hashref{$ref_chr2_chr1} = $ref_chr2_chr1_part1_seq . $ref_chr2_chr1_part2_seq; - delete $$simseq_hashref{$ref_chr1}; - delete $$simseq_hashref{$ref_chr2}; - - if ($ref_chr1_chr2_part1_strand eq $ref_chr1_chr2_part2_strand) { - # translocation resolution: ++++ - # ref_chr1_chr2_part1 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_start'} = $ref_chr1_chr2_part1_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_strand'} = $ref_chr1_chr2_part1_strand; - my $ref_chr1_chr2_part1_end_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr1_chr2_part1_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_allele'} = $ref_chr1_chr2_part1_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_strand'} = $ref_chr1_chr2_part1_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_allele'} = "$ref_chr1_chr2_part1_end_ref_allele\[$ref_chr2:$ref_chr1_chr2_part2_start\["; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_id'} = $translocation_id; - # ref_chr1_chr2_part2 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_start'} = $ref_chr1_chr2_part2_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_end'} = $ref_chr1_chr2_part2_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_strand'} = $ref_chr1_chr2_part2_strand; - my $ref_chr1_chr2_part2_start_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr1_chr2_part2_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_allele'} = $ref_chr1_chr2_part2_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_start'} = $ref_chr1_chr2_part1_end + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_end'} = $ref_chr1_chr2_part1_end + $ref_chr1_chr2_part2_end - $ref_chr1_chr2_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_strand'} = $ref_chr1_chr2_part2_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_allele'} = "\]$ref_chr1:$ref_chr1_chr2_part1_end\]$ref_chr1_chr2_part2_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part1 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_start'} = $ref_chr2_chr1_part1_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_end'} = $ref_chr2_chr1_part1_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_strand'} = $ref_chr2_chr1_part1_strand; - my $ref_chr2_chr1_part1_end_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr2_chr1_part1_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_allele'} = $ref_chr2_chr1_part1_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_end'} = $ref_chr2_chr1_part1_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_strand'} = $ref_chr2_chr1_part1_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_allele'} = "$ref_chr2_chr1_part1_end_ref_allele\[$ref_chr1:$ref_chr2_chr1_part2_start\["; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part2 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_start'} = $ref_chr2_chr1_part2_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_end'} = $ref_chr2_chr1_part2_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_strand'} = $ref_chr2_chr1_part2_strand; - my $ref_chr2_chr1_part2_start_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr2_chr1_part2_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_allele'} = $ref_chr2_chr1_part2_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_start'} = $ref_chr2_chr1_part1_end + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_end'} = $ref_chr2_chr1_part1_end + $ref_chr2_chr1_part2_end - $ref_chr2_chr1_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_strand'} = $ref_chr2_chr1_part2_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_allele'} = "\]$ref_chr2:$ref_chr2_chr1_part1_end\]$ref_chr2_chr1_part2_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_id'} = $translocation_id; - } else { - # translocation resolution: +--+ - # ref_chr1_chr2_part1 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_start'} = $ref_chr1_chr2_part1_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_strand'} = $ref_chr1_chr2_part1_strand; - my $ref_chr1_chr2_part1_end_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr1_chr2_part1_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_allele'} = $ref_chr1_chr2_part1_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_strand'} = $ref_chr1_chr2_part1_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_allele'} = "$ref_chr1_chr2_part1_end_ref_allele\]$ref_chr2:$ref_chr1_chr2_part2_end\]"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_id'} = $translocation_id; - # ref_chr1_chr2_part2 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_start'} = $ref_chr1_chr2_part2_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_end'} = $ref_chr1_chr2_part2_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_strand'} = $ref_chr1_chr2_part2_strand; - my $ref_chr1_chr2_part2_end_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr1_chr2_part2_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_allele'} = $ref_chr1_chr2_part2_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_start'} = $ref_chr1_chr2_part1_end + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_end'} = $ref_chr1_chr2_part1_end + $ref_chr1_chr2_part2_end - $ref_chr1_chr2_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_strand'} = $ref_chr1_chr2_part2_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_allele'} = "$ref_chr1_chr2_part2_end_ref_allele\]$ref_chr1:$ref_chr1_chr2_part1_end\]"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part1 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_start'} = $ref_chr2_chr1_part1_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_end'} = $ref_chr2_chr1_part1_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_strand'} = $ref_chr2_chr1_part1_strand; - my $ref_chr2_chr1_part1_start_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr2_chr1_part1_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_allele'} = $ref_chr2_chr1_part1_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_end'} = $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_strand'} = $ref_chr2_chr1_part1_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_allele'} = "\[$ref_chr1:$ref_chr2_chr1_part2_start\[$ref_chr2_chr1_part1_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part2 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_start'} = $ref_chr2_chr1_part2_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_end'} = $ref_chr2_chr1_part2_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_strand'} = $ref_chr2_chr1_part2_strand; - my $ref_chr2_chr1_part2_start_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr2_chr1_part2_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_allele'} = $ref_chr2_chr1_part2_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_start'} = $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1 + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_end'} = $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1 + $ref_chr2_chr1_part2_end - $ref_chr2_chr1_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_strand'} = $ref_chr2_chr1_part2_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_allele'} = "\[$ref_chr2:$ref_chr2_chr1_part1_start\[$ref_chr2_chr1_part2_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_id'} = $translocation_id; - } - } -} - - -sub examine_breakpoint_for_translocation { - my ($breakpoint_by_chr_by_type_hashref, $refseq_hashref, $centromere_by_chr_hashref) = @_; - my %valid_breakpoint_pair = (); - foreach my $chr1 (sort keys %$breakpoint_by_chr_by_type_hashref) { - foreach my $chr2 (sort keys %$breakpoint_by_chr_by_type_hashref) { - if ($chr1 ne $chr2) { - foreach my $type (sort keys %{$$breakpoint_by_chr_by_type_hashref{$chr1}}) { - if (exists $$breakpoint_by_chr_by_type_hashref{$chr2}{$type}) { - foreach my $b1 (sort keys %{$$breakpoint_by_chr_by_type_hashref{$chr1}{$type}}) { - foreach my $b2 (sort keys %{$$breakpoint_by_chr_by_type_hashref{$chr2}{$type}}) { - my $b1_chr = $chr1; - my $b1_start = $$breakpoint_by_chr_by_type_hashref{$chr1}{$type}{$b1}{'start'}; - my $b1_end = $$breakpoint_by_chr_by_type_hashref{$chr1}{$type}{$b1}{'end'}; - my $b1_strand = $$breakpoint_by_chr_by_type_hashref{$chr1}{$type}{$b1}{'strand'}; - - my $b2_chr = $chr2; - my $b2_start = $$breakpoint_by_chr_by_type_hashref{$chr2}{$type}{$b2}{'start'}; - my $b2_end = $$breakpoint_by_chr_by_type_hashref{$chr2}{$type}{$b2}{'end'}; - my $b2_strand = $$breakpoint_by_chr_by_type_hashref{$chr2}{$type}{$b2}{'strand'}; - # check if the candidate breakpoints overlap with centromeres - if (keys %$centromere_by_chr_hashref) { - if ((exists $$centromere_by_chr_hashref{$b1_chr}) and (exists $$centromere_by_chr_hashref{$b2_chr})) { - my $centromere_check_flag1 = check_overlap_region($b1_start, $b1_end, $$centromere_by_chr_hashref{$b1_chr}{'start'}, $$centromere_by_chr_hashref{$b1_chr}{'end'}); - my $centromere_check_flag2 = check_overlap_region($b2_start, $b2_end, $$centromere_by_chr_hashref{$b2_chr}{'start'}, $$centromere_by_chr_hashref{$b2_chr}{'end'}); - # check for the case in which the translocated chromosomes will have zero or two centromeres - if (($centromere_check_flag1 == 0) and ($centromere_check_flag2 == 0)) { - # print "centromere test passed!\n"; - # try translocation resolution: ++++ - my $translocation_resolution = "++++"; - if ($translocation_resolution eq "++++") { - my $ref_chr1_chr2_part1_chr = $b1_chr; - my $ref_chr1_chr2_part1_start = 1; - my $ref_chr1_chr2_part1_end = $b1_start - 1; - my $ref_chr1_chr2_part1_strand = "+"; - - my $ref_chr1_chr2_part2_chr = $b2_chr; - my $ref_chr1_chr2_part2_start = $b2_start; - my $ref_chr1_chr2_part2_end = length $$refseq_hashref{$b2_chr}; - my $ref_chr1_chr2_part2_strand = "+"; - - my $ref_chr2_chr1_part1_chr = $b2_chr; - my $ref_chr2_chr1_part1_start = 1; - my $ref_chr2_chr1_part1_end = $b2_start - 1; - my $ref_chr2_chr1_part1_strand = "+"; - - my $ref_chr2_chr1_part2_chr = $b1_chr; - my $ref_chr2_chr1_part2_start = $b1_start; - my $ref_chr2_chr1_part2_end = length $$refseq_hashref{$b1_chr}; - my $ref_chr2_chr1_part2_strand = "+"; - - my $ref_chr1_chr2_part1_centromere_count = check_overlap_region($ref_chr1_chr2_part1_start, $ref_chr1_chr2_part1_end, $$centromere_by_chr_hashref{$b1_chr}{'start'}, $$centromere_by_chr_hashref{$b1_chr}{'end'}); - my $ref_chr2_chr1_part2_centromere_count = check_overlap_region($ref_chr2_chr1_part2_start, $ref_chr2_chr1_part2_end, $$centromere_by_chr_hashref{$b1_chr}{'start'}, $$centromere_by_chr_hashref{$b1_chr}{'end'}); - my $ref_chr1_chr2_part2_centromere_count = check_overlap_region($ref_chr1_chr2_part2_start, $ref_chr1_chr2_part2_end, $$centromere_by_chr_hashref{$b2_chr}{'start'}, $$centromere_by_chr_hashref{$b2_chr}{'end'}); - my $ref_chr2_chr1_part1_centromere_count = check_overlap_region($ref_chr2_chr1_part1_start, $ref_chr2_chr1_part1_end, $$centromere_by_chr_hashref{$b2_chr}{'start'}, $$centromere_by_chr_hashref{$b2_chr}{'end'}); - - my $ref_chr1_chr2_centromere_count = $ref_chr1_chr2_part1_centromere_count + $ref_chr1_chr2_part2_centromere_count; - my $ref_chr2_chr1_centromere_count = $ref_chr2_chr1_part1_centromere_count + $ref_chr2_chr1_part2_centromere_count; - - if (($ref_chr1_chr2_centromere_count == 1) and ($ref_chr2_chr1_centromere_count == 1)) { - $valid_breakpoint_pair{$b1}{$b2}{'translocation_resolution'} = $translocation_resolution; - $valid_breakpoint_pair{$b2}{$b1}{'translocation_resolution'} = $translocation_resolution; - } else { - # translocation resolution: +--+ - $translocation_resolution = "+--+"; - $ref_chr1_chr2_part1_chr = $b1_chr; - $ref_chr1_chr2_part1_start = 1; - $ref_chr1_chr2_part1_end = $b1_start - 1; - $ref_chr1_chr2_part1_strand = "+"; - - $ref_chr1_chr2_part2_chr = $b2_chr; - $ref_chr1_chr2_part2_start = 1; - $ref_chr1_chr2_part2_end = $b2_end; - $ref_chr1_chr2_part2_strand = "-"; - - $ref_chr2_chr1_part1_chr = $b2_chr; - $ref_chr2_chr1_part1_start = $b2_end + 1; - $ref_chr2_chr1_part1_end = length $$refseq_hashref{$b2_chr}; - $ref_chr2_chr1_part1_strand = "-"; - - $ref_chr2_chr1_part2_chr = $b1_chr; - $ref_chr2_chr1_part2_start = $b1_start; - $ref_chr2_chr1_part2_end = length $$refseq_hashref{$b1_chr}; - $ref_chr2_chr1_part2_strand = "+"; - - $ref_chr1_chr2_part1_centromere_count = check_overlap_region($ref_chr1_chr2_part1_start, $ref_chr1_chr2_part1_end, $$centromere_by_chr_hashref{$b1_chr}{'start'}, $$centromere_by_chr_hashref{$b1_chr}{'end'}); - $ref_chr2_chr1_part2_centromere_count = check_overlap_region($ref_chr2_chr1_part2_start, $ref_chr2_chr1_part2_end, $$centromere_by_chr_hashref{$b1_chr}{'start'}, $$centromere_by_chr_hashref{$b1_chr}{'end'}); - $ref_chr1_chr2_part2_centromere_count = check_overlap_region($ref_chr1_chr2_part2_start, $ref_chr1_chr2_part2_end, $$centromere_by_chr_hashref{$b2_chr}{'start'}, $$centromere_by_chr_hashref{$b2_chr}{'end'}); - $ref_chr2_chr1_part1_centromere_count = check_overlap_region($ref_chr2_chr1_part1_start, $ref_chr2_chr1_part1_end, $$centromere_by_chr_hashref{$b2_chr}{'start'}, $$centromere_by_chr_hashref{$b2_chr}{'end'}); - - $ref_chr1_chr2_centromere_count = $ref_chr1_chr2_part1_centromere_count + $ref_chr1_chr2_part2_centromere_count; - $ref_chr2_chr1_centromere_count = $ref_chr2_chr1_part1_centromere_count + $ref_chr2_chr1_part2_centromere_count; - - if (($ref_chr1_chr2_centromere_count == 1) and ($ref_chr2_chr1_centromere_count == 1)) { - $valid_breakpoint_pair{$b1}{$b2}{'translocation_resolution'} = "+--+"; - $valid_breakpoint_pair{$b2}{$b1}{'translocation_resolution'} = "+--+"; - } else { - # no possible resolution available - next; - } - } - } - } - } else { - # when not both centromeres were defined - print "!!! b1_chr=$b1_chr, b2_chr=$b2_chr\n"; - print "!!! Warnings! Centromeres are defined but not for all the chromosomes! !!!\n"; - print "!!! Please add the missing centromere annotation in the centromere_gff file or use the -excluded_chr_list option to exclude chromosomes that lack the centromere annotation. !!!\n"; - print "!!! Exit! !!!\n"; - die; - } - } else { - $valid_breakpoint_pair{$b1}{$b2}{'translocation_resolution'} = "NA"; - $valid_breakpoint_pair{$b2}{$b1}{'translocation_resolution'} = "NA"; - } - } - } - } - } - } - } - } - if ((scalar (keys %valid_breakpoint_pair)) == 0) { - print "\n!!! Error! None of defined breakpoints are valid for triggering translocation.\n"; - print "!!! Valid breakpoints need to satisfy the following criteria:\n"; - print "!!! 1) Be annotated as the same feature type.\n"; - print "!!! 2) Come from at least two different chromosomes.\n"; - print "!!! 3) If centromeres have been defined, the breakpoints should not overlap with the defined centromeres.\n"; - print "!!! 4) If centromeres have been defined, the resulting rearranged chromosomes should still possess only one centromere per chromosome.\n"; - print "!!! Exit!\n"; - die; - } - return %valid_breakpoint_pair; -} - - -sub introduce_random_translocation { - my ($translocation_count, $centromere_by_chr_hashref, $translocation_breakpoint_by_chr_by_type_hashref, $gene_by_chr_hashref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref) = @_; - my %translocation = (); - my %available_chr = %$refseq_hashref; - if (keys %$translocation_breakpoint_by_chr_by_type_hashref) { - my %translocation_breakpoint = (); - foreach my $chr (sort keys %$translocation_breakpoint_by_chr_by_type_hashref) { - foreach my $type (sort keys %{$$translocation_breakpoint_by_chr_by_type_hashref{$chr}}) { - foreach my $b (sort keys %{$$translocation_breakpoint_by_chr_by_type_hashref{$chr}{$type}}) { - $translocation_breakpoint{$b}{'chr'} = $chr; - $translocation_breakpoint{$b}{'type'} = $type; - $translocation_breakpoint{$b}{'start'} = $$translocation_breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b}{'start'}; - $translocation_breakpoint{$b}{'end'} = $$translocation_breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b}{'end'}; - $translocation_breakpoint{$b}{'strand'} = $$translocation_breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b}{'strand'}; - # print "b=$b, chr=$chr, type=$type, start = $$translocation_breakpoint_by_chr_by_type_hashref{$chr}{$type}{$b}{'start'}\n"; - } - } - } - my %valid_breakpoint_pair = examine_breakpoint_for_translocation($translocation_breakpoint_by_chr_by_type_hashref, $refseq_hashref, $centromere_by_chr_hashref); - for (my $i = 1; $i <= $translocation_count; $i++) { - # print "sample translocation: i = $i, translocation_count = $translocation_count\n"; - # check if there are still available chromsomes - my $available_chr_count = scalar keys %available_chr; - # print "available_chr_count = $available_chr_count\n"; - SAMPLE_RANDOM_TRA1: - if ($available_chr_count < 2) { - my $j = $i - 1; - print "\n!!! Warning! No more available chromosomes in the current simulation.\n"; - print "!!! Only $j translocation were introduced.\n"; - last; - } - # check if there are still valid $breakpoints - my $valid_breakpoint_pair_count = scalar keys %valid_breakpoint_pair; - # print "valid_breakpoint_pair_count = $valid_breakpoint_pair_count\n"; - if ($valid_breakpoint_pair_count < 2) { - my $j = $i - 1; - print "\n!!! Warning! No more valid breakpoint pairs can be found in the current simulation based on the defined breakpoint file: $translocation_breakpoint_gff\n"; - print "!!! Only $j translocation were introduced.\n"; - last; - } - # sample inversion based on defined breakpoints - my @breakpoint1 = shuffle(sort keys %valid_breakpoint_pair); - my $breakpoint1 = shift @breakpoint1; - my @breakpoint2 = shuffle(sort keys %{$valid_breakpoint_pair{$breakpoint1}}); - my $breakpoint2 = shift @breakpoint2; - # print "breakpoint1 = $breakpoint1\n"; - # print "breakpoint2 = $breakpoint2\n"; - my $breakpoint1_chr = $translocation_breakpoint{$breakpoint1}{'chr'}; - my $breakpoint2_chr = $translocation_breakpoint{$breakpoint2}{'chr'}; - # print "breakpoint1_chr = $breakpoint1_chr\n"; - # print "breakpoint2_chr = $breakpoint2_chr\n"; - # check if the same chromosomes have been involved in previously simulated translocations - if (not exists $available_chr{$breakpoint1_chr}) { - # print "failed available chr check\n"; - delete $valid_breakpoint_pair{$breakpoint1}; - goto SAMPLE_RANDOM_TRA1; - } - if (not exists $available_chr{$breakpoint2_chr}) { - delete $valid_breakpoint_pair{$breakpoint2}; - goto SAMPLE_RANDOM_TRA1; - } - - if ($valid_breakpoint_pair{$breakpoint1}{$breakpoint2}{'translocation_resolution'} eq "NA") { - # random sampling of translocation resolution if it is unspecified - if (rand(1) < 0.5) { - $valid_breakpoint_pair{$breakpoint1}{$breakpoint2}{'translocation_resolution'} = "++++"; - $valid_breakpoint_pair{$breakpoint2}{$breakpoint1}{'translocation_resolution'} = "++++"; - } else { - $valid_breakpoint_pair{$breakpoint1}{$breakpoint2}{'translocation_resolution'} = "+--+"; - $valid_breakpoint_pair{$breakpoint2}{$breakpoint1}{'translocation_resolution'} = "+--+"; - } - } - - my $translocation_id = "TRA_${i}"; - my $ref_chr1; - my $ref_chr2; - my $ref_chr1_breakpoint_start; - my $ref_chr1_breakpoint_end; - my $ref_chr2_breakpoint_start; - my $ref_chr2_breakpoint_end; - - if ($breakpoint1_chr lt $breakpoint2_chr) { - $ref_chr1 = $breakpoint1_chr; - $ref_chr2 = $breakpoint2_chr; - $ref_chr1_breakpoint_start = $translocation_breakpoint{$breakpoint1}{'start'}; - $ref_chr1_breakpoint_end = $translocation_breakpoint{$breakpoint1}{'end'}; - $ref_chr2_breakpoint_start = $translocation_breakpoint{$breakpoint2}{'start'}; - $ref_chr2_breakpoint_end = $translocation_breakpoint{$breakpoint2}{'end'}; - } else { - $ref_chr1 = $breakpoint2_chr; - $ref_chr2 = $breakpoint1_chr; - $ref_chr1_breakpoint_start = $translocation_breakpoint{$breakpoint2}{'start'}; - $ref_chr1_breakpoint_end = $translocation_breakpoint{$breakpoint2}{'end'}; - $ref_chr2_breakpoint_start = $translocation_breakpoint{$breakpoint1}{'start'}; - $ref_chr2_breakpoint_end = $translocation_breakpoint{$breakpoint1}{'end'}; - } - - if ($valid_breakpoint_pair{$breakpoint1}{$breakpoint2}{'translocation_resolution'} eq "++++") { - # translocation_resolution: ++++ - $translocation{$translocation_id}{'ref_chr1'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr2'} = $ref_chr2; - - $translocation{$translocation_id}{'ref_chr1_chr2_part1_chr'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_start'} = 1; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_end'} = $ref_chr1_breakpoint_start - 1; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_strand'} = "+"; - - $translocation{$translocation_id}{'ref_chr1_chr2_part2_chr'} = $ref_chr2; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_start'} = $ref_chr2_breakpoint_start; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_end'} = length $$refseq_hashref{$ref_chr2}; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_strand'} = "+"; - - $translocation{$translocation_id}{'ref_chr2_chr1_part1_chr'} = $ref_chr2; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_start'} = 1; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_end'} = $ref_chr2_breakpoint_start - 1; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_strand'} = "+"; - - $translocation{$translocation_id}{'ref_chr2_chr1_part2_chr'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_start'} = $ref_chr1_breakpoint_start; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_end'} = length $$refseq_hashref{$ref_chr1}; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_strand'} = "+"; - } else { - # translocation_resolution: +--+ - $translocation{$translocation_id}{'ref_chr1'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr2'} = $ref_chr2; - - $translocation{$translocation_id}{'ref_chr1_chr2_part1_chr'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_start'} = 1; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_end'} = $ref_chr1_breakpoint_start - 1; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_strand'} = "+"; - - $translocation{$translocation_id}{'ref_chr1_chr2_part2_chr'} = $ref_chr2; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_start'} = 1; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_end'} = $ref_chr2_breakpoint_end; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_strand'} = "-"; - - $translocation{$translocation_id}{'ref_chr2_chr1_part1_chr'} = $ref_chr2; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_start'} = $ref_chr2_breakpoint_end + 1; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_end'} = length $$refseq_hashref{$ref_chr2}; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_strand'} = "-"; - - $translocation{$translocation_id}{'ref_chr2_chr1_part2_chr'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_start'} = $ref_chr1_breakpoint_start; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_end'} = length $$refseq_hashref{$ref_chr1}; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_strand'} = "+"; - } - # delete used breakpoints - delete $valid_breakpoint_pair{$breakpoint1}; - delete $valid_breakpoint_pair{$breakpoint2}; - # update avialble_chr - delete $available_chr{$ref_chr1}; - delete $available_chr{$ref_chr2}; - } - } else { - for (my $i = 1; $i <= $translocation_count; $i++) { - # print "sample translocation: i = $i, translocation_count = $translocation_count\n"; - # check if there are still available chromsomes - my $available_chr_count = scalar keys %available_chr; - # print "available_chr_count = $available_chr_count\n"; - if ($available_chr_count < 2) { - my $j = $i - 1; - print "\n!!! Warning! No more available chromosomes in the current simulation.\n"; - print "!!! Only $j translocation were introduced.\n"; - last; - } - # random sampling across the genome - my %refseq_genome_space = create_genome_space($refseq_hashref); - SAMPLE_RANDOM_TRA2: - my ($sample1_chr, $sample1_breakpoint_start) = sample_genome_space(\%refseq_genome_space); - my ($sample2_chr, $sample2_breakpoint_start) = sample_genome_space(\%refseq_genome_space); - - if ($sample1_chr eq $sample2_chr) { - goto SAMPLE_RANDOM_TRA2; - } elsif ((not exists $available_chr{$sample1_chr}) or (not exists $available_chr{$sample2_chr})) { - goto SAMPLE_RANDOM_TRA2; - } else { - my $sample1_breakpoint_end = $sample1_breakpoint_start; - my $sample2_breakpoint_end = $sample2_breakpoint_start; - - # check if the sampled breakpoints overlapped with the defined genes - my $sample1_gene_check_flag = 0; - my $sample2_gene_check_flag = 0; - if (exists $$gene_by_chr_hashref{$sample1_chr}) { - foreach my $gene_id (sort keys %{$$gene_by_chr_hashref{$sample1_chr}}) { - $sample1_gene_check_flag = check_overlap_region($sample1_breakpoint_start, $sample1_breakpoint_end, $$gene_by_chr_hashref{$sample1_chr}{$gene_id}{'start'}, $$gene_by_chr_hashref{$sample1_chr}{$gene_id}{'end'}); - if ($sample1_gene_check_flag == 1) { - goto SAMPLE_RANDOM_TRA2; - } - } - } - if (exists $$gene_by_chr_hashref{$sample2_chr}) { - foreach my $gene_id (sort keys %{$$gene_by_chr_hashref{$sample2_chr}}) { - $sample2_gene_check_flag = check_overlap_region($sample2_breakpoint_start, $sample2_breakpoint_end, $$gene_by_chr_hashref{$sample2_chr}{$gene_id}{'start'}, $$gene_by_chr_hashref{$sample2_chr}{$gene_id}{'end'}); - if ($sample2_gene_check_flag == 1) { - goto SAMPLE_RANDOM_TRA2; - } - } - } - - # check if the sampled region overlaped with the defined centromeres - if (keys %$centromere_by_chr_hashref) { - if ((exists $$centromere_by_chr_hashref{$sample1_chr}) and (exists $$centromere_by_chr_hashref{$sample2_chr})) { - my $sample1_centromere_check_flag = check_overlap_region($sample1_breakpoint_start, $sample1_breakpoint_end, $$centromere_by_chr_hashref{$sample1_chr}{'start'}, $$centromere_by_chr_hashref{$sample1_chr}{'end'}); - my $sample2_centromere_check_flag = check_overlap_region($sample2_breakpoint_start, $sample2_breakpoint_end, $$centromere_by_chr_hashref{$sample2_chr}{'start'}, $$centromere_by_chr_hashref{$sample2_chr}{'end'}); - if (($sample1_centromere_check_flag == 1) or ($sample2_centromere_check_flag == 1)) { - goto SAMPLE_RANDOM_TRA2; - } - } else { - # when not both centromeres were defined - print "!!! Warnings! Centromeres are defined but not for all the chromosomes! !!!\n"; - print "!!! Please add the missing centromere annotation in the centromere_gff file or use the -excluded_chr_list option to exclude chromosomes that lack the centromere annotation. !!!\n"; - print "!!! Exit! !!!\n"; - die; - } - } - - my $ref_chr1; - my $ref_chr2; - my $ref_chr1_breakpoint_start; - my $ref_chr1_breakpoint_end; - my $ref_chr2_breakpoint_start; - my $ref_chr2_breakpoint_end; - if ($sample1_chr lt $sample2_chr) { - $ref_chr1 = $sample1_chr; - $ref_chr1_breakpoint_start = $sample1_breakpoint_start; - $ref_chr1_breakpoint_end = $sample1_breakpoint_end; - $ref_chr2 = $sample2_chr; - $ref_chr2_breakpoint_start = $sample2_breakpoint_start; - $ref_chr2_breakpoint_end = $sample2_breakpoint_end; - } else { - $ref_chr1 = $sample2_chr; - $ref_chr1_breakpoint_start = $sample2_breakpoint_start; - $ref_chr1_breakpoint_end = $sample2_breakpoint_end; - $ref_chr2 = $sample1_chr; - $ref_chr2_breakpoint_start = $sample1_breakpoint_start; - $ref_chr2_breakpoint_end = $sample1_breakpoint_end; - } - - if (rand(1) < 0.5) { - # try translocation resolution: ++++ - my $ref_chr1_chr2_part1_chr = $ref_chr1; - my $ref_chr1_chr2_part1_start = 1; - my $ref_chr1_chr2_part1_end = $ref_chr1_breakpoint_start - 1; - my $ref_chr1_chr2_part1_strand = "+"; - - my $ref_chr1_chr2_part2_chr = $ref_chr2; - my $ref_chr1_chr2_part2_start = $ref_chr2_breakpoint_start; - my $ref_chr1_chr2_part2_end = length $$refseq_hashref{$ref_chr2}; - my $ref_chr1_chr2_part2_strand = "+"; - - my $ref_chr2_chr1_part1_chr = $ref_chr2; - my $ref_chr2_chr1_part1_start = 1; - my $ref_chr2_chr1_part1_end = $ref_chr2_breakpoint_start - 1; - my $ref_chr2_chr1_part1_strand = "+"; - - my $ref_chr2_chr1_part2_chr = $ref_chr1; - my $ref_chr2_chr1_part2_start = $ref_chr1_breakpoint_start; - my $ref_chr2_chr1_part2_end = length $$refseq_hashref{$ref_chr1}; - my $ref_chr2_chr1_part2_strand = "+"; - - # check for rearranged chromosome with zero or two centromeres - if ((exists $$centromere_by_chr_hashref{$ref_chr1}) and (exists $$centromere_by_chr_hashref{$ref_chr2})) { - my $ref_chr1_chr2_part1_centromere_count = check_overlap_region($ref_chr1_chr2_part1_start, $ref_chr1_chr2_part1_end, $$centromere_by_chr_hashref{$ref_chr1}{'start'}, $$centromere_by_chr_hashref{$ref_chr1}{'end'}); - my $ref_chr2_chr1_part2_centromere_count = check_overlap_region($ref_chr2_chr1_part2_start, $ref_chr2_chr1_part2_end, $$centromere_by_chr_hashref{$ref_chr1}{'start'}, $$centromere_by_chr_hashref{$ref_chr1}{'end'}); - my $ref_chr1_chr2_part2_centromere_count = check_overlap_region($ref_chr1_chr2_part2_start, $ref_chr1_chr2_part2_end, $$centromere_by_chr_hashref{$ref_chr2}{'start'}, $$centromere_by_chr_hashref{$ref_chr2}{'end'}); - my $ref_chr2_chr1_part1_centromere_count = check_overlap_region($ref_chr2_chr1_part1_start, $ref_chr2_chr1_part1_end, $$centromere_by_chr_hashref{$ref_chr2}{'start'}, $$centromere_by_chr_hashref{$ref_chr2}{'end'}); - - my $ref_chr1_chr2_centromere_count = $ref_chr1_chr2_part1_centromere_count + $ref_chr1_chr2_part2_centromere_count; - my $ref_chr2_chr1_centromere_count = $ref_chr2_chr1_part1_centromere_count + $ref_chr2_chr1_part2_centromere_count; - - if (($ref_chr1_chr2_centromere_count != 1) or ($ref_chr2_chr1_centromere_count != 1)) { - goto SAMPLE_RANDOM_TRA2; - } - } - - my $translocation_id = "TRA_${i}"; - $translocation{$translocation_id}{'ref_chr1'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr2'} = $ref_chr2; - - $translocation{$translocation_id}{'ref_chr1_chr2_part1_chr'} = $ref_chr1_chr2_part1_chr; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_start'} = $ref_chr1_chr2_part1_start; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_end'} = $ref_chr1_chr2_part1_end; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_strand'} = $ref_chr1_chr2_part1_strand; - - $translocation{$translocation_id}{'ref_chr1_chr2_part2_chr'} = $ref_chr1_chr2_part2_chr; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_start'} = $ref_chr1_chr2_part2_start; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_end'} = $ref_chr1_chr2_part2_end; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_strand'} = $ref_chr1_chr2_part2_strand; - - $translocation{$translocation_id}{'ref_chr2_chr1_part1_chr'} = $ref_chr2_chr1_part1_chr; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_start'} = $ref_chr2_chr1_part1_start; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_end'} = $ref_chr2_chr1_part1_end; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_strand'} = $ref_chr2_chr1_part1_strand; - - $translocation{$translocation_id}{'ref_chr2_chr1_part2_chr'} = $ref_chr2_chr1_part2_chr; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_start'} = $ref_chr2_chr1_part2_start; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_end'} = $ref_chr2_chr1_part2_end; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_strand'} = $ref_chr2_chr1_part2_strand; - # update avialble_chr - delete $available_chr{$ref_chr1}; - delete $available_chr{$ref_chr2}; - } else { - # translocation resolution: +--+ - my $ref_chr1_chr2_part1_chr = $ref_chr1; - my $ref_chr1_chr2_part1_start = 1; - my $ref_chr1_chr2_part1_end = $ref_chr1_breakpoint_start - 1; - my $ref_chr1_chr2_part1_strand = "+"; - - my $ref_chr1_chr2_part2_chr = $ref_chr2; - my $ref_chr1_chr2_part2_start = 1; - my $ref_chr1_chr2_part2_end = $ref_chr2_breakpoint_end; - my $ref_chr1_chr2_part2_strand = "-"; - - my $ref_chr2_chr1_part1_chr = $ref_chr2; - my $ref_chr2_chr1_part1_start = $ref_chr2_breakpoint_end + 1; - my $ref_chr2_chr1_part1_end = length $$refseq_hashref{$ref_chr2}; - my $ref_chr2_chr1_part1_strand = "-"; - - my $ref_chr2_chr1_part2_chr = $ref_chr1; - my $ref_chr2_chr1_part2_start = $ref_chr1_breakpoint_start; - my $ref_chr2_chr1_part2_end = length $$refseq_hashref{$ref_chr1}; - my $ref_chr2_chr1_part2_strand = "+"; - - # check for rearranged chromosome with zero or two centromeres - if ((exists $$centromere_by_chr_hashref{$ref_chr1}) and (exists $$centromere_by_chr_hashref{$ref_chr2})) { - my $ref_chr1_chr2_part1_centromere_count = check_overlap_region($ref_chr1_chr2_part1_start, $ref_chr1_chr2_part1_end, $$centromere_by_chr_hashref{$ref_chr1}{'start'}, $$centromere_by_chr_hashref{$ref_chr1}{'end'}); - my $ref_chr2_chr1_part2_centromere_count = check_overlap_region($ref_chr2_chr1_part2_start, $ref_chr2_chr1_part2_end, $$centromere_by_chr_hashref{$ref_chr1}{'start'}, $$centromere_by_chr_hashref{$ref_chr1}{'end'}); - my $ref_chr1_chr2_part2_centromere_count = check_overlap_region($ref_chr1_chr2_part2_start, $ref_chr1_chr2_part2_end, $$centromere_by_chr_hashref{$ref_chr2}{'start'}, $$centromere_by_chr_hashref{$ref_chr2}{'end'}); - my $ref_chr2_chr1_part1_centromere_count = check_overlap_region($ref_chr2_chr1_part1_start, $ref_chr2_chr1_part1_end, $$centromere_by_chr_hashref{$ref_chr2}{'start'}, $$centromere_by_chr_hashref{$ref_chr2}{'end'}); - - my $ref_chr1_chr2_centromere_count = $ref_chr1_chr2_part1_centromere_count + $ref_chr1_chr2_part2_centromere_count; - my $ref_chr2_chr1_centromere_count = $ref_chr2_chr1_part1_centromere_count + $ref_chr2_chr1_part2_centromere_count; - - if (($ref_chr1_chr2_centromere_count != 1) or ($ref_chr2_chr1_centromere_count != 1)) { - goto SAMPLE_RANDOM_TRA2; - } - } - - my $translocation_id = "TRA_${i}"; - $translocation{$translocation_id}{'ref_chr1'} = $ref_chr1; - $translocation{$translocation_id}{'ref_chr2'} = $ref_chr2; - - $translocation{$translocation_id}{'ref_chr1_chr2_part1_chr'} = $ref_chr1_chr2_part1_chr; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_start'} = $ref_chr1_chr2_part1_start; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_end'} = $ref_chr1_chr2_part1_end; - $translocation{$translocation_id}{'ref_chr1_chr2_part1_strand'} = $ref_chr1_chr2_part1_strand; - - $translocation{$translocation_id}{'ref_chr1_chr2_part2_chr'} = $ref_chr1_chr2_part2_chr; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_start'} = $ref_chr1_chr2_part2_start; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_end'} = $ref_chr1_chr2_part2_end; - $translocation{$translocation_id}{'ref_chr1_chr2_part2_strand'} = $ref_chr1_chr2_part2_strand; - - $translocation{$translocation_id}{'ref_chr2_chr1_part1_chr'} = $ref_chr2_chr1_part1_chr; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_start'} = $ref_chr2_chr1_part1_start; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_end'} = $ref_chr2_chr1_part1_end; - $translocation{$translocation_id}{'ref_chr2_chr1_part1_strand'} = $ref_chr2_chr1_part1_strand; - - $translocation{$translocation_id}{'ref_chr2_chr1_part2_chr'} = $ref_chr2_chr1_part2_chr; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_start'} = $ref_chr2_chr1_part2_start; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_end'} = $ref_chr2_chr1_part2_end; - $translocation{$translocation_id}{'ref_chr2_chr1_part2_strand'} = $ref_chr2_chr1_part2_strand; - - # update avialble_chr - delete $available_chr{$ref_chr1}; - delete $available_chr{$ref_chr2}; - } - } - } - } - - foreach my $translocation_id (sort keys %translocation) { - my $ref_chr1 = $translocation{$translocation_id}{'ref_chr1'}; - my $ref_chr2 = $translocation{$translocation_id}{'ref_chr2'}; - - my $ref_chr1_chr2_part1_chr = $translocation{$translocation_id}{'ref_chr1_chr2_part1_chr'}; - my $ref_chr1_chr2_part1_start = $translocation{$translocation_id}{'ref_chr1_chr2_part1_start'}; - my $ref_chr1_chr2_part1_end = $translocation{$translocation_id}{'ref_chr1_chr2_part1_end'}; - my $ref_chr1_chr2_part1_strand = $translocation{$translocation_id}{'ref_chr1_chr2_part1_strand'}; - - my $ref_chr1_chr2_part2_chr = $translocation{$translocation_id}{'ref_chr1_chr2_part2_chr'}; - my $ref_chr1_chr2_part2_start = $translocation{$translocation_id}{'ref_chr1_chr2_part2_start'}; - my $ref_chr1_chr2_part2_end = $translocation{$translocation_id}{'ref_chr1_chr2_part2_end'}; - my $ref_chr1_chr2_part2_strand = $translocation{$translocation_id}{'ref_chr1_chr2_part2_strand'}; - - my $ref_chr2_chr1_part1_chr = $translocation{$translocation_id}{'ref_chr2_chr1_part1_chr'}; - my $ref_chr2_chr1_part1_start = $translocation{$translocation_id}{'ref_chr2_chr1_part1_start'}; - my $ref_chr2_chr1_part1_end = $translocation{$translocation_id}{'ref_chr2_chr1_part1_end'}; - my $ref_chr2_chr1_part1_strand = $translocation{$translocation_id}{'ref_chr2_chr1_part1_strand'}; - - my $ref_chr2_chr1_part2_chr = $translocation{$translocation_id}{'ref_chr2_chr1_part2_chr'}; - my $ref_chr2_chr1_part2_start = $translocation{$translocation_id}{'ref_chr2_chr1_part2_start'}; - my $ref_chr2_chr1_part2_end = $translocation{$translocation_id}{'ref_chr2_chr1_part2_end'}; - my $ref_chr2_chr1_part2_strand = $translocation{$translocation_id}{'ref_chr2_chr1_part2_strand'}; - - my $ref_chr1_chr2 = "${ref_chr1}_${ref_chr2}"; - my $ref_chr2_chr1 = "${ref_chr2}_${ref_chr1}"; - - if ($ref_chr1_chr2_part1_strand eq $ref_chr1_chr2_part2_strand) { - $ref_chr1_chr2_part2_end = length $$refseq_hashref{$ref_chr2}; - $ref_chr2_chr1_part2_end = length $$refseq_hashref{$ref_chr1}; - } else { - $ref_chr2_chr1_part1_end = length $$refseq_hashref{$ref_chr2}; - $ref_chr2_chr1_part2_end = length $$refseq_hashref{$ref_chr1}; - } - - my $ref_chr1_chr2_part1_seq = substr $$refseq_hashref{$ref_chr1}, $ref_chr1_chr2_part1_start - 1, $ref_chr1_chr2_part1_end - $ref_chr1_chr2_part1_start + 1; - my $ref_chr1_chr2_part2_seq = substr $$refseq_hashref{$ref_chr2}, $ref_chr1_chr2_part2_start - 1, $ref_chr1_chr2_part2_end - $ref_chr1_chr2_part2_start + 1; - my $ref_chr2_chr1_part1_seq = substr $$refseq_hashref{$ref_chr2}, $ref_chr2_chr1_part1_start - 1, $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1; - my $ref_chr2_chr1_part2_seq = substr $$refseq_hashref{$ref_chr1}, $ref_chr2_chr1_part2_start - 1, $ref_chr2_chr1_part2_end - $ref_chr2_chr1_part2_start + 1; - - if ($ref_chr1_chr2_part1_strand ne $ref_chr1_chr2_part2_strand) { - $ref_chr1_chr2_part2_seq = revcom($ref_chr1_chr2_part2_seq); - $ref_chr2_chr1_part1_seq = revcom($ref_chr2_chr1_part1_seq); - } - - $$simseq_hashref{$ref_chr1_chr2} = $ref_chr1_chr2_part1_seq . $ref_chr1_chr2_part2_seq; - $$simseq_hashref{$ref_chr2_chr1} = $ref_chr2_chr1_part1_seq . $ref_chr2_chr1_part2_seq; - delete $$simseq_hashref{$ref_chr1}; - delete $$simseq_hashref{$ref_chr2}; - - if ($ref_chr1_chr2_part1_strand eq $ref_chr1_chr2_part2_strand) { - # translocation resolution: ++++ - # ref_chr1_chr2_part1 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_start'} = $ref_chr1_chr2_part1_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_strand'} = $ref_chr1_chr2_part1_strand; - my $ref_chr1_chr2_part1_end_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr1_chr2_part1_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_allele'} = $ref_chr1_chr2_part1_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_strand'} = $ref_chr1_chr2_part1_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_allele'} = "$ref_chr1_chr2_part1_end_ref_allele\[$ref_chr2:$ref_chr1_chr2_part2_start\["; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_id'} = $translocation_id; - # ref_chr1_chr2_part2 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_start'} = $ref_chr1_chr2_part2_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_end'} = $ref_chr1_chr2_part2_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_strand'} = $ref_chr1_chr2_part2_strand; - my $ref_chr1_chr2_part2_start_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr1_chr2_part2_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'ref_allele'} = $ref_chr1_chr2_part2_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_start'} = $ref_chr1_chr2_part1_end + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_end'} = $ref_chr1_chr2_part1_end + $ref_chr1_chr2_part2_end - $ref_chr1_chr2_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_strand'} = $ref_chr1_chr2_part2_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'sim_allele'} = "\]$ref_chr1:$ref_chr1_chr2_part1_end\]$ref_chr1_chr2_part2_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_start}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part1 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_start'} = $ref_chr2_chr1_part1_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_end'} = $ref_chr2_chr1_part1_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_strand'} = $ref_chr2_chr1_part1_strand; - my $ref_chr2_chr1_part1_end_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr2_chr1_part1_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'ref_allele'} = $ref_chr2_chr1_part1_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_end'} = $ref_chr2_chr1_part1_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_strand'} = $ref_chr2_chr1_part1_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'sim_allele'} = "$ref_chr2_chr1_part1_end_ref_allele\[$ref_chr1:$ref_chr2_chr1_part2_start\["; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_end}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part2 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_start'} = $ref_chr2_chr1_part2_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_end'} = $ref_chr2_chr1_part2_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_strand'} = $ref_chr2_chr1_part2_strand; - my $ref_chr2_chr1_part2_start_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr2_chr1_part2_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_allele'} = $ref_chr2_chr1_part2_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_start'} = $ref_chr2_chr1_part1_end + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_end'} = $ref_chr2_chr1_part1_end + $ref_chr2_chr1_part2_end - $ref_chr2_chr1_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_strand'} = $ref_chr2_chr1_part2_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_allele'} = "\]$ref_chr2:$ref_chr2_chr1_part1_end\]$ref_chr2_chr1_part2_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_id'} = $translocation_id; - } else { - # translocation resolution: +--+ - # ref_chr1_chr2_part1 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_start'} = $ref_chr1_chr2_part1_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_strand'} = $ref_chr1_chr2_part1_strand; - my $ref_chr1_chr2_part1_end_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr1_chr2_part1_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'ref_allele'} = $ref_chr1_chr2_part1_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_end'} = $ref_chr1_chr2_part1_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_strand'} = $ref_chr1_chr2_part1_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'sim_allele'} = "$ref_chr1_chr2_part1_end_ref_allele\]$ref_chr2:$ref_chr1_chr2_part2_end\]"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr1_chr2_part1_end}{'variant_id'} = $translocation_id; - # ref_chr1_chr2_part2 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_start'} = $ref_chr1_chr2_part2_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_end'} = $ref_chr1_chr2_part2_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_strand'} = $ref_chr1_chr2_part2_strand; - my $ref_chr1_chr2_part2_end_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr1_chr2_part2_end - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'ref_allele'} = $ref_chr1_chr2_part2_end_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_chr'} = $ref_chr1_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_start'} = $ref_chr1_chr2_part1_end + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_end'} = $ref_chr1_chr2_part1_end + $ref_chr1_chr2_part2_end - $ref_chr1_chr2_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_strand'} = $ref_chr1_chr2_part2_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'sim_allele'} = "$ref_chr1_chr2_part2_end_ref_allele\]$ref_chr1:$ref_chr1_chr2_part1_end\]"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr1_chr2_part2_end}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part1 - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_chr'} = $ref_chr2; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_start'} = $ref_chr2_chr1_part1_start; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_end'} = $ref_chr2_chr1_part1_end; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_strand'} = $ref_chr2_chr1_part1_strand; - my $ref_chr2_chr1_part1_start_ref_allele = substr $$refseq_hashref{$ref_chr2}, $ref_chr2_chr1_part1_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'ref_allele'} = $ref_chr2_chr1_part1_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_start'} = 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_end'} = $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_strand'} = $ref_chr2_chr1_part1_strand; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'sim_allele'} = "\[$ref_chr1:$ref_chr2_chr1_part2_start\[$ref_chr2_chr1_part1_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr2}{$ref_chr2_chr1_part1_start}{'variant_id'} = $translocation_id; - # ref_chr2_chr1_part2 - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_chr'} = $ref_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_start'} = $ref_chr2_chr1_part2_start; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_end'} = $ref_chr2_chr1_part2_end; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_strand'} = $ref_chr2_chr1_part2_strand; - my $ref_chr2_chr1_part2_start_ref_allele = substr $$refseq_hashref{$ref_chr1}, $ref_chr2_chr1_part2_start - 1, 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'ref_allele'} = $ref_chr2_chr1_part2_start_ref_allele; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_chr'} = $ref_chr2_chr1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_start'} = $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1 + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_end'} = $ref_chr2_chr1_part1_end - $ref_chr2_chr1_part1_start + 1 + $ref_chr2_chr1_part2_end - $ref_chr2_chr1_part2_start + 1; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_strand'} = $ref_chr2_chr1_part2_strand; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'sim_allele'} = "\[$ref_chr2:$ref_chr2_chr1_part1_start\[$ref_chr2_chr1_part2_start_ref_allele"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_type'} = "TRA"; - $$ref2sim_map_hashref{$ref_chr1}{$ref_chr2_chr1_part2_start}{'variant_id'} = $translocation_id; - } - } -} - - -sub generate_output_files { - my ($prefix, $refseq_arrayref, $refseq_hashref, $simseq_hashref, $ref2sim_map_hashref, $excluded_refseq_hashref) = @_; - # output fasta.gz file for the simulated genome - my $output_simseq_fasta = "$prefix.fasta"; - my $output_simseq_fasta_fh = write_file($output_simseq_fasta); - if ((defined $translocation_vcf) or (defined $translocation_count)) { - foreach my $chr (sort keys %$simseq_hashref) { - if (exists $$simseq_hashref{$chr}) { - print $output_simseq_fasta_fh ">$chr\n$$simseq_hashref{$chr}\n"; - } - } - } else { - foreach my $chr (@$refseq_arrayref) { - if (exists $$simseq_hashref{$chr}) { - print $output_simseq_fasta_fh ">$chr\n$$simseq_hashref{$chr}\n"; - } - } - } - - foreach my $chr (@$refseq_arrayref) { - if (exists $$excluded_refseq_hashref{$chr}) { - print $output_simseq_fasta_fh ">$chr\n$$excluded_refseq_hashref{$chr}\n"; - } - } - - # generate the correspondance map for genomic variants introduced during simulation - print "Generating the correspondance map for genomic variants introduced during simulation:\n"; - my $output_ref2sim_map = "$prefix.bed"; - my $output_ref2sim_map_fh = write_file($output_ref2sim_map); - print $output_ref2sim_map_fh "ref_chr\tref_start\tref_end\tref_strand\tref_allele\tsim_chr\tsim_start\tsim_end\tref_strand\tsim_allele\tvariant_type\tvariant_id\tdonor_chr_in_ref\tdonor_start_in_ref\tdonor_end_in_ref\tdonor_strand_in_ref\tduplication_type\tinserted_copy_number\ttotal_copy_number\n"; - - if ((defined $translocation_vcf) or (defined $translocation_count)) { - foreach my $ref_chr (@$refseq_arrayref) { - if (not exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - print $output_ref2sim_map_fh "$ref_chr\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}\t"; - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} eq "DUP") { - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_chr_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_start_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_end_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_strand_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'duplication_type'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'total_copy_number'}\n"; - } else { - print $output_ref2sim_map_fh ".\t.\t.\t.\t.\t.\t.\n"; - } - } - } - } - } else { - foreach my $ref_chr (@$refseq_arrayref) { - if (exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - print $output_ref2sim_map_fh "$ref_chr\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_start'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_strand'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_strand'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}\t"; - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} eq "DUP") { - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_chr_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_start_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_end_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_strand_in_ref'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'duplication_type'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'inserted_copy_number'}\t"; - print $output_ref2sim_map_fh "$$ref2sim_map_hashref{$ref_chr}{$ref_start}{'total_copy_number'}\n"; - } else { - print $output_ref2sim_map_fh ".\t.\t.\t.\t.\t.\t.\n"; - } - } - } - } - } - - # generate reference-based vcf file for genomic variants introduced during simulation - print "\nGenerating reference-based vcf file for genomic variants introduced during simulation:\n"; - my $gmt_time = gmtime(); - if ((defined $snp_vcf) or (defined $snp_count)) { - my $output_ref2sim_snp_vcf = "$prefix.snp.vcf"; - my $output_ref2sim_snp_vcf_fh = write_file($output_ref2sim_snp_vcf); - print $output_ref2sim_snp_vcf_fh "##fileformat=VCFv4.1\n"; - print $output_ref2sim_snp_vcf_fh "##fileDate=$gmt_time (GMT time)\n"; - print $output_ref2sim_snp_vcf_fh "##source=simuG.pl\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "##INFO=\n"; - print $output_ref2sim_snp_vcf_fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - foreach my $ref_chr (@$refseq_arrayref) { - if (exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} eq "SNP") { - my $ref_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}; - my $sim_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}; - my $sim_chr = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}; - my $sim_start = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}; - my $sim_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}; - my $variant_type = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $variant_id = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}; - print $output_ref2sim_snp_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_snp_vcf_fh "$ref_allele\t$sim_allele\t.\t.\tvariant_type=$variant_type;ref_chr=$ref_chr;ref_start=$ref_start;ref_end=$ref_end;sim_chr=$sim_chr;sim_start=$sim_start;sim_end=$sim_end\n"; - } - } - } - } - } - if ((defined $indel_vcf) or (defined $indel_count)) { - my $output_ref2sim_indel_vcf = "$prefix.indel.vcf"; - my $output_ref2sim_indel_vcf_fh = write_file($output_ref2sim_indel_vcf); - print $output_ref2sim_indel_vcf_fh "##fileformat=VCFv4.1\n"; - print $output_ref2sim_indel_vcf_fh "##fileDate=$gmt_time (GMT time)\n"; - print $output_ref2sim_indel_vcf_fh "##source=simuG.pl\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "##INFO=\n"; - print $output_ref2sim_indel_vcf_fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - foreach my $ref_chr (@$refseq_arrayref) { - if (exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} eq "INDEL") { - my $ref_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}; - my $sim_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}; - my $sim_chr = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}; - my $sim_start = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}; - my $sim_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}; - my $variant_type = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $variant_id = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}; - print $output_ref2sim_indel_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_indel_vcf_fh "$ref_allele\t$sim_allele\t.\t.\tvariant_type=$variant_type;ref_chr=$ref_chr;ref_start=$ref_start;ref_end=$ref_end;sim_chr=$sim_chr;sim_start=$sim_start;sim_end=$sim_end\n"; - } - } - } - } - } - if ((defined $cnv_vcf) or (defined $cnv_count)) { - my $output_ref2sim_cnv_vcf = "$prefix.vcf"; - my $output_ref2sim_cnv_vcf_fh = write_file($output_ref2sim_cnv_vcf); - print $output_ref2sim_cnv_vcf_fh "##fileformat=VCFv4.1\n"; - print $output_ref2sim_cnv_vcf_fh "##fileDate=$gmt_time (GMT time)\n"; - print $output_ref2sim_cnv_vcf_fh "##source=simuG.pl\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "##INFO=\n"; - print $output_ref2sim_cnv_vcf_fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - foreach my $ref_chr (@$refseq_arrayref) { - if (exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} =~ /(DEL|DUP)/) { - my $ref_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}; - my $sim_chr = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}; - my $sim_start = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}; - my $sim_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}; - my $sim_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}; - my $variant_type = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $variant_id = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}; - if ($variant_type eq "DEL") { - print $output_ref2sim_cnv_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_cnv_vcf_fh "$ref_allele\t$sim_allele\t.\t.\tSVTYPE=DEL;EVENT=$variant_id;END=$ref_end\n"; - } else { - my $donor_chr_in_ref = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_chr_in_ref'}; - my $donor_start_in_ref = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_start_in_ref'}; - my $donor_end_in_ref = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_end_in_ref'}; - my $donor_strand_in_ref = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'donor_strand_in_ref'}; - my $duplication_type = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'duplication_type'}; - my $total_copy_number; - if (exists $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'total_copy_number'}) { - $total_copy_number = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'total_copy_number'}; - } - my $inserted_copy_number; - if ($duplication_type eq "dispersed_duplication") { - $inserted_copy_number = 1; - } else { - $inserted_copy_number = $total_copy_number - 1; - } - my $ref_start_after_duplication = $ref_start + 1; - my $ref_allele_after_duplication = substr $$refseq_hashref{$ref_chr}, $ref_end, 1; - if ($donor_strand_in_ref eq "+") { - print $output_ref2sim_cnv_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_cnv_vcf_fh "$ref_allele\t$ref_allele\[$donor_chr_in_ref:$donor_start_in_ref\[\t.\t.\tSVTYPE=BND;EVENT=$variant_id;duplication_type=$duplication_type;inserted_copy_number=$inserted_copy_number;total_copy_number=$total_copy_number\n"; - print $output_ref2sim_cnv_vcf_fh "$ref_chr\t$ref_start_after_duplication\t.\t"; - print $output_ref2sim_cnv_vcf_fh "$ref_allele_after_duplication\t\]$donor_chr_in_ref:$donor_end_in_ref\]$ref_allele_after_duplication\t.\t.\tSVTYPE=BND;EVENT=$variant_id;duplication_type=$duplication_type;inserted_copy_number=$inserted_copy_number;total_copy_number=$total_copy_number\n"; - } else { - print $output_ref2sim_cnv_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_cnv_vcf_fh "$ref_allele\t$ref_allele\]$donor_chr_in_ref:$donor_end_in_ref\]\t.\t.\tSVTYPE=BND;EVENT=$variant_id;duplication_type=$duplication_type;inserted_copy_number=$inserted_copy_number;total_copy_number=$total_copy_number\n"; - print $output_ref2sim_cnv_vcf_fh "$ref_chr\t$ref_start_after_duplication\t.\t"; - print $output_ref2sim_cnv_vcf_fh "$ref_allele_after_duplication\t\[$donor_chr_in_ref:$donor_start_in_ref\[$ref_allele_after_duplication\t.\t.\tSVTYPE=BND;EVENT=$variant_id;duplication_type=$duplication_type;inserted_copy_number=$inserted_copy_number;total_copy_number=$total_copy_number\n"; - } - } - } - } - } - } - } - if ((defined $inversion_vcf) or (defined $inversion_count)) { - my $output_ref2sim_inversion_vcf = "$prefix.vcf"; - my $output_ref2sim_inversion_vcf_fh = write_file($output_ref2sim_inversion_vcf); - print $output_ref2sim_inversion_vcf_fh "##fileformat=VCFv4.1\n"; - print $output_ref2sim_inversion_vcf_fh "##fileDate=$gmt_time (GMT time)\n"; - print $output_ref2sim_inversion_vcf_fh "##source=simuG.pl\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "##INFO=\n"; - print $output_ref2sim_inversion_vcf_fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - foreach my $ref_chr (@$refseq_arrayref) { - if (exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} eq "INV") { - my $ref_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}; - my $sim_chr = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}; - my $sim_start = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}; - my $sim_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}; - my $sim_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}; - my $variant_type = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $variant_id = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}; - print $output_ref2sim_inversion_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_inversion_vcf_fh "$ref_allele\t$sim_allele\t.\t.\tvariant_type=$variant_type;variant_id=$variant_id;SVTYPE=INV;EVENT=$variant_id;END=$ref_end\n"; - } - } - } - } - } - if ((defined $translocation_vcf) or (defined $translocation_count)) { - my $output_ref2sim_translocation_vcf = "$prefix.vcf"; - my $output_ref2sim_translocation_vcf_fh = write_file($output_ref2sim_translocation_vcf); - print $output_ref2sim_translocation_vcf_fh "##fileformat=VCFv4.1\n"; - print $output_ref2sim_translocation_vcf_fh "##fileDate=$gmt_time (GMT time)\n"; - print $output_ref2sim_translocation_vcf_fh "##source=simuG.pl\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "##INFO=\n"; - print $output_ref2sim_translocation_vcf_fh "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - foreach my $ref_chr (@$refseq_arrayref) { - if (not exists $$simseq_hashref{$ref_chr}) { - foreach my $ref_start (sort {$a <=> $b} keys %{$$ref2sim_map_hashref{$ref_chr}}) { - if ($$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'} eq "TRA") { - my $ref_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_end'}; - my $ref_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'ref_allele'}; - my $sim_chr = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_chr'}; - my $sim_start = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_start'}; - my $sim_end = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_end'}; - my $sim_allele = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'sim_allele'}; - my $variant_type = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_type'}; - my $variant_id = $$ref2sim_map_hashref{$ref_chr}{$ref_start}{'variant_id'}; - print $output_ref2sim_translocation_vcf_fh "$ref_chr\t$ref_start\t.\t"; - print $output_ref2sim_translocation_vcf_fh "$ref_allele\t$sim_allele\t.\t.\tvariant_type=$variant_type;variant_id=$variant_id;SVTYPE=BND;EVENT=$variant_id\n"; - } - } - } - } - } -} - - -#----------------------------------------------------------------- -#---------------- Documentation / Usage / Help ------------------ - -=head1 NAME - -simuG.pl - simulate genome sequences with designated or random genomic variants of full spectrum (SNPs, INDELs, CNVs, inversions, translocations). - -=head1 SYNOPSIS - -perl simuG.pl [options] [file ...] - -=head1 OPTIONS - -=over 8 - -=item B<-help> or B<-h> - -Print help message. Example: -h. - -=item B<-man> or B<-m> - -Print more detailed help message. Example: -m. - -=item B<-version> or B<-v> - -Print version information. Example: -v. - -=item B<-refseq> or B<-r> - -Specify the reference genome to be used as the template (in fasta or fasta.gz format). This option is mandatory. Default = "". Example: -refseq ref.genome.fa(.gz). - -=item B<-snp_vcf> - -Specify the list of exact SNP variants to be introduced (in vcf or vcf.gz format). When specified, the options '-snp_count', '-snp_model', and '-titv_ratio' will be ignored. If there are also INDEL variants in the vcf file, they will be automatically ignored. Default = "". Example: -snp_vcf snp.vcf(.gz). - -=item B<-snp_count> - -Specify the number of SNP variants to be introduced. Default = "". Example: -snp_count 5000. - -=item B<-snp_model> - -Specify the SNP model file generated by the ancillary script vcf2model.pl. When specified, the option '-titv_ratio' will be ignored. Default = "". Example: -snp_model snp_model.txt. - -=item B<-titv_ratio> - -Specify the Ti/Tv ratio (transition/transversion ratio) used for simulate SNP variants. Default = 0.5. Example: -titv_ratio 2.0. For transition only, set '-titv_ratio Inf'. For transversion only, set '-titv_ratio 0'. - -=item B<-coding_partition_for_snp_simulation> - -Specify the coding partition (e.g. 'noncoding', 'coding', '2d' or '4d') used for constraining simulate SNP variants within the specified sites. This option needs to be used together with the option '-gene_gff'. Default = "". Example: -coding_partition_for_snp_simulation 4d for simulating SNP only in 4-fold degenerate (4d) sites. - -=item B<-indel_vcf> - -Specify the list of exact INDEL variants to be introduced (in vcf or vcf.gz format). When specified, the options '-indel_count', '-indel_model', '-ins_del_ratio', '-indel_size_powerlaw_alpha', and '-indel_size_powerlaw_constant' will be ignored. If there are also SNP variants in the vcf file, they will be automatically ignored. Default = "". Example: -indel_vcf indel.vcf(.gz). - -=item B<-indel_count> - -Specify the number of INDEL variants to be introduced. Default = "". Example: -indel_count 500. - -=item B<-indel_model> - -Specify the INDEL model file generated by the ancillary script vcf2model.pl. When specified, the options '-ins_del_ratio', '-indel_size_powerlaw_alpha', and '-indel_size_powerlaw_constant' will be ignored. Default = "". Example: -indel_model indel_model.txt. - -=item B<-ins_del_ratio> - -Specify the Insertion/Deletion ratio used for simulate INDEL variants. Default = 1.0. Example: -ins_del_ratio 1.0. For insertion only, set '-ins_del_ratio Inf'. For deletion only, set '-ins_del_ratio 0'. - -=item B<-indel_size_powerlaw_alpha> - -Specify the exponent factor alpha for power-law-fitted indel size distribution: p = C * (size) ** (-alpha) for size >= 1 & size <= 50. Default = 2.0. Example: -indel_size_powerlaw_alpha 2.0. - -=item B<-indel_size_powerlaw_constant> - -Specify the constant (C) for power-law-fitted indel size distribution: p = C * (size) ** (-alpha) for size >= 1 & size <= 50. Default = 0.5. Example: -indel_size_powerlaw_constant 0.5. - -=item B<-cnv_vcf> - -Specify the list of exact CNV variants to be introduced (in vcf or vcf.gz format). When specified, the options '-cnv_count', '-cnv_gain_loss_ratio', '-cnv_max_copy_number', '-cnv_min_size', and '-cnv_max_size' will be ignored. Default = "". Example: -cnv_vcf cnv.vcf. - -=item B<-cnv_count> - -Specify the number of CNV variants to be introduced. Default = "". Example: -cnv_count 50. - -=item B<-cnv_gain_loss_ratio> - -Specify the relative ratio of DNA again over DNA loss. Default = 1.0. Example: -cnv_gain_loss_ratio 1.0. For copy number gain only, set '-cnv_gain_loss Inf'. For copy number loss only, set '-cnv_gain_loss_ratio 0'. - -=item B<-cnv_max_copy_number> - -Specify the maximal copy number for CNV. Default = 10. Example: -cnv_max_copy_number 10. - -=item B<-cnv_min_size> - -Specify the minimal size (in basepair) for CNV variants. Default = 100. Example: -cnv_min_size 100. - -=item B<-cnv_max_size> - -Specify the maximal size (in basepair) for CNV variants. Default = 100000. Example: -cnv_max_size 100. - -=item B<-duplication_tandem_dispersed_ratio> - -Specify the expect frequency ratio between tandem duplication and dispersed duplication for CNV variants. Default = 1. Example: -duplication_tandem_dispersed_ratio 1. For simulating tandem duplication only, set '-duplication_tandem_dispersed_ratio Inf'. For simulating dispersed duplication only, set '-duplication_tandem_dispersed_ratio 0'. - -=item B<-inversion_vcf> - -Specify the list of exact inversions to be introduced (in vcf or vcf.gz format). When specified, the options '-inversion_count', '-inversion_min_size', '-inversion_max_size', and '-inversion_breakpoint_gff' will be ignored. Default = "". Example: -inversion_vcf inversion.vcf(.gz). - -=item B<-inversion_count> - -Specify the number of inversions to be introduced. Default = "". Example: -inversion_count 5. - -=item B<-inversion_min_size> - -Specify the minimal size (in basepair) for inversion. Default = 1000. Example: -inversion_min_size 1000. - -=item B<-inversion_max_size> - -Specify the maximal size (in basepair) for inversion. Default = 100000. Example: -inversion_max_size 100000. - -=item B<-inversion_breakpoint_gff> - -Specify the list of potential breakpoints for triggering inversions (in gff3 or gff3.gz format). Default = "". Example: -inversion_breakpoint_gff inversion_breakpoint.gff(.gz). - -=item B<-translocation_vcf> - -Specify the list of exact translocations to be introduced (in vcf or vcf.gz format). When specified, the options '-translocation_count' and '-sv_breakpoint_gff' will be ignored. Default = "". Example: -translocation_vcf transloaction.vcf(.gz). - -=item B<-translocation_count> - -Specify the number of translocations to be introduced. Default = "". Example: -translocation_count 1. - -=item B<-translocation_breakpoint_gff> - -Specify the list of potential breakpoints for triggering translocations (in gff3 or gff3.gz format). Default = "". Example: -translocation_breakpoint_gff translocation_breakpoint.gff(.gz). - -=item B<-centromere_gff> - -Specify centromeres for constraining the random CNV, inversion, and translocation simulation (in gff3 or gff3.gz format). When this option applied, those potential CNVs that will induce centromere deletion/duplication as well as the potential inversions and translocations with breakpoints overlapped with defined centromeres will be disalled. Also, potential translocation that will induce dicentric rearranged chromosomes will be disalled as well. Default = "". Example: -centromere_gff centromere.gff(.gz). - -=item B<-gene_gff> - -Specify genes for constraining the random SNP, CNV, inversion, and translocation simulation (in gff3 or gff3.gz format). For random SNP simulation, this option needs to be used together with the option '-coding_partition_for_snp_simulation' to constrain SNPs simulations only in noncoding regions, coding regions, 2-fold degenerate (2d) sites or 4-fold degenerate (4d) sites. For random CNV, inversion, and translocation simulation, applied this option will disallow simulated breakpoints falling on the defined genes. Default = "". Example: -gene_gff gene.gff(.gz). -jj -=item B<-excluded_chr_list> - -Specify the name of chromosome(s) to be excluded for introducing genomic variants (a single-column list file in txt format). Default = "". Example: -excluded_chr_list excluded_chr_list.txt. - -=item B<-seed> or B<-s> - -Specify an integer as the random seed for the simulation. It is recommended to set a very big integer for ideal randomness. Default = "". Example: -seed 201812201903. - -=item B<-prefix> or B<-p> - -Specify the prefix for output files. Default = "output_prefix". Example: -prefix sim. - -=back - -=head1 DESCRIPTION - -B can simulate genome sequences with designated or random genomic variants of full spectrum (e.g. SNPs, INDELs, CNVs, inversions, and translocations). - -=head1 AUTHOR - -B (GitHub ID: yjx1217) - -=head1 VERSION - -B v1.0.0 - -=cut diff --git a/harpy/simulate.py b/harpy/simulate.py index 5c082a81e..ef1bfa49b 100644 --- a/harpy/simulate.py +++ b/harpy/simulate.py @@ -172,6 +172,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r fetch_script(workflowdir, "HaploSim.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_linkedreads") + conda_envs = ["simulations"] configs = { "workflow" : "simulate linkedreads", "snakemake_log" : sm_log, @@ -184,6 +185,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r "partitions" : partitions, "molecules_per_partition" : molecules_per, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, 'barcodes': { "file": Path(barcodes).resolve().as_posix() if barcodes else f"{workflowdir}/input/haplotag_barcodes.txt", "length": bc_len if barcodes else 24 @@ -196,7 +198,7 @@ def linkedreads(genome_hap1, genome_hap2, output_dir, outer_distance, mutation_r with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -295,9 +297,9 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count start_text.add_row("Heterozygosity:", f"{heterozygosity}") fetch_rule(workflowdir, "simulate_snpindel.smk") - fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_snpindel") + conda_envs = ["simulations"] configs = { "workflow" : "simulate snpindel", "snakemake_log" : sm_log, @@ -322,6 +324,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count **({"size_constant" : indel_size_constant} if indel_size_constant and not indel_vcf else {}) }, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs" : { "genome" : Path(genome).resolve().as_posix(), **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), @@ -332,7 +335,7 @@ def snpindel(genome, snp_vcf, indel_vcf, only_vcf, output_dir, prefix, snp_count with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -407,9 +410,9 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si start_text.add_row("Heterozygosity:", f"{heterozygosity}") fetch_rule(workflowdir, "simulate_variants.smk") - fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_inversion") + conda_envs = ["simulations"] configs = { "workflow" : "simulate inversion", "snakemake_log" : sm_log, @@ -422,11 +425,12 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si }, "inversion" : { **({"vcf" : Path(vcf).resolve().as_posix()} if vcf else {}), - **({'count': count} if count and not vcf else {}), - **({"min_size": min_size} if min_size and not vcf else {}), - **({"max_size" : max_size} if max_size and not vcf else {}) + **({'count': count} if not vcf else {}), + **({"min_size": min_size} if not vcf else {}), + **({"max_size" : max_size} if not vcf else {}) }, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs" : { "genome" : Path(genome).resolve().as_posix(), **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), @@ -437,7 +441,7 @@ def inversion(genome, vcf, only_vcf, prefix, output_dir, count, min_size, max_si with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -522,9 +526,9 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du start_text.add_row("Heterozygosity:", f"{heterozygosity}") fetch_rule(workflowdir, "simulate_variants.smk") - fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_cnv") + conda_envs = ["simulations"] configs = { "workflow" : "simulate cnv", "snakemake_log" : sm_log, @@ -537,14 +541,15 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du }, "cnv" : { **({"vcf" : Path(vcf).resolve().as_posix()} if vcf else {}), - **({'count': count} if count and not vcf else {}), - **({"min_size": min_size} if min_size and not vcf else {}), - **({"max_size" : max_size} if max_size and not vcf else {}), - **({"duplication_ratio" : dup_ratio} if dup_ratio and not vcf else {}), - **({"max_copy" : max_copy} if max_copy and not vcf else {}), - **({"gain_ratio" : gain_ratio} if gain_ratio and not vcf else {}) + **({'count': count} if not vcf else {}), + **({"min_size": min_size} if not vcf else {}), + **({"max_size" : max_size} if not vcf else {}), + **({"duplication_ratio" : dup_ratio} if not vcf else {}), + **({"max_copy" : max_copy} if not vcf else {}), + **({"gain_ratio" : gain_ratio} if not vcf else {}) }, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs" : { "genome" : Path(genome).resolve().as_posix(), **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), @@ -555,7 +560,7 @@ def cnv(genome, output_dir, vcf, only_vcf, prefix, count, min_size, max_size, du with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -627,9 +632,9 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, start_text.add_row("Heterozygosity:", f"{heterozygosity}") fetch_rule(workflowdir, "simulate_variants.smk") - fetch_script(workflowdir, "simuG.pl") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "simulate_translocation") + conda_envs = ["simulations"] configs = { "workflow" : "simulate translocation", "snakemake_log" : sm_log, @@ -638,13 +643,14 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, **({"random_seed" : randomseed} if randomseed else {}), "heterozygosity" : { "ratio" : heterozygosity, - "only_vcf" : only_vcf, + "only_vcf" : only_vcf }, "translocation" : { **({"vcf" : Path(vcf).resolve().as_posix()} if vcf else {}), - **({'count': count} if count and not vcf else {}), + **({'count': count} if not vcf else {}) }, "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "inputs" : { "genome" : Path(genome).resolve().as_posix(), **({"centromeres" : Path(centromeres).resolve().as_posix()} if centromeres else {}), @@ -656,7 +662,7 @@ def translocation(genome, output_dir, prefix, vcf, only_vcf, count, centromeres, with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/snakefiles/align_bwa.smk b/harpy/snakefiles/align_bwa.smk index d2baf8fb0..63e3f3861 100644 --- a/harpy/snakefiles/align_bwa.smk +++ b/harpy/snakefiles/align_bwa.smk @@ -14,7 +14,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+" outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] fqlist = config["inputs"]["fastq"] molecule_distance = config["molecule_distance"] diff --git a/harpy/snakefiles/align_ema.smk b/harpy/snakefiles/align_ema.smk index 01a26891d..d04e16324 100644 --- a/harpy/snakefiles/align_ema.smk +++ b/harpy/snakefiles/align_ema.smk @@ -25,7 +25,7 @@ extra = config.get("extra", "") bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") windowsize = config["depth_windowsize"] keep_unmapped = config["keep_unmapped"] skip_reports = config["reports"]["skip"] diff --git a/harpy/snakefiles/align_strobealign.smk b/harpy/snakefiles/align_strobealign.smk index 23edbcdd8..717da1407 100644 --- a/harpy/snakefiles/align_strobealign.smk +++ b/harpy/snakefiles/align_strobealign.smk @@ -14,7 +14,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+" outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] fqlist = config["inputs"]["fastq"] extra = config.get("extra", "") diff --git a/harpy/snakefiles/assembly.smk b/harpy/snakefiles/assembly.smk index eadb46d51..c079c6b92 100644 --- a/harpy/snakefiles/assembly.smk +++ b/harpy/snakefiles/assembly.smk @@ -13,7 +13,7 @@ onerror: FQ1 = config["inputs"]["fastq_r1"] FQ2 = config["inputs"]["fastq_r2"] outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") skip_reports = config["reports"]["skip"] organism = config["reports"]["organism_type"] lineage_map = { diff --git a/harpy/snakefiles/containerize.smk b/harpy/snakefiles/containerize.smk index f70ef2d92..1e86e91f1 100644 --- a/harpy/snakefiles/containerize.smk +++ b/harpy/snakefiles/containerize.smk @@ -1,59 +1,53 @@ import os import shutil -onsuccess: - shutil.rmtree(f'.harpy_envs', ignore_errors=True) - +envdir = os.path.join(os.getcwd(), "container/workflow/envs") +# spades isn't added b/c it has a post-setup script rule all: input: - collect("{conda}.env", conda = ["align", "assembly", "metassembly", "phase", "qc", "r", "simulations", "snp", "stitch", "sv"]) + collect("{conda}.env", conda = ["align", "assembly", "metassembly", "phase", "qc", "r", "simulations", "stitch", "variants"]) rule qc: output: "qc.env" - conda: os.getcwd() + "/.harpy_envs/qc.yaml" + conda: f"{envdir}/qc.yaml" shell: "touch {output}" rule align: output: "align.env" - conda: os.getcwd() + "/.harpy_envs/align.yaml" - shell: "touch {output}" - -rule snp: - output: "snp.env" - conda: os.getcwd() + "/.harpy_envs/variants.yaml" + conda: f"{envdir}/align.yaml" shell: "touch {output}" -rule sv: - output: "sv.env" - conda: os.getcwd() + "/.harpy_envs/variants.yaml" +rule variants: + output: "variants.env" + conda: f"{envdir}/variants.yaml" shell: "touch {output}" rule phase: output: "phase.env" - conda: os.getcwd() + "/.harpy_envs/phase.yaml" + conda: f"{envdir}/phase.yaml" shell: "touch {output}" rule r: output: "r.env" - conda: os.getcwd() + "/.harpy_envs/r.yaml" + conda: f"{envdir}/r.yaml" shell: "touch {output}" rule stitch: output: "stitch.env" - conda: os.getcwd() + "/.harpy_envs/stitch.yaml" + conda: f"{envdir}/stitch.yaml" shell: "touch {output}" rule simulations: output: "simulations.env" - conda: os.getcwd() + "/.harpy_envs/simulations.yaml" + conda: f"{envdir}/simulations.yaml" shell: "touch {output}" rule assembly: output: "assembly.env" - conda: os.getcwd() + "/.harpy_envs/assembly.yaml" + conda: f"{envdir}/assembly.yaml" shell: "touch {output}" rule metassembly: output: "metassembly.env" - conda: os.getcwd() + "/.harpy_envs/metassembly.yaml" + conda: f"{envdir}/metassembly.yaml" shell: "touch {output}" \ No newline at end of file diff --git a/harpy/snakefiles/deconvolve.smk b/harpy/snakefiles/deconvolve.smk index d1a16e5e7..2bee9657d 100644 --- a/harpy/snakefiles/deconvolve.smk +++ b/harpy/snakefiles/deconvolve.smk @@ -13,9 +13,9 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") fqlist = config["inputs"] outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") kmer_length = config["kmer_length"] window_size = config["window_size"] density = config["density"] diff --git a/harpy/snakefiles/demultiplex_gen1.smk b/harpy/snakefiles/demultiplex_gen1.smk index c6de765af..2f5daa919 100644 --- a/harpy/snakefiles/demultiplex_gen1.smk +++ b/harpy/snakefiles/demultiplex_gen1.smk @@ -10,7 +10,7 @@ I2 = config["inputs"]["I2"] samplefile = config["inputs"]["demultiplex_schema"] skip_reports = config["reports"]["skip"] outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") onstart: logger.logger.addHandler(logging.FileHandler(config["snakemake_log"])) diff --git a/harpy/snakefiles/impute.smk b/harpy/snakefiles/impute.smk index ca74ec88c..a9f665472 100644 --- a/harpy/snakefiles/impute.smk +++ b/harpy/snakefiles/impute.smk @@ -19,7 +19,7 @@ variantfile = config["inputs"]["variantfile"] paramfile = config["inputs"]["paramfile"] biallelic = config["inputs"]["biallelic_contigs"] outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") skip_reports = config["reports"]["skip"] stitch_params = config["stitch_parameters"] with open(biallelic, "r") as f: diff --git a/harpy/snakefiles/metassembly.smk b/harpy/snakefiles/metassembly.smk index 92b2ff18d..b08a67bf9 100644 --- a/harpy/snakefiles/metassembly.smk +++ b/harpy/snakefiles/metassembly.smk @@ -14,7 +14,7 @@ FQ1 = config["inputs"]["fastq_r1"] FQ2 = config["inputs"]["fastq_r2"] BX_TAG = config["barcode_tag"].upper() outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") max_mem = config["spades"]["max_memory"] k_param = config["spades"]["k"] ignore_bx = config["spades"]["ignore_barcodes"] diff --git a/harpy/snakefiles/phase.smk b/harpy/snakefiles/phase.smk index 3682e4545..78d9cebf2 100644 --- a/harpy/snakefiles/phase.smk +++ b/harpy/snakefiles/phase.smk @@ -21,7 +21,7 @@ pruning = config["prune"] molecule_distance = config["molecule_distance"] extra = config.get("extra", "") outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") samples_from_vcf = config["samples_from_vcf"] variantfile = config["inputs"]["variantfile"] skip_reports = config["reports"]["skip"] diff --git a/harpy/snakefiles/preflight_bam.smk b/harpy/snakefiles/preflight_bam.smk index 87ad954d8..0e0d18773 100644 --- a/harpy/snakefiles/preflight_bam.smk +++ b/harpy/snakefiles/preflight_bam.smk @@ -15,7 +15,7 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+" outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") bamlist = config["inputs"] bamdict = dict(zip(bamlist, bamlist)) samplenames = {Path(i).stem for i in bamlist} diff --git a/harpy/snakefiles/preflight_fastq.smk b/harpy/snakefiles/preflight_fastq.smk index 40704a1c3..497e771fb 100644 --- a/harpy/snakefiles/preflight_fastq.smk +++ b/harpy/snakefiles/preflight_fastq.smk @@ -15,7 +15,7 @@ wildcard_constraints: fqlist = config["inputs"] outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} diff --git a/harpy/snakefiles/qc.smk b/harpy/snakefiles/qc.smk index 047417ae0..8752cfbd7 100644 --- a/harpy/snakefiles/qc.smk +++ b/harpy/snakefiles/qc.smk @@ -13,9 +13,9 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") fqlist = config["inputs"] outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") min_len = config["min_len"] max_len = config["max_len"] extra = config.get("extra", "") diff --git a/harpy/snakefiles/simulate_linkedreads.smk b/harpy/snakefiles/simulate_linkedreads.smk index 9b8d5da4e..7770d2b80 100644 --- a/harpy/snakefiles/simulate_linkedreads.smk +++ b/harpy/snakefiles/simulate_linkedreads.smk @@ -17,11 +17,11 @@ wildcard_constraints: hap = "[01]" outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") gen_hap1 = config["inputs"]["genome_hap1"] gen_hap2 = config["inputs"]["genome_hap2"] barcode_file = config["barcodes"]["file"] barcode_len = config["barcodes"]["length"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") genodict = {"0": gen_hap1, "1": gen_hap2} rule barcode_keymap: diff --git a/harpy/snakefiles/simulate_snpindel.smk b/harpy/snakefiles/simulate_snpindel.smk index fe56e53b4..8cea5cbd2 100644 --- a/harpy/snakefiles/simulate_snpindel.smk +++ b/harpy/snakefiles/simulate_snpindel.smk @@ -12,8 +12,8 @@ onerror: os.remove(logger.logfile) outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genome = config["inputs"]["genome"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") snp_vcf = config["snp"].get("vcf", None) indel_vcf = config["indel"].get("vcf", None) heterozygosity = float(config["heterozygosity"]["ratio"]) @@ -94,28 +94,47 @@ rule simulate_haploid: in_vcfs, geno = genome output: - multiext(f"{outdir}/{outprefix}", ".bed", ".fasta"), - collect(f"{outdir}/{outprefix}." + "{var}.vcf", var = variants) + f"{outdir}/{outprefix}.simseq.genome.fa", + f"{outdir}/{outprefix}.refseq2simseq.SNP.vcf" if snp else [], + f"{outdir}/{outprefix}.refseq2simseq.INDEL.vcf" if indel else [], + f"{outdir}/{outprefix}.refseq2simseq.map.txt" log: f"{outdir}/logs/{outprefix}.log" params: prefix = f"{outdir}/{outprefix}", - simuG = f"{outdir}/workflow/scripts/simuG.pl", parameters = variant_params conda: f"{envdir}/simulations.yaml" shell: - "perl {params.simuG} -refseq {input.geno} -prefix {params.prefix} {params.parameters} > {log}" + "simuG -refseq {input.geno} -prefix {params.prefix} {params.parameters} > {log}" + +rule rename_haploid: + input: + fasta = f"{outdir}/{outprefix}.simseq.genome.fa", + snpvcf = f"{outdir}/{outprefix}.refseq2simseq.SNP.vcf" if snp else [], + indelvcf = f"{outdir}/{outprefix}.refseq2simseq.INDEL.vcf" if indel else [], + mapfile = f"{outdir}/{outprefix}.refseq2simseq.map.txt" + output: + fasta = f"{outdir}/{outprefix}.fasta", + snpvcf = f"{outdir}/{outprefix}.snp.vcf" if snp else [], + indelvcf = f"{outdir}/{outprefix}.indel.vcf" if indel else [], + mapfile = f"{outdir}/{outprefix}.map" + run: + for i,j in zip(input, output): + if i: + os.rename(i,j) rule diploid_snps: input: f"{outdir}/{outprefix}.snp.vcf" output: - f"{outdir}/diploid/{outprefix}.snp.hap1.vcf", - f"{outdir}/diploid/{outprefix}.snp.hap2.vcf" + f"{outdir}/haplotype_1/{outprefix}.hap1.snp.vcf", + f"{outdir}/haplotype_2/{outprefix}.hap2.snp.vcf" params: het = heterozygosity run: + os.makedirs(f"{outdir}/haplotype_1", exist_ok = True) + os.makedirs(f"{outdir}/haplotype_2", exist_ok = True) rng = random.Random(randomseed) if randomseed else random.Random() with open(input[0], "r") as in_vcf, open(output[0], "w") as hap1, open(output[1], "w") as hap2: for line in in_vcf: @@ -132,42 +151,54 @@ use rule diploid_snps as diploid_indels with: input: f"{outdir}/{outprefix}.indel.vcf" output: - f"{outdir}/diploid/{outprefix}.indel.hap1.vcf", - f"{outdir}/diploid/{outprefix}.indel.hap2.vcf" + f"{outdir}/haplotype_1/{outprefix}.hap1.indel.vcf", + f"{outdir}/haplotype_2/{outprefix}.hap2.indel.vcf" rule simulate_diploid: input: - snp_hap = f"{outdir}/diploid/{outprefix}.snp.hap{{haplotype}}.vcf" if snp else [], - indel_hap = f"{outdir}/diploid/{outprefix}.indel.hap{{haplotype}}.vcf" if indel else [], + snp_hap = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.snp.vcf" if snp else [], + indel_hap = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.indel.vcf" if indel else [], geno = genome output: - f"{outdir}/diploid/{outprefix}.hap{{haplotype}}.fasta", - temp(f"{outdir}/diploid/{outprefix}.hap{{haplotype}}.indel.vcf") if indel else [], - temp(f"{outdir}/diploid/{outprefix}.hap{{haplotype}}.snp.vcf") if snp else [] + f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.simseq.genome.fa", + f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.map.txt", + temp(f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.INDEL.vcf") if indel else [], + temp(f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.SNP.vcf") if snp else [] log: f"{outdir}/logs/{outprefix}.hap{{haplotype}}.log" params: - prefix = f"{outdir}/diploid/{outprefix}.hap{{haplotype}}", - simuG = f"{outdir}/workflow/scripts/simuG.pl", - snp = f"-snp_vcf {outdir}/diploid/{outprefix}.snp.hap{{haplotype}}.vcf" if snp else "", - indel = f"-indel_vcf {outdir}/diploid/{outprefix}.indel.hap{{haplotype}}.vcf" if indel else "" + prefix = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}", + snp = f"-snp_vcf {outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.snp.vcf" if snp else "", + indel = f"-indel_vcf {outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.indel.vcf" if indel else "" conda: f"{envdir}/simulations.yaml" shell: - "perl {params.simuG} -refseq {input.geno} -prefix {params.prefix} {params.snp} {params.indel} > {log}" + "simuG -refseq {input.geno} -prefix {params.prefix} {params.snp} {params.indel} > {log}" + +rule rename_diploid: + input: + fasta= f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.simseq.genome.fa", + mapfile = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.map.txt", + output: + fasta = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.fasta", + mapfile = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.map" + run: + for i,j in zip(input, output): + os.rename(i,j) rule workflow_summary: default_target: True input: - multiext(f"{outdir}/{outprefix}", ".bed", ".fasta"), + f"{outdir}/{outprefix}.fasta", collect(f"{outdir}/{outprefix}" + ".{var}.vcf", var = variants), - collect(f"{outdir}/diploid/{outprefix}" + ".hap{n}.fasta", n = [1,2]) if heterozygosity > 0 and not only_vcf else [], - collect(f"{outdir}/diploid/{outprefix}" + ".{var}.hap{n}.vcf", n = [1,2], var = variants) if heterozygosity > 0 else [] + collect(f"{outdir}/haplotype_" + "{n}/" + outprefix + ".hap{n}.fasta", n = [1,2]) if heterozygosity > 0 and not only_vcf else [], + collect(f"{outdir}/haplotype_" + "{n}/" + outprefix + ".hap{n}" + ".{var}.vcf", n = [1,2], var = variants) if heterozygosity > 0 else [], + collect(f"{outdir}/haplotype_" + "{n}/" + outprefix + ".hap{n}" + ".map", n = [1,2]) if heterozygosity > 0 else [] params: prefix = f"{outdir}/{outprefix}", parameters = variant_params, - snp = f"-snp_vcf {outdir}/diploid/{outprefix}.snp.hapX.vcf" if snp else "", - indel = f"-indel_vcf {outdir}/diploid/{outprefix}.indel.hapX.vcf" if indel else "" + snp = f"-snp_vcf {outdir}/haplotype_X/{outprefix}.snp.hapX.vcf" if snp else "", + indel = f"-indel_vcf {outdir}/haplotype_X/{outprefix}.indel.hapX.vcf" if indel else "" run: summary = ["The harpy simulate snpindel workflow ran using these parameters:"] summary.append(f"The provided genome: {genome}") diff --git a/harpy/snakefiles/simulate_variants.smk b/harpy/snakefiles/simulate_variants.smk index 2e95bbb53..39fad0aa1 100644 --- a/harpy/snakefiles/simulate_variants.smk +++ b/harpy/snakefiles/simulate_variants.smk @@ -12,8 +12,9 @@ onerror: os.remove(logger.logfile) outdir = config["output_directory"] -envdir = os.path.join(os.getcwd(), ".harpy_envs") +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") variant = config["workflow"].split()[1] +simuG_variant = variant.upper() if variant == "cnv" else variant outprefix = config["prefix"] genome = config["inputs"]["genome"] vcf = config[variant].get("vcf", None) @@ -57,27 +58,43 @@ rule simulate_haploid: vcf_correct if vcf else [], geno = genome output: - collect(f"{outdir}/{outprefix}" + "{ext}", ext = [".vcf", ".bed", ".fasta"]) + f"{outdir}/{outprefix}.simseq.genome.fa", + f"{outdir}/{outprefix}.refseq2simseq.{simuG_variant}.vcf", + f"{outdir}/{outprefix}.refseq2simseq.map.txt" log: f"{outdir}/logs/{outprefix}.log" params: prefix = f"{outdir}/{outprefix}", - simuG = f"{outdir}/workflow/scripts/simuG.pl", parameters = variant_params conda: f"{envdir}/simulations.yaml" shell: - "perl {params.simuG} -refseq {input.geno} -prefix {params.prefix} {params.parameters} > {log}" + "simuG -refseq {input.geno} -prefix {params.prefix} {params.parameters} > {log}" + +rule rename_haploid: + input: + fasta = f"{outdir}/{outprefix}.simseq.genome.fa", + vcf = f"{outdir}/{outprefix}.refseq2simseq.{simuG_variant}.vcf", + mapfile = f"{outdir}/{outprefix}.refseq2simseq.map.txt" + output: + fasta = f"{outdir}/{outprefix}.fasta", + vcf = f"{outdir}/{outprefix}.{variant}.vcf", + mapfile = f"{outdir}/{outprefix}.{variant}.map" + run: + for i,j in zip(input, output): + os.rename(i,j) rule diploid_variants: input: - f"{outdir}/{outprefix}.vcf" + f"{outdir}/{outprefix}.{variant}.vcf" output: - f"{outdir}/diploid/{outprefix}.{variant}.hap1.vcf", - f"{outdir}/diploid/{outprefix}.{variant}.hap2.vcf" + f"{outdir}/haplotype_1/{outprefix}.hap1.{variant}.vcf", + f"{outdir}/haplotype_2/{outprefix}.hap2.{variant}.vcf" params: het = heterozygosity run: + os.makedirs(f"{outdir}/haplotype_1", exist_ok = True) + os.makedirs(f"{outdir}/haplotype_2", exist_ok = True) rng = random.Random(randomseed) if randomseed else random.Random() with open(input[0], "r") as in_vcf, open(output[0], "w") as hap1, open(output[1], "w") as hap2: for line in in_vcf: @@ -92,28 +109,41 @@ rule diploid_variants: rule simulate_diploid: input: - hap = f"{outdir}/diploid/{outprefix}.{variant}.hap{{haplotype}}.vcf", + hap = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.{variant}.vcf", geno = genome output: - f"{outdir}/diploid/{outprefix}.hap{{haplotype}}.fasta", - temp(f"{outdir}/diploid/{outprefix}.hap{{haplotype}}.vcf") + f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.simseq.genome.fa", + f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.map.txt", + temp(f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.{simuG_variant}.vcf") log: f"{outdir}/logs/{outprefix}.hap{{haplotype}}.log" params: - prefix = f"{outdir}/diploid/{outprefix}.hap{{haplotype}}", - simuG = f"{outdir}/workflow/scripts/simuG.pl", + prefix = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}", vcf_arg = f"-{variant}_vcf" conda: f"{envdir}/simulations.yaml" shell: - "perl {params.simuG} -refseq {input.geno} -prefix {params.prefix} {params.vcf_arg} {input.hap} > {log}" + "simuG -refseq {input.geno} -prefix {params.prefix} {params.vcf_arg} {input.hap} > {log}" + +rule rename_diploid: + input: + fasta = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.simseq.genome.fa", + mapfile = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.refseq2simseq.map.txt" + output: + fasta = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.fasta", + mapfile = f"{outdir}/haplotype_{{haplotype}}/{outprefix}.hap{{haplotype}}.{variant}.map" + run: + for i,j in zip(input, output): + os.rename(i,j) rule workflow_summary: default_target: True input: - multiext(f"{outdir}/{outprefix}", ".vcf", ".bed", ".fasta"), - collect(f"{outdir}/diploid/{outprefix}.hap" + "{n}.fasta", n = [1,2]) if heterozygosity > 0 and not only_vcf else [], - collect(f"{outdir}/diploid/{outprefix}.{variant}.hap" + "{n}.vcf", n = [1,2]) if heterozygosity > 0 else [] + f"{outdir}/{outprefix}.fasta", + f"{outdir}/{outprefix}.{variant}.vcf", + collect(f"{outdir}/haplotype_" + "{n}" + f"/{outprefix}.hap" + "{n}.fasta", n = [1,2]) if heterozygosity > 0 and not only_vcf else [], + collect(f"{outdir}/haplotype_" + "{n}" + f"/{outprefix}.hap" + "{n}" + f".{variant}.vcf", n = [1,2]) if heterozygosity > 0 else [], + collect(f"{outdir}/haplotype_" + "{n}" + f"/{outprefix}.hap" + "{n}" + f".{variant}.map", n = [1,2]) if heterozygosity > 0 else [] params: prefix = f"{outdir}/{outprefix}", parameters = variant_params, diff --git a/harpy/snakefiles/snp_freebayes.smk b/harpy/snakefiles/snp_freebayes.smk index e315a53b6..299c8dea3 100644 --- a/harpy/snakefiles/snp_freebayes.smk +++ b/harpy/snakefiles/snp_freebayes.smk @@ -13,12 +13,12 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") ploidy = config["ploidy"] extra = config.get("extra", "") regiontype = config["region_type"] windowsize = config.get("windowsize", None) -outdir = config["output_directory"] skip_reports = config["reports"]["skip"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) diff --git a/harpy/snakefiles/snp_mpileup.smk b/harpy/snakefiles/snp_mpileup.smk index b6de042a5..dc02b2dd2 100644 --- a/harpy/snakefiles/snp_mpileup.smk +++ b/harpy/snakefiles/snp_mpileup.smk @@ -13,12 +13,12 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") ploidy = config["ploidy"] mp_extra = config.get("extra", "") regiontype = config["region_type"] windowsize = config.get("windowsize", None) -outdir = config["output_directory"] skip_reports = config["reports"]["skip"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) diff --git a/harpy/snakefiles/sv_leviathan.smk b/harpy/snakefiles/sv_leviathan.smk index b4c505a4b..4aa54074e 100644 --- a/harpy/snakefiles/sv_leviathan.smk +++ b/harpy/snakefiles/sv_leviathan.smk @@ -14,7 +14,8 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) @@ -23,7 +24,6 @@ min_sv = config["min_sv"] min_bc = config["min_barcodes"] iterations = config["iterations"] extra = config.get("extra", "") -outdir = config["output_directory"] skip_reports = config["reports"]["skip"] plot_contigs = config["reports"]["plot_contigs"] bn = os.path.basename(genomefile) diff --git a/harpy/snakefiles/sv_leviathan_pop.smk b/harpy/snakefiles/sv_leviathan_pop.smk index 3bb0af094..8680fa3fd 100644 --- a/harpy/snakefiles/sv_leviathan_pop.smk +++ b/harpy/snakefiles/sv_leviathan_pop.smk @@ -14,7 +14,8 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+", population = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] groupfile = config["inputs"]["groupings"] @@ -22,7 +23,6 @@ extra = config.get("extra", "") min_sv = config["min_sv"] min_bc = config["min_barcodes"] iterations = config["iterations"] -outdir = config["output_directory"] skip_reports = config["reports"]["skip"] plot_contigs = config["reports"]["plot_contigs"] bn = os.path.basename(genomefile) diff --git a/harpy/snakefiles/sv_naibr.smk b/harpy/snakefiles/sv_naibr.smk index 96a9896b5..e7177a6a6 100644 --- a/harpy/snakefiles/sv_naibr.smk +++ b/harpy/snakefiles/sv_naibr.smk @@ -14,7 +14,8 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) @@ -24,7 +25,6 @@ mol_dist = config["molecule_distance"] min_sv = config["min_sv"] min_barcodes = config["min_barcodes"] min_quality = config["min_quality"] -outdir = config["output_directory"] bn = os.path.basename(genomefile) genome_zip = True if bn.lower().endswith(".gz") else False bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" diff --git a/harpy/snakefiles/sv_naibr_phase.smk b/harpy/snakefiles/sv_naibr_phase.smk index 510b9b813..4a983a6f8 100644 --- a/harpy/snakefiles/sv_naibr_phase.smk +++ b/harpy/snakefiles/sv_naibr_phase.smk @@ -14,7 +14,8 @@ onerror: wildcard_constraints: sample = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] bamdict = dict(zip(bamlist, bamlist)) @@ -25,7 +26,6 @@ mol_dist = config["molecule_distance"] min_quality = config["min_quality"] min_sv = config["min_sv"] min_barcodes = config["min_barcodes"] -outdir = config["output_directory"] plot_contigs = config["reports"]["plot_contigs"] skip_reports = config["reports"]["skip"] bn = os.path.basename(genomefile) diff --git a/harpy/snakefiles/sv_naibr_pop.smk b/harpy/snakefiles/sv_naibr_pop.smk index b9486a376..1750ed69f 100644 --- a/harpy/snakefiles/sv_naibr_pop.smk +++ b/harpy/snakefiles/sv_naibr_pop.smk @@ -15,7 +15,8 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+", population = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] bamlist = config["inputs"]["alignments"] groupfile = config["inputs"]["groupings"] @@ -24,7 +25,6 @@ min_sv = config["min_sv"] min_barcodes = config["min_barcodes"] min_quality = config["min_quality"] mol_dist = config["molecule_distance"] -outdir = config["output_directory"] skip_reports = config["reports"]["skip"] plot_contigs = config["reports"]["plot_contigs"] bn = os.path.basename(genomefile) diff --git a/harpy/snakefiles/sv_naibr_pop_phase.smk b/harpy/snakefiles/sv_naibr_pop_phase.smk index e67ef76b4..70226ba46 100644 --- a/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -15,7 +15,8 @@ wildcard_constraints: sample = "[a-zA-Z0-9._-]+", population = "[a-zA-Z0-9._-]+" -envdir = os.path.join(os.getcwd(), ".harpy_envs") +outdir = config["output_directory"] +envdir = os.path.join(os.getcwd(), outdir, "workflow", "envs") genomefile = config["inputs"]["genome"] bn = os.path.basename(genomefile) bamlist = config["inputs"]["alignments"] @@ -29,7 +30,6 @@ min_sv = config["min_sv"] min_quality = config["min_quality"] min_barcodes = config["min_barcodes"] mol_dist = config["molecule_distance"] -outdir = config["output_directory"] skip_reports = config["reports"]["skip"] plot_contigs = config["reports"]["plot_contigs"] if bn.lower().endswith(".gz"): diff --git a/harpy/snp.py b/harpy/snp.py index fe8672df2..f44eec257 100644 --- a/harpy/snp.py +++ b/harpy/snp.py @@ -53,7 +53,7 @@ def snp(): @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/mpileup", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min = 1, max = 2), help = 'Ploidy of samples') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') @click.option('-r', '--regions', type=str, default=50000, show_default=True, help = "Regions where to call variants") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('--hpc', type = HPCProfile(), help = 'Directory with HPC submission `config.yaml` file') @@ -111,6 +111,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e fetch_report(workflowdir, "bcftools_stats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "snp_mpileup") + conda_envs = ["r"] if populations: validate_popfile(populations) # check that samplenames and populations line up @@ -124,6 +125,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e **({'windowsize': int(regions)} if regtype == "windows" else {}), **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports }, @@ -137,7 +139,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -157,7 +159,7 @@ def mpileup(inputs, output_dir, regions, genome, threads, populations, ploidy, e @click.option('-g', '--genome', type=InputFile("fasta", gzip_ok = True), required = True, help = 'Genome assembly for variant calling') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SNP/freebayes", show_default=True, help = 'Output directory name') @click.option('-n', '--ploidy', default = 2, show_default = True, type=click.IntRange(min=1, max_open=True), help = 'Ploidy of samples') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') @click.option('-r', '--regions', type=str, default=50000, show_default=True, help = "Regions where to call variants") @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @@ -215,6 +217,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, fetch_report(workflowdir, "bcftools_stats.Rmd") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "snp_freebayes") + conda_envs = ["r", "variants"] if populations: # check for delimeter and formatting validate_popfile(populations) @@ -230,6 +233,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, **({'windowsize': int(regions)} if regtype == "windows" else {}), **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : {"skip": skip_reports}, "inputs" : { "genome" : Path(genome).resolve().as_posix(), @@ -241,7 +245,7 @@ def freebayes(inputs, output_dir, genome, threads, populations, ploidy, regions, with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/sv.py b/harpy/sv.py index e8c25f809..1c59a784c 100644 --- a/harpy/sv.py +++ b/harpy/sv.py @@ -59,7 +59,7 @@ def sv(): @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @click.option('-b', '--min-barcodes', show_default = True, default=2, type = click.IntRange(min = 1, max_open = True), help = 'Minimum number of barcode overlaps supporting candidate SV') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SV/leviathan", show_default=True, help = 'Output directory name') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @click.option('--contigs', type = ContigList(), help = 'File or list of contigs to plot') @@ -105,6 +105,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre fetch_rule(workflowdir, f"sv_{vcaller}.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "sv_leviathan") + conda_envs = ["align", "r", "variants"] configs = { "workflow" : "sv leviathan", "snakemake_log" : sm_log, @@ -114,6 +115,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre "iterations" : iterations, **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, **({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}), @@ -127,7 +129,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) @@ -149,7 +151,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre @click.option('-s', '--min-sv', type = click.IntRange(min = 10, max_open = True), default = 1000, show_default=True, help = 'Minimum size of SV to detect') @click.option('-d', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance delineating separate molecules') @click.option('-o', '--output-dir', type = click.Path(exists = False), default = "SV/naibr", show_default=True, help = 'Output directory name') -@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = "Tab-delimited file of sample\population") +@click.option('-p', '--populations', type=click.Path(exists = True, dir_okay=False, readable=True), help = 'File of `sample`\\`population`') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @click.option('-v', '--vcf', type=click.Path(exists=True, dir_okay=False, readable=True), help = 'Path to phased bcf/vcf file') @click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of a container') @@ -205,6 +207,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th fetch_rule(workflowdir, f"sv_{vcaller}.smk") os.makedirs(f"{output_dir}/logs/snakemake", exist_ok = True) sm_log = snakemake_log(output_dir, "sv_naibr") + conda_envs = ["phase", "r", "variants"] configs = { "workflow" : "sv naibr", "snakemake_log" : sm_log, @@ -215,6 +218,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th "molecule_distance" : molecule_distance, **({'extra': extra_params} if extra_params else {}), "workflow_call" : command.rstrip(), + "conda_environments" : conda_envs, "reports" : { "skip": skip_reports, **({'plot_contigs': contigs} if contigs else {'plot_contigs': "default"}), @@ -229,7 +233,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, min_quality, th with open(os.path.join(workflowdir, 'config.yaml'), "w", encoding="utf-8") as config: yaml.dump(configs, config, default_flow_style= False, sort_keys=False, width=float('inf')) - create_conda_recipes() + create_conda_recipes(output_dir, conda_envs) if setup_only: sys.exit(0) diff --git a/harpy/view.py b/harpy/view.py index 48b2f1c8e..7774f08f8 100644 --- a/harpy/view.py +++ b/harpy/view.py @@ -1,4 +1,4 @@ -"""View the latest log or snakefile of a workflow""" +"""View the latest log, config, or snakefile of a workflow""" import os import sys @@ -9,15 +9,16 @@ from ._validations import is_gzip @click.command(no_args_is_help = True, context_settings=dict(allow_interspersed_args=False)) -@click.option('-s', '--snakefile', is_flag = True, show_default = True, default = False, help = "View the snakefile, not the log file") +@click.option('-s', '--snakefile', is_flag = True, show_default = True, default = False, help = "View the snakefile instead") +@click.option('-c', '--config', is_flag = True, show_default = True, default = False, help = "View the workflow config file instead") @click.argument('directory', required=True, type=click.Path(exists=True, file_okay=False)) -def view(directory, snakefile): +def view(directory, snakefile, config): """ - View a workflow log file or snakefile + View a workflow log, config, or snakefile This convenience command lets you view the latest workflow log file - of a Harpy output directory. You can use `--snakefile` to view the workflow - snakefile instead. Output is printed to the screen via `less` and + of a Harpy output directory. Use `--snakefile` or `--config` to view the workflow + snakefile or config.yaml file instead, respectively. Output is printed to the screen via `less` and accepts the typical keyboard shortcuts to navigate the output, e.g.: | key | function | @@ -28,19 +29,30 @@ def view(directory, snakefile): """ # check if there is a workflow or log folder # and whether the expected files are in there + if snakefile and config: + print_error("Invalid options", "Please pick one of [bold]--snakefile[/bold] or [bold]--config[/bold]") + sys.exit(1) err = 0 if snakefile: files = [i for i in glob.iglob(f"{directory}/workflow/*.smk")] err_dir = f"{directory}/workflow/" - err_file = "snakefiles" + err_file = "There are no snakefiles" if not os.path.exists(f"{directory}/workflow"): err = 1 elif not files: err = 2 + elif config: + files = [f"{directory}/workflow/config.yaml"] + err_dir = f"{directory}/workflow/" + err_file = "There is no [blue]config.yaml[/blue] file" + if not os.path.exists(f"{directory}/workflow"): + err = 1 + elif not os.path.exists(f"{directory}/workflow/config.yaml"): + err = 2 else: files = [i for i in glob.iglob(f"{directory}/logs/snakemake/*.log*")] err_dir = f"{directory}/logs/snakemake/" - err_file = "log files" + err_file = "There are no log files" if not os.path.exists(f"{directory}/logs/snakemake"): err = 1 elif not files: @@ -54,7 +66,7 @@ def view(directory, snakefile): elif err == 2: print_error( "File not found", - f"There are no {err_file} in [blue]{err_dir}[/blue]. Please check that this is the correct folder." + f"{err_file} in [blue]{err_dir}[/blue]. Please check that this is the correct folder." ) sys.exit(1) # sort and pull only the most recent file (based on modification time)