From 9bffc3d92d8be8f5c6e30a25280ed6eade46738c Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 13:31:43 -0400 Subject: [PATCH 01/11] add initial aggregation --- src/harpy/bin/infer_sv.py | 4 +-- src/harpy/snakefiles/sv_naibr.smk | 41 ++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/harpy/bin/infer_sv.py b/src/harpy/bin/infer_sv.py index 717794312..e6631869f 100755 --- a/src/harpy/bin/infer_sv.py +++ b/src/harpy/bin/infer_sv.py @@ -27,8 +27,7 @@ } if args.failfile is not None: - failout = open(args.failfile, "w", encoding="utf-8") - with open(args.bedfile, "r", encoding="utf-8") as f: + with open(args.bedfile, "r", encoding="utf-8") as f, open(args.failfile, "w", encoding="utf-8") as failout: # first line, the header line = f.readline().strip().split("\t") line_corrected = [i.title().replace(" ", "") for i in line] @@ -49,7 +48,6 @@ sys.stdout.write(NEWROW) else: failout.write(NEWROW) - failout.close() else: with open(args.bedfile, "r", encoding="utf-8") as f: # first line, the header diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk index 8255960a5..41511a7c9 100644 --- a/src/harpy/snakefiles/sv_naibr.smk +++ b/src/harpy/snakefiles/sv_naibr.smk @@ -139,7 +139,7 @@ rule infer_sv: refmt = outdir + "/{sample}/{sample}.reformat.bedpe", vcf = outdir + "/{sample}/{sample}.vcf" output: - bedpe = outdir + "/{sample}.bedpe", + bedpe = outdir + "/bedpe/{sample}.bedpe", refmt = outdir + "/IGV/{sample}.reformat.bedpe", fail = outdir + "/filtered/{sample}.fail.bedpe", vcf = outdir + "/vcf/{sample}.vcf" @@ -157,6 +157,40 @@ rule infer_sv: rm -rf {params.outdir} """ +rule merge_variants: + input: + collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames) + output: + outdir + "/inversions.bedpe", + outdir + "/deletions.bedpe", + outdir + "/duplications.bedpe" + message: + "Aggregating the detected variants" + run: + from pathlib import Path + with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: + header = ["Sample","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"] + _ = inversions.write("\t".join(header) + "\n") + _ = deletions.write("\t".join(header) + "\n") + _ = duplications.write("\t".join(header) + "\n") + for varfile in input: + samplename = Path(varfile).stem + with open(varfile, "r") as f_in: + # read the header to skip it + f_in.readline() + # read the rest of it + while True: + line = f_in.readline() + if not line: + break + record = line.rstrip().split("\t") + if record[-1] == "inversion": + _ = inversions.write(f"{samplename}\t{line}") + elif record[-1] == "deletion": + _ = deletions.write(f"{samplename}\t{line}") + elif record[-1] == "duplication": + _ = duplications.write(f"{samplename}\t{line}") + rule genome_link: input: genomefile @@ -202,7 +236,7 @@ rule genome_faidx: rule create_report: input: - bedpe = outdir + "/{sample}.bedpe", + bedpe = outdir + "/bedpe/{sample}.bedpe", fai = f"Genome/{bn}.fai" output: outdir + "/reports/{sample}.naibr.html" @@ -216,7 +250,8 @@ rule create_report: rule workflow_summary: default_target: True input: - bedpe = collect(outdir + "/{sample}.bedpe", sample = samplenames), + bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames), + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]) reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else [] message: "Summarizing the workflow: {output}" From 4f2bb3ff4093a48f6f143e94c3dc3ec3dd7d1f76 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 13:35:37 -0400 Subject: [PATCH 02/11] fixes --- src/harpy/snakefiles/sv_naibr.smk | 2 +- src/harpy/sv.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk index 41511a7c9..c384256a7 100644 --- a/src/harpy/snakefiles/sv_naibr.smk +++ b/src/harpy/snakefiles/sv_naibr.smk @@ -251,7 +251,7 @@ rule workflow_summary: default_target: True input: bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames), - bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]) + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else [] message: "Summarizing the workflow: {output}" diff --git a/src/harpy/sv.py b/src/harpy/sv.py index fc3c3fc9e..b3269cfd2 100644 --- a/src/harpy/sv.py +++ b/src/harpy/sv.py @@ -122,7 +122,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre if config_only: sys.exit(0) - modetext = "pool-by-group" if populations else "single-sample" + modetext = "pool-by-group" if populations else "sample-by-sample" print_onstart( f"Samples: {n}{popgroupings}\nOutput Directory: {output_dir}/\nMode: {modetext}", "sv leviathan" @@ -222,7 +222,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, threads, popula if populations: modetext = "pool-by-group" else: - modetext = "single-sample" + modetext = "sample-by-sample" if vcf: modetext += " + will be phased" else: From 6c3ab24b33c8a7b062ad7b7f1c84bd8102100c07 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 13:36:53 -0400 Subject: [PATCH 03/11] add naibrphase --- src/harpy/snakefiles/sv_naibr_phase.smk | 39 +++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/harpy/snakefiles/sv_naibr_phase.smk b/src/harpy/snakefiles/sv_naibr_phase.smk index 866718bc8..cab2a87ea 100644 --- a/src/harpy/snakefiles/sv_naibr_phase.smk +++ b/src/harpy/snakefiles/sv_naibr_phase.smk @@ -259,7 +259,7 @@ rule infer_sv: refmt = outdir + "/{sample}/{sample}.reformat.bedpe", vcf = outdir + "/{sample}/{sample}.vcf" output: - bedpe = outdir + "/{sample}.bedpe", + bedpe = outdir + "/bedpe/{sample}.bedpe", refmt = outdir + "/IGV/{sample}.reformat.bedpe", fail = outdir + "/filtered/{sample}.fail.bedpe", vcf = outdir + "/vcf/{sample}.vcf" @@ -277,6 +277,40 @@ rule infer_sv: rm -rf {params.outdir} """ +rule merge_variants: + input: + collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames) + output: + outdir + "/inversions.bedpe", + outdir + "/deletions.bedpe", + outdir + "/duplications.bedpe" + message: + "Aggregating the detected variants" + run: + from pathlib import Path + with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: + header = ["Sample","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"] + _ = inversions.write("\t".join(header) + "\n") + _ = deletions.write("\t".join(header) + "\n") + _ = duplications.write("\t".join(header) + "\n") + for varfile in input: + samplename = Path(varfile).stem + with open(varfile, "r") as f_in: + # read the header to skip it + f_in.readline() + # read the rest of it + while True: + line = f_in.readline() + if not line: + break + record = line.rstrip().split("\t") + if record[-1] == "inversion": + _ = inversions.write(f"{samplename}\t{line}") + elif record[-1] == "deletion": + _ = deletions.write(f"{samplename}\t{line}") + elif record[-1] == "duplication": + _ = duplications.write(f"{samplename}\t{line}") + rule create_report: input: bedpe = outdir + "/{sample}.bedpe", @@ -293,7 +327,8 @@ rule create_report: rule workflow_summary: default_target: True input: - bedpe = collect(outdir + "/{sample}.bedpe", sample = samplenames), + bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames), + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), phaselog = outdir + "/logs/whatshap-haplotag.log", reports = collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else [] message: From 6c9303b6487528817c79678969791c8d1c68fff1 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 13:41:53 -0400 Subject: [PATCH 04/11] update to aggregate --- src/harpy/snakefiles/sv_naibr.smk | 2 +- src/harpy/snakefiles/sv_naibr_phase.smk | 2 +- src/harpy/snakefiles/sv_naibr_pop.smk | 41 +++++++++++++++++++-- src/harpy/snakefiles/sv_naibr_pop_phase.smk | 41 +++++++++++++++++++-- 4 files changed, 78 insertions(+), 8 deletions(-) diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk index c384256a7..0e2055854 100644 --- a/src/harpy/snakefiles/sv_naibr.smk +++ b/src/harpy/snakefiles/sv_naibr.smk @@ -141,7 +141,7 @@ rule infer_sv: output: bedpe = outdir + "/bedpe/{sample}.bedpe", refmt = outdir + "/IGV/{sample}.reformat.bedpe", - fail = outdir + "/filtered/{sample}.fail.bedpe", + fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe", vcf = outdir + "/vcf/{sample}.vcf" params: outdir = lambda wc: outdir + "/" + wc.get("sample") diff --git a/src/harpy/snakefiles/sv_naibr_phase.smk b/src/harpy/snakefiles/sv_naibr_phase.smk index cab2a87ea..d99854662 100644 --- a/src/harpy/snakefiles/sv_naibr_phase.smk +++ b/src/harpy/snakefiles/sv_naibr_phase.smk @@ -261,7 +261,7 @@ rule infer_sv: output: bedpe = outdir + "/bedpe/{sample}.bedpe", refmt = outdir + "/IGV/{sample}.reformat.bedpe", - fail = outdir + "/filtered/{sample}.fail.bedpe", + fail = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe", vcf = outdir + "/vcf/{sample}.vcf" params: outdir = lambda wc: outdir + "/" + wc.get("sample") diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk index 7d4c699e9..edebfe5a9 100644 --- a/src/harpy/snakefiles/sv_naibr_pop.smk +++ b/src/harpy/snakefiles/sv_naibr_pop.smk @@ -169,9 +169,9 @@ rule infer_sv: refmt = outdir + "/{population}/{population}.reformat.bedpe", vcf = outdir + "/{population}/{population}.vcf" output: - bedpe = outdir + "/{population}.bedpe", + bedpe = outdir + "/bedpe/{population}.bedpe", refmt = outdir + "/IGV/{population}.reformat.bedpe", - fail = outdir + "/filtered/{population}.fail.bedpe", + fail = outdir + "/bedpe/qc_fail/{population}.fail.bedpe", vcf = outdir + "/vcf/{population}.vcf" params: outdir = lambda wc: outdir + "/" + wc.get("population") @@ -187,6 +187,40 @@ rule infer_sv: rm -rf {params.outdir} """ +rule merge_variants: + input: + collect(outdir + "/bedpe/{population}.bedpe", population = populations) + output: + outdir + "/inversions.bedpe", + outdir + "/deletions.bedpe", + outdir + "/duplications.bedpe" + message: + "Aggregating the detected variants" + run: + from pathlib import Path + with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: + header = ["Population","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"] + _ = inversions.write("\t".join(header) + "\n") + _ = deletions.write("\t".join(header) + "\n") + _ = duplications.write("\t".join(header) + "\n") + for varfile in input: + samplename = Path(varfile).stem + with open(varfile, "r") as f_in: + # read the header to skip it + f_in.readline() + # read the rest of it + while True: + line = f_in.readline() + if not line: + break + record = line.rstrip().split("\t") + if record[-1] == "inversion": + _ = inversions.write(f"{samplename}\t{line}") + elif record[-1] == "deletion": + _ = deletions.write(f"{samplename}\t{line}") + elif record[-1] == "duplication": + _ = duplications.write(f"{samplename}\t{line}") + rule genome_link: input: genomefile @@ -252,7 +286,8 @@ rule sv_report_aggregate: rule workflow_summary: default_target: True input: - bedpe = collect(outdir + "/{pop}.bedpe", pop = populations), + bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations), + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skipreports else [], agg_report = outdir + "/reports/naibr.pop.summary.html" if not skipreports else [] message: diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk index 079ad5d89..40a50a6f6 100644 --- a/src/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/src/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -302,9 +302,9 @@ rule infer_sv: refmt = outdir + "/{population}/{population}.reformat.bedpe", vcf = outdir + "/{population}/{population}.vcf" output: - bedpe = outdir + "/{population}.bedpe", + bedpe = outdir + "/bedpe/{population}.bedpe", refmt = outdir + "/IGV/{population}.reformat.bedpe", - fail = outdir + "/filtered/{population}.fail.bedpe", + fail = outdir + "/bedpe/qc_fail/{population}.fail.bedpe", vcf = outdir + "/vcf/{population}.vcf" params: outdir = lambda wc: outdir + "/" + wc.get("population") @@ -320,6 +320,40 @@ rule infer_sv: rm -rf {params.outdir} """ +rule merge_variants: + input: + collect(outdir + "/bedpe/{population}.bedpe", population = populations) + output: + outdir + "/inversions.bedpe", + outdir + "/deletions.bedpe", + outdir + "/duplications.bedpe" + message: + "Aggregating the detected variants" + run: + from pathlib import Path + with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications: + header = ["Population","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"] + _ = inversions.write("\t".join(header) + "\n") + _ = deletions.write("\t".join(header) + "\n") + _ = duplications.write("\t".join(header) + "\n") + for varfile in input: + samplename = Path(varfile).stem + with open(varfile, "r") as f_in: + # read the header to skip it + f_in.readline() + # read the rest of it + while True: + line = f_in.readline() + if not line: + break + record = line.rstrip().split("\t") + if record[-1] == "inversion": + _ = inversions.write(f"{samplename}\t{line}") + elif record[-1] == "deletion": + _ = deletions.write(f"{samplename}\t{line}") + elif record[-1] == "duplication": + _ = duplications.write(f"{samplename}\t{line}") + rule infer_sv_report: input: fai = f"Genome/{bn}.fai", @@ -349,7 +383,8 @@ rule sv_report_aggregate: rule workflow_summary: default_target: True input: - bedpe = collect(outdir + "/{pop}.bedpe", pop = populations), + bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations), + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]), phaselog = outdir + "/logs/whatshap-haplotag.log", reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skipreports else [], agg_report = outdir + "/reports/naibr.pop.summary.html" if not skipreports else [] From 06f0468605ce83a0c7ad4b7fcf4176fb3e30ccd4 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 13:59:45 -0400 Subject: [PATCH 05/11] include aggregation --- src/harpy/snakefiles/sv_leviathan.smk | 48 ++++++++++++++++++--- src/harpy/snakefiles/sv_leviathan_pop.smk | 52 +++++++++++++++++++---- 2 files changed, 86 insertions(+), 14 deletions(-) diff --git a/src/harpy/snakefiles/sv_leviathan.smk b/src/harpy/snakefiles/sv_leviathan.smk index 3c1d49421..7210c72c7 100644 --- a/src/harpy/snakefiles/sv_leviathan.smk +++ b/src/harpy/snakefiles/sv_leviathan.smk @@ -156,7 +156,7 @@ rule call_sv: genome = f"Genome/{bn}", genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - temp(outdir + "/{sample}.vcf") + temp(outdir + "/vcf/{sample}.vcf") log: runlog = outdir + "/logs/{sample}.leviathan.log", candidates = outdir + "/logs/{sample}.candidates" @@ -178,9 +178,9 @@ rule call_sv: rule sort_bcf: input: - outdir + "/{sample}.vcf" + outdir + "/vcf/{sample}.vcf" output: - outdir + "/{sample}.bcf" + outdir + "/vcf/{sample}.bcf" params: "{wildcards.sample}" container: @@ -192,9 +192,9 @@ rule sort_bcf: rule sv_stats: input: - outdir + "/{sample}.bcf" + outdir + "/vcf/{sample}.bcf" output: - outdir + "/reports/data/{sample}.sv.stats" + temp(outdir + "/reports/data/{sample}.sv.stats") container: None message: @@ -205,6 +205,41 @@ rule sv_stats: bcftools query -f '{wildcards.sample}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output} """ +rule merge_variants: + input: + collect(outdir + "/reports/data/{sample}.sv.stats", sample = samplenames) + output: + outdir + "/inversions.bedpe", + outdir + "/deletions.bedpe", + outdir + "/duplications.bedpe", + outdir + "/breakends.bedpe" + message: + "Aggregating the detected variants" + run: + with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends: + header = ["sample","contig","position_start","position_end","length","type","n_barcodes","n_pairs"] + _ = inversions.write("\t".join(header) + "\n") + _ = deletions.write("\t".join(header) + "\n") + _ = duplications.write("\t".join(header) + "\n") + _ = breakends.write("\t".join(header) + "\n") + for varfile in input: + with open(varfile, "r") as f_in: + # skip header + f_in.readline() + while True: + line = f_in.readline() + if not line: + break + record = line.rstrip().split("\t") + if record[5] == "INV": + _ = inversions.write(line) + elif record[5] == "DEL": + _ = deletions.write(line) + elif record[5] == "DUP": + _ = duplications.write(line) + elif record[5] == "BND": + _ = breakends.write(line) + rule sv_report: input: faidx = f"Genome/{bn}.fai", @@ -222,7 +257,8 @@ rule sv_report: rule workflow_summary: default_target: True input: - vcf = collect(outdir + "/{sample}.bcf", sample = samplenames), + vcf = collect(outdir + "/vcf/{sample}.bcf", sample = samplenames), + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]), reports = collect(outdir + "/reports/{sample}.SV.html", sample = samplenames) if not skipreports else [] params: min_sv = f"-v {min_sv}", diff --git a/src/harpy/snakefiles/sv_leviathan_pop.smk b/src/harpy/snakefiles/sv_leviathan_pop.smk index cd5afd4a2..710b2fe4a 100644 --- a/src/harpy/snakefiles/sv_leviathan_pop.smk +++ b/src/harpy/snakefiles/sv_leviathan_pop.smk @@ -182,7 +182,7 @@ rule call_sv: genome = f"Genome/{bn}", genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - temp(outdir + "/{population}.vcf") + temp(outdir + "/vcf/{population}.vcf") log: runlog = outdir + "/logs/{population}.leviathan.log", candidates = outdir + "/logs/{population}.candidates" @@ -196,7 +196,7 @@ rule call_sv: conda: f"{envdir}/sv.yaml" message: - "Calling variants: Population {wildcards.population}" + "Calling variants: {wildcards.population}" benchmark: ".Benchmark/leviathan-pop/{population}.variantcall" shell: @@ -204,9 +204,9 @@ rule call_sv: rule sort_bcf: input: - outdir + "/{population}.vcf" + outdir + "/vcf/{population}.vcf" output: - outdir + "/{population}.bcf" + outdir + "/vcf/{population}.bcf" params: "{wildcards.population}" container: @@ -218,9 +218,9 @@ rule sort_bcf: rule sv_stats: input: - outdir + "/{population}.bcf" + outdir + "/vcf/{population}.bcf" output: - outdir + "/reports/data/{population}.sv.stats" + temp(outdir + "/reports/data/{population}.sv.stats") container: None message: @@ -231,10 +231,45 @@ rule sv_stats: bcftools query -f '{wildcards.population}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output} """ +rule merge_variants: + input: + collect(outdir + "/reports/data/{population}.sv.stats", population = populations) + output: + outdir + "/inversions.bedpe", + outdir + "/deletions.bedpe", + outdir + "/duplications.bedpe", + outdir + "/breakends.bedpe" + message: + "Aggregating the detected variants" + run: + with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends: + header = ["population","contig","position_start","position_end","length","type","n_barcodes","n_pairs"] + _ = inversions.write("\t".join(header) + "\n") + _ = deletions.write("\t".join(header) + "\n") + _ = duplications.write("\t".join(header) + "\n") + _ = breakends.write("\t".join(header) + "\n") + for varfile in input: + with open(varfile, "r") as f_in: + # skip header + f_in.readline() + while True: + line = f_in.readline() + if not line: + break + record = line.rstrip().split("\t") + if record[5] == "INV": + _ = inversions.write(line) + elif record[5] == "DEL": + _ = deletions.write(line) + elif record[5] == "DUP": + _ = duplications.write(line) + elif record[5] == "BND": + _ = breakends.write(line) + rule sv_report: input: statsfile = outdir + "/reports/data/{population}.sv.stats", - bcf = outdir + "/{population}.bcf", + bcf = outdir + "/vcf/{population}.bcf", faidx = f"Genome/{bn}.fai" output: outdir + "/reports/{population}.sv.html" @@ -261,7 +296,8 @@ rule sv_report_aggregate: rule workflow_summary: default_target: True input: - vcf = collect(outdir + "/{pop}.bcf", pop = populations), + vcf = collect(outdir + "/vcf/{pop}.bcf", pop = populations), + bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]), reports = collect(outdir + "/reports/{pop}.sv.html", pop = populations) if not skipreports else [], agg_report = outdir + "/reports/leviathan.summary.html" if not skipreports else [] message: From 567719f2d59b05ddb12cc4d558efc01b04162c62 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 14:38:46 -0400 Subject: [PATCH 06/11] filename fixes --- src/harpy/snakefiles/sv_naibr_pop.smk | 4 ++-- src/harpy/snakefiles/sv_naibr_pop_phase.smk | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk index edebfe5a9..2a6e8054c 100644 --- a/src/harpy/snakefiles/sv_naibr_pop.smk +++ b/src/harpy/snakefiles/sv_naibr_pop.smk @@ -260,7 +260,7 @@ rule genome_faidx: rule sv_report: input: fai = f"Genome/{bn}.fai", - bedpe = outdir + "/{population}.bedpe" + bedpe = outdir + "/bedpe/{population}.bedpe" output: outdir + "/reports/{population}.naibr.html" conda: @@ -273,7 +273,7 @@ rule sv_report: rule sv_report_aggregate: input: fai = f"Genome/{bn}.fai", - bedpe = collect(outdir + "/{pop}.bedpe", pop = populations) + bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations) output: outdir + "/reports/naibr.pop.summary.html" conda: diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk index 40a50a6f6..037fe931e 100644 --- a/src/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/src/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -357,7 +357,7 @@ rule merge_variants: rule infer_sv_report: input: fai = f"Genome/{bn}.fai", - bedpe = outdir + "/{population}.bedpe" + bedpe = outdir + "/bedpe/{population}.bedpe" output: outdir + "/reports/{population}.naibr.html" message: @@ -370,7 +370,7 @@ rule infer_sv_report: rule sv_report_aggregate: input: fai = f"Genome/{bn}.fai", - bedpe = collect(outdir + "/{pop}.bedpe", pop = populations) + bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations) output: outdir + "/reports/naibr.pop.summary.html" message: From 2336594f2447f71f979b4fb559b7c8a72c68d66c Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 15:07:07 -0400 Subject: [PATCH 07/11] rm dep --- src/harpy/reports/naibr.Rmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/harpy/reports/naibr.Rmd b/src/harpy/reports/naibr.Rmd index 5a9354b8e..03cdaa9b5 100644 --- a/src/harpy/reports/naibr.Rmd +++ b/src/harpy/reports/naibr.Rmd @@ -108,8 +108,6 @@ summstats <- sv %>% group_by(Chr1, SV) %>% summarise(n = n()) -#svlen <- sv %>% group_by(SV) %>% summarise(mlen = mean(Length), sdlen = sd(Length)) - sumtable <- summstats %>% group_by(SV) %>% summarise(count = sum(n)) From 7a3b93cbeffd541aa95cdf5c9810762a7e2d3f43 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 15:07:18 -0400 Subject: [PATCH 08/11] make sure not to plot chimeras --- src/harpy/reports/naibr_pop.Rmd | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/harpy/reports/naibr_pop.Rmd b/src/harpy/reports/naibr_pop.Rmd index 4553194e1..0aa9da103 100644 --- a/src/harpy/reports/naibr_pop.Rmd +++ b/src/harpy/reports/naibr_pop.Rmd @@ -279,10 +279,10 @@ fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$Chr1), 1:2] fa.sizes <- fa.sizes[1:min(nrow(fa.sizes), 30), ] # filter out variants that aren't on one of these contigs -sv <- sv[sv$Chr1 %in% fa.sizes$contig, ] -sv$Chr1 <- factor(sv$Chr1) +sv_clean <- sv_clean[sv_clean$Chr1 %in% fa.sizes$contig, ] +sv_clean$Chr1 <- factor(sv_clean$Chr1) -populations <- levels(sv$Population) +populations <- levels(sv_clean$Population) color.palette <- turbo(length(populations)) names(color.palette) <- populations @@ -322,7 +322,7 @@ tracks <- BioCircosTracklist() variant_type <- "inversion" # Add one track for each population for( pop in populations ){ - variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),] + variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),] if(nrow(variant) > 0){ tracks <- tracks + BioCircosArcTrack( paste0(variant_type, pop), @@ -372,7 +372,7 @@ tracks <- BioCircosTracklist() variant_type <- "duplication" # Add one track for each population for( pop in populations ){ - variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),] + variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),] if(nrow(variant) > 0){ tracks <- tracks + BioCircosArcTrack( paste0(variant_type, pop), @@ -420,7 +420,7 @@ tracks <- BioCircosTracklist() variant_type <- "deletion" # Add one track for each population for( pop in populations ){ - variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),] + variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),] if(nrow(variant) > 0){ tracks <- tracks + BioCircosSNPTrack( paste0(variant_type, pop), From 3fb753c4a20f0c0b81c4113d0e9fbb0b96625245 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 15:09:01 -0400 Subject: [PATCH 09/11] fix path --- src/harpy/snakefiles/sv_naibr_phase.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harpy/snakefiles/sv_naibr_phase.smk b/src/harpy/snakefiles/sv_naibr_phase.smk index d99854662..8540ece7e 100644 --- a/src/harpy/snakefiles/sv_naibr_phase.smk +++ b/src/harpy/snakefiles/sv_naibr_phase.smk @@ -313,7 +313,7 @@ rule merge_variants: rule create_report: input: - bedpe = outdir + "/{sample}.bedpe", + bedpe = outdir + "/bedpe/{sample}.bedpe", fai = f"Genome/{validgenome}.fai" output: outdir + "/reports/{sample}.naibr.html" From b7668ae3508570439b553d2cee7e90ca6c9fdbcf Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 15:45:35 -0400 Subject: [PATCH 10/11] expand and fix text --- src/harpy/reports/impute.Rmd | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd index be9b4c387..4dd405c5a 100644 --- a/src/harpy/reports/impute.Rmd +++ b/src/harpy/reports/impute.Rmd @@ -236,12 +236,16 @@ the reference allele (from the reference genome), `Alt` refers to the alternativ allele, and `Missing` details how many SNPs remain missing (failed to impute) for that individual. As an example, the `Homozygous Ref` is the number of missing genotypes that were imputed into -a homozygous genotype for the reference allele. The table on the right is the -underlying data used to create the visualization. Clicking -on a plot will expand it to fill your browser window. Clicking it again will exit -out of the zoomed view. +a homozygous genotype for the reference allele. The table on the other tab is the +underlying data used to create the visualization. + +```{r} +n_samples <- length(unique(gcts_long$sample)) +plotheight <- 150 + (15 * n_samples) +figheight <- 1 + (0.2 * n_samples) +``` -## ind stats plt +## ind stats plt {.tabset data-height=plotheight} ### Individuals Plot {.no-title} ```{r preprocess fields, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"} gcts_long$Conversion <- gsub("AltHom","Homozygous Alt", gcts_long$Conversion) @@ -249,7 +253,7 @@ gcts_long$Conversion <- gsub("Het","Heterozygote", gcts_long$Conversion) gcts_long$Conversion <- gsub("RefHom","Homozygous Ref", gcts_long$Conversion) gcts_long$Conversion <- gsub("missing","not imputed", gcts_long$Conversion) ``` -```{r percontig dist, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"} +```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE,fig. height=figheight, out.width = "100%"} hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stacking = "normal", animation = F) |> hc_colors(c("#7cb3d4", "#b18ad1", "#ffd75f", "#6f6c70")) |> hc_title(text = "Missing Genotype Imputations") |> From 01b1f6c5bfd06962f5d2b63c88a60e1dae713779 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 22 Jul 2024 16:13:28 -0400 Subject: [PATCH 11/11] fix this silly typo --- src/harpy/reports/impute.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd index 4dd405c5a..b1f1ad087 100644 --- a/src/harpy/reports/impute.Rmd +++ b/src/harpy/reports/impute.Rmd @@ -253,7 +253,7 @@ gcts_long$Conversion <- gsub("Het","Heterozygote", gcts_long$Conversion) gcts_long$Conversion <- gsub("RefHom","Homozygous Ref", gcts_long$Conversion) gcts_long$Conversion <- gsub("missing","not imputed", gcts_long$Conversion) ``` -```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE,fig. height=figheight, out.width = "100%"} +```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE, fig.height=figheight, out.width = "100%"} hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stacking = "normal", animation = F) |> hc_colors(c("#7cb3d4", "#b18ad1", "#ffd75f", "#6f6c70")) |> hc_title(text = "Missing Genotype Imputations") |>