From 9bffc3d92d8be8f5c6e30a25280ed6eade46738c Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 13:31:43 -0400
Subject: [PATCH 01/11] add initial aggregation

---
 src/harpy/bin/infer_sv.py         |  4 +--
 src/harpy/snakefiles/sv_naibr.smk | 41 ++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/harpy/bin/infer_sv.py b/src/harpy/bin/infer_sv.py
index 717794312..e6631869f 100755
--- a/src/harpy/bin/infer_sv.py
+++ b/src/harpy/bin/infer_sv.py
@@ -27,8 +27,7 @@
     }
 
 if args.failfile is not None:
-    failout = open(args.failfile, "w", encoding="utf-8")
-    with open(args.bedfile, "r", encoding="utf-8") as f:
+    with open(args.bedfile, "r", encoding="utf-8") as f, open(args.failfile, "w", encoding="utf-8") as failout:
         # first line, the header
         line = f.readline().strip().split("\t")
         line_corrected = [i.title().replace(" ", "") for i in line]
@@ -49,7 +48,6 @@
                 sys.stdout.write(NEWROW)
             else:
                 failout.write(NEWROW)
-    failout.close()
 else:
     with open(args.bedfile, "r", encoding="utf-8") as f:
         # first line, the header
diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk
index 8255960a5..41511a7c9 100644
--- a/src/harpy/snakefiles/sv_naibr.smk
+++ b/src/harpy/snakefiles/sv_naibr.smk
@@ -139,7 +139,7 @@ rule infer_sv:
         refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
         vcf   = outdir + "/{sample}/{sample}.vcf"
     output:
-        bedpe = outdir + "/{sample}.bedpe",
+        bedpe = outdir + "/bedpe/{sample}.bedpe",
         refmt = outdir + "/IGV/{sample}.reformat.bedpe",
         fail  = outdir + "/filtered/{sample}.fail.bedpe",
         vcf   = outdir + "/vcf/{sample}.vcf" 
@@ -157,6 +157,40 @@ rule infer_sv:
         rm -rf {params.outdir}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        from pathlib import Path
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications:
+            header = ["Sample","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            for varfile in input:
+                samplename = Path(varfile).stem
+                with open(varfile, "r") as f_in:
+                    # read the header to skip it
+                    f_in.readline()
+                    # read the rest of it
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[-1] == "inversion":
+                            _ = inversions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "deletion":
+                            _ = deletions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "duplication":
+                            _ = duplications.write(f"{samplename}\t{line}")
+
 rule genome_link:
     input:
         genomefile
@@ -202,7 +236,7 @@ rule genome_faidx:
 
 rule create_report:
     input:
-        bedpe = outdir + "/{sample}.bedpe",
+        bedpe = outdir + "/bedpe/{sample}.bedpe",
         fai   = f"Genome/{bn}.fai"
     output:
         outdir + "/reports/{sample}.naibr.html"
@@ -216,7 +250,8 @@ rule create_report:
 rule workflow_summary:
     default_target: True
     input:
-        bedpe = collect(outdir + "/{sample}.bedpe", sample = samplenames),
+        bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"])
         reports =  collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else []
     message:
         "Summarizing the workflow: {output}"

From 4f2bb3ff4093a48f6f143e94c3dc3ec3dd7d1f76 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 13:35:37 -0400
Subject: [PATCH 02/11] fixes

---
 src/harpy/snakefiles/sv_naibr.smk | 2 +-
 src/harpy/sv.py                   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk
index 41511a7c9..c384256a7 100644
--- a/src/harpy/snakefiles/sv_naibr.smk
+++ b/src/harpy/snakefiles/sv_naibr.smk
@@ -251,7 +251,7 @@ rule workflow_summary:
     default_target: True
     input:
         bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames),
-        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"])
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]),
         reports =  collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else []
     message:
         "Summarizing the workflow: {output}"
diff --git a/src/harpy/sv.py b/src/harpy/sv.py
index fc3c3fc9e..b3269cfd2 100644
--- a/src/harpy/sv.py
+++ b/src/harpy/sv.py
@@ -122,7 +122,7 @@ def leviathan(inputs, output_dir, genome, min_sv, min_barcodes, iterations, thre
     if config_only:
         sys.exit(0)
 
-    modetext = "pool-by-group" if populations else "single-sample"
+    modetext = "pool-by-group" if populations else "sample-by-sample"
     print_onstart(
         f"Samples: {n}{popgroupings}\nOutput Directory: {output_dir}/\nMode: {modetext}",
         "sv leviathan"
@@ -222,7 +222,7 @@ def naibr(inputs, output_dir, genome, vcf, min_sv, min_barcodes, threads, popula
     if populations:
         modetext = "pool-by-group"
     else:
-        modetext = "single-sample"
+        modetext = "sample-by-sample"
     if vcf:
         modetext += " + will be phased"
     else:

From 6c3ab24b33c8a7b062ad7b7f1c84bd8102100c07 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 13:36:53 -0400
Subject: [PATCH 03/11] add naibrphase

---
 src/harpy/snakefiles/sv_naibr_phase.smk | 39 +++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/harpy/snakefiles/sv_naibr_phase.smk b/src/harpy/snakefiles/sv_naibr_phase.smk
index 866718bc8..cab2a87ea 100644
--- a/src/harpy/snakefiles/sv_naibr_phase.smk
+++ b/src/harpy/snakefiles/sv_naibr_phase.smk
@@ -259,7 +259,7 @@ rule infer_sv:
         refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
         vcf   = outdir + "/{sample}/{sample}.vcf"
     output:
-        bedpe = outdir + "/{sample}.bedpe",
+        bedpe = outdir + "/bedpe/{sample}.bedpe",
         refmt = outdir + "/IGV/{sample}.reformat.bedpe",
         fail  = outdir + "/filtered/{sample}.fail.bedpe",
         vcf   = outdir + "/vcf/{sample}.vcf" 
@@ -277,6 +277,40 @@ rule infer_sv:
         rm -rf {params.outdir}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        from pathlib import Path
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications:
+            header = ["Sample","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            for varfile in input:
+                samplename = Path(varfile).stem
+                with open(varfile, "r") as f_in:
+                    # read the header to skip it
+                    f_in.readline()
+                    # read the rest of it
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[-1] == "inversion":
+                            _ = inversions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "deletion":
+                            _ = deletions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "duplication":
+                            _ = duplications.write(f"{samplename}\t{line}")
+
 rule create_report:
     input:
         bedpe = outdir + "/{sample}.bedpe",
@@ -293,7 +327,8 @@ rule create_report:
 rule workflow_summary:
     default_target: True
     input:
-        bedpe = collect(outdir + "/{sample}.bedpe", sample = samplenames),
+        bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]),
         phaselog = outdir + "/logs/whatshap-haplotag.log",
         reports =  collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else []
     message:

From 6c9303b6487528817c79678969791c8d1c68fff1 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 13:41:53 -0400
Subject: [PATCH 04/11] update to aggregate

---
 src/harpy/snakefiles/sv_naibr.smk           |  2 +-
 src/harpy/snakefiles/sv_naibr_phase.smk     |  2 +-
 src/harpy/snakefiles/sv_naibr_pop.smk       | 41 +++++++++++++++++++--
 src/harpy/snakefiles/sv_naibr_pop_phase.smk | 41 +++++++++++++++++++--
 4 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk
index c384256a7..0e2055854 100644
--- a/src/harpy/snakefiles/sv_naibr.smk
+++ b/src/harpy/snakefiles/sv_naibr.smk
@@ -141,7 +141,7 @@ rule infer_sv:
     output:
         bedpe = outdir + "/bedpe/{sample}.bedpe",
         refmt = outdir + "/IGV/{sample}.reformat.bedpe",
-        fail  = outdir + "/filtered/{sample}.fail.bedpe",
+        fail  = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe",
         vcf   = outdir + "/vcf/{sample}.vcf" 
     params:
         outdir = lambda wc: outdir + "/" + wc.get("sample")
diff --git a/src/harpy/snakefiles/sv_naibr_phase.smk b/src/harpy/snakefiles/sv_naibr_phase.smk
index cab2a87ea..d99854662 100644
--- a/src/harpy/snakefiles/sv_naibr_phase.smk
+++ b/src/harpy/snakefiles/sv_naibr_phase.smk
@@ -261,7 +261,7 @@ rule infer_sv:
     output:
         bedpe = outdir + "/bedpe/{sample}.bedpe",
         refmt = outdir + "/IGV/{sample}.reformat.bedpe",
-        fail  = outdir + "/filtered/{sample}.fail.bedpe",
+        fail  = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe",
         vcf   = outdir + "/vcf/{sample}.vcf" 
     params:
         outdir = lambda wc: outdir + "/" + wc.get("sample")
diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk
index 7d4c699e9..edebfe5a9 100644
--- a/src/harpy/snakefiles/sv_naibr_pop.smk
+++ b/src/harpy/snakefiles/sv_naibr_pop.smk
@@ -169,9 +169,9 @@ rule infer_sv:
         refmt = outdir + "/{population}/{population}.reformat.bedpe",
         vcf   = outdir + "/{population}/{population}.vcf"
     output:
-        bedpe = outdir + "/{population}.bedpe",
+        bedpe = outdir + "/bedpe/{population}.bedpe",
         refmt = outdir + "/IGV/{population}.reformat.bedpe",
-        fail  = outdir + "/filtered/{population}.fail.bedpe",
+        fail  = outdir + "/bedpe/qc_fail/{population}.fail.bedpe",
         vcf   = outdir + "/vcf/{population}.vcf" 
     params:
         outdir = lambda wc: outdir + "/" + wc.get("population")
@@ -187,6 +187,40 @@ rule infer_sv:
         rm -rf {params.outdir}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/bedpe/{population}.bedpe", population = populations)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        from pathlib import Path
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications:
+            header = ["Population","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            for varfile in input:
+                samplename = Path(varfile).stem
+                with open(varfile, "r") as f_in:
+                    # read the header to skip it
+                    f_in.readline()
+                    # read the rest of it
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[-1] == "inversion":
+                            _ = inversions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "deletion":
+                            _ = deletions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "duplication":
+                            _ = duplications.write(f"{samplename}\t{line}")
+
 rule genome_link:
     input:
         genomefile
@@ -252,7 +286,8 @@ rule sv_report_aggregate:
 rule workflow_summary:
     default_target: True
     input:
-        bedpe = collect(outdir + "/{pop}.bedpe", pop = populations),
+        bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]),
         reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skipreports else [],
         agg_report = outdir + "/reports/naibr.pop.summary.html" if not skipreports else []
     message:
diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
index 079ad5d89..40a50a6f6 100644
--- a/src/harpy/snakefiles/sv_naibr_pop_phase.smk
+++ b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
@@ -302,9 +302,9 @@ rule infer_sv:
         refmt = outdir + "/{population}/{population}.reformat.bedpe",
         vcf   = outdir + "/{population}/{population}.vcf"
     output:
-        bedpe = outdir + "/{population}.bedpe",
+        bedpe = outdir + "/bedpe/{population}.bedpe",
         refmt = outdir + "/IGV/{population}.reformat.bedpe",
-        fail  = outdir + "/filtered/{population}.fail.bedpe",
+        fail  = outdir + "/bedpe/qc_fail/{population}.fail.bedpe",
         vcf   = outdir + "/vcf/{population}.vcf" 
     params:
         outdir = lambda wc: outdir + "/" + wc.get("population")
@@ -320,6 +320,40 @@ rule infer_sv:
         rm -rf {params.outdir}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/bedpe/{population}.bedpe", population = populations)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        from pathlib import Path
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications:
+            header = ["Population","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            for varfile in input:
+                samplename = Path(varfile).stem
+                with open(varfile, "r") as f_in:
+                    # read the header to skip it
+                    f_in.readline()
+                    # read the rest of it
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[-1] == "inversion":
+                            _ = inversions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "deletion":
+                            _ = deletions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "duplication":
+                            _ = duplications.write(f"{samplename}\t{line}")
+
 rule infer_sv_report:
     input:
         fai   = f"Genome/{bn}.fai",
@@ -349,7 +383,8 @@ rule sv_report_aggregate:
 rule workflow_summary:
     default_target: True
     input:
-        bedpe = collect(outdir + "/{pop}.bedpe", pop = populations),
+        bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]),
         phaselog = outdir + "/logs/whatshap-haplotag.log",
         reports = collect(outdir + "/reports/{pop}.naibr.html", pop = populations) if not skipreports else [],
         agg_report = outdir + "/reports/naibr.pop.summary.html" if not skipreports else []

From 06f0468605ce83a0c7ad4b7fcf4176fb3e30ccd4 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 13:59:45 -0400
Subject: [PATCH 05/11] include aggregation

---
 src/harpy/snakefiles/sv_leviathan.smk     | 48 ++++++++++++++++++---
 src/harpy/snakefiles/sv_leviathan_pop.smk | 52 +++++++++++++++++++----
 2 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/src/harpy/snakefiles/sv_leviathan.smk b/src/harpy/snakefiles/sv_leviathan.smk
index 3c1d49421..7210c72c7 100644
--- a/src/harpy/snakefiles/sv_leviathan.smk
+++ b/src/harpy/snakefiles/sv_leviathan.smk
@@ -156,7 +156,7 @@ rule call_sv:
         genome = f"Genome/{bn}",
         genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb")
     output:
-        temp(outdir + "/{sample}.vcf")
+        temp(outdir + "/vcf/{sample}.vcf")
     log:  
         runlog     = outdir + "/logs/{sample}.leviathan.log",
         candidates = outdir + "/logs/{sample}.candidates"
@@ -178,9 +178,9 @@ rule call_sv:
 
 rule sort_bcf:
     input:
-        outdir + "/{sample}.vcf"
+        outdir + "/vcf/{sample}.vcf"
     output:
-        outdir + "/{sample}.bcf"
+        outdir + "/vcf/{sample}.bcf"
     params:
         "{wildcards.sample}"
     container:
@@ -192,9 +192,9 @@ rule sort_bcf:
 
 rule sv_stats:
     input: 
-        outdir + "/{sample}.bcf"
+        outdir + "/vcf/{sample}.bcf"
     output: 
-        outdir + "/reports/data/{sample}.sv.stats"
+        temp(outdir + "/reports/data/{sample}.sv.stats")
     container:
         None
     message:
@@ -205,6 +205,41 @@ rule sv_stats:
         bcftools query -f '{wildcards.sample}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/reports/data/{sample}.sv.stats", sample = samplenames)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe",
+        outdir + "/breakends.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends:
+            header = ["sample","contig","position_start","position_end","length","type","n_barcodes","n_pairs"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            _ = breakends.write("\t".join(header) + "\n")
+            for varfile in input:
+                with open(varfile, "r") as f_in:
+                    # skip header
+                    f_in.readline()
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[5] == "INV":
+                            _ = inversions.write(line)
+                        elif record[5] == "DEL":
+                            _ = deletions.write(line)
+                        elif record[5] == "DUP":
+                            _ = duplications.write(line)
+                        elif record[5] == "BND":
+                            _ = breakends.write(line)
+
 rule sv_report:
     input:	
         faidx     = f"Genome/{bn}.fai",
@@ -222,7 +257,8 @@ rule sv_report:
 rule workflow_summary:
     default_target: True
     input: 
-        vcf = collect(outdir + "/{sample}.bcf", sample = samplenames),
+        vcf = collect(outdir + "/vcf/{sample}.bcf", sample = samplenames),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]),
         reports = collect(outdir + "/reports/{sample}.SV.html", sample = samplenames) if not skipreports else []
     params:
         min_sv = f"-v {min_sv}",
diff --git a/src/harpy/snakefiles/sv_leviathan_pop.smk b/src/harpy/snakefiles/sv_leviathan_pop.smk
index cd5afd4a2..710b2fe4a 100644
--- a/src/harpy/snakefiles/sv_leviathan_pop.smk
+++ b/src/harpy/snakefiles/sv_leviathan_pop.smk
@@ -182,7 +182,7 @@ rule call_sv:
         genome = f"Genome/{bn}",
         genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb")
     output:
-        temp(outdir + "/{population}.vcf")
+        temp(outdir + "/vcf/{population}.vcf")
     log:  
         runlog     = outdir + "/logs/{population}.leviathan.log",
         candidates = outdir + "/logs/{population}.candidates"
@@ -196,7 +196,7 @@ rule call_sv:
     conda:
         f"{envdir}/sv.yaml"
     message:
-        "Calling variants: Population {wildcards.population}"
+        "Calling variants: {wildcards.population}"
     benchmark:
         ".Benchmark/leviathan-pop/{population}.variantcall"
     shell:
@@ -204,9 +204,9 @@ rule call_sv:
 
 rule sort_bcf:
     input:
-        outdir + "/{population}.vcf"
+        outdir + "/vcf/{population}.vcf"
     output:
-        outdir + "/{population}.bcf"
+        outdir + "/vcf/{population}.bcf"
     params:
         "{wildcards.population}"
     container:
@@ -218,9 +218,9 @@ rule sort_bcf:
 
 rule sv_stats:
     input: 
-        outdir + "/{population}.bcf"
+        outdir + "/vcf/{population}.bcf"
     output:
-        outdir + "/reports/data/{population}.sv.stats"
+        temp(outdir + "/reports/data/{population}.sv.stats")
     container:
         None
     message:
@@ -231,10 +231,45 @@ rule sv_stats:
         bcftools query -f '{wildcards.population}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/reports/data/{population}.sv.stats", population = populations)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe",
+        outdir + "/breakends.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends:
+            header = ["population","contig","position_start","position_end","length","type","n_barcodes","n_pairs"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            _ = breakends.write("\t".join(header) + "\n")
+            for varfile in input:
+                with open(varfile, "r") as f_in:
+                    # skip header
+                    f_in.readline()
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[5] == "INV":
+                            _ = inversions.write(line)
+                        elif record[5] == "DEL":
+                            _ = deletions.write(line)
+                        elif record[5] == "DUP":
+                            _ = duplications.write(line)
+                        elif record[5] == "BND":
+                            _ = breakends.write(line)
+
 rule sv_report:
     input:	
         statsfile = outdir + "/reports/data/{population}.sv.stats",
-        bcf       = outdir + "/{population}.bcf",
+        bcf       = outdir + "/vcf/{population}.bcf",
         faidx     = f"Genome/{bn}.fai"
     output:
         outdir + "/reports/{population}.sv.html"
@@ -261,7 +296,8 @@ rule sv_report_aggregate:
 rule workflow_summary:
     default_target: True
     input:
-        vcf = collect(outdir + "/{pop}.bcf", pop = populations),
+        vcf = collect(outdir + "/vcf/{pop}.bcf", pop = populations),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]),
         reports = collect(outdir + "/reports/{pop}.sv.html", pop = populations) if not skipreports else [],
         agg_report = outdir + "/reports/leviathan.summary.html" if not skipreports else []
     message:

From 567719f2d59b05ddb12cc4d558efc01b04162c62 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 14:38:46 -0400
Subject: [PATCH 06/11] filename fixes

---
 src/harpy/snakefiles/sv_naibr_pop.smk       | 4 ++--
 src/harpy/snakefiles/sv_naibr_pop_phase.smk | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk
index edebfe5a9..2a6e8054c 100644
--- a/src/harpy/snakefiles/sv_naibr_pop.smk
+++ b/src/harpy/snakefiles/sv_naibr_pop.smk
@@ -260,7 +260,7 @@ rule genome_faidx:
 rule sv_report:
     input:
         fai   = f"Genome/{bn}.fai",
-        bedpe = outdir + "/{population}.bedpe"
+        bedpe = outdir + "/bedpe/{population}.bedpe"
     output:
         outdir + "/reports/{population}.naibr.html"
     conda:
@@ -273,7 +273,7 @@ rule sv_report:
 rule sv_report_aggregate:
     input:
         fai   = f"Genome/{bn}.fai",
-        bedpe = collect(outdir + "/{pop}.bedpe", pop = populations)
+        bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations)
     output:
         outdir + "/reports/naibr.pop.summary.html"
     conda:
diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
index 40a50a6f6..037fe931e 100644
--- a/src/harpy/snakefiles/sv_naibr_pop_phase.smk
+++ b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
@@ -357,7 +357,7 @@ rule merge_variants:
 rule infer_sv_report:
     input:
         fai   = f"Genome/{bn}.fai",
-        bedpe = outdir + "/{population}.bedpe"
+        bedpe = outdir + "/bedpe/{population}.bedpe"
     output:
         outdir + "/reports/{population}.naibr.html"
     message:
@@ -370,7 +370,7 @@ rule infer_sv_report:
 rule sv_report_aggregate:
     input:
         fai   = f"Genome/{bn}.fai",
-        bedpe = collect(outdir + "/{pop}.bedpe", pop = populations)
+        bedpe = collect(outdir + "/bedpe/{pop}.bedpe", pop = populations)
     output:
         outdir + "/reports/naibr.pop.summary.html"
     message:

From 2336594f2447f71f979b4fb559b7c8a72c68d66c Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 15:07:07 -0400
Subject: [PATCH 07/11] rm dep

---
 src/harpy/reports/naibr.Rmd | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/harpy/reports/naibr.Rmd b/src/harpy/reports/naibr.Rmd
index 5a9354b8e..03cdaa9b5 100644
--- a/src/harpy/reports/naibr.Rmd
+++ b/src/harpy/reports/naibr.Rmd
@@ -108,8 +108,6 @@ summstats <- sv %>%
   group_by(Chr1, SV) %>%
   summarise(n = n())
 
-#svlen <- sv %>% group_by(SV) %>%  summarise(mlen = mean(Length), sdlen = sd(Length))
-
 sumtable <- summstats %>%
   group_by(SV) %>%
   summarise(count = sum(n))

From 7a3b93cbeffd541aa95cdf5c9810762a7e2d3f43 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 15:07:18 -0400
Subject: [PATCH 08/11] make sure not to plot chimeras

---
 src/harpy/reports/naibr_pop.Rmd | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/harpy/reports/naibr_pop.Rmd b/src/harpy/reports/naibr_pop.Rmd
index 4553194e1..0aa9da103 100644
--- a/src/harpy/reports/naibr_pop.Rmd
+++ b/src/harpy/reports/naibr_pop.Rmd
@@ -279,10 +279,10 @@ fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$Chr1), 1:2]
 fa.sizes <- fa.sizes[1:min(nrow(fa.sizes), 30), ]
 
 # filter out variants that aren't on one of these contigs
-sv <- sv[sv$Chr1 %in% fa.sizes$contig, ]
-sv$Chr1 <- factor(sv$Chr1)
+sv_clean <- sv_clean[sv_clean$Chr1 %in% fa.sizes$contig, ]
+sv_clean$Chr1 <- factor(sv_clean$Chr1)
 
-populations <- levels(sv$Population)
+populations <- levels(sv_clean$Population)
 color.palette <- turbo(length(populations))
 names(color.palette) <- populations
 
@@ -322,7 +322,7 @@ tracks <- BioCircosTracklist()
 variant_type <- "inversion"
 # Add one track for each population
 for( pop in populations ){
-  variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
+  variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
   if(nrow(variant) > 0){
     tracks <- tracks + BioCircosArcTrack(
       paste0(variant_type, pop),
@@ -372,7 +372,7 @@ tracks <- BioCircosTracklist()
 variant_type <- "duplication"
 # Add one track for each population
 for( pop in populations ){
-  variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
+  variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
   if(nrow(variant) > 0){
     tracks <- tracks + BioCircosArcTrack(
       paste0(variant_type, pop),
@@ -420,7 +420,7 @@ tracks <- BioCircosTracklist()
 variant_type <- "deletion"
 # Add one track for each population
 for( pop in populations ){
-  variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
+  variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
   if(nrow(variant) > 0){
     tracks <- tracks + BioCircosSNPTrack(
       paste0(variant_type, pop),

From 3fb753c4a20f0c0b81c4113d0e9fbb0b96625245 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 15:09:01 -0400
Subject: [PATCH 09/11] fix path

---
 src/harpy/snakefiles/sv_naibr_phase.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/harpy/snakefiles/sv_naibr_phase.smk b/src/harpy/snakefiles/sv_naibr_phase.smk
index d99854662..8540ece7e 100644
--- a/src/harpy/snakefiles/sv_naibr_phase.smk
+++ b/src/harpy/snakefiles/sv_naibr_phase.smk
@@ -313,7 +313,7 @@ rule merge_variants:
 
 rule create_report:
     input:
-        bedpe = outdir + "/{sample}.bedpe",
+        bedpe = outdir + "/bedpe/{sample}.bedpe",
         fai   = f"Genome/{validgenome}.fai"
     output:
         outdir + "/reports/{sample}.naibr.html"

From b7668ae3508570439b553d2cee7e90ca6c9fdbcf Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 15:45:35 -0400
Subject: [PATCH 10/11] expand and fix text

---
 src/harpy/reports/impute.Rmd | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd
index be9b4c387..4dd405c5a 100644
--- a/src/harpy/reports/impute.Rmd
+++ b/src/harpy/reports/impute.Rmd
@@ -236,12 +236,16 @@ the reference allele (from the reference genome), `Alt` refers to the alternativ
 allele, and `Missing` details how many SNPs remain missing
 (failed to impute) for that individual. As an example, the `Homozygous Ref` is 
 the number of missing genotypes that were imputed into
-a homozygous genotype for the reference allele. The table on the right is the
-underlying data used to create the visualization. Clicking
-on a plot will expand it to fill your browser window. Clicking it again will exit
-out of the zoomed view.
+a homozygous genotype for the reference allele. The table on the other tab is the
+underlying data used to create the visualization.
+
+```{r}
+n_samples <- length(unique(gcts_long$sample))
+plotheight <- 150 + (15 * n_samples)
+figheight <- 1 + (0.2 * n_samples)
+```
 
-## ind stats plt
+## ind stats plt {.tabset data-height=plotheight}
 ### Individuals Plot {.no-title}
 ```{r preprocess fields, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"}
 gcts_long$Conversion <- gsub("AltHom","Homozygous Alt", gcts_long$Conversion)
@@ -249,7 +253,7 @@ gcts_long$Conversion <- gsub("Het","Heterozygote", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("RefHom","Homozygous Ref", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("missing","not imputed", gcts_long$Conversion)
 ```
-```{r percontig dist, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"}
+```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE,fig. height=figheight, out.width = "100%"}
 hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stacking = "normal", animation = F) |>
   hc_colors(c("#7cb3d4", "#b18ad1", "#ffd75f", "#6f6c70")) |>
   hc_title(text = "Missing Genotype Imputations") |>

From 01b1f6c5bfd06962f5d2b63c88a60e1dae713779 Mon Sep 17 00:00:00 2001
From: pdimens <pdimens@live.com>
Date: Mon, 22 Jul 2024 16:13:28 -0400
Subject: [PATCH 11/11] fix this silly typo

---
 src/harpy/reports/impute.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd
index 4dd405c5a..b1f1ad087 100644
--- a/src/harpy/reports/impute.Rmd
+++ b/src/harpy/reports/impute.Rmd
@@ -253,7 +253,7 @@ gcts_long$Conversion <- gsub("Het","Heterozygote", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("RefHom","Homozygous Ref", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("missing","not imputed", gcts_long$Conversion)
 ```
-```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE,fig. height=figheight, out.width = "100%"}
+```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE, fig.height=figheight, out.width = "100%"}
 hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stacking = "normal", animation = F) |>
   hc_colors(c("#7cb3d4", "#b18ad1", "#ffd75f", "#6f6c70")) |>
   hc_title(text = "Missing Genotype Imputations") |>