pdimens · pdimens · Jul 23, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/src/harpy/bin/infer_sv.py b/src/harpy/bin/infer_sv.py
@@ -27,8 +27,7 @@
     }
 
 if args.failfile is not None:
-    failout = open(args.failfile, "w", encoding="utf-8")
-    with open(args.bedfile, "r", encoding="utf-8") as f:
+    with open(args.bedfile, "r", encoding="utf-8") as f, open(args.failfile, "w", encoding="utf-8") as failout:
         # first line, the header
         line = f.readline().strip().split("\t")
         line_corrected = [i.title().replace(" ", "") for i in line]
@@ -49,7 +48,6 @@
                 sys.stdout.write(NEWROW)
             else:
                 failout.write(NEWROW)
-    failout.close()
 else:
     with open(args.bedfile, "r", encoding="utf-8") as f:
         # first line, the header

diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd
@@ -236,20 +236,24 @@ the reference allele (from the reference genome), `Alt` refers to the alternativ
 allele, and `Missing` details how many SNPs remain missing
 (failed to impute) for that individual. As an example, the `Homozygous Ref` is 
 the number of missing genotypes that were imputed into
-a homozygous genotype for the reference allele. The table on the right is the
-underlying data used to create the visualization. Clicking
-on a plot will expand it to fill your browser window. Clicking it again will exit
-out of the zoomed view.
+a homozygous genotype for the reference allele. The table on the other tab is the
+underlying data used to create the visualization.
+
+```{r}
+n_samples <- length(unique(gcts_long$sample))
+plotheight <- 150 + (15 * n_samples)
+figheight <- 1 + (0.2 * n_samples)
+```
 
-## ind stats plt
+## ind stats plt {.tabset data-height=plotheight}
 ### Individuals Plot {.no-title}
 ```{r preprocess fields, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"}
 gcts_long$Conversion <- gsub("AltHom","Homozygous Alt", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("Het","Heterozygote", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("RefHom","Homozygous Ref", gcts_long$Conversion)
 gcts_long$Conversion <- gsub("missing","not imputed", gcts_long$Conversion)
 ```
-```{r percontig dist, echo = FALSE, message = FALSE, warning = FALSE, out.width = "100%"}
+```{r per_indiv, echo = FALSE, message = FALSE, warning = FALSE, fig.height=figheight, out.width = "100%"}
 hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stacking = "normal", animation = F) |>
   hc_colors(c("#7cb3d4", "#b18ad1", "#ffd75f", "#6f6c70")) |>
   hc_title(text = "Missing Genotype Imputations") |>

diff --git a/src/harpy/reports/naibr.Rmd b/src/harpy/reports/naibr.Rmd
@@ -108,8 +108,6 @@ summstats <- sv %>%
   group_by(Chr1, SV) %>%
   summarise(n = n())
 
-#svlen <- sv %>% group_by(SV) %>%  summarise(mlen = mean(Length), sdlen = sd(Length))
-
 sumtable <- summstats %>%
   group_by(SV) %>%
   summarise(count = sum(n))

diff --git a/src/harpy/reports/naibr_pop.Rmd b/src/harpy/reports/naibr_pop.Rmd
@@ -279,10 +279,10 @@ fa.sizes <- fa.sizes[fa.sizes$contig %in% unique(sv$Chr1), 1:2]
 fa.sizes <- fa.sizes[1:min(nrow(fa.sizes), 30), ]
 
 # filter out variants that aren't on one of these contigs
-sv <- sv[sv$Chr1 %in% fa.sizes$contig, ]
-sv$Chr1 <- factor(sv$Chr1)
+sv_clean <- sv_clean[sv_clean$Chr1 %in% fa.sizes$contig, ]
+sv_clean$Chr1 <- factor(sv_clean$Chr1)
 
-populations <- levels(sv$Population)
+populations <- levels(sv_clean$Population)
 color.palette <- turbo(length(populations))
 names(color.palette) <- populations
 
@@ -322,7 +322,7 @@ tracks <- BioCircosTracklist()
 variant_type <- "inversion"
 # Add one track for each population
 for( pop in populations ){
-  variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
+  variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
   if(nrow(variant) > 0){
     tracks <- tracks + BioCircosArcTrack(
       paste0(variant_type, pop),
@@ -372,7 +372,7 @@ tracks <- BioCircosTracklist()
 variant_type <- "duplication"
 # Add one track for each population
 for( pop in populations ){
-  variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
+  variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
   if(nrow(variant) > 0){
     tracks <- tracks + BioCircosArcTrack(
       paste0(variant_type, pop),
@@ -420,7 +420,7 @@ tracks <- BioCircosTracklist()
 variant_type <- "deletion"
 # Add one track for each population
 for( pop in populations ){
-  variant <- sv[(sv$SV == variant_type) & (sv$Population == pop),]
+  variant <- sv_clean[(sv_clean$SV == variant_type) & (sv_clean$Population == pop),]
   if(nrow(variant) > 0){
     tracks <- tracks + BioCircosSNPTrack(
       paste0(variant_type, pop),

diff --git a/src/harpy/snakefiles/sv_leviathan.smk b/src/harpy/snakefiles/sv_leviathan.smk
@@ -156,7 +156,7 @@ rule call_sv:
         genome = f"Genome/{bn}",
         genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb")
     output:
-        temp(outdir + "/{sample}.vcf")
+        temp(outdir + "/vcf/{sample}.vcf")
     log:  
         runlog     = outdir + "/logs/{sample}.leviathan.log",
         candidates = outdir + "/logs/{sample}.candidates"
@@ -178,9 +178,9 @@ rule call_sv:
 
 rule sort_bcf:
     input:
-        outdir + "/{sample}.vcf"
+        outdir + "/vcf/{sample}.vcf"
     output:
-        outdir + "/{sample}.bcf"
+        outdir + "/vcf/{sample}.bcf"
     params:
         "{wildcards.sample}"
     container:
@@ -192,9 +192,9 @@ rule sort_bcf:
 
 rule sv_stats:
     input: 
-        outdir + "/{sample}.bcf"
+        outdir + "/vcf/{sample}.bcf"
     output: 
-        outdir + "/reports/data/{sample}.sv.stats"
+        temp(outdir + "/reports/data/{sample}.sv.stats")
     container:
         None
     message:
@@ -205,6 +205,41 @@ rule sv_stats:
         bcftools query -f '{wildcards.sample}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/reports/data/{sample}.sv.stats", sample = samplenames)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe",
+        outdir + "/breakends.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends:
+            header = ["sample","contig","position_start","position_end","length","type","n_barcodes","n_pairs"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            _ = breakends.write("\t".join(header) + "\n")
+            for varfile in input:
+                with open(varfile, "r") as f_in:
+                    # skip header
+                    f_in.readline()
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[5] == "INV":
+                            _ = inversions.write(line)
+                        elif record[5] == "DEL":
+                            _ = deletions.write(line)
+                        elif record[5] == "DUP":
+                            _ = duplications.write(line)
+                        elif record[5] == "BND":
+                            _ = breakends.write(line)
+
 rule sv_report:
     input:	
         faidx     = f"Genome/{bn}.fai",
@@ -222,7 +257,8 @@ rule sv_report:
 rule workflow_summary:
     default_target: True
     input: 
-        vcf = collect(outdir + "/{sample}.bcf", sample = samplenames),
+        vcf = collect(outdir + "/vcf/{sample}.bcf", sample = samplenames),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]),
         reports = collect(outdir + "/reports/{sample}.SV.html", sample = samplenames) if not skipreports else []
     params:
         min_sv = f"-v {min_sv}",

diff --git a/src/harpy/snakefiles/sv_leviathan_pop.smk b/src/harpy/snakefiles/sv_leviathan_pop.smk
@@ -182,7 +182,7 @@ rule call_sv:
         genome = f"Genome/{bn}",
         genidx = multiext(f"Genome/{bn}", ".fai", ".ann", ".bwt", ".pac", ".sa", ".amb")
     output:
-        temp(outdir + "/{population}.vcf")
+        temp(outdir + "/vcf/{population}.vcf")
     log:  
         runlog     = outdir + "/logs/{population}.leviathan.log",
         candidates = outdir + "/logs/{population}.candidates"
@@ -196,17 +196,17 @@ rule call_sv:
     conda:
         f"{envdir}/sv.yaml"
     message:
-        "Calling variants: Population {wildcards.population}"
+        "Calling variants: {wildcards.population}"
     benchmark:
         ".Benchmark/leviathan-pop/{population}.variantcall"
     shell:
         "LEVIATHAN -b {input.bam} -i {input.bc_idx} {params} -g {input.genome} -o {output} -t {threads} --candidates {log.candidates} 2> {log.runlog}"
 
 rule sort_bcf:
     input:
-        outdir + "/{population}.vcf"
+        outdir + "/vcf/{population}.vcf"
     output:
-        outdir + "/{population}.bcf"
+        outdir + "/vcf/{population}.bcf"
     params:
         "{wildcards.population}"
     container:
@@ -218,9 +218,9 @@ rule sort_bcf:
 
 rule sv_stats:
     input: 
-        outdir + "/{population}.bcf"
+        outdir + "/vcf/{population}.bcf"
     output:
-        outdir + "/reports/data/{population}.sv.stats"
+        temp(outdir + "/reports/data/{population}.sv.stats")
     container:
         None
     message:
@@ -231,10 +231,45 @@ rule sv_stats:
         bcftools query -f '{wildcards.population}\\t%CHROM\\t%POS\\t%END\\t%SVLEN\\t%SVTYPE\\t%BARCODES\\t%PAIRS\\n' {input} >> {output}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/reports/data/{population}.sv.stats", population = populations)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe",
+        outdir + "/breakends.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications, open(output[3], "w") as breakends:
+            header = ["population","contig","position_start","position_end","length","type","n_barcodes","n_pairs"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            _ = breakends.write("\t".join(header) + "\n")
+            for varfile in input:
+                with open(varfile, "r") as f_in:
+                    # skip header
+                    f_in.readline()
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[5] == "INV":
+                            _ = inversions.write(line)
+                        elif record[5] == "DEL":
+                            _ = deletions.write(line)
+                        elif record[5] == "DUP":
+                            _ = duplications.write(line)
+                        elif record[5] == "BND":
+                            _ = breakends.write(line)
+
 rule sv_report:
     input:	
         statsfile = outdir + "/reports/data/{population}.sv.stats",
-        bcf       = outdir + "/{population}.bcf",
+        bcf       = outdir + "/vcf/{population}.bcf",
         faidx     = f"Genome/{bn}.fai"
     output:
         outdir + "/reports/{population}.sv.html"
@@ -261,7 +296,8 @@ rule sv_report_aggregate:
 rule workflow_summary:
     default_target: True
     input:
-        vcf = collect(outdir + "/{pop}.bcf", pop = populations),
+        vcf = collect(outdir + "/vcf/{pop}.bcf", pop = populations),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications", "breakends"]),
         reports = collect(outdir + "/reports/{pop}.sv.html", pop = populations) if not skipreports else [],
         agg_report = outdir + "/reports/leviathan.summary.html" if not skipreports else []
     message:

diff --git a/src/harpy/snakefiles/sv_naibr.smk b/src/harpy/snakefiles/sv_naibr.smk
@@ -139,9 +139,9 @@ rule infer_sv:
         refmt = outdir + "/{sample}/{sample}.reformat.bedpe",
         vcf   = outdir + "/{sample}/{sample}.vcf"
     output:
-        bedpe = outdir + "/{sample}.bedpe",
+        bedpe = outdir + "/bedpe/{sample}.bedpe",
         refmt = outdir + "/IGV/{sample}.reformat.bedpe",
-        fail  = outdir + "/filtered/{sample}.fail.bedpe",
+        fail  = outdir + "/bedpe/qc_fail/{sample}.fail.bedpe",
         vcf   = outdir + "/vcf/{sample}.vcf" 
     params:
         outdir = lambda wc: outdir + "/" + wc.get("sample")
@@ -157,6 +157,40 @@ rule infer_sv:
         rm -rf {params.outdir}
         """
 
+rule merge_variants:
+    input:
+        collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames)
+    output:
+        outdir + "/inversions.bedpe",
+        outdir + "/deletions.bedpe",
+        outdir + "/duplications.bedpe"
+    message:
+        "Aggregating the detected variants"
+    run:
+        from pathlib import Path
+        with open(output[0], "w") as inversions, open(output[1], "w") as deletions, open(output[2], "w") as duplications:
+            header = ["Sample","Chr1","Break1","Chr2","Break2","SplitMolecules","DiscordantReads","Orientation","Haplotype","Score","PassFilter","SV"]
+            _ = inversions.write("\t".join(header) + "\n")
+            _ = deletions.write("\t".join(header) + "\n")
+            _ = duplications.write("\t".join(header) + "\n")
+            for varfile in input:
+                samplename = Path(varfile).stem
+                with open(varfile, "r") as f_in:
+                    # read the header to skip it
+                    f_in.readline()
+                    # read the rest of it
+                    while True:
+                        line = f_in.readline()
+                        if not line:
+                            break
+                        record = line.rstrip().split("\t")
+                        if record[-1] == "inversion":
+                            _ = inversions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "deletion":
+                            _ = deletions.write(f"{samplename}\t{line}")
+                        elif record[-1] == "duplication":
+                            _ = duplications.write(f"{samplename}\t{line}")
+
 rule genome_link:
     input:
         genomefile
@@ -202,7 +236,7 @@ rule genome_faidx:
 
 rule create_report:
     input:
-        bedpe = outdir + "/{sample}.bedpe",
+        bedpe = outdir + "/bedpe/{sample}.bedpe",
         fai   = f"Genome/{bn}.fai"
     output:
         outdir + "/reports/{sample}.naibr.html"
@@ -216,7 +250,8 @@ rule create_report:
 rule workflow_summary:
     default_target: True
     input:
-        bedpe = collect(outdir + "/{sample}.bedpe", sample = samplenames),
+        bedpe = collect(outdir + "/bedpe/{sample}.bedpe", sample = samplenames),
+        bedpe_agg = collect(outdir + "/{sv}.bedpe", sv = ["inversions", "deletions","duplications"]),
         reports =  collect(outdir + "/reports/{sample}.naibr.html", sample = samplenames) if not skipreports else []
     message:
         "Summarizing the workflow: {output}"