diff --git a/.github/filters.yml b/.github/filters.yml
index 63797d230..62163e977 100644
--- a/.github/filters.yml
+++ b/.github/filters.yml
@@ -109,12 +109,14 @@ leviathan: &leviathan
- 'src/harpy/sv.py'
- 'src/harpy/snakefiles/sv_leviathan**.smk'
- 'test/bam/**'
+ - 'src/harpy/bin/concatenate_bam.py'
- 'src/harpy/reports/Leviathan**.Rmd'
naibr: &naibr
- *common
- *container
- 'src/harpy/sv.py'
- 'src/harpy/snakefiles/sv_naibr**.smk'
+ - 'src/harpy/bin/concatenate_bam.py'
- 'test/bam/**'
- 'test/bam_phased/**'
- 'test/vcf/test.phased.bcf'
diff --git a/bamlist b/bamlist
new file mode 100644
index 000000000..31050d5a0
--- /dev/null
+++ b/bamlist
@@ -0,0 +1,2 @@
+test/bam/sample1.bam
+test/bam/sample2.bam
diff --git a/src/harpy/bin/assign_mi.py b/src/harpy/bin/assign_mi.py
index 016a5e350..f3cad54b2 100755
--- a/src/harpy/bin/assign_mi.py
+++ b/src/harpy/bin/assign_mi.py
@@ -110,8 +110,7 @@ def write_missingbx(bam, alnrecord):
if bam_input.lower().endswith(".bam"):
if not os.path.exists(bam_input + ".bai"):
- print(f"Error: {bam_input} requires a matching {bam_input}.bai index file, but one wasn\'t found.", file = sys.stderr)
- sys.exit(1)
+ pysam.index(bam_input)
# iniitalize input/output files
alnfile = pysam.AlignmentFile(bam_input)
diff --git a/src/harpy/bin/check_bam.py b/src/harpy/bin/check_bam.py
index 15ddaf301..cf636d355 100755
--- a/src/harpy/bin/check_bam.py
+++ b/src/harpy/bin/check_bam.py
@@ -19,7 +19,7 @@
exit_on_error = False
)
-parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.")
+parser.add_argument('input', help = "Input bam/sam file")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
@@ -28,6 +28,9 @@
args = parser.parse_args()
bam_in = args.input
+if bam_in.lower().endswith(".bam"):
+ if not os.path.exists(bam_in + ".bai"):
+ pysam.index(bam_in)
# regex for EXACTLY AXXCXXBXXDXX
haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]')
diff --git a/src/harpy/bin/concatenate_bam.py b/src/harpy/bin/concatenate_bam.py
new file mode 100755
index 000000000..73366f675
--- /dev/null
+++ b/src/harpy/bin/concatenate_bam.py
@@ -0,0 +1,97 @@
+#! /usr/bin/env python
+"""Concatenate records from haplotagged BAM files"""
+import os
+import sys
+from pathlib import Path
+import argparse
+import pysam
+
+parser = argparse.ArgumentParser(
+ prog = 'concatenate_bam.py',
+ description =
+ """
+ Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
+ This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated
+ so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
+ all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option.
+ """,
+ usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam",
+ exit_on_error = False
+ )
+parser.add_argument("alignments", nargs='*', help = "SAM or BAM files")
+parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file")
+parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate")
+if len(sys.argv) == 1:
+ parser.print_help(sys.stderr)
+ sys.exit(1)
+args = parser.parse_args()
+
+if (args.alignments and args.bamlist):
+ print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr)
+ print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr)
+
+ print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr)
+ print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr)
+
+ sys.exit(1)
+
+if args.bamlist:
+ with open(args.bamlist, "r") as bl:
+ # read in and filter out commented lines
+ aln_list = [i.rstrip() for i in bl.readlines() if not i.startswith("#")]
+else:
+ if isinstance(args.alignments, str):
+ aln_list = [args.alignments]
+ else:
+ aln_list = args.alignments
+
+# instantiate output file
+if aln_list[0].lower().endswith(".bam"):
+ if not os.path.exists(f"{aln_list[0]}.bai"):
+ pysam.index(aln_list[0])
+with pysam.AlignmentFile(aln_list[0]) as xam_in:
+ header = xam_in.header.to_dict()
+# Remove all @PG lines
+if 'PG' in header:
+ del header['PG']
+# Add a new @PG line
+sys.argv[0] = os.path.basename(sys.argv[0])
+new_pg_line = {'ID': 'concatenate', 'PN': 'harpy', 'VN': '1.x', 'CL': " ".join(sys.argv)}
+if 'PG' not in header:
+ header['PG'] = []
+header['PG'].append(new_pg_line)
+
+# update RG lines to match output filename name
+header['RG'][0]['ID'] = Path(args.out).stem
+header['RG'][0]['SM'] = Path(args.out).stem
+
+with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out:
+ # current available unused MI tag
+ MI_NEW = 1
+
+ # iterate through the bam files
+ for xam in aln_list:
+ # create MI dict for this sample
+ MI_LOCAL = {}
+ # create index if it doesn't exist
+ if xam.lower().endswith(".bam"):
+ if not os.path.exists(f"{xam}.bai"):
+ pysam.index(xam)
+ with pysam.AlignmentFile(xam) as xamfile:
+ for record in xamfile.fetch():
+ try:
+ mi = record.get_tag("MI")
+ # if previously converted for this sample, use that
+ if mi in MI_LOCAL:
+ record.set_tag("MI", MI_LOCAL[mi])
+ else:
+ record.set_tag("MI", MI_NEW)
+ # add to sample conversion dict
+ MI_LOCAL[mi] = MI_NEW
+ # increment to next unique MI
+ MI_NEW += 1
+ except:
+ pass
+ bam_out.write(record)
+
+pysam.index(args.out)
diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd
index b1f1ad087..8a53ea5a1 100644
--- a/src/harpy/reports/impute.Rmd
+++ b/src/harpy/reports/impute.Rmd
@@ -29,6 +29,7 @@ using<-function(...) {
using("flexdashboard","dplyr","tidyr","DT","highcharter","scales")
modelparams <- paste(snakemake@params[[1]])
+splitparams <- unlist(strsplit(modelparams, "[_/]"))
#modelparams <- "Filler Text"
```
```{r echo = FALSE, message = FALSE, warning = FALSE}
@@ -68,12 +69,40 @@ tryCatch(
### reportheader {.no-title}
`r comparefile`
-This report details the results of genotype imputation using STITCH with the model parameters:
-`r modelparams`.
-
This page displays various information and general statistics and clicking the `Individual Statistics` tab
in the navigation menu above will display imputation results per individual.
+## Param Information {data-height=100}
+### param_model
+```{r}
+valueBox(gsub("model", "", splitparams[1]), caption = "model", color = "#c0c0c0")
+```
+
+### param_usebx
+```{r}
+valueBox(gsub("usebx", "", splitparams[2]), caption = "usebX", color = "#c0c0c0")
+```
+### param_bxlimit
+```{r}
+valueBox(gsub("bxlimit", "", splitparams[6]), caption = "bxlimit", color = "#c0c0c0")
+```
+
+### param_k
+```{r}
+valueBox(gsub("k", "", splitparams[3]), caption = "k", color = "#c0c0c0")
+```
+
+### param_s
+```{r}
+valueBox(gsub("s", "", splitparams[4]), caption = "s", color = "#c0c0c0")
+```
+
+### param_ngen
+```{r}
+valueBox(gsub("ngen", "", splitparams[5]), caption = "ngen", color = "#c0c0c0")
+```
+
+
## General Information {data-height=100}
```{r}
gcts_minmax <- filter(gcts_long, Conversion != "missing") %>% summarise(min = min(Count), max = max(Count))
diff --git a/src/harpy/snakefiles/impute.smk b/src/harpy/snakefiles/impute.smk
index c58afec6c..f1c242cfe 100644
--- a/src/harpy/snakefiles/impute.smk
+++ b/src/harpy/snakefiles/impute.smk
@@ -288,7 +288,7 @@ rule imputation_results_reports:
conda:
f"{envdir}/r.yaml"
message:
- "Generating imputation success report: {output}"
+ "Assessing imputation results: {output}"
script:
"report/impute.Rmd"
diff --git a/src/harpy/snakefiles/sv_leviathan_pop.smk b/src/harpy/snakefiles/sv_leviathan_pop.smk
index 76a048de7..bbb944fa7 100644
--- a/src/harpy/snakefiles/sv_leviathan_pop.smk
+++ b/src/harpy/snakefiles/sv_leviathan_pop.smk
@@ -96,16 +96,34 @@ rule merge_populations:
bamlist = outdir + "/workflow/merge_samples/{population}.list",
bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population])
output:
- bam = temp(outdir + "/workflow/input/{population}.bam"),
- bai = temp(outdir + "/workflow/input/{population}.bam.bai")
+ bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
+ bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
threads:
- workflow.cores
+ 1
container:
None
message:
"Merging alignments: {wildcards.population}"
shell:
- "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
+ "concatenate_bam.py -o {output.bam} -b {input.bamlist}"
+
+rule sort_alignments:
+ input:
+ bam = outdir + "/workflow/input/{population}.unsort.bam",
+ bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
+ output:
+ bam = outdir + "/workflow/input/{population}.bam",
+ bai = outdir + "/workflow/input/{population}.bam.bai"
+ log:
+ outdir + "/logs/{population}.sort.log"
+ resources:
+ mem_mb = 2000
+ container:
+ None
+ message:
+ "Sorting alignments: {wildcards.population}"
+ shell:
+ "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"
rule index_barcode:
input:
diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk
index a49a70e79..96fa6d2c2 100644
--- a/src/harpy/snakefiles/sv_naibr_pop.smk
+++ b/src/harpy/snakefiles/sv_naibr_pop.smk
@@ -112,16 +112,34 @@ rule merge_populations:
bamlist = outdir + "/workflow/merge_samples/{population}.list",
bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population])
output:
- bam = temp(outdir + "/workflow/input/{population}.bam"),
- bai = temp(outdir + "/workflow/input/{population}.bam.bai")
+ bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
+ bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
threads:
- workflow.cores
+ 1
container:
None
message:
"Merging alignments: {wildcards.population}"
shell:
- "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
+ "concatenate_bam.py -o {output.bam} -b {input.bamlist}"
+
+rule sort_alignments:
+ input:
+ bam = outdir + "/workflow/input/{population}.unsort.bam",
+ bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
+ output:
+ bam = outdir + "/workflow/input/{population}.bam",
+ bai = outdir + "/workflow/input/{population}.bam.bai"
+ log:
+ outdir + "/logs/{population}.sort.log"
+ resources:
+ mem_mb = 2000
+ container:
+ None
+ message:
+ "Sorting alignments: {wildcards.population}"
+ shell:
+ "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"
rule create_config:
input:
diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
index 64d02f6b5..c13ee5632 100644
--- a/src/harpy/snakefiles/sv_naibr_pop_phase.smk
+++ b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
@@ -235,16 +235,34 @@ rule merge_populations:
bamlist = outdir + "/workflow/merge_samples/{population}.list",
bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population])
output:
- bam = temp(outdir + "/workflow/input/{population}.bam"),
- bai = temp(outdir + "/workflow/input/{population}.bam.bai")
+ bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
+ bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
threads:
- workflow.cores
+ 1
container:
None
message:
"Merging alignments: {wildcards.population}"
shell:
- "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
+ "concatenate_bam.py -o {output.bam} -b {input.bamlist}"
+
+rule sort_alignments:
+ input:
+ bam = outdir + "/workflow/input/{population}.unsort.bam",
+ bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
+ output:
+ bam = outdir + "/workflow/input/{population}.bam",
+ bai = outdir + "/workflow/input/{population}.bam.bai"
+ log:
+ outdir + "/logs/{population}.sort.log"
+ resources:
+ mem_mb = 2000
+ container:
+ None
+ message:
+ "Sorting alignments: {wildcards.population}"
+ shell:
+ "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"
rule create_config:
input: