Merge pull request #119 from pdimens/bamMergeFix

Bam merge fix
pdimens · Aug 1, 2024 · d9ffa6c · d9ffa6c
2 parents a2279db + 49fdd56
commit d9ffa6c
Show file tree

Hide file tree

Showing 10 changed files with 205 additions and 19 deletions.
diff --git a/.github/filters.yml b/.github/filters.yml
@@ -109,12 +109,14 @@ leviathan: &leviathan
   - 'src/harpy/sv.py'
   - 'src/harpy/snakefiles/sv_leviathan**.smk'
   - 'test/bam/**'
+  - 'src/harpy/bin/concatenate_bam.py'
   - 'src/harpy/reports/Leviathan**.Rmd'
 naibr: &naibr
   - *common
   - *container
   - 'src/harpy/sv.py'
   - 'src/harpy/snakefiles/sv_naibr**.smk'
+  - 'src/harpy/bin/concatenate_bam.py'
   - 'test/bam/**'
   - 'test/bam_phased/**'
   - 'test/vcf/test.phased.bcf'

diff --git a/bamlist b/bamlist
@@ -0,0 +1,2 @@
+test/bam/sample1.bam
+test/bam/sample2.bam
diff --git a/src/harpy/bin/assign_mi.py b/src/harpy/bin/assign_mi.py
@@ -110,8 +110,7 @@ def write_missingbx(bam, alnrecord):
 
 if bam_input.lower().endswith(".bam"):
     if not os.path.exists(bam_input + ".bai"):
-        print(f"Error: {bam_input} requires a matching {bam_input}.bai index file, but one wasn\'t found.", file = sys.stderr)
-        sys.exit(1)
+        pysam.index(bam_input)
 
 # iniitalize input/output files
 alnfile = pysam.AlignmentFile(bam_input)

diff --git a/src/harpy/bin/check_bam.py b/src/harpy/bin/check_bam.py
@@ -19,7 +19,7 @@
     exit_on_error = False
     )
 
-parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.")
+parser.add_argument('input', help = "Input bam/sam file")
 
 if len(sys.argv) == 1:
     parser.print_help(sys.stderr)
@@ -28,6 +28,9 @@
 args = parser.parse_args()
 
 bam_in = args.input
+if bam_in.lower().endswith(".bam"):
+    if not os.path.exists(bam_in + ".bai"):
+        pysam.index(bam_in)
 
 # regex for EXACTLY AXXCXXBXXDXX
 haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]')

diff --git a/src/harpy/bin/concatenate_bam.py b/src/harpy/bin/concatenate_bam.py
@@ -0,0 +1,97 @@
+#! /usr/bin/env python
+"""Concatenate records from haplotagged BAM files"""
+import os
+import sys
+from pathlib import Path
+import argparse
+import pysam
+
+parser = argparse.ArgumentParser(
+    prog = 'concatenate_bam.py',
+    description =
+    """
+    Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
+    This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated
+    so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
+    all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option.
+    """,
+    usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam",
+    exit_on_error = False
+    )
+parser.add_argument("alignments", nargs='*', help = "SAM or BAM files")
+parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file")
+parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate")
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+args = parser.parse_args()
+
+if (args.alignments and args.bamlist):
+    print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr)
+    print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr)
+
+    print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr)
+    print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr)
+
+    sys.exit(1)
+
+if args.bamlist:
+    with open(args.bamlist, "r") as bl:
+        # read in and filter out commented lines
+        aln_list = [i.rstrip() for i in bl.readlines() if not i.startswith("#")]
+else:
+    if isinstance(args.alignments, str):
+        aln_list = [args.alignments]
+    else:
+        aln_list = args.alignments
+
+# instantiate output file
+if aln_list[0].lower().endswith(".bam"):
+    if not os.path.exists(f"{aln_list[0]}.bai"):
+        pysam.index(aln_list[0])
+with pysam.AlignmentFile(aln_list[0]) as xam_in:
+    header = xam_in.header.to_dict()
+# Remove all @PG lines
+if 'PG' in header:
+    del header['PG']
+# Add a new @PG line
+sys.argv[0] = os.path.basename(sys.argv[0])
+new_pg_line = {'ID': 'concatenate', 'PN': 'harpy', 'VN': '1.x', 'CL': " ".join(sys.argv)}
+if 'PG' not in header:
+    header['PG'] = []
+header['PG'].append(new_pg_line)
+
+# update RG lines to match output filename name
+header['RG'][0]['ID'] = Path(args.out).stem
+header['RG'][0]['SM'] = Path(args.out).stem
+
+with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out:
+    # current available unused MI tag
+    MI_NEW = 1
+
+    # iterate through the bam files
+    for xam in aln_list:
+        # create MI dict for this sample
+        MI_LOCAL = {}
+        # create index if it doesn't exist
+        if xam.lower().endswith(".bam"):
+            if not os.path.exists(f"{xam}.bai"):
+                pysam.index(xam)
+        with pysam.AlignmentFile(xam) as xamfile:
+            for record in xamfile.fetch():
+                try:
+                    mi = record.get_tag("MI")
+                    # if previously converted for this sample, use that
+                    if mi in MI_LOCAL:
+                        record.set_tag("MI", MI_LOCAL[mi])
+                    else:
+                        record.set_tag("MI", MI_NEW)
+                        # add to sample conversion dict
+                        MI_LOCAL[mi] = MI_NEW
+                        # increment to next unique MI
+                        MI_NEW += 1
+                except:
+                    pass
+                bam_out.write(record)
+
+pysam.index(args.out)
diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd
@@ -29,6 +29,7 @@ using<-function(...) {
 using("flexdashboard","dplyr","tidyr","DT","highcharter","scales")
 
 modelparams <- paste(snakemake@params[[1]])
+splitparams <- unlist(strsplit(modelparams, "[_/]"))
 #modelparams <- "Filler Text"
 ```
 ```{r echo = FALSE, message = FALSE, warning = FALSE}
@@ -68,12 +69,40 @@ tryCatch(
 
 ### reportheader {.no-title}
 <h2> `r comparefile` </h2>
-This report details the results of genotype imputation using STITCH with the model parameters:
-`r modelparams`.
-
 This page displays various information and general statistics and clicking the `Individual Statistics` tab 
 in the navigation menu above will display imputation results per individual.
 
+## Param Information {data-height=100}
+### param_model
+```{r}
+valueBox(gsub("model", "", splitparams[1]), caption = "model", color = "#c0c0c0")
+```
+
+### param_usebx
+```{r}
+valueBox(gsub("usebx", "", splitparams[2]), caption = "usebX", color = "#c0c0c0")
+```
+### param_bxlimit
+```{r}
+valueBox(gsub("bxlimit", "", splitparams[6]), caption = "bxlimit", color = "#c0c0c0")
+```
+
+### param_k
+```{r}
+valueBox(gsub("k", "", splitparams[3]), caption = "k", color = "#c0c0c0")
+```
+
+### param_s
+```{r}
+valueBox(gsub("s", "", splitparams[4]), caption = "s", color = "#c0c0c0")
+```
+
+### param_ngen
+```{r}
+valueBox(gsub("ngen", "", splitparams[5]), caption = "ngen", color = "#c0c0c0")
+```
+
+
 ## General Information {data-height=100}
 ```{r}
 gcts_minmax <- filter(gcts_long, Conversion != "missing") %>% summarise(min = min(Count), max = max(Count))

diff --git a/src/harpy/snakefiles/impute.smk b/src/harpy/snakefiles/impute.smk
@@ -288,7 +288,7 @@ rule imputation_results_reports:
     conda:
         f"{envdir}/r.yaml"
     message:
-        "Generating imputation success report: {output}"
+        "Assessing imputation results: {output}"
     script:
         "report/impute.Rmd"
 

diff --git a/src/harpy/snakefiles/sv_leviathan_pop.smk b/src/harpy/snakefiles/sv_leviathan_pop.smk
@@ -96,16 +96,34 @@ rule merge_populations:
         bamlist  = outdir + "/workflow/merge_samples/{population}.list",
         bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) 
     output:
-        bam = temp(outdir + "/workflow/input/{population}.bam"),
-        bai = temp(outdir + "/workflow/input/{population}.bam.bai")
+        bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
+        bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
     threads:
-        workflow.cores
+        1
     container:
         None
     message:
         "Merging alignments: {wildcards.population}"
     shell:
-        "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
+        "concatenate_bam.py -o {output.bam} -b {input.bamlist}"
+
+rule sort_alignments:
+    input:
+        bam = outdir + "/workflow/input/{population}.unsort.bam",
+        bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
+    output:
+        bam = outdir + "/workflow/input/{population}.bam",
+        bai = outdir + "/workflow/input/{population}.bam.bai"
+    log:
+        outdir + "/logs/{population}.sort.log"
+    resources:
+        mem_mb = 2000
+    container:
+        None
+    message:
+        "Sorting alignments: {wildcards.population}"
+    shell:
+        "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"
 
 rule index_barcode:
     input: 

diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk
@@ -112,16 +112,34 @@ rule merge_populations:
         bamlist  = outdir + "/workflow/merge_samples/{population}.list",
         bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) 
     output:
-        bam = temp(outdir + "/workflow/input/{population}.bam"),
-        bai = temp(outdir + "/workflow/input/{population}.bam.bai")
+        bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
+        bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
     threads:
-        workflow.cores
+        1
     container:
         None
     message:
         "Merging alignments: {wildcards.population}"
     shell:
-        "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
+        "concatenate_bam.py -o {output.bam} -b {input.bamlist}"
+
+rule sort_alignments:
+    input:
+        bam = outdir + "/workflow/input/{population}.unsort.bam",
+        bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
+    output:
+        bam = outdir + "/workflow/input/{population}.bam",
+        bai = outdir + "/workflow/input/{population}.bam.bai"
+    log:
+        outdir + "/logs/{population}.sort.log"
+    resources:
+        mem_mb = 2000
+    container:
+        None
+    message:
+        "Sorting alignments: {wildcards.population}"
+    shell:
+        "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"
 
 rule create_config:
     input:

diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk
@@ -235,16 +235,34 @@ rule merge_populations:
         bamlist  = outdir + "/workflow/merge_samples/{population}.list",
         bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) 
     output:
-        bam = temp(outdir + "/workflow/input/{population}.bam"),
-        bai = temp(outdir + "/workflow/input/{population}.bam.bai")
+        bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
+        bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
     threads:
-        workflow.cores
+        1
     container:
         None
     message:
         "Merging alignments: {wildcards.population}"
     shell:
-        "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
+        "concatenate_bam.py -o {output.bam} -b {input.bamlist}"
+
+rule sort_alignments:
+    input:
+        bam = outdir + "/workflow/input/{population}.unsort.bam",
+        bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
+    output:
+        bam = outdir + "/workflow/input/{population}.bam",
+        bai = outdir + "/workflow/input/{population}.bam.bai"
+    log:
+        outdir + "/logs/{population}.sort.log"
+    resources:
+        mem_mb = 2000
+    container:
+        None
+    message:
+        "Sorting alignments: {wildcards.population}"
+    shell:
+        "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"
 
 rule create_config:
     input: