Skip to content

Commit

Permalink
Merge pull request #119 from pdimens/bamMergeFix
Browse files Browse the repository at this point in the history
Bam merge fix
  • Loading branch information
pdimens authored Aug 1, 2024
2 parents a2279db + 49fdd56 commit d9ffa6c
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 19 deletions.
2 changes: 2 additions & 0 deletions .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,12 +109,14 @@ leviathan: &leviathan
- 'src/harpy/sv.py'
- 'src/harpy/snakefiles/sv_leviathan**.smk'
- 'test/bam/**'
- 'src/harpy/bin/concatenate_bam.py'
- 'src/harpy/reports/Leviathan**.Rmd'
naibr: &naibr
- *common
- *container
- 'src/harpy/sv.py'
- 'src/harpy/snakefiles/sv_naibr**.smk'
- 'src/harpy/bin/concatenate_bam.py'
- 'test/bam/**'
- 'test/bam_phased/**'
- 'test/vcf/test.phased.bcf'
Expand Down
2 changes: 2 additions & 0 deletions bamlist
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
test/bam/sample1.bam
test/bam/sample2.bam
3 changes: 1 addition & 2 deletions src/harpy/bin/assign_mi.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ def write_missingbx(bam, alnrecord):

if bam_input.lower().endswith(".bam"):
if not os.path.exists(bam_input + ".bai"):
print(f"Error: {bam_input} requires a matching {bam_input}.bai index file, but one wasn\'t found.", file = sys.stderr)
sys.exit(1)
pysam.index(bam_input)

# iniitalize input/output files
alnfile = pysam.AlignmentFile(bam_input)
Expand Down
5 changes: 4 additions & 1 deletion src/harpy/bin/check_bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
exit_on_error = False
)

parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.")
parser.add_argument('input', help = "Input bam/sam file")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
Expand All @@ -28,6 +28,9 @@
args = parser.parse_args()

bam_in = args.input
if bam_in.lower().endswith(".bam"):
if not os.path.exists(bam_in + ".bai"):
pysam.index(bam_in)

# regex for EXACTLY AXXCXXBXXDXX
haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]')
Expand Down
97 changes: 97 additions & 0 deletions src/harpy/bin/concatenate_bam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#! /usr/bin/env python
"""Concatenate records from haplotagged BAM files"""
import os
import sys
from pathlib import Path
import argparse
import pysam

parser = argparse.ArgumentParser(
prog = 'concatenate_bam.py',
description =
"""
Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample.
This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated
so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide
all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option.
""",
usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam",
exit_on_error = False
)
parser.add_argument("alignments", nargs='*', help = "SAM or BAM files")
parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file")
parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate")
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()

if (args.alignments and args.bamlist):
print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr)
print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr)

print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr)
print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr)

sys.exit(1)

if args.bamlist:
with open(args.bamlist, "r") as bl:
# read in and filter out commented lines
aln_list = [i.rstrip() for i in bl.readlines() if not i.startswith("#")]
else:
if isinstance(args.alignments, str):
aln_list = [args.alignments]
else:
aln_list = args.alignments

# instantiate output file
if aln_list[0].lower().endswith(".bam"):
if not os.path.exists(f"{aln_list[0]}.bai"):
pysam.index(aln_list[0])
with pysam.AlignmentFile(aln_list[0]) as xam_in:
header = xam_in.header.to_dict()
# Remove all @PG lines
if 'PG' in header:
del header['PG']
# Add a new @PG line
sys.argv[0] = os.path.basename(sys.argv[0])
new_pg_line = {'ID': 'concatenate', 'PN': 'harpy', 'VN': '1.x', 'CL': " ".join(sys.argv)}
if 'PG' not in header:
header['PG'] = []
header['PG'].append(new_pg_line)

# update RG lines to match output filename name
header['RG'][0]['ID'] = Path(args.out).stem
header['RG'][0]['SM'] = Path(args.out).stem

with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out:
# current available unused MI tag
MI_NEW = 1

# iterate through the bam files
for xam in aln_list:
# create MI dict for this sample
MI_LOCAL = {}
# create index if it doesn't exist
if xam.lower().endswith(".bam"):
if not os.path.exists(f"{xam}.bai"):
pysam.index(xam)
with pysam.AlignmentFile(xam) as xamfile:
for record in xamfile.fetch():
try:
mi = record.get_tag("MI")
# if previously converted for this sample, use that
if mi in MI_LOCAL:
record.set_tag("MI", MI_LOCAL[mi])
else:
record.set_tag("MI", MI_NEW)
# add to sample conversion dict
MI_LOCAL[mi] = MI_NEW
# increment to next unique MI
MI_NEW += 1
except:
pass
bam_out.write(record)

pysam.index(args.out)
35 changes: 32 additions & 3 deletions src/harpy/reports/impute.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ using<-function(...) {
using("flexdashboard","dplyr","tidyr","DT","highcharter","scales")
modelparams <- paste(snakemake@params[[1]])
splitparams <- unlist(strsplit(modelparams, "[_/]"))
#modelparams <- "Filler Text"
```
```{r echo = FALSE, message = FALSE, warning = FALSE}
Expand Down Expand Up @@ -68,12 +69,40 @@ tryCatch(

### reportheader {.no-title}
<h2> `r comparefile` </h2>
This report details the results of genotype imputation using STITCH with the model parameters:
`r modelparams`.

This page displays various information and general statistics and clicking the `Individual Statistics` tab
in the navigation menu above will display imputation results per individual.

## Param Information {data-height=100}
### param_model
```{r}
valueBox(gsub("model", "", splitparams[1]), caption = "model", color = "#c0c0c0")
```

### param_usebx
```{r}
valueBox(gsub("usebx", "", splitparams[2]), caption = "usebX", color = "#c0c0c0")
```
### param_bxlimit
```{r}
valueBox(gsub("bxlimit", "", splitparams[6]), caption = "bxlimit", color = "#c0c0c0")
```

### param_k
```{r}
valueBox(gsub("k", "", splitparams[3]), caption = "k", color = "#c0c0c0")
```

### param_s
```{r}
valueBox(gsub("s", "", splitparams[4]), caption = "s", color = "#c0c0c0")
```

### param_ngen
```{r}
valueBox(gsub("ngen", "", splitparams[5]), caption = "ngen", color = "#c0c0c0")
```


## General Information {data-height=100}
```{r}
gcts_minmax <- filter(gcts_long, Conversion != "missing") %>% summarise(min = min(Count), max = max(Count))
Expand Down
2 changes: 1 addition & 1 deletion src/harpy/snakefiles/impute.smk
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ rule imputation_results_reports:
conda:
f"{envdir}/r.yaml"
message:
"Generating imputation success report: {output}"
"Assessing imputation results: {output}"
script:
"report/impute.Rmd"

Expand Down
26 changes: 22 additions & 4 deletions src/harpy/snakefiles/sv_leviathan_pop.smk
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,34 @@ rule merge_populations:
bamlist = outdir + "/workflow/merge_samples/{population}.list",
bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population])
output:
bam = temp(outdir + "/workflow/input/{population}.bam"),
bai = temp(outdir + "/workflow/input/{population}.bam.bai")
bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
threads:
workflow.cores
1
container:
None
message:
"Merging alignments: {wildcards.population}"
shell:
"samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
"concatenate_bam.py -o {output.bam} -b {input.bamlist}"

rule sort_alignments:
input:
bam = outdir + "/workflow/input/{population}.unsort.bam",
bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
output:
bam = outdir + "/workflow/input/{population}.bam",
bai = outdir + "/workflow/input/{population}.bam.bai"
log:
outdir + "/logs/{population}.sort.log"
resources:
mem_mb = 2000
container:
None
message:
"Sorting alignments: {wildcards.population}"
shell:
"samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"

rule index_barcode:
input:
Expand Down
26 changes: 22 additions & 4 deletions src/harpy/snakefiles/sv_naibr_pop.smk
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,34 @@ rule merge_populations:
bamlist = outdir + "/workflow/merge_samples/{population}.list",
bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population])
output:
bam = temp(outdir + "/workflow/input/{population}.bam"),
bai = temp(outdir + "/workflow/input/{population}.bam.bai")
bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
threads:
workflow.cores
1
container:
None
message:
"Merging alignments: {wildcards.population}"
shell:
"samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
"concatenate_bam.py -o {output.bam} -b {input.bamlist}"

rule sort_alignments:
input:
bam = outdir + "/workflow/input/{population}.unsort.bam",
bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
output:
bam = outdir + "/workflow/input/{population}.bam",
bai = outdir + "/workflow/input/{population}.bam.bai"
log:
outdir + "/logs/{population}.sort.log"
resources:
mem_mb = 2000
container:
None
message:
"Sorting alignments: {wildcards.population}"
shell:
"samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"

rule create_config:
input:
Expand Down
26 changes: 22 additions & 4 deletions src/harpy/snakefiles/sv_naibr_pop_phase.smk
Original file line number Diff line number Diff line change
Expand Up @@ -235,16 +235,34 @@ rule merge_populations:
bamlist = outdir + "/workflow/merge_samples/{population}.list",
bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population])
output:
bam = temp(outdir + "/workflow/input/{population}.bam"),
bai = temp(outdir + "/workflow/input/{population}.bam.bai")
bam = temp(outdir + "/workflow/input/{population}.unsort.bam"),
bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai")
threads:
workflow.cores
1
container:
None
message:
"Merging alignments: {wildcards.population}"
shell:
"samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}"
"concatenate_bam.py -o {output.bam} -b {input.bamlist}"

rule sort_alignments:
input:
bam = outdir + "/workflow/input/{population}.unsort.bam",
bai = outdir + "/workflow/input/{population}.unsort.bam.bai"
output:
bam = outdir + "/workflow/input/{population}.bam",
bai = outdir + "/workflow/input/{population}.bam.bai"
log:
outdir + "/logs/{population}.sort.log"
resources:
mem_mb = 2000
container:
None
message:
"Sorting alignments: {wildcards.population}"
shell:
"samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}"

rule create_config:
input:
Expand Down

0 comments on commit d9ffa6c

Please sign in to comment.