diff --git a/.github/filters.yml b/.github/filters.yml index 63797d230..62163e977 100644 --- a/.github/filters.yml +++ b/.github/filters.yml @@ -109,12 +109,14 @@ leviathan: &leviathan - 'src/harpy/sv.py' - 'src/harpy/snakefiles/sv_leviathan**.smk' - 'test/bam/**' + - 'src/harpy/bin/concatenate_bam.py' - 'src/harpy/reports/Leviathan**.Rmd' naibr: &naibr - *common - *container - 'src/harpy/sv.py' - 'src/harpy/snakefiles/sv_naibr**.smk' + - 'src/harpy/bin/concatenate_bam.py' - 'test/bam/**' - 'test/bam_phased/**' - 'test/vcf/test.phased.bcf' diff --git a/bamlist b/bamlist new file mode 100644 index 000000000..31050d5a0 --- /dev/null +++ b/bamlist @@ -0,0 +1,2 @@ +test/bam/sample1.bam +test/bam/sample2.bam diff --git a/src/harpy/bin/assign_mi.py b/src/harpy/bin/assign_mi.py index 016a5e350..f3cad54b2 100755 --- a/src/harpy/bin/assign_mi.py +++ b/src/harpy/bin/assign_mi.py @@ -110,8 +110,7 @@ def write_missingbx(bam, alnrecord): if bam_input.lower().endswith(".bam"): if not os.path.exists(bam_input + ".bai"): - print(f"Error: {bam_input} requires a matching {bam_input}.bai index file, but one wasn\'t found.", file = sys.stderr) - sys.exit(1) + pysam.index(bam_input) # iniitalize input/output files alnfile = pysam.AlignmentFile(bam_input) diff --git a/src/harpy/bin/check_bam.py b/src/harpy/bin/check_bam.py index 15ddaf301..cf636d355 100755 --- a/src/harpy/bin/check_bam.py +++ b/src/harpy/bin/check_bam.py @@ -19,7 +19,7 @@ exit_on_error = False ) -parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.") +parser.add_argument('input', help = "Input bam/sam file") if len(sys.argv) == 1: parser.print_help(sys.stderr) @@ -28,6 +28,9 @@ args = parser.parse_args() bam_in = args.input +if bam_in.lower().endswith(".bam"): + if not os.path.exists(bam_in + ".bai"): + pysam.index(bam_in) # regex for EXACTLY AXXCXXBXXDXX haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]') diff --git a/src/harpy/bin/concatenate_bam.py b/src/harpy/bin/concatenate_bam.py new file mode 100755 index 000000000..73366f675 --- /dev/null +++ b/src/harpy/bin/concatenate_bam.py @@ -0,0 +1,97 @@ +#! /usr/bin/env python +"""Concatenate records from haplotagged BAM files""" +import os +import sys +from pathlib import Path +import argparse +import pysam + +parser = argparse.ArgumentParser( + prog = 'concatenate_bam.py', + description = + """ + Concatenate records from haplotagged SAM/BAM files while making sure MI:i tags remain unique for every sample. + This is a means of accomplishing the same as \'samtools cat\', except all MI (Molecule Identifier) tags are updated + so individuals don't have overlapping MI tags (which would mess up all the linked-read data). You can either provide + all the files you want to concatenate, or a single file featuring filenames with the \'-b\' option. + """, + usage = "concatenate_bam.py -o output.bam file_1.bam file_2.bam...file_N.bam", + exit_on_error = False + ) +parser.add_argument("alignments", nargs='*', help = "SAM or BAM files") +parser.add_argument("-o", "--out", required = True, type = str, help = "Name of BAM output file") +parser.add_argument("-b", "--bamlist", required = False, type = str, help = "List of SAM or BAM files to concatenate") +if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) +args = parser.parse_args() + +if (args.alignments and args.bamlist): + print("Please provide a single file to \'--bamlist\' (-b) featuring all the files you want to concatenate (one per line):", file = sys.stderr) + print("[example]: concatenate_bam.py -o c_acronotus.bam -b alignments.txt\n", file = sys.stderr) + + print("Alternatively, provide the files after \'-o output.bam\':", file = sys.stderr) + print("[example]: concatenate_bam.py -o c_acronotus.bam sample1.bam sample2.bam", file = sys.stderr) + + sys.exit(1) + +if args.bamlist: + with open(args.bamlist, "r") as bl: + # read in and filter out commented lines + aln_list = [i.rstrip() for i in bl.readlines() if not i.startswith("#")] +else: + if isinstance(args.alignments, str): + aln_list = [args.alignments] + else: + aln_list = args.alignments + +# instantiate output file +if aln_list[0].lower().endswith(".bam"): + if not os.path.exists(f"{aln_list[0]}.bai"): + pysam.index(aln_list[0]) +with pysam.AlignmentFile(aln_list[0]) as xam_in: + header = xam_in.header.to_dict() +# Remove all @PG lines +if 'PG' in header: + del header['PG'] +# Add a new @PG line +sys.argv[0] = os.path.basename(sys.argv[0]) +new_pg_line = {'ID': 'concatenate', 'PN': 'harpy', 'VN': '1.x', 'CL': " ".join(sys.argv)} +if 'PG' not in header: + header['PG'] = [] +header['PG'].append(new_pg_line) + +# update RG lines to match output filename name +header['RG'][0]['ID'] = Path(args.out).stem +header['RG'][0]['SM'] = Path(args.out).stem + +with pysam.AlignmentFile(args.out, "wb", header = header) as bam_out: + # current available unused MI tag + MI_NEW = 1 + + # iterate through the bam files + for xam in aln_list: + # create MI dict for this sample + MI_LOCAL = {} + # create index if it doesn't exist + if xam.lower().endswith(".bam"): + if not os.path.exists(f"{xam}.bai"): + pysam.index(xam) + with pysam.AlignmentFile(xam) as xamfile: + for record in xamfile.fetch(): + try: + mi = record.get_tag("MI") + # if previously converted for this sample, use that + if mi in MI_LOCAL: + record.set_tag("MI", MI_LOCAL[mi]) + else: + record.set_tag("MI", MI_NEW) + # add to sample conversion dict + MI_LOCAL[mi] = MI_NEW + # increment to next unique MI + MI_NEW += 1 + except: + pass + bam_out.write(record) + +pysam.index(args.out) diff --git a/src/harpy/reports/impute.Rmd b/src/harpy/reports/impute.Rmd index b1f1ad087..8a53ea5a1 100644 --- a/src/harpy/reports/impute.Rmd +++ b/src/harpy/reports/impute.Rmd @@ -29,6 +29,7 @@ using<-function(...) { using("flexdashboard","dplyr","tidyr","DT","highcharter","scales") modelparams <- paste(snakemake@params[[1]]) +splitparams <- unlist(strsplit(modelparams, "[_/]")) #modelparams <- "Filler Text" ``` ```{r echo = FALSE, message = FALSE, warning = FALSE} @@ -68,12 +69,40 @@ tryCatch( ### reportheader {.no-title}

`r comparefile`

-This report details the results of genotype imputation using STITCH with the model parameters: -`r modelparams`. - This page displays various information and general statistics and clicking the `Individual Statistics` tab in the navigation menu above will display imputation results per individual. +## Param Information {data-height=100} +### param_model +```{r} +valueBox(gsub("model", "", splitparams[1]), caption = "model", color = "#c0c0c0") +``` + +### param_usebx +```{r} +valueBox(gsub("usebx", "", splitparams[2]), caption = "usebX", color = "#c0c0c0") +``` +### param_bxlimit +```{r} +valueBox(gsub("bxlimit", "", splitparams[6]), caption = "bxlimit", color = "#c0c0c0") +``` + +### param_k +```{r} +valueBox(gsub("k", "", splitparams[3]), caption = "k", color = "#c0c0c0") +``` + +### param_s +```{r} +valueBox(gsub("s", "", splitparams[4]), caption = "s", color = "#c0c0c0") +``` + +### param_ngen +```{r} +valueBox(gsub("ngen", "", splitparams[5]), caption = "ngen", color = "#c0c0c0") +``` + + ## General Information {data-height=100} ```{r} gcts_minmax <- filter(gcts_long, Conversion != "missing") %>% summarise(min = min(Count), max = max(Count)) diff --git a/src/harpy/snakefiles/impute.smk b/src/harpy/snakefiles/impute.smk index c58afec6c..f1c242cfe 100644 --- a/src/harpy/snakefiles/impute.smk +++ b/src/harpy/snakefiles/impute.smk @@ -288,7 +288,7 @@ rule imputation_results_reports: conda: f"{envdir}/r.yaml" message: - "Generating imputation success report: {output}" + "Assessing imputation results: {output}" script: "report/impute.Rmd" diff --git a/src/harpy/snakefiles/sv_leviathan_pop.smk b/src/harpy/snakefiles/sv_leviathan_pop.smk index 76a048de7..bbb944fa7 100644 --- a/src/harpy/snakefiles/sv_leviathan_pop.smk +++ b/src/harpy/snakefiles/sv_leviathan_pop.smk @@ -96,16 +96,34 @@ rule merge_populations: bamlist = outdir + "/workflow/merge_samples/{population}.list", bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) output: - bam = temp(outdir + "/workflow/input/{population}.bam"), - bai = temp(outdir + "/workflow/input/{population}.bam.bai") + bam = temp(outdir + "/workflow/input/{population}.unsort.bam"), + bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai") threads: - workflow.cores + 1 container: None message: "Merging alignments: {wildcards.population}" shell: - "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}" + "concatenate_bam.py -o {output.bam} -b {input.bamlist}" + +rule sort_alignments: + input: + bam = outdir + "/workflow/input/{population}.unsort.bam", + bai = outdir + "/workflow/input/{population}.unsort.bam.bai" + output: + bam = outdir + "/workflow/input/{population}.bam", + bai = outdir + "/workflow/input/{population}.bam.bai" + log: + outdir + "/logs/{population}.sort.log" + resources: + mem_mb = 2000 + container: + None + message: + "Sorting alignments: {wildcards.population}" + shell: + "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}" rule index_barcode: input: diff --git a/src/harpy/snakefiles/sv_naibr_pop.smk b/src/harpy/snakefiles/sv_naibr_pop.smk index a49a70e79..96fa6d2c2 100644 --- a/src/harpy/snakefiles/sv_naibr_pop.smk +++ b/src/harpy/snakefiles/sv_naibr_pop.smk @@ -112,16 +112,34 @@ rule merge_populations: bamlist = outdir + "/workflow/merge_samples/{population}.list", bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) output: - bam = temp(outdir + "/workflow/input/{population}.bam"), - bai = temp(outdir + "/workflow/input/{population}.bam.bai") + bam = temp(outdir + "/workflow/input/{population}.unsort.bam"), + bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai") threads: - workflow.cores + 1 container: None message: "Merging alignments: {wildcards.population}" shell: - "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}" + "concatenate_bam.py -o {output.bam} -b {input.bamlist}" + +rule sort_alignments: + input: + bam = outdir + "/workflow/input/{population}.unsort.bam", + bai = outdir + "/workflow/input/{population}.unsort.bam.bai" + output: + bam = outdir + "/workflow/input/{population}.bam", + bai = outdir + "/workflow/input/{population}.bam.bai" + log: + outdir + "/logs/{population}.sort.log" + resources: + mem_mb = 2000 + container: + None + message: + "Sorting alignments: {wildcards.population}" + shell: + "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}" rule create_config: input: diff --git a/src/harpy/snakefiles/sv_naibr_pop_phase.smk b/src/harpy/snakefiles/sv_naibr_pop_phase.smk index 64d02f6b5..c13ee5632 100644 --- a/src/harpy/snakefiles/sv_naibr_pop_phase.smk +++ b/src/harpy/snakefiles/sv_naibr_pop_phase.smk @@ -235,16 +235,34 @@ rule merge_populations: bamlist = outdir + "/workflow/merge_samples/{population}.list", bamfiles = lambda wc: collect("{sample}", sample = popdict[wc.population]) output: - bam = temp(outdir + "/workflow/input/{population}.bam"), - bai = temp(outdir + "/workflow/input/{population}.bam.bai") + bam = temp(outdir + "/workflow/input/{population}.unsort.bam"), + bai = temp(outdir + "/workflow/input/{population}.unsort.bam.bai") threads: - workflow.cores + 1 container: None message: "Merging alignments: {wildcards.population}" shell: - "samtools merge -o {output.bam}##idx##{output.bai} --threads {threads} --write-index -b {input.bamlist}" + "concatenate_bam.py -o {output.bam} -b {input.bamlist}" + +rule sort_alignments: + input: + bam = outdir + "/workflow/input/{population}.unsort.bam", + bai = outdir + "/workflow/input/{population}.unsort.bam.bai" + output: + bam = outdir + "/workflow/input/{population}.bam", + bai = outdir + "/workflow/input/{population}.bam.bai" + log: + outdir + "/logs/{population}.sort.log" + resources: + mem_mb = 2000 + container: + None + message: + "Sorting alignments: {wildcards.population}" + shell: + "samtools sort -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> {log}" rule create_config: input: