Skip to content

Commit

Permalink
Merge pull request #93 from pdimens/91-add-pysam-as-direct-dependency
Browse files Browse the repository at this point in the history
move pysam into main env
  • Loading branch information
pdimens authored Jun 29, 2024
2 parents 0cf2243 + 503825e commit 6153cca
Show file tree
Hide file tree
Showing 19 changed files with 214 additions and 120 deletions.
16 changes: 8 additions & 8 deletions .deprecated/align-minimap.smk
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,12 @@ rule assign_molecules:
bai = outdir + "/{sample}.bam.bai"
params:
molecule_distance
conda:
f"{envdir}/qc.yaml"
container:
None
message:
"Assigning barcodes to molecules: {wildcards.sample}"
script:
"scripts/assignMI.py"
shell:
"assignMI.py -o {output.bam} -c {params} {input.bam}"

rule alignment_bxstats:
input:
Expand All @@ -249,12 +249,12 @@ rule alignment_bxstats:
outdir + "/reports/data/bxstats/{sample}.bxstats.gz"
params:
sample = lambda wc: d[wc.sample]
conda:
f"{envdir}/qc.yaml"
container:
None
message:
"Calculating barcode alignment statistics: {wildcards.sample}"
script:
"scripts/bxStats.py"
shell:
"bxStats.py -o {output} {input.bam}"

rule alignment_coverage:
input:
Expand Down
19 changes: 10 additions & 9 deletions .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ preflight: &preflight
- 'src/harpy/rules/preflight-bam.smk'
- 'test/fastq/**'
- 'test/bam/**'
- 'src/harpy/scripts/checkBAM.py'
- 'src/harpy/scripts/checkFASTQ.py'
- 'src/harpy/bin/checkBAM.py'
- 'src/harpy/bin/checkFASTQ.py'
- 'src/harpy/reports/PreflightFastq.Rmd'
- 'src/harpy/reports/PreflightBam.Rmd'
demux: &demux
Expand All @@ -43,8 +43,9 @@ bwa: &bwa
- 'src/harpy/rules/align-bwa.smk'
- 'src/harpy/reports/AlignStats.Rmd'
- 'src/harpy/reports/BxCount.Rmd'
- 'src/harpy/scripts/bxStats.py'
- 'src/harpy/scripts/countBX.py'
- 'src/harpy/bin/bxStats.py'
- 'src/harpy/bin/countBX.py'
- 'src/harpy/bin/assignMI.py'
- 'src/harpy/bin/makeWindows.py'
- 'test/fastq/**'
ema: &ema
Expand All @@ -53,10 +54,9 @@ ema: &ema
- 'src/harpy/align.py'
- 'src/harpy/rules/align-ema.smk'
- 'src/harpy/reports/AlignStats.Rmd'
- 'src/harpy/reports/EmaCount.Rmd'
- 'src/harpy/reports/BxCount.Rmd'
- 'src/harpy/scripts/bxStats.py'
- 'src/harpy/scripts/countBX.py'
- 'src/harpy/bin/bxStats.py'
- 'src/harpy/bin/countBX.py'
- 'src/harpy/bin/makewindows.py'
- 'test/fastq/**'
strobealign: &strobealign
Expand All @@ -66,8 +66,9 @@ strobealign: &strobealign
- 'src/harpy/rules/align-strobealign.smk'
- 'src/harpy/reports/AlignStats.Rmd'
- 'src/harpy/reports/BxCount.Rmd'
- 'src/harpy/scripts/bxStats.py'
- 'src/harpy/scripts/countBX.py'
- 'src/harpy/bin/assignMI.py'
- 'src/harpy/bin/bxStats.py'
- 'src/harpy/bin/countBX.py'
- 'src/harpy/bin/makewindows.py'
- 'test/fastq/**'
mpileup: &mpileup
Expand Down
1 change: 1 addition & 0 deletions resources/harpy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dependencies:
- bcftools =1.20
- mamba
- pandas
- pysam
- python
- rich-click
- snakemake-minimal >7
Expand Down
1 change: 1 addition & 0 deletions resources/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ requirements:
- apptainer
- bcftools =1.20
- pandas
- pysam
- python >3.10
- rich-click
- snakemake-minimal >7
Expand Down
5 changes: 0 additions & 5 deletions src/harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality
fqlist, sample_count = parse_fastq_inputs(inputs)
validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
fetch_rule(workflowdir, "align-bwa.smk")
fetch_script(workflowdir, "assignMI.py")
fetch_script(workflowdir, "bxStats.py")
fetch_report(workflowdir, "AlignStats.Rmd")
fetch_report(workflowdir, "AlignBxStats.Rmd")

Expand Down Expand Up @@ -208,7 +206,6 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
fqlist, sample_count = parse_fastq_inputs(inputs)
validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
fetch_rule(workflowdir, "align-ema.smk")
fetch_script(workflowdir, "bxStats.py")
fetch_report(workflowdir, "AlignStats.Rmd")
fetch_report(workflowdir, "AlignBxStats.Rmd")

Expand Down Expand Up @@ -289,8 +286,6 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
fqlist, sample_count = parse_fastq_inputs(inputs)
validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
fetch_rule(workflowdir, "align-strobealign.smk")
fetch_script(workflowdir, "assignMI.py")
fetch_script(workflowdir, "bxStats.py")
fetch_report(workflowdir, "AlignStats.Rmd")
fetch_report(workflowdir, "AlignBxStats.Rmd")

Expand Down
71 changes: 37 additions & 34 deletions src/harpy/scripts/assignMI.py → src/harpy/bin/assignMI.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,34 @@
#! /usr/bin/env python

import re
import os
import sys
import argparse
import pysam
#import argparse

#parser = argparse.ArgumentParser(
# prog = 'assignMI.py',
# description =
# """
# Assign an MI:i: (Molecular Identifier) tag to each barcoded
# record based on a molecular distance cutoff. Unmapped records
# are discarded in the output. Records without a BX:Z: tag or
# with an invalid barcode (00 as one of its segments) are presevered
# but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED.
# """,
# usage = "assignMI.py -c cutoff -i input.bam -o output.bam",
# exit_on_error = False
# )
#parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules. (default: 100000)")
#parser.add_argument('-i', '--input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.")
#parser.add_argument('-o', '--output', help = "Output bam file. Will also create an index file.")
#
#if len(sys.argv) == 1:
# parser.print_help(sys.stderr)
# sys.exit(1)

parser = argparse.ArgumentParser(
prog = 'assignMI.py',
description =
"""
Assign an MI:i: (Molecular Identifier) tag to each barcoded
record based on a molecular distance cutoff. Unmapped records
are discarded in the output. Records without a BX:Z: tag or
with an invalid barcode (00 as one of its segments) are presevered
but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED.
""",
usage = "assignMI.py -c cutoff -o output.bam input.bam",
exit_on_error = False
)

parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules. (default: 100000)")
parser.add_argument('-o', '--output', help = "Output bam file. Will also create an index file.")
parser.add_argument('input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()

def write_validbx(bam, alnrecord, molID):
'''
Expand Down Expand Up @@ -97,7 +102,7 @@ def write_missingbx(bam, alnrecord):
bam.write(alnrecord)

#args = parser.parse_args()
bam_input = snakemake.input[0]
bam_input = args.input
# initialize the dict
d = dict()
# chromlast keeps track of the last chromosome so we can
Expand All @@ -117,17 +122,15 @@ def write_missingbx(bam, alnrecord):
alnfile = pysam.AlignmentFile(bam_input)

# iniitalize output file
#alnfile = pysam.AlignmentFile("/home/pdimens/Documents/harpy/test/bam/sample1.bam")
outfile = pysam.AlignmentFile(snakemake.output[0], "wb", template = alnfile)
#outfile = pysam.AlignmentFile("/home/pdimens/Documents/harpy/test/bam/test.bam", "w", template = alnfile)
outfile = pysam.AlignmentFile(args.output, "wb", template = alnfile)

for record in alnfile.fetch():
chrm = record.reference_name
bp = record.query_alignment_length
# check if the current chromosome is different from the previous one
# if so, empty the dict (a consideration for RAM usage)
if chromlast != False and chrm != chromlast:
d = dict()
if chromlast is not False and chrm != chromlast:
d = {}
if record.is_unmapped:
# skip, don't output
chromlast = chrm
Expand All @@ -146,7 +149,7 @@ def write_missingbx(bam, alnrecord):
write_missingbx(outfile, record)
chromlast = chrm
continue

aln = record.get_blocks()
if not aln:
# unaligned, skip and don't output
Expand All @@ -160,9 +163,9 @@ def write_missingbx(bam, alnrecord):
pos_end = aln[-1][1]

# create bx entry if it's not present
if bx not in d.keys():
if bx not in d:
# increment MI b/c it's a new molecule
MI += 1
MI += 1
d[bx] = {
"lastpos" : pos_end,
"current_suffix": 0,
Expand All @@ -184,9 +187,9 @@ def write_missingbx(bam, alnrecord):
# if the distance between alignments is > cutoff, it's a different molecule
# so we'll +1 the suffix of the original barcode and relabel this one as
# BX + suffix. Since it's a new entry, we initialize it and move on
if dist > snakemake.params[0]:
if dist > args.cutoff:
# increment MI b/c it's a new molecule
MI += 1
MI += 1
# increment original barcode's suffix
d[orig]["current_suffix"] += 1
bx = orig + "." + str(d[orig]["current_suffix"])
Expand Down Expand Up @@ -216,4 +219,4 @@ def write_missingbx(bam, alnrecord):
outfile.close()

# index the output file
pysam.index(snakemake.output[0])
pysam.index(args.output)
33 changes: 31 additions & 2 deletions src/harpy/scripts/bxStats.py → src/harpy/bin/bxStats.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,38 @@
#! /usr/bin/env python

import re
import sys
import gzip
import argparse
import pysam

alnfile = pysam.AlignmentFile(snakemake.input[0])
outfile = gzip.open(snakemake.output[0], "wb", 6)
parser = argparse.ArgumentParser(
prog = 'bxStats.py',
description =
"""
Calculates various linked-read molecule metrics from the input alignment file.
Metrics include (per molecule): number of reads, position start, position end,
length of molecule inferred from alignments, total aligned basepairs, total,
length of inferred inserts, molecule coverage (%) based on aligned bases, molecule
coverage (%) based on total inferred insert length.
Input file MUST BE COORDINATE SORTED.
""",
usage = "bxStats.py -o output.gz input.bam",
exit_on_error = False
)

parser.add_argument('-o', '--output', help = "Gzipped tab-delimited file of metrics.")
parser.add_argument('input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()


alnfile = pysam.AlignmentFile(args.input)
outfile = gzip.open(args.output, "wb", 6)
outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n")

d = {}
Expand Down
35 changes: 29 additions & 6 deletions src/harpy/scripts/checkBAM.py → src/harpy/bin/checkBAM.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
import pysam
import sys
#! /usr/bin/env python

import re
import os.path
import sys
import os
import argparse
import pysam

parser = argparse.ArgumentParser(
prog = 'checkBAM.py',
description =
"""
Parses an aligment (sam/bam) file to check if the sample name
matched the RG tag, whether BX:Z: is the last tag in the record,
and the counts of: total alignments, alignments with an MI:i: tag,
alignments without BX:Z: tag, incorrect BX:Z: tag.
""",
usage = "checkBAM.py input.bam > output.txt",
exit_on_error = False
)

parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()

bam_in = snakemake.input[0]
bam_in = args.input

# regex for EXACTLY AXXCXXBXXDXX
haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]$')
Expand Down Expand Up @@ -46,5 +70,4 @@


values = [str(i) for i in [os.path.basename(bam_in), nameMismatch, n_reads, noMI, noBX, badBX, bxNotLast]]
with open(snakemake.output[0], "w") as fout:
print("\t".join(values), file = fout)
print("\t".join(values), file = sys.stdout)
34 changes: 28 additions & 6 deletions src/harpy/scripts/checkFASTQ.py → src/harpy/bin/checkFASTQ.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,32 @@
import pysam
import sys
#! /usr/bin/env python

import re
import os.path
import os
import sys
import argparse
import pysam

parser = argparse.ArgumentParser(
prog = 'checkFASTQ.py',
description =
"""
Parses a FASTQ file to check if any sequences don't conform to the SAM spec,
whether BX:Z: is the last tag in the record, and the counts of: total reads,
reads without BX:Z: tag, reads with incorrect BX:Z: tag.
""",
usage = "checkBAM.py input.bam > output.txt",
exit_on_error = False
)

parser.add_argument('input', help = "Input fastq file. Can be gzipped.")

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()

fq_in = snakemake.input[0]
fq_in = args.input

#bxz = re.compile('BX:Z:')
samspec = re.compile('[A-Z][A-Z]:[AifZHB]:')
Expand Down Expand Up @@ -38,5 +61,4 @@
noBX += 1

values = [str(i) for i in [os.path.basename(fq_in), n_reads, noBX, badBX, badSamSpec, bxNotLast]]
with open(snakemake.output[0], "w") as fout:
print("\t".join(values), file = fout)
print("\t".join(values), file = sys.stdout)
Loading

0 comments on commit 6153cca

Please sign in to comment.