Merge pull request #93 from pdimens/91-add-pysam-as-direct-dependency

move pysam into main env
pdimens · Jun 29, 2024 · 6153cca · 6153cca
2 parents 0cf2243 + 503825e
commit 6153cca
Show file tree

Hide file tree

Showing 19 changed files with 214 additions and 120 deletions.
diff --git a/.deprecated/align-minimap.smk b/.deprecated/align-minimap.smk
@@ -234,12 +234,12 @@ rule assign_molecules:
         bai = outdir + "/{sample}.bam.bai"
     params:
         molecule_distance
-    conda:
-        f"{envdir}/qc.yaml"
+    container:
+        None
     message:
         "Assigning barcodes to molecules: {wildcards.sample}"
-    script:
-        "scripts/assignMI.py"
+    shell:
+        "assignMI.py -o {output.bam} -c {params} {input.bam}"
 
 rule alignment_bxstats:
     input:
@@ -249,12 +249,12 @@ rule alignment_bxstats:
         outdir + "/reports/data/bxstats/{sample}.bxstats.gz"
     params:
         sample = lambda wc: d[wc.sample]
-    conda:
-        f"{envdir}/qc.yaml"
+    container:
+        None
     message:
         "Calculating barcode alignment statistics: {wildcards.sample}"
-    script:
-        "scripts/bxStats.py"
+    shell:
+        "bxStats.py -o {output} {input.bam}"
 
 rule alignment_coverage:
     input: 

diff --git a/.github/filters.yml b/.github/filters.yml
@@ -18,8 +18,8 @@ preflight: &preflight
   - 'src/harpy/rules/preflight-bam.smk'
   - 'test/fastq/**'
   - 'test/bam/**'
-  - 'src/harpy/scripts/checkBAM.py'
-  - 'src/harpy/scripts/checkFASTQ.py'
+  - 'src/harpy/bin/checkBAM.py'
+  - 'src/harpy/bin/checkFASTQ.py'
   - 'src/harpy/reports/PreflightFastq.Rmd'
   - 'src/harpy/reports/PreflightBam.Rmd'
 demux: &demux
@@ -43,8 +43,9 @@ bwa: &bwa
   - 'src/harpy/rules/align-bwa.smk'
   - 'src/harpy/reports/AlignStats.Rmd'
   - 'src/harpy/reports/BxCount.Rmd'
-  - 'src/harpy/scripts/bxStats.py'
-  - 'src/harpy/scripts/countBX.py'
+  - 'src/harpy/bin/bxStats.py'
+  - 'src/harpy/bin/countBX.py'
+  - 'src/harpy/bin/assignMI.py'
   - 'src/harpy/bin/makeWindows.py'
   - 'test/fastq/**'
 ema: &ema
@@ -53,10 +54,9 @@ ema: &ema
   - 'src/harpy/align.py'
   - 'src/harpy/rules/align-ema.smk'
   - 'src/harpy/reports/AlignStats.Rmd'
-  - 'src/harpy/reports/EmaCount.Rmd'
   - 'src/harpy/reports/BxCount.Rmd'
-  - 'src/harpy/scripts/bxStats.py'
-  - 'src/harpy/scripts/countBX.py'
+  - 'src/harpy/bin/bxStats.py'
+  - 'src/harpy/bin/countBX.py'
   - 'src/harpy/bin/makewindows.py'
   - 'test/fastq/**'
 strobealign: &strobealign
@@ -66,8 +66,9 @@ strobealign: &strobealign
   - 'src/harpy/rules/align-strobealign.smk'
   - 'src/harpy/reports/AlignStats.Rmd'
   - 'src/harpy/reports/BxCount.Rmd'
-  - 'src/harpy/scripts/bxStats.py'
-  - 'src/harpy/scripts/countBX.py'
+  - 'src/harpy/bin/assignMI.py'
+  - 'src/harpy/bin/bxStats.py'
+  - 'src/harpy/bin/countBX.py'
   - 'src/harpy/bin/makewindows.py'
   - 'test/fastq/**'
 mpileup: &mpileup

diff --git a/resources/harpy.yaml b/resources/harpy.yaml
@@ -7,6 +7,7 @@ dependencies:
   - bcftools =1.20
   - mamba
   - pandas
+  - pysam
   - python
   - rich-click
   - snakemake-minimal >7

diff --git a/resources/meta.yaml b/resources/meta.yaml
@@ -34,6 +34,7 @@ requirements:
     - apptainer
     - bcftools =1.20
     - pandas
+    - pysam
     - python >3.10
     - rich-click
     - snakemake-minimal >7

diff --git a/src/harpy/align.py b/src/harpy/align.py
@@ -116,8 +116,6 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality
     fqlist, sample_count = parse_fastq_inputs(inputs)
     validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
     fetch_rule(workflowdir, "align-bwa.smk")
-    fetch_script(workflowdir, "assignMI.py")
-    fetch_script(workflowdir, "bxStats.py")
     fetch_report(workflowdir, "AlignStats.Rmd")
     fetch_report(workflowdir, "AlignBxStats.Rmd")
 
@@ -208,7 +206,6 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
     fqlist, sample_count = parse_fastq_inputs(inputs)
     validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
     fetch_rule(workflowdir, "align-ema.smk")
-    fetch_script(workflowdir, "bxStats.py")
     fetch_report(workflowdir, "AlignStats.Rmd")
     fetch_report(workflowdir, "AlignBxStats.Rmd")
 
@@ -289,8 +286,6 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
     fqlist, sample_count = parse_fastq_inputs(inputs)
     validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
     fetch_rule(workflowdir, "align-strobealign.smk")
-    fetch_script(workflowdir, "assignMI.py")
-    fetch_script(workflowdir, "bxStats.py")
     fetch_report(workflowdir, "AlignStats.Rmd")
     fetch_report(workflowdir, "AlignBxStats.Rmd")
 

diff --git a/src/harpy/scripts/assignMI.py → src/harpy/bin/assignMI.py b/src/harpy/scripts/assignMI.py → src/harpy/bin/assignMI.py
@@ -1,29 +1,34 @@
+#! /usr/bin/env python
+
 import re
 import os
 import sys
+import argparse
 import pysam
-#import argparse
-
-#parser = argparse.ArgumentParser(
-#    prog = 'assignMI.py',
-#    description = 
-#    """
-#    Assign an MI:i: (Molecular Identifier) tag to each barcoded 
-#    record based on a molecular distance cutoff. Unmapped records
-#    are discarded in the output. Records without a BX:Z: tag or
-#    with an invalid barcode (00 as one of its segments) are presevered
-#    but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED.
-#    """,
-#    usage = "assignMI.py -c cutoff -i input.bam -o output.bam",
-#    exit_on_error = False
-#    )
-#parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules. (default: 100000)")
-#parser.add_argument('-i', '--input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.")
-#parser.add_argument('-o', '--output', help = "Output bam file. Will also create an index file.")
-#
-#if len(sys.argv) == 1:
-#    parser.print_help(sys.stderr)
-#    sys.exit(1)
+
+parser = argparse.ArgumentParser(
+    prog = 'assignMI.py',
+    description =
+    """
+    Assign an MI:i: (Molecular Identifier) tag to each barcoded
+    record based on a molecular distance cutoff. Unmapped records
+    are discarded in the output. Records without a BX:Z: tag or
+    with an invalid barcode (00 as one of its segments) are presevered
+    but are not assigned an MI:i tag. Input file MUST BE COORDINATE SORTED.
+    """,
+    usage = "assignMI.py -c cutoff -o output.bam input.bam",
+    exit_on_error = False
+    )
+
+parser.add_argument('-c','--cutoff', type=int, default = 100000, help = "Distance in base pairs at which alignments with the same barcode should be considered different molecules. (default: 100000)")
+parser.add_argument('-o', '--output', help = "Output bam file. Will also create an index file.")
+parser.add_argument('input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.")
+
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
 
 def write_validbx(bam, alnrecord, molID):
     '''
@@ -97,7 +102,7 @@ def write_missingbx(bam, alnrecord):
     bam.write(alnrecord)
 
 #args = parser.parse_args()
-bam_input = snakemake.input[0]
+bam_input = args.input
 # initialize the dict
 d = dict()
 # chromlast keeps track of the last chromosome so we can
@@ -117,17 +122,15 @@ def write_missingbx(bam, alnrecord):
 alnfile = pysam.AlignmentFile(bam_input)
 
 # iniitalize output file
-#alnfile = pysam.AlignmentFile("/home/pdimens/Documents/harpy/test/bam/sample1.bam")
-outfile = pysam.AlignmentFile(snakemake.output[0], "wb", template = alnfile)
-#outfile = pysam.AlignmentFile("/home/pdimens/Documents/harpy/test/bam/test.bam", "w", template = alnfile)
+outfile = pysam.AlignmentFile(args.output, "wb", template = alnfile)
 
 for record in alnfile.fetch():
     chrm = record.reference_name
     bp   = record.query_alignment_length
     # check if the current chromosome is different from the previous one
     # if so, empty the dict (a consideration for RAM usage)
-    if chromlast != False and chrm != chromlast:
-        d = dict()
+    if chromlast is not False and chrm != chromlast:
+        d = {}
     if record.is_unmapped:
         # skip, don't output
         chromlast = chrm
@@ -146,7 +149,7 @@ def write_missingbx(bam, alnrecord):
         write_missingbx(outfile, record)
         chromlast = chrm
         continue
-    
+
     aln = record.get_blocks()
     if not aln:
         # unaligned, skip and don't output
@@ -160,9 +163,9 @@ def write_missingbx(bam, alnrecord):
     pos_end   = aln[-1][1]
 
     # create bx entry if it's not present
-    if bx not in d.keys():
+    if bx not in d:
         # increment MI b/c it's a new molecule
-        MI += 1 
+        MI += 1
         d[bx] = {
             "lastpos" : pos_end,
             "current_suffix": 0,
@@ -184,9 +187,9 @@ def write_missingbx(bam, alnrecord):
     # if the distance between alignments is > cutoff, it's a different molecule
     # so we'll +1 the suffix of the original barcode and relabel this one as 
     # BX + suffix. Since it's a new entry, we initialize it and move on
-    if dist > snakemake.params[0]:
+    if dist > args.cutoff:
         # increment MI b/c it's a new molecule
-        MI += 1 
+        MI += 1
         # increment original barcode's suffix
         d[orig]["current_suffix"] += 1
         bx = orig + "." + str(d[orig]["current_suffix"])
@@ -216,4 +219,4 @@ def write_missingbx(bam, alnrecord):
 outfile.close()
 
 # index the output file
-pysam.index(snakemake.output[0])
+pysam.index(args.output)
diff --git a/src/harpy/scripts/bxStats.py → src/harpy/bin/bxStats.py b/src/harpy/scripts/bxStats.py → src/harpy/bin/bxStats.py
@@ -1,9 +1,38 @@
+#! /usr/bin/env python
+
 import re
+import sys
 import gzip
+import argparse
 import pysam
 
-alnfile = pysam.AlignmentFile(snakemake.input[0])
-outfile = gzip.open(snakemake.output[0], "wb", 6)
+parser = argparse.ArgumentParser(
+    prog = 'bxStats.py',
+    description =
+    """
+    Calculates various linked-read molecule metrics from the input alignment file.
+    Metrics include (per molecule): number of reads, position start, position end,
+    length of molecule inferred from alignments, total aligned basepairs, total,
+    length of inferred inserts, molecule coverage (%) based on aligned bases, molecule
+    coverage (%) based on total inferred insert length.
+    Input file MUST BE COORDINATE SORTED.
+    """,
+    usage = "bxStats.py -o output.gz input.bam",
+    exit_on_error = False
+    )
+
+parser.add_argument('-o', '--output', help = "Gzipped tab-delimited file of metrics.")
+parser.add_argument('input', help = "Input coordinate-sorted bam/sam file. If bam, a matching index file should be in the same directory.")
+
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
+
+
+alnfile = pysam.AlignmentFile(args.input)
+outfile = gzip.open(args.output, "wb", 6)
 outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n")
 
 d = {}

diff --git a/src/harpy/scripts/checkBAM.py → src/harpy/bin/checkBAM.py b/src/harpy/scripts/checkBAM.py → src/harpy/bin/checkBAM.py
@@ -1,9 +1,33 @@
-import pysam
-import sys
+#! /usr/bin/env python
+
 import re
-import os.path
+import sys
+import os
+import argparse
+import pysam
+
+parser = argparse.ArgumentParser(
+    prog = 'checkBAM.py',
+    description =
+    """
+    Parses an aligment (sam/bam) file to check if the sample name
+    matched the RG tag, whether BX:Z: is the last tag in the record,
+    and the counts of: total alignments, alignments with an MI:i: tag,
+    alignments without BX:Z: tag, incorrect BX:Z: tag.
+    """,
+    usage = "checkBAM.py input.bam > output.txt",
+    exit_on_error = False
+    )
+
+parser.add_argument('input', help = "Input bam/sam file. If bam, a matching index file should be in the same directory.")
+
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
 
-bam_in = snakemake.input[0]
+bam_in = args.input
 
 # regex for EXACTLY AXXCXXBXXDXX
 haplotag = re.compile('^A[0-9][0-9]C[0-9][0-9]B[0-9][0-9]D[0-9][0-9]$')
@@ -46,5 +70,4 @@
 
 
 values = [str(i) for i in [os.path.basename(bam_in), nameMismatch, n_reads, noMI, noBX, badBX, bxNotLast]]
-with open(snakemake.output[0], "w") as fout:
-    print("\t".join(values), file = fout) 
+print("\t".join(values), file = sys.stdout)
diff --git a/src/harpy/scripts/checkFASTQ.py → src/harpy/bin/checkFASTQ.py b/src/harpy/scripts/checkFASTQ.py → src/harpy/bin/checkFASTQ.py
@@ -1,9 +1,32 @@
-import pysam
-import sys
+#! /usr/bin/env python
+
 import re
-import os.path
+import os
+import sys
+import argparse
+import pysam
+
+parser = argparse.ArgumentParser(
+    prog = 'checkFASTQ.py',
+    description =
+    """
+    Parses a FASTQ file to check if any sequences don't conform to the SAM spec,
+    whether BX:Z: is the last tag in the record, and the counts of: total reads,
+    reads without BX:Z: tag, reads with incorrect BX:Z: tag.
+    """,
+    usage = "checkBAM.py input.bam > output.txt",
+    exit_on_error = False
+    )
+
+parser.add_argument('input', help = "Input fastq file. Can be gzipped.")
+
+if len(sys.argv) == 1:
+    parser.print_help(sys.stderr)
+    sys.exit(1)
+
+args = parser.parse_args()
 
-fq_in = snakemake.input[0]
+fq_in = args.input
 
 #bxz = re.compile('BX:Z:')
 samspec = re.compile('[A-Z][A-Z]:[AifZHB]:')
@@ -38,5 +61,4 @@
             noBX += 1
 
 values = [str(i) for i in [os.path.basename(fq_in), n_reads, noBX, badBX, badSamSpec, bxNotLast]]
-with open(snakemake.output[0], "w") as fout:
-    print("\t".join(values), file = fout) 
+print("\t".join(values), file = sys.stdout)
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ dependencies: @@
       - bcftools =1.20
       - mamba
       - pandas
+      - pysam
       - python
       - rich-click
       - snakemake-minimal >7
@@ Expand Down @@