Merge pull request #5 from BioWilko/main

Refactor to run with k8s executor and add paired read merging
artic-network · Jun 21, 2023 · 585b5f8 · 585b5f8
2 parents 44ffc6a + 0503fd0
commit 585b5f8
Show file tree

Hide file tree

Showing 23 changed files with 1,447 additions and 383 deletions.
diff --git a/bin/combine_kreports.py b/bin/combine_kreports.py
@@ -1,28 +1,36 @@
 #!/usr/bin/env python
 
 import sys
-import gzip
 from Bio import SeqIO
 import argparse
 from datetime import datetime
 
+
 def parse_depth(name):
     parse_name = name.split(" ")
     depth = 0
     for i in parse_name:
-        if i!="":
+        if i != "":
             break
         depth += 1
-    depth = int(depth/2)
+    depth = int(depth / 2)
     return depth
 
+
 def get_kraken_hierarchy(kraken_file, entries={}, total=0):
     with open(kraken_file, "r") as f:
         hierarchy = []
         for line in f:
             if line.startswith("% of Seqs"):
                 continue
-            percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t")
+            (
+                percentage,
+                num_clade_root,
+                num_direct,
+                raw_rank,
+                ncbi,
+                name,
+            ) = line.strip().split("\t")
             percentage = float(percentage)
             num_clade_root = int(num_clade_root)
             num_direct = int(num_direct)
@@ -37,16 +45,27 @@ def get_kraken_hierarchy(kraken_file, entries={}, total=0):
                 entries[ncbi]["count"] += num_direct
                 entries[ncbi]["count_descendants"] += num_clade_root
             else:
-                entries[ncbi] = {"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root,
-            "raw_rank": raw_rank, "rank": rank, "depth": depth, "ncbi": ncbi, "name": name, "parents":[], "children":set()}
+                entries[ncbi] = {
+                    "percentage": percentage,
+                    "count": num_direct,
+                    "count_descendants": num_clade_root,
+                    "raw_rank": raw_rank,
+                    "rank": rank,
+                    "depth": depth,
+                    "ncbi": ncbi,
+                    "name": name,
+                    "parents": [],
+                    "children": set(),
+                }
 
             if len(hierarchy) > 1:
                 parent = hierarchy[-2]
                 assert entries[parent]["depth"] < entries[ncbi]["depth"]
                 entries[ncbi]["parents"] = hierarchy[:-1]
                 entries[parent]["children"].add(ncbi)
 
-    return entries,total
+    return entries, total
+
 
 def combine_kraken_reports(kreports):
     entries = {}
@@ -55,12 +74,27 @@ def combine_kraken_reports(kreports):
         entries, total = get_kraken_hierarchy(kreport, entries, total)
         print(kreport, len(entries), total)
     for ncbi in entries:
-        entries[ncbi]["percentage"] = entries[ncbi]["count_descendants"] / float(total) *100
+        entries[ncbi]["percentage"] = (
+            entries[ncbi]["count_descendants"] / float(total) * 100
+        )
     return entries
 
+
 def write_entry(out_handle, entry):
     offset = entry["depth"]
-    out_handle.write("%f\t%i\t%i\t%s\t%s\t%s%s\n" %(entry["percentage"], entry["count"], entry["count_descendants"], entry["raw_rank"], entry["ncbi"], 2*offset*" ", entry["name"] ))
+    out_handle.write(
+        "%f\t%i\t%i\t%s\t%s\t%s%s\n"
+        % (
+            entry["percentage"],
+            entry["count"],
+            entry["count_descendants"],
+            entry["raw_rank"],
+            entry["ncbi"],
+            2 * offset * " ",
+            entry["name"],
+        )
+    )
+
 
 def choose_next(entries, taxa, ncbi="0"):
     if len(taxa) == 0:
@@ -86,38 +120,49 @@ def choose_next(entries, taxa, ncbi="0"):
 
     return next, taxa
 
+
 def write_kraken_report(outfile, entries):
     taxa = list(entries.keys())
     with open(outfile, "w") as out_report:
-        out_report.write("% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n")
+        out_report.write(
+            "% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n"
+        )
         current = "0"
         while len(taxa) > 0:
             write_entry(out_report, entries[current])
             current, taxa = choose_next(entries, taxa, ncbi=current)
 
+
 def main():
-    #Parse arguments
+    # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument('-r', dest='kraken_report_files', required=True, nargs='+',
-        help='Kraken report files to parse')
-    parser.add_argument('-o', "--outfile",dest='outfile', required=True,
-        help='Output name')
-
-    args=parser.parse_args()
-
-    #Start Program
+    parser.add_argument(
+        "-r",
+        dest="kraken_report_files",
+        required=True,
+        nargs="+",
+        help="Kraken report files to parse",
+    )
+    parser.add_argument(
+        "-o", "--outfile", dest="outfile", required=True, help="Output name"
+    )
+
+    args = parser.parse_args()
+
+    # Start Program
     now = datetime.now()
     time = now.strftime("%m/%d/%Y, %H:%M:%S")
-    sys.stdout.write("PROGRAM START TIME: " + time + '\n')
+    sys.stdout.write("PROGRAM START TIME: " + time + "\n")
 
     entries = combine_kraken_reports(args.kraken_report_files)
     write_kraken_report(args.outfile, entries)
 
     now = datetime.now()
     time = now.strftime("%m/%d/%Y, %H:%M:%S")
-    sys.stdout.write("PROGRAM END TIME: " + time + '\n')
+    sys.stdout.write("PROGRAM END TIME: " + time + "\n")
 
     sys.exit(0)
 
+
 if __name__ == "__main__":
     main()
diff --git a/bin/concatenate_reads.py b/bin/concatenate_reads.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+"""
+No dependency Python script for joining two paired end FASTQ files.
+Supports concatenating reads with a separator ("NNNNN") or interleaving reads via the
+--interleave option. Auto-detects gzip'd files, offers header checking via a --strict flag,
+and supports output to STDOUT a gzip'd FASTQ or an uncompressed FASTQ (--uncompressed flag).
+"""
+import argparse
+from Bio import bgzf
+import sys
+
+
+def enforce_headers(f1_header, f2_header):
+    if f1_header[0] != "@" or f2_header[0] != "@":
+        raise Exception("Invalid input FASTQ files.")
+    if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
+        raise Exception(
+            "Input FASTQ files do not share headers. " "Check and re-run w/o --strict."
+        )
+
+
+def join_w_separator(f1, f2, f_out, strict=False, sep="|"):
+    readlines = True
+    ix = 0
+
+    # Use 'G', which is in most Phred scores.
+    # http://en.wikipedia.org/wiki/FASTQ_format
+    phred_sep = "|" * len(sep)
+
+    while readlines:
+        f1_line = f1.readline()
+        f2_line = f2.readline()
+
+        if f1_line == "":
+            readlines = False
+            f1.close()
+            f2.close()
+            f_out.close()
+            if f2_line != "":
+                raise Exception("Input FASTQ files do not match in length.")
+            continue
+
+        if ix % 4 == 0:  # Header
+            if strict:  # Fail if they don't match up to the first whitespace
+                enforce_headers(f1_line, f2_line)
+
+            # Write the header out
+            f_out.write(f1_line)
+
+        elif (ix - 1) % 4 == 0:  # Sequence
+            f_out.write(f1_line.strip() + sep + f2_line)
+        elif (ix - 2) % 4 == 0:  # Separator
+            f_out.write("+\n")
+        else:
+            f_out.write(f1_line.strip() + phred_sep + f2_line)
+
+        ix += 1
+
+
+def join_interleaved(f1, f2, f_out, strict=False):
+    readlines = True
+    ix = 0
+    f1_lines = []
+    f2_lines = []
+
+    def flush_buffer(f_out, f1_lines, f2_lines):
+        if f1_lines and f2_lines:
+            assert len(f1_lines) == 4
+            assert len(f2_lines) == 4
+            f_out.write(f1_lines[0])
+            f_out.write(f1_lines[1])
+            f_out.write(f1_lines[2])
+            f_out.write(f1_lines[3])
+            f_out.write(f2_lines[0])
+            f_out.write(f2_lines[1])
+            f_out.write(f2_lines[2])
+            f_out.write(f2_lines[3])
+
+            f1_lines = []
+            f2_lines = []
+
+        return f1_lines, f2_lines
+
+    while readlines:
+        f1_line = f1.readline()
+        f2_line = f2.readline()
+
+        if f1_line == "":
+            readlines = False
+            if f2_line != "":
+                raise Exception("Input FASTQ files do not match in length.")
+            break
+
+        if ix % 4 == 0:  # Header #1
+            if strict:
+                enforce_headers(f1_line, f2_line)
+
+            f1_lines, f2_lines = flush_buffer(f_out, f1_lines, f2_lines)
+
+        # Fill buffer up to 4 lines
+        f1_lines.append(f1_line)
+        f2_lines.append(f2_line)
+        ix += 1
+
+    _, _ = flush_buffer(f_out, f1_lines, f2_lines)
+    f1.close()
+    f2.close()
+    f_out.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=(
+            "Join two FASTQ paired end read files. Defaults to interleaving reads. "
+            "Note: Requires input files to be sorted."
+        )
+    )
+    parser.add_argument("fastq1", help="First input FASTQ.")
+    parser.add_argument("fastq2", help="Second input FASTQ.")
+    parser.add_argument(
+        "output_fastq",
+        nargs="?",
+        help=("Output FASTQ file name (optional, " "streams to STDOUT if missing."),
+    )
+    parser.add_argument(
+        "--sep",
+        default="|",
+        help=("Optional separator to override default (|)."),
+    )
+    parser.add_argument(
+        "--no-interleave",
+        action="store_true",
+        help="Concatenate the two reads using a separate (--sep) rather than "
+        "interleaving them (default).",
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help=(
+            "Enforce that the headers of the input "
+            "FASTQ files match until the first whitespace "
+            "delimiter (e.g., a space)."
+        ),
+    )
+    parser.add_argument("--gzip", action="store_true", help=("Gzip output_fastq."))
+    args = parser.parse_args()
+
+    if ".gz" in args.fastq1 or ".gzip" in args.fastq1:
+        f1 = bgzf.open(args.fastq1, mode="r")
+    else:
+        f1 = open(args.fastq1, mode="r")
+    if ".gz" in args.fastq2 or ".gzip" in args.fastq2:
+        f2 = bgzf.open(args.fastq2, mode="r")
+    else:
+        f2 = open(args.fastq2, mode="r")
+
+    if args.output_fastq is None:
+        f_out = sys.stdout
+    elif args.gzip:
+        f_out = bgzf.open(args.output_fastq, mode="w")
+    else:
+        f_out = open(args.output_fastq, mode="w")
+
+    if not args.no_interleave:
+        join_interleaved(f1, f2, f_out, strict=args.strict)
+    else:
+        join_w_separator(f1, f2, f_out, strict=args.strict, sep=args.sep)