Skip to content

Commit

Permalink
Merge pull request #5 from BioWilko/main
Browse files Browse the repository at this point in the history
Refactor to run with k8s executor and add paired read merging
  • Loading branch information
rmcolq authored Jun 21, 2023
2 parents 44ffc6a + 0503fd0 commit 585b5f8
Show file tree
Hide file tree
Showing 23 changed files with 1,447 additions and 383 deletions.
87 changes: 66 additions & 21 deletions bin/combine_kreports.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,36 @@
#!/usr/bin/env python

import sys
import gzip
from Bio import SeqIO
import argparse
from datetime import datetime


def parse_depth(name):
parse_name = name.split(" ")
depth = 0
for i in parse_name:
if i!="":
if i != "":
break
depth += 1
depth = int(depth/2)
depth = int(depth / 2)
return depth


def get_kraken_hierarchy(kraken_file, entries={}, total=0):
with open(kraken_file, "r") as f:
hierarchy = []
for line in f:
if line.startswith("% of Seqs"):
continue
percentage, num_clade_root, num_direct, raw_rank, ncbi, name = line.strip().split("\t")
(
percentage,
num_clade_root,
num_direct,
raw_rank,
ncbi,
name,
) = line.strip().split("\t")
percentage = float(percentage)
num_clade_root = int(num_clade_root)
num_direct = int(num_direct)
Expand All @@ -37,16 +45,27 @@ def get_kraken_hierarchy(kraken_file, entries={}, total=0):
entries[ncbi]["count"] += num_direct
entries[ncbi]["count_descendants"] += num_clade_root
else:
entries[ncbi] = {"percentage": percentage, "count": num_direct, "count_descendants": num_clade_root,
"raw_rank": raw_rank, "rank": rank, "depth": depth, "ncbi": ncbi, "name": name, "parents":[], "children":set()}
entries[ncbi] = {
"percentage": percentage,
"count": num_direct,
"count_descendants": num_clade_root,
"raw_rank": raw_rank,
"rank": rank,
"depth": depth,
"ncbi": ncbi,
"name": name,
"parents": [],
"children": set(),
}

if len(hierarchy) > 1:
parent = hierarchy[-2]
assert entries[parent]["depth"] < entries[ncbi]["depth"]
entries[ncbi]["parents"] = hierarchy[:-1]
entries[parent]["children"].add(ncbi)

return entries,total
return entries, total


def combine_kraken_reports(kreports):
entries = {}
Expand All @@ -55,12 +74,27 @@ def combine_kraken_reports(kreports):
entries, total = get_kraken_hierarchy(kreport, entries, total)
print(kreport, len(entries), total)
for ncbi in entries:
entries[ncbi]["percentage"] = entries[ncbi]["count_descendants"] / float(total) *100
entries[ncbi]["percentage"] = (
entries[ncbi]["count_descendants"] / float(total) * 100
)
return entries


def write_entry(out_handle, entry):
offset = entry["depth"]
out_handle.write("%f\t%i\t%i\t%s\t%s\t%s%s\n" %(entry["percentage"], entry["count"], entry["count_descendants"], entry["raw_rank"], entry["ncbi"], 2*offset*" ", entry["name"] ))
out_handle.write(
"%f\t%i\t%i\t%s\t%s\t%s%s\n"
% (
entry["percentage"],
entry["count"],
entry["count_descendants"],
entry["raw_rank"],
entry["ncbi"],
2 * offset * " ",
entry["name"],
)
)


def choose_next(entries, taxa, ncbi="0"):
if len(taxa) == 0:
Expand All @@ -86,38 +120,49 @@ def choose_next(entries, taxa, ncbi="0"):

return next, taxa


def write_kraken_report(outfile, entries):
taxa = list(entries.keys())
with open(outfile, "w") as out_report:
out_report.write("% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n")
out_report.write(
"% of Seqs\tClades\tTaxonomies\tRank\tTaxonomy ID\tScientific Name\n"
)
current = "0"
while len(taxa) > 0:
write_entry(out_report, entries[current])
current, taxa = choose_next(entries, taxa, ncbi=current)


def main():
#Parse arguments
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('-r', dest='kraken_report_files', required=True, nargs='+',
help='Kraken report files to parse')
parser.add_argument('-o', "--outfile",dest='outfile', required=True,
help='Output name')

args=parser.parse_args()

#Start Program
parser.add_argument(
"-r",
dest="kraken_report_files",
required=True,
nargs="+",
help="Kraken report files to parse",
)
parser.add_argument(
"-o", "--outfile", dest="outfile", required=True, help="Output name"
)

args = parser.parse_args()

# Start Program
now = datetime.now()
time = now.strftime("%m/%d/%Y, %H:%M:%S")
sys.stdout.write("PROGRAM START TIME: " + time + '\n')
sys.stdout.write("PROGRAM START TIME: " + time + "\n")

entries = combine_kraken_reports(args.kraken_report_files)
write_kraken_report(args.outfile, entries)

now = datetime.now()
time = now.strftime("%m/%d/%Y, %H:%M:%S")
sys.stdout.write("PROGRAM END TIME: " + time + '\n')
sys.stdout.write("PROGRAM END TIME: " + time + "\n")

sys.exit(0)


if __name__ == "__main__":
main()
167 changes: 167 additions & 0 deletions bin/concatenate_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#!/usr/bin/env python
"""
No dependency Python script for joining two paired end FASTQ files.
Supports concatenating reads with a separator ("NNNNN") or interleaving reads via the
--interleave option. Auto-detects gzip'd files, offers header checking via a --strict flag,
and supports output to STDOUT a gzip'd FASTQ or an uncompressed FASTQ (--uncompressed flag).
"""
import argparse
from Bio import bgzf
import sys


def enforce_headers(f1_header, f2_header):
if f1_header[0] != "@" or f2_header[0] != "@":
raise Exception("Invalid input FASTQ files.")
if f1_header.strip().split(" ")[0] != f2_header.strip().split(" ")[0]:
raise Exception(
"Input FASTQ files do not share headers. " "Check and re-run w/o --strict."
)


def join_w_separator(f1, f2, f_out, strict=False, sep="|"):
readlines = True
ix = 0

# Use 'G', which is in most Phred scores.
# http://en.wikipedia.org/wiki/FASTQ_format
phred_sep = "|" * len(sep)

while readlines:
f1_line = f1.readline()
f2_line = f2.readline()

if f1_line == "":
readlines = False
f1.close()
f2.close()
f_out.close()
if f2_line != "":
raise Exception("Input FASTQ files do not match in length.")
continue

if ix % 4 == 0: # Header
if strict: # Fail if they don't match up to the first whitespace
enforce_headers(f1_line, f2_line)

# Write the header out
f_out.write(f1_line)

elif (ix - 1) % 4 == 0: # Sequence
f_out.write(f1_line.strip() + sep + f2_line)
elif (ix - 2) % 4 == 0: # Separator
f_out.write("+\n")
else:
f_out.write(f1_line.strip() + phred_sep + f2_line)

ix += 1


def join_interleaved(f1, f2, f_out, strict=False):
readlines = True
ix = 0
f1_lines = []
f2_lines = []

def flush_buffer(f_out, f1_lines, f2_lines):
if f1_lines and f2_lines:
assert len(f1_lines) == 4
assert len(f2_lines) == 4
f_out.write(f1_lines[0])
f_out.write(f1_lines[1])
f_out.write(f1_lines[2])
f_out.write(f1_lines[3])
f_out.write(f2_lines[0])
f_out.write(f2_lines[1])
f_out.write(f2_lines[2])
f_out.write(f2_lines[3])

f1_lines = []
f2_lines = []

return f1_lines, f2_lines

while readlines:
f1_line = f1.readline()
f2_line = f2.readline()

if f1_line == "":
readlines = False
if f2_line != "":
raise Exception("Input FASTQ files do not match in length.")
break

if ix % 4 == 0: # Header #1
if strict:
enforce_headers(f1_line, f2_line)

f1_lines, f2_lines = flush_buffer(f_out, f1_lines, f2_lines)

# Fill buffer up to 4 lines
f1_lines.append(f1_line)
f2_lines.append(f2_line)
ix += 1

_, _ = flush_buffer(f_out, f1_lines, f2_lines)
f1.close()
f2.close()
f_out.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=(
"Join two FASTQ paired end read files. Defaults to interleaving reads. "
"Note: Requires input files to be sorted."
)
)
parser.add_argument("fastq1", help="First input FASTQ.")
parser.add_argument("fastq2", help="Second input FASTQ.")
parser.add_argument(
"output_fastq",
nargs="?",
help=("Output FASTQ file name (optional, " "streams to STDOUT if missing."),
)
parser.add_argument(
"--sep",
default="|",
help=("Optional separator to override default (|)."),
)
parser.add_argument(
"--no-interleave",
action="store_true",
help="Concatenate the two reads using a separate (--sep) rather than "
"interleaving them (default).",
)
parser.add_argument(
"--strict",
action="store_true",
help=(
"Enforce that the headers of the input "
"FASTQ files match until the first whitespace "
"delimiter (e.g., a space)."
),
)
parser.add_argument("--gzip", action="store_true", help=("Gzip output_fastq."))
args = parser.parse_args()

if ".gz" in args.fastq1 or ".gzip" in args.fastq1:
f1 = bgzf.open(args.fastq1, mode="r")
else:
f1 = open(args.fastq1, mode="r")
if ".gz" in args.fastq2 or ".gzip" in args.fastq2:
f2 = bgzf.open(args.fastq2, mode="r")
else:
f2 = open(args.fastq2, mode="r")

if args.output_fastq is None:
f_out = sys.stdout
elif args.gzip:
f_out = bgzf.open(args.output_fastq, mode="w")
else:
f_out = open(args.output_fastq, mode="w")

if not args.no_interleave:
join_interleaved(f1, f2, f_out, strict=args.strict)
else:
join_w_separator(f1, f2, f_out, strict=args.strict, sep=args.sep)
Loading

0 comments on commit 585b5f8

Please sign in to comment.