Skip to content
This repository has been archived by the owner on Apr 13, 2024. It is now read-only.

Commit

Permalink
v0.0.2
Browse files Browse the repository at this point in the history
A profound update of algorithms, which corrects allele names when the
same consensus sequence is aligned to multiple reference alleles in
different strains. This problem caused split of allele calls when they
actually reveal the same allele. This update also enables PAMmaker to
take as input the clustering outcome when consensus sequences are
clustered at a nucleotide identity of <100%.
  • Loading branch information
wanyuac committed Aug 6, 2018
1 parent f5afcdf commit 2d2fdcf
Show file tree
Hide file tree
Showing 4 changed files with 376 additions and 136 deletions.
234 changes: 160 additions & 74 deletions clustering_allele_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,94 +3,180 @@
It works as a downstream process after tabulate_cd-hit.py. Specifically, it takes as input from stdin and prints results to stdout.
In particular, this program assumes question marks have been removed from the compiled result using a proper method to solve ambiguity in allele calls.
In a few cases, there is no unique allele identifiers (such as intI1, not intI1_1). This may happen when the reference database is small.
Usage:
python tabulate_cdhit.py clusters.fna.clstr | python clustering_allele_variants.py srst2_compiledResults.txt > compiledResults_clusterd.txt
cat cluster_table.txt | python clustering_allele_variants.py srst2_compiledResults.txt > compiledResults_clusterd.txt # if you already have a tabulated cluster file
cat cluster_table.txt | python clustering_allele_variants.py srst2_compiledResults.txt 1 > compiledResults_clusterd.txt # when allele calls do not have unique allele identifiers
python tabulate_cdhit.py clusters.fna.clstr > cluster_table.txt
python clustering_allele_variants.py -i srst2_compiledResults.txt -c cluster_table.txt -r -o compiledResults_clusterd.txt -m allele_name_replacement.txt
python clustering_allele_variants.py -i srst2_compiledResults.txt -c cluster_table.txt -r -n -o compiledResults_clusterd.txt # when allele calls do not have unique allele identifiers
Input:
1. A compiled tab-deliminated file generated by SRST2 using the command: python srst2.py --prev_output *__genes*results.txt --output <prefix>
2. A cluster file created by either cd-hit or cd-hit-est.
1. A compiled tab-deliminated file generated by SRST2 using the command: python srst2.py --prev_output *__genes*results.txt --output <prefix>
2. A cluster file created by either cd-hit or cd-hit-est.
Output: A new file of compiled results, where each allele variant are assigned a unique cluster identifier.
Python version: 3.4.3
Author: Yu Wan ([email protected], GitHub: https://github.com/wanyuac)
Editions: 29 July 2015, 5 Sep 2016, 27 Jun 2017
This script is derived from clustering_allele_variants.py in my repository SRST2_toolkit (https://github.com/wanyuac/SRST2_toolkit).
License: Apache License, Version 2.0
Python 2 and 3 compatible
Copyright 2017 Yu Wan <[email protected]>
Licensed under the Apache License, Version 2.0
First and the latest editions: 29 July 2015, 6 August 2018
"""

from __future__ import print_function
import sys
import re
import collections

from collections import defaultdict, namedtuple
from argparse import ArgumentParser


def parse_arguments():
parser = ArgumentParser(description = "Assigning extended identifiers to allele names")
parser.add_argument("-i", "--input", dest = "input", type = str, required = True,
help = "Input gene-content table whose allele names are subject to changes.")
parser.add_argument("-c", "--clusters", dest = "clusters", type = str, required = True,
help = "A tab-delimited file containing clustering information from CD-HIT-EST")
parser.add_argument("-n", "--noext", dest = "noext", action = "store_true", required = False,
help = "Whether allele names do contain unique identifiers. Default: False.")
parser.add_argument("-r", "--rename", dest = "rename", action = "store_true", required = False,
help = "Whether to rename alleles when consensus sequences of different alleles belong to the same cluster")
parser.add_argument("-o", "--out", dest = "out", type = str, required = False, default = "modified_allele_matrix.txt",
help = "The output gene-content table with extended allele names")
parser.add_argument("-m", "--mapping", dest = "mapping", type = str, required = False, default = "allele_name_replacement.txt",
help = "The output gene-content table with extended allele names")

return parser.parse_args()


def extract_alleleID(id, ext):
"""
This function changes sequence names in the cluster file into allele names used in SRST2. For example,
if ext == True:
input = "66__FloR_Phe__FloR__1212", output = "FloR_1212"
"""
fields = id.split("__")[-2:] # only take the last two elements: allele name and sequence identifier
if ext:
new_id = "_".join(fields) # shorten the double underscores into a single one in order to match the format in SRST2 gene profiles.
else:
new_id = fields[0] # only take the first element in the ID list, eg. returns FloR for the example above

return new_id

def read_clusters(lines, id_ext = True):
"""
This function reads a tabulated cluster file produced by cd-hit-est and my script tabulate_cdhit.py, and then
returns a dictionary of clusters following the structure: {allele_name: {cluster_id : [sample1, sample2, ...]}},
where two keys are applied hierarchically.
"""
clusters = collections.defaultdict(dict)
for line in lines[1 : ]: # skip the header line
fields = line.split("\t")
cluster_id = fields[0] # an integer
allele_info = fields[2].split("|") # "0__OqxB_Flq__OqxB__49.variant|NTUH-K2044" -> ["0__OqxB_Flq__OqxB__49.variant", "NTUH-K2044"]

# Extract the sequence ID, such as 52__TetA_Tet__TetA__1545, with a regular expression; then convert it into an allele name
# For instance, the regular expression extracts "0__OqxB_Flq__OqxB__49" out of "0__OqxB_Flq__OqxB__49.consensus".
allele = extract_alleleID(re.findall("(.+?)\.", allele_info[0])[0], ext = id_ext)
sample = allele_info[1] # extract the sample ID from the line with a regular expression
clusters[sample][allele] = cluster_id # save this cluster_id in the dictionary {sample : {allele : cluster_id}}

return clusters

def assign_clusterIDs(gene_table, clusters):
"""
This function assigns an extended allele identifier to every allele call if it is a variant of its reference.
The additional identifier is ID of each cluster.
For example, "Aac6-Iaa_760*" becomes "aac6-Iaa_760.10" in the output, where 10 is the cluster where this allele belongs to in an output from cd-hit-est.
"""
f = open(gene_table, "rU")
content = f.read().splitlines()
print(content[0]) # print the header line first
for row in content[1:]: # for every row of this table
fields = row.split("\t")
sample = fields[0]
new_line = [sample]
for allele in fields[1:]:
if allele[-1] == "*": # This is a variant; otherwise, no change is needed.
allele = allele.replace("*", "." + clusters[sample][allele[:-1]]) # replace the asterisk with a cluster ID
new_line.append(allele) # non-variants remain intact
print("\t".join(new_line)) # print a new line to the stdout
f.close()

return
"""
This function changes sequence names in the cluster file into allele names used in SRST2. For example,
if ext == True:
input = "66__FloR_Phe__FloR__1212", output = "FloR_1212"
"""
fields = id.split("__")[-2:] # only take the last two elements: allele name and sequence identifier
if ext:
new_id = "_".join(fields) # shorten the double underscores into a single one in order to match the format in SRST2 gene profiles.
else:
new_id = fields[0] # only take the first element in the ID list, eg. returns FloR for the example above

return new_id


def read_clusters(cluster_file, id_ext = True):
"""
This function reads a tabulated cluster file produced by cd-hit-est and my script tabulate_cdhit.py, and then
returns a dictionary of clusters following the structure: {allele_name: {cluster_id : [sample1, sample2, ...]}},
where two keys are applied hierarchically.
"""
with open(cluster_file, "rU") as f:
lines = f.read().splitlines()

# Initialise the return value
clusters = defaultdict(dict)
Allele = namedtuple("Allele", ["cid", "rep"]) # cid: cluster index; rep: is this allele in the current sample is chosen as the representative sequence

# Retrieve allele and cluster information
for line in lines[1 : ]: # skip the header line
fields = line.split("\t")
cluster_id = fields[0]
is_rep = fields[5] == "Y"
allele_info = fields[2].split("|") # "0__OqxB_Flq__OqxB__49.variant|NTUH-K2044" -> ["0__OqxB_Flq__OqxB__49.variant", "NTUH-K2044"]

# Extract the sequence ID, such as 52__TetA_Tet__TetA__1545, with a regular expression; then convert it into an allele name
# For instance, the regular expression extracts "0__OqxB_Flq__OqxB__49" out of "0__OqxB_Flq__OqxB__49.consensus".
allele = extract_alleleID(re.findall("(.+?)\.", allele_info[0])[0], ext = id_ext)
sample = allele_info[1] # extract the sample ID from the line with a regular expression
clusters[sample][allele] = Allele(cid = cluster_id, rep = is_rep) # save this cluster_id (a string) in the dictionary {sample : {allele : cluster_id}}

return clusters


def get_rep_allele_per_cluster(clusters):
"""
Identify allele names in the same cluster and replace them with the representative allele name
Return a dictionary {cid : allele}
"""
reps = {} # representative allele of each cluster

for sample, alleles in clusters.items():
for allele, record in alleles.items():
if record.rep:
"""
Add this allele name into the dictionary cluster_rep. CD-HIT-EST ensures that there is
only a single representative allele per cluster. Therefore, we will not lose any
representative allele in this algorithm.
"""
reps[record.cid] = allele

return reps


def assign_clusterIDs(gene_table, clusters, reps, rename, out_tab, out_rep):
"""
This function assigns an extended allele identifier to every allele call if it is a variant of its reference.
The additional identifier is ID of each cluster.
For example, "Aac6-Iaa_760*" becomes "aac6-Iaa_760.10" in the output, where 10 is the cluster where this allele belongs to in an output from cd-hit-est.
"""
# Import the gene content table
with open(gene_table, "rU") as f:
content = f.read().splitlines()

# Establish output files
new_tab = open(out_tab, "w")
rep_out = open(out_rep, "w")

# Convert allele IDs in the gene content table
new_tab.write(content[0] + "\n") # print the header line first
rep_out.write("Sample\tPrev_allele\tVar\tNew_allele\n")
for row in content[1:]: # for every row of this table
fields = row.split("\t")
sample = fields[0]
new_line = [sample]

for allele in fields[1:]:
"""
Although alleles are different to each other in every reference database, exact hits to different alleles
may be clustered together when the threshold of nucleotide identity is sufficiently low. Therefore, an exact
hit may be renamed when the rename argument is True and multiple reference alleles are clustered together.
Evidently, for clusters having only a single allele, there is no difference between the output allele name
and the original name.
"""
if allele != "-":
is_var = allele[-1] == "*"
if is_var: # This is a variant of a reference allele.
allele = allele[:-1] # drop the asterisk
cid = clusters[sample][allele].cid
if rename:
rep_allele = reps[cid]
if allele != rep_allele:
rep_out.write("\t".join([sample, allele, str(int(is_var)), rep_allele]) + "\n")
if is_var:
allele = rep_allele + "." + cid # replace the asterisk with a cluster ID
else:
allele = rep_allele
elif is_var:
allele = allele + "." + cid
# else: do nothing
new_line.append(allele) # non-variants remain intact

new_tab.write("\t".join(new_line) + "\n") # print a new line to the stdout

new_tab.close()
rep_out.close()

return


def main():
argv_len = len(sys.argv) # number of arguments (>= 1)
if argv_len == 2: # Allele calls have unique allele identifiers, which is the most common case of results.
assign_clusterIDs(gene_table = sys.argv[1], clusters = read_clusters(sys.stdin.read().splitlines(), True)) # a table showing allele calls of every gene over samples
elif argv_len > 2: # In a few cases, there is no unique allele identifiers (such as intI1, not intI1_1). This may happen when the reference database is small.
assign_clusterIDs(gene_table = sys.argv[1], clusters = read_clusters(sys.stdin.read().splitlines(), False))
else:
exit("Error: a file showing allele calls of every gene over samples is required.")

args = parse_arguments()
clusters = read_clusters(args.clusters, not args.noext) # import cluster information: the tablated CD-HIT-EST output
reps = get_rep_allele_per_cluster(clusters) # one representative allele per cluster
assign_clusterIDs(gene_table = args.input, clusters = clusters, reps = reps, rename = args.rename, \
out_tab = args.out, out_rep = args.mapping) # a table showing allele calls of every gene over samples


# The main program
if __name__ == "__main__":
main()
Loading

0 comments on commit 2d2fdcf

Please sign in to comment.