This repository has been archived by the owner on Apr 13, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from wanyuac/dev
Add support to ARIBA outputs
- Loading branch information
Showing
4 changed files
with
311 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Converting ARIBA outputs to an allelic PAM | ||
|
||
This subdirectory offers scripts for converting ARIBA outputs to an allelic PAM and a genetic PAM. Please see README under the main directory for a step-by-step guide. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/usr/bin/env python | ||
|
||
""" | ||
Converts output TSV of utility/tabulate_cdhit.py to an allelic presence-absence matrix (PAM) and saves | ||
it in TSV format. | ||
Command demonstration: | ||
python clusters2pam.py -i alleles.clstr.tsv -om alleles_pam.tsv -ot alleles_clstr_updated.tsv | ||
Dependency: module pandas, Python v3 | ||
Copyright (C) 2020 Yu Wan <[email protected]> | ||
Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>. | ||
Publication: 11 Nov 2020; the latest modification: 12 Nov 2020 | ||
""" | ||
|
||
import os | ||
import sys | ||
import pandas | ||
from argparse import ArgumentParser | ||
|
||
def parse_arguments(): | ||
parser = ArgumentParser(description = "Creating an allelic presence-absence matrix from a table of sequence clusters",\ | ||
epilog = "This is a helper script of R package GeneMates.") | ||
parser.add_argument("-i", dest = "i", type = str, required = True, help = "Input FASTA files") | ||
parser.add_argument("-om", dest = "om", type = str, required = False, default = "allelic_PAM.tsv", help = "Output presence-absence matrix in TSV format") | ||
parser.add_argument("-ot", dest = "ot", type = str, required = False, default = "clusters.tsv", help = "Output table about sequence clusters") | ||
return parser.parse_args() | ||
|
||
def main(): | ||
args = parse_arguments() | ||
|
||
# Import the input TSV file as a data frame | ||
input_tsv = args.i | ||
if os.path.exists(input_tsv): | ||
clusters = parse_seqid(pandas.read_csv(input_tsv, sep = "\t", index_col = None)) # Import the table as a data frame of six columns and parse column 'seqid' | ||
else: | ||
print("Error: the input file is not accessible.", file = sys.stderr) | ||
sys.exit(1) | ||
|
||
# Assign allele IDs and create an allelic PAM from data frame "clusters" | ||
pam = create_allelic_pam(clusters) | ||
pam.to_csv(args.om, header = True, index = False, sep = "\t", float_format = None) | ||
clusters.to_csv(args.ot, header = True, index = False, sep = "\t", float_format = None) | ||
return | ||
|
||
def parse_seqid(df): | ||
""" | ||
Parses column 'seqid' into two new columns and returns a data frame of eight columns. | ||
""" | ||
seqids = df["seqid"].tolist() # Convert a column into a list | ||
genes = list() | ||
samples = list() | ||
|
||
# Create two lists from one column | ||
for item in seqids: | ||
g, s = item.split("|") | ||
genes.append(g) | ||
samples.append(s) | ||
|
||
# Append lists to the data frame as columns | ||
df["gene"] = genes | ||
df["sample"] = samples | ||
return df | ||
|
||
def create_allelic_pam(df): | ||
""" | ||
Assign allele IDs and create an allelic PAM based on clustering results. | ||
""" | ||
samples = get_unique_ids(df, "sample") | ||
cluster_ids = get_unique_ids(df, "cluster") # 0, 1, 2, ... | ||
genes_visited = dict() # A dictionary of genes in processed clusters. {gene : number of clusters} | ||
pam = pandas.DataFrame(samples, columns = ["sample"]) # Initalise the output PAM | ||
|
||
# Create a list about presence-absence of each allele and combine it to the output PAM as a column | ||
for c in cluster_ids: # Type of elements: numpy.int64 | ||
df_c = df[df["cluster"] == c] # Select rows of the current cluster | ||
genes_c = df_c["gene"].tolist() # All gene names should be the same when alleles are clustered under 100% nucleotide identity | ||
gene = genes_c[0] | ||
if gene in genes_visited.keys(): | ||
genes_visited[gene] += 1 | ||
allele = gene + "." + str(genes_visited[gene]) # Adding a suffix for making an allele name. Example result: sul1.1, sul1.2. | ||
else: | ||
genes_visited[gene] = 0 # Record a new gene encountered | ||
allele = gene | ||
pa_vec = list() # A binary vector about presence-absence of the current allele across samples | ||
samples_c = df_c["sample"].tolist() # Samples in which the current allele is detected | ||
for s in samples: | ||
pa = 1 if s in samples_c else 0 | ||
pa_vec.append(pa) | ||
pam[allele] = pa_vec | ||
return pam | ||
|
||
def get_unique_ids(df, col_name): | ||
""" | ||
A subordinate function of create_allelic_pam for getting a list of unique and sorted values from a | ||
given column of input data frame df. | ||
""" | ||
ids = list(df[col_name].unique()) | ||
ids.sort(reverse = False) | ||
return ids | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
|
||
""" | ||
Pool FASTA files of ARIBA's output allele sequences into one file and append sample names to sequence IDs (in | ||
the same format as that for SRST2's output consensus sequences). | ||
Command demonstration: | ||
python pool_seqs.py -i *_genes.fna -o alleles.fna -e '_genes.fna' | ||
Copyright (C) 2020 Yu Wan <[email protected]> | ||
Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>. | ||
Publication: 11 Nov 2020; the latest modification: 11 Nov 2020 | ||
""" | ||
|
||
import os | ||
from argparse import ArgumentParser | ||
|
||
def parse_arguments(): | ||
parser = ArgumentParser(description = "Pool ARIBA's output allele sequences into one FASTA file and append sample names to sequence IDs",\ | ||
epilog = "This is a helper script of R package GeneMates.") | ||
parser.add_argument("-i", nargs = "+", dest = "i", type = str, required = True, help = "Input FASTA files") | ||
parser.add_argument("-o", dest = "o", type = str, required = False, default = "alleles.fna", help = "Output FASTA file of pooled allele sequences") | ||
parser.add_argument("-e", dest = "e", type = str, required = False, default = "_genes.fna", help = "Filename extension to be removed for sample names") | ||
return parser.parse_args() | ||
|
||
def main(): | ||
args = parse_arguments() | ||
fasta_out = open(args.o, "w") | ||
sample_count = 0 | ||
for fasta_in in args.i: | ||
f_in = open(fasta_in, "r") | ||
fasta_in = os.path.basename(fasta_in) | ||
sample = fasta_in.replace(args.e, "") # No change applies if args.e is not found in the filename | ||
line = f_in.readline() | ||
while line: | ||
if line.startswith(">"): # A header is encountered | ||
fields = line.split(".") # ARIBA uses full stops as delimiters in the header line | ||
new_id = fields[0] + "|" + sample # Example value: ">cluster_1|sample_1" | ||
seq_descr = ".".join(fields[1 : ]) | ||
fasta_out.write(new_id + " " + seq_descr) # Note that the newline character of this line is not stripped off by the readline method. | ||
else: | ||
fasta_out.write(line) | ||
line = f_in.readline() # Till the end of the file, a None value is returned. | ||
f_in.close() | ||
sample_count += 1 | ||
fasta_out.close() | ||
print("%d samples have been processed." % sample_count) | ||
return | ||
|
||
if __name__ == "__main__": | ||
main() |