Skip to content

Commit

Permalink
Blacklist statistics filter added
Browse files Browse the repository at this point in the history
Changes to be committed:
	new file:   blackstats.py
	modified:   filter_nt.py
  • Loading branch information
HadleyKing committed Aug 9, 2023
1 parent ac44c3d commit 0335d28
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 32 deletions.
98 changes: 98 additions & 0 deletions blackstats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
"""Blacklist stats
"""

import sys
import csv
from datetime import datetime
from argparse import ArgumentParser, SUPPRESS

__version__ = "7.0"
__status__ = "Dev"

def usr_args():
"""User Arguments
User supplied arguments from command line for function
Returns
-------
ArgumentParser objects to be digested by subsequent functions.
"""

parser = ArgumentParser(
add_help=False,
prog='blackstats.py',
description="This script counts the items from a black list of "
"unwanted taxonomy names (scientific names). "
"names: ['unclassified','unidentified','uncultured','unspecified',"
"'unknown','phage','vector'] and ['environmental sample',"
"'artificial sequence','other sequence'].")

required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')

required.add_argument('-b', '--blackstats',
help="Input file to use. The `blackstats.tsv` is "
"used to generate the filtering statistics. "
"Default is `./output_data/blacklist-taxId.1.csv` ",
default='./output_data/blackstats.tsv')

optional.add_argument('-o', '--output',
help="Output file to create."
"Default is `./output_data/filter_stats.tsv` ",
default='./output_data/filter_stats.tsv')

optional.add_argument('-v', '--version',
action='version',
version='%(prog)s ' + __version__)
optional.add_argument('-h', '--help',
action='help',
default=SUPPRESS,
help='show this help message and exit')

if len(sys.argv) <= 1:
sys.argv.append('--help')

return parser.parse_args()

def read_blackstats(blackstats:str) -> dict:
"""Read Blacklist statistics
Populates a dictionary with the blacklisted TaxIds
and other terms to calculate overall statistics.
"""

black_dict = {}
with open(blackstats, 'r', encoding='utf-8') as list:
reader = csv.reader(list, delimiter='\t')
next(reader)
for row in reader:
if row[2] in black_dict.keys():
black_dict[row[2]][1] += 1
if row[1] not in black_dict[row[2]][0]:
black_dict[row[2]][0].append(row[1])
else:
black_dict[row[2]] = [[row[1]], 1]
print('Blacklist loaded: ', datetime.utcnow())
return black_dict

def write_filter_stats(black_dict: dict, filter_stats:str):
"""Write Filter Stats
"""
with open(filter_stats, 'w', encoding='utf-8') as stat_file:
writer = csv.writer(stat_file, delimiter='\t')
writer.writerow(['blackListTaxonomyName','taxid', 'removed sequences'])
for key in black_dict.keys():
writer.writerow([key, len(black_dict[key][0]), black_dict[key][1]])

def main():
"""Main Function"""

options = usr_args()
black_dict = read_blackstats(options.blackstats)
write_filter_stats(black_dict, options.output)

if __name__ == '__main__':
main()
54 changes: 22 additions & 32 deletions filter_nt.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def create_connection(db_file):
"""

# protein_conn, dead_conn, taxonomy_conn = None, None, None
print(db_file)

try:
protein_conn = sqlite3.connect(f'{db_file}/protein_taxonomy.db')
Expand Down Expand Up @@ -168,41 +167,32 @@ def get_taxonomy(conn, accession):

return row

def filter_nt(protein_conn, dead_conn, taxonomy_conn, nt, black_dict, outfile):
def filter_nt(protein_conn, dead_conn, taxonomy_conn, nt, black_dict, outfile, stats):
"""Filter NT"""

black_stats = {}
with open(outfile, 'w', encoding='utf-8') as filtered:
count = 0
for record in SeqIO.parse(nt, 'fasta'):
count += 1
prot_result, tax_result, dead_result = '', '', ''
accession = record.id.split('.')[0]
result = get_taxonomy(taxonomy_conn, accession)
# import pdb; pdb.set_trace()
if result == 'not found':
result = get_taxonomy(protein_conn, accession)
with open(stats, 'w', encoding='utf-8')as stat_file:
writer = csv.writer(stat_file, delimiter='\t')
writer.writerow(['accession','taxid', 'node', 'name'])
count = 0
for record in SeqIO.parse(nt, 'fasta'):
count += 1
accession = record.id.split('.')[0]
result = get_taxonomy(taxonomy_conn, accession)
# import pdb; pdb.set_trace()
if result == 'not found':
result = get_taxonomy(dead_conn, accession)
result = get_taxonomy(protein_conn, accession)
if result == 'not found':
raise f'{accession} Not Found'
if str(result[1]) in black_dict.keys():
black_stats[accession] = result[1], black_dict[str(result[1])][0], black_dict[str(result[1])][1]
else:
filtered.write(">%s\n%s\n" % (record.id, record.seq))

# if len(black_stats) == 10000:
# return black_stats

def write_blackstats(blackstats: dict, stats:str):
"""Write Blacklist stats
"""

with open(stats, 'w', encoding='utf-8')as stat_file:
writer = csv.writer(stat_file, delimiter='\t')
writer.writerow(['accession','taxid', 'node', 'name'])
for key, value in blackstats.items():
writer.writerow([key, value[0], value[1], value[2]])
result = get_taxonomy(dead_conn, accession)
if result == 'not found':
raise f'{accession} Not Found'
if str(result[1]) in black_dict.keys():
# print(accession, '\t', result[1] , black_dict[str(result[1])][0], black_dict[str(result[1])][1] )
# black_stats[accession] = result[1], black_dict[str(result[1])][0], black_dict[str(result[1])][1]
writer.writerow([accession, result[1], black_dict[str(result[1])][0], black_dict[str(result[1])][1]])
else:
filtered.write(">%s\n%s\n" % (record.id, record.seq))

def main():
"""Main Function"""
Expand All @@ -211,8 +201,8 @@ def main():
options = usr_args()
protein_conn, dead_conn, taxonomy_conn = create_connection(options.database)
black_dict = read_blacklist(options.blacklist)
blackstats = filter_nt(protein_conn, dead_conn, taxonomy_conn, options.nt, black_dict, options.output)
write_blackstats(blackstats, options.stats)
blackstats = filter_nt(protein_conn, dead_conn, taxonomy_conn, options.nt, black_dict, options.output, options.stats)
# write_blackstats(blackstats, options.stats)

if __name__ == '__main__':
main()

0 comments on commit 0335d28

Please sign in to comment.