Skip to content

Commit

Permalink
Removed usage of blastdb_aliastool to convert text file with list of …
Browse files Browse the repository at this point in the history
…IDs for BLAST v5 databases. Switched BLAST databases to v4 to improve performance. Improved messages about filename checks.
  • Loading branch information
rfm-targa committed Jul 11, 2024
1 parent 91715f9 commit 885965e
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 210 deletions.
36 changes: 4 additions & 32 deletions CHEWBBACA/AlleleCall/allele_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
sequence_manipulation as sm,
iterables_manipulation as im,
multiprocessing_operations as mo)
from utils.parameters_validation import get_blast_version
except ModuleNotFoundError:
from CHEWBBACA.utils import (constants as ct,
blast_wrapper as bw,
Expand All @@ -42,7 +41,6 @@
sequence_manipulation as sm,
iterables_manipulation as im,
multiprocessing_operations as mo)
from CHEWBBACA.utils.parameters_validation import get_blast_version


def compute_loci_modes(loci_files, output_file):
Expand Down Expand Up @@ -1751,8 +1749,7 @@ def create_missing_fasta(class_files, fasta_file, input_map, dna_hashtable,

def select_representatives(representative_candidates, locus, fasta_file,
iteration, output_directory, blastp_path,
blast_db, blast_score_ratio, threads,
blastdb_aliastool_path):
blast_db, blast_score_ratio, threads):
"""Select new representative alleles for a locus.
Parameters
Expand All @@ -1777,10 +1774,6 @@ def select_representatives(representative_candidates, locus, fasta_file,
BLAST Score Ratio value.
threads : int
Number of threads passed to BLAST.
blastdb_aliastool_path : str or None
Path to the `blastdb_aliastool` executable if it is
necessary to convert the list of seqids to binary file
format, None otherwise.
Returns
-------
Expand All @@ -1795,13 +1788,6 @@ def select_representatives(representative_candidates, locus, fasta_file,
ids_file = fo.join_paths(output_directory,
['{0}_candidates_ids_{1}.fasta'.format(locus, iteration)])
fo.write_lines(list(representative_candidates.keys()), ids_file)
# Convert to binary format if BLAST>=2.10
if blastdb_aliastool_path is not None:
binary_file = f'{ids_file}.bin'
blastp_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
ids_file,
binary_file)
ids_file = binary_file

# BLASTp to compare all candidates
blast_output = fo.join_paths(output_directory,
Expand Down Expand Up @@ -2287,12 +2273,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
# Define BLASTp and makeblastdb paths
blastp_path = fo.join_paths(config['BLAST path'], [ct.BLASTP_ALIAS])
makeblastdb_path = fo.join_paths(config['BLAST path'], [ct.MAKEBLASTDB_ALIAS])
blast_version = get_blast_version(config['BLAST path'])
# Determine if it is necessary to run blastdb_aliastool to convert
# accession list to binary format based on BLAST version being >2.9
blastdb_aliastool_path = None
if blast_version['MINOR'] > ct.BLAST_MINOR:
blastdb_aliastool_path = fo.join_paths(config['BLAST path'], [ct.BLASTDB_ALIASTOOL_ALIAS])

# Concatenate representative FASTA files
concat_reps = fo.join_paths(reps_protein_dir, ['loci_to_call_translated_representatives.fasta'])
Expand All @@ -2313,8 +2293,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
self_scores = cf.determine_self_scores(concat_full_reps, self_score_dir,
makeblastdb_path, blastp_path,
'prot',
config['CPU cores'],
blastdb_aliastool_path)
config['CPU cores'])
fo.pickle_dumper(self_scores, self_score_file)
else:
self_scores = fo.pickle_loader(self_score_file)
Expand Down Expand Up @@ -2368,7 +2347,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
clustering_dir, blastp_path,
makeblastdb_path,
config['CPU cores'],
blastdb_aliastool_path,
True)

blast_files = im.flatten_list(blast_results)
Expand Down Expand Up @@ -2499,12 +2477,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
# Create text file with unclassified seqids
remaining_seqids_file = fo.join_paths(iteration_directory, ['unclassified_seqids_{0}.txt'.format(iteration)])
fo.write_lines(unclassified_seqids, remaining_seqids_file)
if blastdb_aliastool_path is not None:
binary_file = f'{remaining_seqids_file}.bin'
blastp_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
remaining_seqids_file,
binary_file)
remaining_seqids_file = binary_file

# BLAST representatives against unclassified sequences
# Iterative process until no more sequences are classified
Expand Down Expand Up @@ -2660,7 +2632,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
if len(v) > 1:
representative_inputs.append([current_candidates, k, fasta_file,
iteration, blast_selection_dir, blastp_path,
blast_db, config['BLAST Score Ratio'], 1, blastdb_aliastool_path,
blast_db, config['BLAST Score Ratio'], 1,
select_representatives])
# Single candidate
else:
Expand Down Expand Up @@ -2715,7 +2687,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
fo.create_directory(candidates_blast_dir)
new_self_scores = cf.determine_self_scores(concat_repy, candidates_blast_dir,
makeblastdb_path, blastp_path,
'prot', config['CPU cores'], blastdb_aliastool_path)
'prot', config['CPU cores'])

# This includes self-score for candidates that are not added
# (e.g. classification changes due to multiple matches)
Expand Down
2 changes: 1 addition & 1 deletion CHEWBBACA/CHEWBBACA_NS/download_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def main(species_id, schema_id, download_folder, cpu_cores,
fo.create_directory(schema_path_short)

loci_list = fo.join_paths(schema_path, [ct.LOCI_LIST])
loci_list = pv.check_input_type(download_folder, loci_list)
loci_list, total_loci = pv.check_input_type(download_folder, loci_list)

# Determine representatives and create schema
# Do not apply minimum length and size threshold values
Expand Down
2 changes: 1 addition & 1 deletion CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,7 @@ def main(schema_directory, cpu_cores, nomenclature_server,
schema_short_directory = fo.join_paths(schema_directory, ['short'])
if attributed > 0 or count > 0:
loci_list = fo.join_paths(schema_directory, [ct.LOCI_LIST])
loci_list = pv.check_input_type(temp_dir, loci_list)
loci_list, total_loci = pv.check_input_type(temp_dir, loci_list)
adapt_schema.main(loci_list, [schema_directory, schema_short_directory],
cpu_cores, float(schema_params['bsr'][0]),
0, 11, None, blast_path)
Expand Down
33 changes: 2 additions & 31 deletions CHEWBBACA/CreateSchema/create_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
sequence_manipulation as sm,
iterables_manipulation as im,
multiprocessing_operations as mo)
from utils.parameters_validation import get_blast_version
except ModuleNotFoundError:
from CHEWBBACA.utils import (constants as ct,
blast_wrapper as bw,
Expand All @@ -38,7 +37,6 @@
sequence_manipulation as sm,
iterables_manipulation as im,
multiprocessing_operations as mo)
from CHEWBBACA.utils.parameters_validation import get_blast_version


def create_schema_structure(schema_seed_fasta, output_directory, schema_name):
Expand Down Expand Up @@ -306,21 +304,14 @@ def create_schema_seed(fasta_files, output_directory, schema_name, ptf_path,
# Define BLASTp and makeblastdb paths
blastp_path = fo.join_paths(blast_path, [ct.BLASTP_ALIAS])
makeblastdb_path = fo.join_paths(blast_path, [ct.MAKEBLASTDB_ALIAS])
blast_version = get_blast_version(blast_path)
# Determine if it is necessary to run blastdb_aliastool to convert
# accession list to binary format based on BLAST version being >2.9
blastdb_aliastool_path = None
if blast_version['MINOR'] > ct.BLAST_MINOR:
blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS])

# All-vs-all BLASTp per cluster
if len(clusters) > 0:
print(f'Clusters to BLAST: {len(clusters)}')
print('Performing all-vs-all BLASTp per cluster...')
blast_results, blast_results_dir = cf.blast_clusters(clusters, representative_pfasta,
clustering_dir, blastp_path,
makeblastdb_path, cpu_cores,
blastdb_aliastool_path)
makeblastdb_path, cpu_cores)

blast_files = im.flatten_list(blast_results)

Expand Down Expand Up @@ -481,25 +472,6 @@ def main(input_files, output_directory, schema_name, ptf_path,
If provided, intermediate files generated during process
execution are not removed at the end.
"""
print(f'Prodigal training file: {ptf_path}')
print(f'Prodigal mode: {prodigal_mode}')
print(f'CPU cores: {cpu_cores}')
print(f'BLAST Score Ratio: {blast_score_ratio}')
print(f'Translation table: {translation_table}')
print(f'Minimum sequence length: {minimum_length}')
print(f'Size threshold: {size_threshold}')
print(f'Word size: {word_size}')
print(f'Window size: {window_size}')
print(f'Clustering similarity: {clustering_sim}')
print(f'Representative filter: {representative_filter}')
print(f'Intra-cluster filter: {intra_filter}')

if prodigal_mode == 'meta' and ptf_path is not None:
print('Prodigal mode is set to "meta". Will add training file to '
'the schema, but will not use it for gene prediction during '
'schema creation.')
ptf_path = None

# Read file with paths to input files
input_files = fo.read_lines(input_files, strip=True)

Expand All @@ -517,5 +489,4 @@ def main(input_files, output_directory, schema_name, ptf_path,
if no_cleanup is False:
exists = fo.delete_directory(results[1])

# Print message about schema that was created
print('Created schema seed with {0} loci.'.format(len(results[0])))
return len(results[0])
25 changes: 2 additions & 23 deletions CHEWBBACA/PrepExternalSchema/adapt_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
sequence_manipulation as sm,
iterables_manipulation as im,
multiprocessing_operations as mo)
from utils.parameters_validation import get_blast_version
except ModuleNotFoundError:
from CHEWBBACA.utils import (constants as ct,
blast_wrapper as bw,
Expand All @@ -36,7 +35,6 @@
sequence_manipulation as sm,
iterables_manipulation as im,
multiprocessing_operations as mo)
from CHEWBBACA.utils.parameters_validation import get_blast_version


def bsr_categorizer(blast_results, representatives,
Expand Down Expand Up @@ -169,8 +167,7 @@ def select_candidate(candidates, proteins, seqids,


def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
table_id, size_threshold, blastp_path, makeblastdb_path,
blastdb_aliastool_path):
table_id, size_threshold, blastp_path, makeblastdb_path):
"""Adapts a set of loci from an external schema.
Adapts an external schema for usage with chewBBACA. Removes invalid
Expand Down Expand Up @@ -288,12 +285,6 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
# Create file with representative seqid to only compare against self
id_file = fo.join_paths(locus_temp_dir, [f'{seqid}_ids.txt'])
fo.write_lines([seqid], id_file)
if blastdb_aliastool_path is not None:
binary_file = f'{id_file}.bin'
blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
id_file,
binary_file)
id_file = binary_file

rep_blastout = fo.join_paths(locus_temp_dir, [f'{seqid}_blastout.tsv'])
# Cannot get self-alignemnt for some sequences if composition-based stats is enabled
Expand All @@ -311,12 +302,6 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
ids_str = im.join_list([str(i) for i in ids_to_blast if i not in representatives], '\n')
ids_file = fo.join_paths(locus_temp_dir, [f'{locus_id}_ids.txt'])
fo.write_to_file(ids_str, ids_file, 'w', '')
if blastdb_aliastool_path is not None:
binary_file = f'{ids_file}.bin'
blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
ids_file,
binary_file)
ids_file = binary_file

# BLAST representatives against non-represented
blast_output = fo.join_paths(locus_temp_dir,
Expand Down Expand Up @@ -468,16 +453,10 @@ def main(input_files, output_directories, cpu_cores, blast_score_ratio,
# Add common arguments
blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
blast_version = get_blast_version(blast_path)
# Determine if it is necessary to run blastdb_aliastool to convert
# accession list to binary format based on BLAST version being >2.9
blastdb_aliastool_path = None
if blast_version['MINOR'] > ct.BLAST_MINOR:
blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS])
even_loci_groups = [[i, schema_path, schema_short_path,
blast_score_ratio, minimum_length,
translation_table, size_threshold,
blastp_path, makeblastdb_path, blastdb_aliastool_path,
blastp_path, makeblastdb_path,
adapt_loci] for i in even_loci_groups]

print('Adapting {0} loci...\n'.format(len(genes_list)))
Expand Down
13 changes: 3 additions & 10 deletions CHEWBBACA/UniprotFinder/annotate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,21 +200,14 @@ def proteome_annotations(schema_directory, temp_directory, taxa,
print('\nDetermining self-score of representatives...', end='')
blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
blast_version = pv.get_blast_version(blast_path)
# Determine if it is necessary to run blastdb_aliastool to convert
# accession list to binary format based on BLAST version being >2.9
blastdb_aliastool_path = None
if blast_version['MINOR'] > ct.BLAST_MINOR:
blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS])

self_scores = cf.determine_self_scores(reps_concat, temp_directory,
makeblastdb_path, blastp_path, 'prot', cpu_cores, blastdb_aliastool_path)
makeblastdb_path, blastp_path, 'prot', cpu_cores)
print('done.')

# create BLASTdb with proteome sequences
proteome_blastdb = fo.join_paths(proteomes_directory,
['proteomes_db'])
db_std = bw.make_blast_db('makeblastdb', proteomes_concat, proteome_blastdb, 'prot')
db_std = bw.make_blast_db(makeblastdb_path, proteomes_concat, proteome_blastdb, 'prot')

# BLASTp to determine annotations
blast_inputs = [[blastp_path, proteome_blastdb, file, file+'_blastout.tsv',
Expand Down Expand Up @@ -382,7 +375,7 @@ def main(schema_directory, output_directory, genes_list, protein_table,
loci_list = pv.validate_loci_list(genes_list, loci_list, schema_directory)
# Working with the whole schema
else:
loci_list = pv.check_input_type(schema_directory, loci_list)
loci_list, total_loci = pv.check_input_type(schema_directory, loci_list)

loci_paths = fo.read_lines(loci_list)
loci_basenames = [fo.file_basename(locus, False) for locus in loci_paths]
Expand Down
Loading

0 comments on commit 885965e

Please sign in to comment.