Removed usage of blastdb_aliastool to convert text file with list of …

…IDs for BLAST v5 databases. Switched BLAST databases to v4 to improve performance. Improved messages about filename checks.
B-UMMI · Jul 11, 2024 · 885965e · 885965e
1 parent 91715f9
commit 885965e
Show file tree

Hide file tree

Showing 12 changed files with 116 additions and 210 deletions.
diff --git a/CHEWBBACA/AlleleCall/allele_call.py b/CHEWBBACA/AlleleCall/allele_call.py
@@ -30,7 +30,6 @@
 					   sequence_manipulation as sm,
 					   iterables_manipulation as im,
 					   multiprocessing_operations as mo)
-	from utils.parameters_validation import get_blast_version
 except ModuleNotFoundError:
 	from CHEWBBACA.utils import (constants as ct,
 								 blast_wrapper as bw,
@@ -42,7 +41,6 @@
 								 sequence_manipulation as sm,
 								 iterables_manipulation as im,
 								 multiprocessing_operations as mo)
-	from CHEWBBACA.utils.parameters_validation import get_blast_version
 
 
 def compute_loci_modes(loci_files, output_file):
@@ -1751,8 +1749,7 @@ def create_missing_fasta(class_files, fasta_file, input_map, dna_hashtable,
 
 def select_representatives(representative_candidates, locus, fasta_file,
 						   iteration, output_directory, blastp_path,
-						   blast_db, blast_score_ratio, threads,
-						   blastdb_aliastool_path):
+						   blast_db, blast_score_ratio, threads):
 	"""Select new representative alleles for a locus.
 
 	Parameters
@@ -1777,10 +1774,6 @@ def select_representatives(representative_candidates, locus, fasta_file,
 		BLAST Score Ratio value.
 	threads : int
 		Number of threads passed to BLAST.
-	blastdb_aliastool_path : str or None
-		Path to the `blastdb_aliastool` executable if it is
-		necessary to convert the list of seqids to binary file
-		format, None otherwise.
 
 	Returns
 	-------
@@ -1795,13 +1788,6 @@ def select_representatives(representative_candidates, locus, fasta_file,
 	ids_file = fo.join_paths(output_directory,
 							 ['{0}_candidates_ids_{1}.fasta'.format(locus, iteration)])
 	fo.write_lines(list(representative_candidates.keys()), ids_file)
-	# Convert to binary format if BLAST>=2.10
-	if blastdb_aliastool_path is not None:
-		binary_file = f'{ids_file}.bin'
-		blastp_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
-								 			  ids_file,
-								 			  binary_file)
-		ids_file = binary_file
 
 	# BLASTp to compare all candidates
 	blast_output = fo.join_paths(output_directory,
@@ -2287,12 +2273,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
 	# Define BLASTp and makeblastdb paths
 	blastp_path = fo.join_paths(config['BLAST path'], [ct.BLASTP_ALIAS])
 	makeblastdb_path = fo.join_paths(config['BLAST path'], [ct.MAKEBLASTDB_ALIAS])
-	blast_version = get_blast_version(config['BLAST path'])
-	# Determine if it is necessary to run blastdb_aliastool to convert
-	# accession list to binary format based on BLAST version being >2.9
-	blastdb_aliastool_path = None
-	if blast_version['MINOR'] > ct.BLAST_MINOR:
-		blastdb_aliastool_path = fo.join_paths(config['BLAST path'], [ct.BLASTDB_ALIASTOOL_ALIAS])
 
 	# Concatenate representative FASTA files
 	concat_reps = fo.join_paths(reps_protein_dir, ['loci_to_call_translated_representatives.fasta'])
@@ -2313,8 +2293,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
 		self_scores = cf.determine_self_scores(concat_full_reps, self_score_dir,
 											   makeblastdb_path, blastp_path,
 											   'prot',
-											   config['CPU cores'],
-											   blastdb_aliastool_path)
+											   config['CPU cores'])
 		fo.pickle_dumper(self_scores, self_score_file)
 	else:
 		self_scores = fo.pickle_loader(self_score_file)
@@ -2368,7 +2347,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
 															 clustering_dir, blastp_path,
 															 makeblastdb_path,
 															 config['CPU cores'],
-															 blastdb_aliastool_path,
 															 True)
 
 		blast_files = im.flatten_list(blast_results)
@@ -2499,12 +2477,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
 		# Create text file with unclassified seqids
 		remaining_seqids_file = fo.join_paths(iteration_directory, ['unclassified_seqids_{0}.txt'.format(iteration)])
 		fo.write_lines(unclassified_seqids, remaining_seqids_file)
-		if blastdb_aliastool_path is not None:
-			binary_file = f'{remaining_seqids_file}.bin'
-			blastp_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
-						 						  remaining_seqids_file,
-									 			  binary_file)
-			remaining_seqids_file = binary_file
 
 		# BLAST representatives against unclassified sequences
 		# Iterative process until no more sequences are classified
@@ -2660,7 +2632,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
 				if len(v) > 1:
 					representative_inputs.append([current_candidates, k, fasta_file,
 												  iteration, blast_selection_dir, blastp_path,
-												  blast_db, config['BLAST Score Ratio'], 1, blastdb_aliastool_path,
+												  blast_db, config['BLAST Score Ratio'], 1,
 												  select_representatives])
 				# Single candidate
 				else:
@@ -2715,7 +2687,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory,
 			fo.create_directory(candidates_blast_dir)
 			new_self_scores = cf.determine_self_scores(concat_repy, candidates_blast_dir,
 													   makeblastdb_path, blastp_path,
-													   'prot', config['CPU cores'], blastdb_aliastool_path)
+													   'prot', config['CPU cores'])
 
 			# This includes self-score for candidates that are not added
 			# (e.g. classification changes due to multiple matches)

diff --git a/CHEWBBACA/CHEWBBACA_NS/download_schema.py b/CHEWBBACA/CHEWBBACA_NS/download_schema.py
@@ -508,7 +508,7 @@ def main(species_id, schema_id, download_folder, cpu_cores,
         fo.create_directory(schema_path_short)
 
         loci_list = fo.join_paths(schema_path, [ct.LOCI_LIST])
-        loci_list = pv.check_input_type(download_folder, loci_list)
+        loci_list, total_loci = pv.check_input_type(download_folder, loci_list)
 
         # Determine representatives and create schema
         # Do not apply minimum length and size threshold values

diff --git a/CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py b/CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py
@@ -990,7 +990,7 @@ def main(schema_directory, cpu_cores, nomenclature_server,
     schema_short_directory = fo.join_paths(schema_directory, ['short'])
     if attributed > 0 or count > 0:
         loci_list = fo.join_paths(schema_directory, [ct.LOCI_LIST])
-        loci_list = pv.check_input_type(temp_dir, loci_list)
+        loci_list, total_loci = pv.check_input_type(temp_dir, loci_list)
         adapt_schema.main(loci_list, [schema_directory, schema_short_directory],
                           cpu_cores, float(schema_params['bsr'][0]),
                           0, 11, None, blast_path)

diff --git a/CHEWBBACA/CreateSchema/create_schema.py b/CHEWBBACA/CreateSchema/create_schema.py
@@ -28,7 +28,6 @@
                        sequence_manipulation as sm,
                        iterables_manipulation as im,
                        multiprocessing_operations as mo)
-    from utils.parameters_validation import get_blast_version
 except ModuleNotFoundError:
     from CHEWBBACA.utils import (constants as ct,
                                  blast_wrapper as bw,
@@ -38,7 +37,6 @@
                                  sequence_manipulation as sm,
                                  iterables_manipulation as im,
                                  multiprocessing_operations as mo)
-    from CHEWBBACA.utils.parameters_validation import get_blast_version
 
 
 def create_schema_structure(schema_seed_fasta, output_directory, schema_name):
@@ -306,21 +304,14 @@ def create_schema_seed(fasta_files, output_directory, schema_name, ptf_path,
     # Define BLASTp and makeblastdb paths
     blastp_path = fo.join_paths(blast_path, [ct.BLASTP_ALIAS])
     makeblastdb_path = fo.join_paths(blast_path, [ct.MAKEBLASTDB_ALIAS])
-    blast_version = get_blast_version(blast_path)
-    # Determine if it is necessary to run blastdb_aliastool to convert
-    # accession list to binary format based on BLAST version being >2.9
-    blastdb_aliastool_path = None
-    if blast_version['MINOR'] > ct.BLAST_MINOR:
-        blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS])
 
     # All-vs-all BLASTp per cluster
     if len(clusters) > 0:
         print(f'Clusters to BLAST: {len(clusters)}')
         print('Performing all-vs-all BLASTp per cluster...')
         blast_results, blast_results_dir = cf.blast_clusters(clusters, representative_pfasta,
                                                     clustering_dir, blastp_path,
-                                                    makeblastdb_path, cpu_cores,
-                                                    blastdb_aliastool_path)
+                                                    makeblastdb_path, cpu_cores)
 
         blast_files = im.flatten_list(blast_results)
 
@@ -481,25 +472,6 @@ def main(input_files, output_directory, schema_name, ptf_path,
         If provided, intermediate files generated during process
         execution are not removed at the end.
     """
-    print(f'Prodigal training file: {ptf_path}')
-    print(f'Prodigal mode: {prodigal_mode}')
-    print(f'CPU cores: {cpu_cores}')
-    print(f'BLAST Score Ratio: {blast_score_ratio}')
-    print(f'Translation table: {translation_table}')
-    print(f'Minimum sequence length: {minimum_length}')
-    print(f'Size threshold: {size_threshold}')
-    print(f'Word size: {word_size}')
-    print(f'Window size: {window_size}')
-    print(f'Clustering similarity: {clustering_sim}')
-    print(f'Representative filter: {representative_filter}')
-    print(f'Intra-cluster filter: {intra_filter}')
-
-    if prodigal_mode == 'meta' and ptf_path is not None:
-        print('Prodigal mode is set to "meta". Will add training file to '
-              'the schema, but will not use it for gene prediction during '
-              'schema creation.')
-        ptf_path = None
-
     # Read file with paths to input files
     input_files = fo.read_lines(input_files, strip=True)
 
@@ -517,5 +489,4 @@ def main(input_files, output_directory, schema_name, ptf_path,
     if no_cleanup is False:
         exists = fo.delete_directory(results[1])
 
-    # Print message about schema that was created
-    print('Created schema seed with {0} loci.'.format(len(results[0])))
+	return len(results[0])
diff --git a/CHEWBBACA/PrepExternalSchema/adapt_schema.py b/CHEWBBACA/PrepExternalSchema/adapt_schema.py
@@ -27,7 +27,6 @@
 					   sequence_manipulation as sm,
 					   iterables_manipulation as im,
 					   multiprocessing_operations as mo)
-	from utils.parameters_validation import get_blast_version
 except ModuleNotFoundError:
 	from CHEWBBACA.utils import (constants as ct,
 								 blast_wrapper as bw,
@@ -36,7 +35,6 @@
 								 sequence_manipulation as sm,
 								 iterables_manipulation as im,
 								 multiprocessing_operations as mo)
-	from CHEWBBACA.utils.parameters_validation import get_blast_version
 
 
 def bsr_categorizer(blast_results, representatives,
@@ -169,8 +167,7 @@ def select_candidate(candidates, proteins, seqids,
 
 
 def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
-			   table_id, size_threshold, blastp_path, makeblastdb_path,
-			   blastdb_aliastool_path):
+			   table_id, size_threshold, blastp_path, makeblastdb_path):
 	"""Adapts a set of loci from an external schema.
 
 	Adapts an external schema for usage with chewBBACA. Removes invalid
@@ -288,12 +285,6 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
 						# Create file with representative seqid to only compare against self
 						id_file = fo.join_paths(locus_temp_dir, [f'{seqid}_ids.txt'])
 						fo.write_lines([seqid], id_file)
-						if blastdb_aliastool_path is not None:
-							binary_file = f'{id_file}.bin'
-							blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
-																id_file,
-																binary_file)
-							id_file = binary_file
 
 						rep_blastout = fo.join_paths(locus_temp_dir, [f'{seqid}_blastout.tsv'])
 						# Cannot get self-alignemnt for some sequences if composition-based stats is enabled
@@ -311,12 +302,6 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len,
 				ids_str = im.join_list([str(i) for i in ids_to_blast if i not in representatives], '\n')
 				ids_file = fo.join_paths(locus_temp_dir, [f'{locus_id}_ids.txt'])
 				fo.write_to_file(ids_str, ids_file, 'w', '')
-				if blastdb_aliastool_path is not None:
-					binary_file = f'{ids_file}.bin'
-					blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path,
-														 ids_file,
-														 binary_file)
-					ids_file = binary_file
 
 				# BLAST representatives against non-represented
 				blast_output = fo.join_paths(locus_temp_dir,
@@ -468,16 +453,10 @@ def main(input_files, output_directories, cpu_cores, blast_score_ratio,
 	# Add common arguments
 	blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
 	makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
-	blast_version = get_blast_version(blast_path)
-	# Determine if it is necessary to run blastdb_aliastool to convert
-	# accession list to binary format based on BLAST version being >2.9
-	blastdb_aliastool_path = None
-	if blast_version['MINOR'] > ct.BLAST_MINOR:
-		blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS])
 	even_loci_groups = [[i, schema_path, schema_short_path,
 						 blast_score_ratio, minimum_length,
 						 translation_table, size_threshold,
-						 blastp_path, makeblastdb_path, blastdb_aliastool_path,
+						 blastp_path, makeblastdb_path,
 						 adapt_loci] for i in even_loci_groups]
 
 	print('Adapting {0} loci...\n'.format(len(genes_list)))

diff --git a/CHEWBBACA/UniprotFinder/annotate_schema.py b/CHEWBBACA/UniprotFinder/annotate_schema.py
@@ -200,21 +200,14 @@ def proteome_annotations(schema_directory, temp_directory, taxa,
 		print('\nDetermining self-score of representatives...', end='')
 		blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS)
 		makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS)
-		blast_version = pv.get_blast_version(blast_path)
-		# Determine if it is necessary to run blastdb_aliastool to convert
-		# accession list to binary format based on BLAST version being >2.9
-		blastdb_aliastool_path = None
-		if blast_version['MINOR'] > ct.BLAST_MINOR:
-			blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS])
-
 		self_scores = cf.determine_self_scores(reps_concat, temp_directory,
-			makeblastdb_path, blastp_path, 'prot', cpu_cores, blastdb_aliastool_path)
+			makeblastdb_path, blastp_path, 'prot', cpu_cores)
 		print('done.')
 
 		# create BLASTdb with proteome sequences
 		proteome_blastdb = fo.join_paths(proteomes_directory,
 										 ['proteomes_db'])
-		db_std = bw.make_blast_db('makeblastdb', proteomes_concat, proteome_blastdb, 'prot')
+		db_std = bw.make_blast_db(makeblastdb_path, proteomes_concat, proteome_blastdb, 'prot')
 
 		# BLASTp to determine annotations
 		blast_inputs = [[blastp_path, proteome_blastdb, file, file+'_blastout.tsv',
@@ -382,7 +375,7 @@ def main(schema_directory, output_directory, genes_list, protein_table,
 		loci_list = pv.validate_loci_list(genes_list, loci_list, schema_directory)
 	# Working with the whole schema
 	else:
-		loci_list = pv.check_input_type(schema_directory, loci_list)
+		loci_list, total_loci = pv.check_input_type(schema_directory, loci_list)
 
 	loci_paths = fo.read_lines(loci_list)
 	loci_basenames = [fo.file_basename(locus, False) for locus in loci_paths]