diff --git a/CHEWBBACA/AlleleCall/allele_call.py b/CHEWBBACA/AlleleCall/allele_call.py index 48b7a01c..0ca5e05a 100644 --- a/CHEWBBACA/AlleleCall/allele_call.py +++ b/CHEWBBACA/AlleleCall/allele_call.py @@ -30,7 +30,6 @@ sequence_manipulation as sm, iterables_manipulation as im, multiprocessing_operations as mo) - from utils.parameters_validation import get_blast_version except ModuleNotFoundError: from CHEWBBACA.utils import (constants as ct, blast_wrapper as bw, @@ -42,7 +41,6 @@ sequence_manipulation as sm, iterables_manipulation as im, multiprocessing_operations as mo) - from CHEWBBACA.utils.parameters_validation import get_blast_version def compute_loci_modes(loci_files, output_file): @@ -1751,8 +1749,7 @@ def create_missing_fasta(class_files, fasta_file, input_map, dna_hashtable, def select_representatives(representative_candidates, locus, fasta_file, iteration, output_directory, blastp_path, - blast_db, blast_score_ratio, threads, - blastdb_aliastool_path): + blast_db, blast_score_ratio, threads): """Select new representative alleles for a locus. Parameters @@ -1777,10 +1774,6 @@ def select_representatives(representative_candidates, locus, fasta_file, BLAST Score Ratio value. threads : int Number of threads passed to BLAST. - blastdb_aliastool_path : str or None - Path to the `blastdb_aliastool` executable if it is - necessary to convert the list of seqids to binary file - format, None otherwise. Returns ------- @@ -1795,13 +1788,6 @@ def select_representatives(representative_candidates, locus, fasta_file, ids_file = fo.join_paths(output_directory, ['{0}_candidates_ids_{1}.fasta'.format(locus, iteration)]) fo.write_lines(list(representative_candidates.keys()), ids_file) - # Convert to binary format if BLAST>=2.10 - if blastdb_aliastool_path is not None: - binary_file = f'{ids_file}.bin' - blastp_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - ids_file, - binary_file) - ids_file = binary_file # BLASTp to compare all candidates blast_output = fo.join_paths(output_directory, @@ -2287,12 +2273,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory, # Define BLASTp and makeblastdb paths blastp_path = fo.join_paths(config['BLAST path'], [ct.BLASTP_ALIAS]) makeblastdb_path = fo.join_paths(config['BLAST path'], [ct.MAKEBLASTDB_ALIAS]) - blast_version = get_blast_version(config['BLAST path']) - # Determine if it is necessary to run blastdb_aliastool to convert - # accession list to binary format based on BLAST version being >2.9 - blastdb_aliastool_path = None - if blast_version['MINOR'] > ct.BLAST_MINOR: - blastdb_aliastool_path = fo.join_paths(config['BLAST path'], [ct.BLASTDB_ALIASTOOL_ALIAS]) # Concatenate representative FASTA files concat_reps = fo.join_paths(reps_protein_dir, ['loci_to_call_translated_representatives.fasta']) @@ -2313,8 +2293,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory, self_scores = cf.determine_self_scores(concat_full_reps, self_score_dir, makeblastdb_path, blastp_path, 'prot', - config['CPU cores'], - blastdb_aliastool_path) + config['CPU cores']) fo.pickle_dumper(self_scores, self_score_file) else: self_scores = fo.pickle_loader(self_score_file) @@ -2368,7 +2347,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory, clustering_dir, blastp_path, makeblastdb_path, config['CPU cores'], - blastdb_aliastool_path, True) blast_files = im.flatten_list(blast_results) @@ -2499,12 +2477,6 @@ def allele_calling(fasta_files, schema_directory, temp_directory, # Create text file with unclassified seqids remaining_seqids_file = fo.join_paths(iteration_directory, ['unclassified_seqids_{0}.txt'.format(iteration)]) fo.write_lines(unclassified_seqids, remaining_seqids_file) - if blastdb_aliastool_path is not None: - binary_file = f'{remaining_seqids_file}.bin' - blastp_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - remaining_seqids_file, - binary_file) - remaining_seqids_file = binary_file # BLAST representatives against unclassified sequences # Iterative process until no more sequences are classified @@ -2660,7 +2632,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory, if len(v) > 1: representative_inputs.append([current_candidates, k, fasta_file, iteration, blast_selection_dir, blastp_path, - blast_db, config['BLAST Score Ratio'], 1, blastdb_aliastool_path, + blast_db, config['BLAST Score Ratio'], 1, select_representatives]) # Single candidate else: @@ -2715,7 +2687,7 @@ def allele_calling(fasta_files, schema_directory, temp_directory, fo.create_directory(candidates_blast_dir) new_self_scores = cf.determine_self_scores(concat_repy, candidates_blast_dir, makeblastdb_path, blastp_path, - 'prot', config['CPU cores'], blastdb_aliastool_path) + 'prot', config['CPU cores']) # This includes self-score for candidates that are not added # (e.g. classification changes due to multiple matches) diff --git a/CHEWBBACA/CHEWBBACA_NS/download_schema.py b/CHEWBBACA/CHEWBBACA_NS/download_schema.py index 8f2240c3..9e0edcde 100755 --- a/CHEWBBACA/CHEWBBACA_NS/download_schema.py +++ b/CHEWBBACA/CHEWBBACA_NS/download_schema.py @@ -508,7 +508,7 @@ def main(species_id, schema_id, download_folder, cpu_cores, fo.create_directory(schema_path_short) loci_list = fo.join_paths(schema_path, [ct.LOCI_LIST]) - loci_list = pv.check_input_type(download_folder, loci_list) + loci_list, total_loci = pv.check_input_type(download_folder, loci_list) # Determine representatives and create schema # Do not apply minimum length and size threshold values diff --git a/CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py b/CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py index 99bfd011..9478cf53 100755 --- a/CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py +++ b/CHEWBBACA/CHEWBBACA_NS/synchronize_schema.py @@ -990,7 +990,7 @@ def main(schema_directory, cpu_cores, nomenclature_server, schema_short_directory = fo.join_paths(schema_directory, ['short']) if attributed > 0 or count > 0: loci_list = fo.join_paths(schema_directory, [ct.LOCI_LIST]) - loci_list = pv.check_input_type(temp_dir, loci_list) + loci_list, total_loci = pv.check_input_type(temp_dir, loci_list) adapt_schema.main(loci_list, [schema_directory, schema_short_directory], cpu_cores, float(schema_params['bsr'][0]), 0, 11, None, blast_path) diff --git a/CHEWBBACA/CreateSchema/create_schema.py b/CHEWBBACA/CreateSchema/create_schema.py index 56903c0b..73678a98 100644 --- a/CHEWBBACA/CreateSchema/create_schema.py +++ b/CHEWBBACA/CreateSchema/create_schema.py @@ -28,7 +28,6 @@ sequence_manipulation as sm, iterables_manipulation as im, multiprocessing_operations as mo) - from utils.parameters_validation import get_blast_version except ModuleNotFoundError: from CHEWBBACA.utils import (constants as ct, blast_wrapper as bw, @@ -38,7 +37,6 @@ sequence_manipulation as sm, iterables_manipulation as im, multiprocessing_operations as mo) - from CHEWBBACA.utils.parameters_validation import get_blast_version def create_schema_structure(schema_seed_fasta, output_directory, schema_name): @@ -306,12 +304,6 @@ def create_schema_seed(fasta_files, output_directory, schema_name, ptf_path, # Define BLASTp and makeblastdb paths blastp_path = fo.join_paths(blast_path, [ct.BLASTP_ALIAS]) makeblastdb_path = fo.join_paths(blast_path, [ct.MAKEBLASTDB_ALIAS]) - blast_version = get_blast_version(blast_path) - # Determine if it is necessary to run blastdb_aliastool to convert - # accession list to binary format based on BLAST version being >2.9 - blastdb_aliastool_path = None - if blast_version['MINOR'] > ct.BLAST_MINOR: - blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS]) # All-vs-all BLASTp per cluster if len(clusters) > 0: @@ -319,8 +311,7 @@ def create_schema_seed(fasta_files, output_directory, schema_name, ptf_path, print('Performing all-vs-all BLASTp per cluster...') blast_results, blast_results_dir = cf.blast_clusters(clusters, representative_pfasta, clustering_dir, blastp_path, - makeblastdb_path, cpu_cores, - blastdb_aliastool_path) + makeblastdb_path, cpu_cores) blast_files = im.flatten_list(blast_results) @@ -481,25 +472,6 @@ def main(input_files, output_directory, schema_name, ptf_path, If provided, intermediate files generated during process execution are not removed at the end. """ - print(f'Prodigal training file: {ptf_path}') - print(f'Prodigal mode: {prodigal_mode}') - print(f'CPU cores: {cpu_cores}') - print(f'BLAST Score Ratio: {blast_score_ratio}') - print(f'Translation table: {translation_table}') - print(f'Minimum sequence length: {minimum_length}') - print(f'Size threshold: {size_threshold}') - print(f'Word size: {word_size}') - print(f'Window size: {window_size}') - print(f'Clustering similarity: {clustering_sim}') - print(f'Representative filter: {representative_filter}') - print(f'Intra-cluster filter: {intra_filter}') - - if prodigal_mode == 'meta' and ptf_path is not None: - print('Prodigal mode is set to "meta". Will add training file to ' - 'the schema, but will not use it for gene prediction during ' - 'schema creation.') - ptf_path = None - # Read file with paths to input files input_files = fo.read_lines(input_files, strip=True) @@ -517,5 +489,4 @@ def main(input_files, output_directory, schema_name, ptf_path, if no_cleanup is False: exists = fo.delete_directory(results[1]) - # Print message about schema that was created - print('Created schema seed with {0} loci.'.format(len(results[0]))) + return len(results[0]) diff --git a/CHEWBBACA/PrepExternalSchema/adapt_schema.py b/CHEWBBACA/PrepExternalSchema/adapt_schema.py index e490d0a8..07578b6c 100644 --- a/CHEWBBACA/PrepExternalSchema/adapt_schema.py +++ b/CHEWBBACA/PrepExternalSchema/adapt_schema.py @@ -27,7 +27,6 @@ sequence_manipulation as sm, iterables_manipulation as im, multiprocessing_operations as mo) - from utils.parameters_validation import get_blast_version except ModuleNotFoundError: from CHEWBBACA.utils import (constants as ct, blast_wrapper as bw, @@ -36,7 +35,6 @@ sequence_manipulation as sm, iterables_manipulation as im, multiprocessing_operations as mo) - from CHEWBBACA.utils.parameters_validation import get_blast_version def bsr_categorizer(blast_results, representatives, @@ -169,8 +167,7 @@ def select_candidate(candidates, proteins, seqids, def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len, - table_id, size_threshold, blastp_path, makeblastdb_path, - blastdb_aliastool_path): + table_id, size_threshold, blastp_path, makeblastdb_path): """Adapts a set of loci from an external schema. Adapts an external schema for usage with chewBBACA. Removes invalid @@ -288,12 +285,6 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len, # Create file with representative seqid to only compare against self id_file = fo.join_paths(locus_temp_dir, [f'{seqid}_ids.txt']) fo.write_lines([seqid], id_file) - if blastdb_aliastool_path is not None: - binary_file = f'{id_file}.bin' - blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - id_file, - binary_file) - id_file = binary_file rep_blastout = fo.join_paths(locus_temp_dir, [f'{seqid}_blastout.tsv']) # Cannot get self-alignemnt for some sequences if composition-based stats is enabled @@ -311,12 +302,6 @@ def adapt_loci(loci, schema_path, schema_short_path, bsr, min_len, ids_str = im.join_list([str(i) for i in ids_to_blast if i not in representatives], '\n') ids_file = fo.join_paths(locus_temp_dir, [f'{locus_id}_ids.txt']) fo.write_to_file(ids_str, ids_file, 'w', '') - if blastdb_aliastool_path is not None: - binary_file = f'{ids_file}.bin' - blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - ids_file, - binary_file) - ids_file = binary_file # BLAST representatives against non-represented blast_output = fo.join_paths(locus_temp_dir, @@ -468,16 +453,10 @@ def main(input_files, output_directories, cpu_cores, blast_score_ratio, # Add common arguments blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS) makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS) - blast_version = get_blast_version(blast_path) - # Determine if it is necessary to run blastdb_aliastool to convert - # accession list to binary format based on BLAST version being >2.9 - blastdb_aliastool_path = None - if blast_version['MINOR'] > ct.BLAST_MINOR: - blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS]) even_loci_groups = [[i, schema_path, schema_short_path, blast_score_ratio, minimum_length, translation_table, size_threshold, - blastp_path, makeblastdb_path, blastdb_aliastool_path, + blastp_path, makeblastdb_path, adapt_loci] for i in even_loci_groups] print('Adapting {0} loci...\n'.format(len(genes_list))) diff --git a/CHEWBBACA/UniprotFinder/annotate_schema.py b/CHEWBBACA/UniprotFinder/annotate_schema.py index 60162d27..4049f618 100755 --- a/CHEWBBACA/UniprotFinder/annotate_schema.py +++ b/CHEWBBACA/UniprotFinder/annotate_schema.py @@ -200,21 +200,14 @@ def proteome_annotations(schema_directory, temp_directory, taxa, print('\nDetermining self-score of representatives...', end='') blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS) makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS) - blast_version = pv.get_blast_version(blast_path) - # Determine if it is necessary to run blastdb_aliastool to convert - # accession list to binary format based on BLAST version being >2.9 - blastdb_aliastool_path = None - if blast_version['MINOR'] > ct.BLAST_MINOR: - blastdb_aliastool_path = fo.join_paths(blast_path, [ct.BLASTDB_ALIASTOOL_ALIAS]) - self_scores = cf.determine_self_scores(reps_concat, temp_directory, - makeblastdb_path, blastp_path, 'prot', cpu_cores, blastdb_aliastool_path) + makeblastdb_path, blastp_path, 'prot', cpu_cores) print('done.') # create BLASTdb with proteome sequences proteome_blastdb = fo.join_paths(proteomes_directory, ['proteomes_db']) - db_std = bw.make_blast_db('makeblastdb', proteomes_concat, proteome_blastdb, 'prot') + db_std = bw.make_blast_db(makeblastdb_path, proteomes_concat, proteome_blastdb, 'prot') # BLASTp to determine annotations blast_inputs = [[blastp_path, proteome_blastdb, file, file+'_blastout.tsv', @@ -382,7 +375,7 @@ def main(schema_directory, output_directory, genes_list, protein_table, loci_list = pv.validate_loci_list(genes_list, loci_list, schema_directory) # Working with the whole schema else: - loci_list = pv.check_input_type(schema_directory, loci_list) + loci_list, total_loci = pv.check_input_type(schema_directory, loci_list) loci_paths = fo.read_lines(loci_list) loci_basenames = [fo.file_basename(locus, False) for locus in loci_paths] diff --git a/CHEWBBACA/chewBBACA.py b/CHEWBBACA/chewBBACA.py index 034b634f..f6991621 100755 --- a/CHEWBBACA/chewBBACA.py +++ b/CHEWBBACA/chewBBACA.py @@ -62,9 +62,6 @@ synchronize_schema, stats_requests) -version = __version__ - - @pdt.process_timer def run_create_schema(): """Run the CreateSchema module to create a schema seed.""" @@ -183,26 +180,44 @@ def msg(name=None): # Get translation table used to create training file ptf_table = gp.read_training_file(args.ptf_path).translation_table args.translation_table = ptf_table + print('Provided training file. Using translation table used to create training file.') else: if not args.translation_table: args.translation_table = ct.GENETIC_CODES_DEFAULT + print(f'Did not provide training file and translation table. Using default translation table ({ct.GENETIC_CODES_DEFAULT})') + + print(f'Prodigal training file: {args.ptf_path}') + print(f'Prodigal mode: {args.prodigal_mode}') + if args.prodigal_mode == 'meta' and args.ptf_path is not None: + print('Prodigal mode is set to "meta". Will add training file to ' + 'the schema, but will not use it for gene prediction during ' + 'schema creation.') + args.ptf_path = None # Check if translation table is supported pv.translation_table_type([args.translation_table]) + print(f'Translation table: {args.translation_table}') + + print(f'CPU cores: {args.cpu_cores}') + print(f'BLAST Score Ratio: {args.blast_score_ratio}') + print(f'Minimum sequence length: {args.minimum_length}') + print(f'Size threshold: {args.size_threshold}') # Create output directory created = fo.create_directory(args.output_directory) if created is False: sys.exit(ct.OUTPUT_DIRECTORY_EXISTS) + print(f'Output directory: {args.output_directory}') genome_list = fo.join_paths(args.output_directory, [ct.GENOME_LIST]) - args.input_files = pv.check_input_type(args.input_files, genome_list) + args.input_files, total_inputs = pv.check_input_type(args.input_files, genome_list) # Detect if some inputs share the same unique prefix repeated_prefixes = pv.check_unique_prefixes(args.input_files) # Detect if filenames include blank spaces blank_spaces = pv.check_blanks(args.input_files) # Check if any input file has an unique prefix >= 50 characters long_prefixes = pv.check_prefix_length(args.input_files) + print(f'Number of inputs: {total_inputs}') # Add clustering parameters args.word_size = ct.WORD_SIZE_DEFAULT @@ -212,7 +227,8 @@ def msg(name=None): args.intra_filter = ct.INTRA_CLUSTER_DEFAULT # Run the CreateSchema process - create_schema.main(**vars(args)) + nloci = create_schema.main(**vars(args)) + print(f'Created schema seed with {nloci} loci.') schema_dir = os.path.join(args.output_directory, args.schema_name) # Copy Prodigal Training File (PTF) to schema directory @@ -221,14 +237,17 @@ def msg(name=None): shutil.copy(args.ptf_path, schema_dir) # Determine PTF checksum ptf_hash = fo.hash_file(args.ptf_path, hashlib.blake2b()) + print(f'Copied Prodigal training file to schema seed directory.') # Write schema config file args.minimum_length = ct.MSL_MIN args.ptf_path = ptf_hash - schema_config = pv.write_schema_config(vars(args), version, schema_dir) + schema_config = pv.write_schema_config(vars(args), __version__, schema_dir) + print(f'Wrote schema config file to {schema_config}') # Create the file with the list of genes/loci pv.write_gene_list(schema_dir) + print(f'Wrote list of loci to {os.path.join(schema_dir, ct.LOCI_LIST)}') # Remove temporary file with paths to input genomes fo.remove_files([genome_list]) @@ -469,10 +488,10 @@ def msg(name=None): args.schema_directory) # Working with the whole schema else: - loci_list = pv.check_input_type(args.schema_directory, loci_list) + loci_list, total_loci = pv.check_input_type(args.schema_directory, loci_list) genome_list = fo.join_paths(args.output_directory, [ct.GENOME_LIST]) - genome_list = pv.check_input_type(args.input_files, genome_list) + genome_list, total_inputs = pv.check_input_type(args.input_files, genome_list) # Detect if some inputs share the same unique prefix repeated_prefixes = pv.check_unique_prefixes(genome_list) # Detect if filenames include blank spaces @@ -625,7 +644,7 @@ def msg(name=None): args.schema_directory) # Working with the whole schema else: - loci_list = pv.check_input_type(args.schema_directory, loci_list) + loci_list, total_loci = pv.check_input_type(args.schema_directory, loci_list) args.genes_list = loci_list @@ -998,7 +1017,7 @@ def msg(name=None): args.schema_directory) # Working with the whole schema else: - loci_list = pv.check_input_type(args.schema_directory, loci_list) + loci_list, total_loci = pv.check_input_type(args.schema_directory, loci_list) print(f'Number of cores: {args.cpu_cores}') print(f'BLAST Score Ratio: {args.blast_score_ratio}') @@ -1040,7 +1059,7 @@ def msg(name=None): args.clustering_sim = ct.CLUSTERING_SIMILARITY_DEFAULT args.representative_filter = ct.REPRESENTATIVE_FILTER_DEFAULT args.intra_filter = ct.INTRA_CLUSTER_DEFAULT - schema_config = pv.write_schema_config(vars(args), version, schema_path) + schema_config = pv.write_schema_config(vars(args), __version__, schema_path) # Create hidden file with list of loci genes_list_file = pv.write_gene_list(schema_path) @@ -1468,17 +1487,16 @@ def main(): 'and schemas in Chewie-NS.', run_stats_requests]} + print(f'chewBBACA version: {__version__}') version_triggers = ['-v', '--v', '-version', '--version'] if len(sys.argv) > 1 and sys.argv[1] in version_triggers: - # Print version and exit - print('chewBBACA version: {0}'.format(version)) + # Exit after printing version sys.exit(0) - print('chewBBACA version: {0}'.format(version)) - print('Authors: {0}'.format(ct.authors)) - print('Github: {0}'.format(ct.repository)) - print('Documentation: {0}'.format(ct.documentation)) - print('Contacts: {0}\n'.format(ct.contacts)) + print(f'Authors: {ct.AUTHORS}') + print(f'Github: {ct.REPOSITORY}') + print(f'Documentation: {ct.DOCUMENTATION}') + print(f'Contacts: {ct.CONTACTS}\n') # Display help message if selected process is not valid help_triggers = ['-h', '--h', '-help', '--help'] diff --git a/CHEWBBACA/utils/blast_wrapper.py b/CHEWBBACA/utils/blast_wrapper.py index d88bdc21..48205d05 100644 --- a/CHEWBBACA/utils/blast_wrapper.py +++ b/CHEWBBACA/utils/blast_wrapper.py @@ -45,10 +45,11 @@ def make_blast_db(makeblastdb_path, input_fasta, output_path, db_type): stderr : bytes or str BLAST stderr. """ - # use '-parse-seqids' to be able to retrieve/align sequences by identifier + # Use '-parse-seqids' to be able to specify sequences to align against + # Use v4 databases for performance reasons (no need to convert list of sequence IDs with blastdb_aliastool) makedb_cmd = [makeblastdb_path, '-in', input_fasta, '-out', output_path, '-parse_seqids', - '-dbtype', db_type] + '-dbtype', db_type, '-blastdb_version', '4'] makedb_process = subprocess.Popen(makedb_cmd, stdout=subprocess.PIPE, @@ -178,38 +179,38 @@ def run_blast(blast_path, blast_db, fasta_file, blast_output, return [stdout, stderr] -def run_blastdb_aliastool(blastdb_aliastool_path, seqid_infile, seqid_outfile): - """Convert list of sequence identifiers into binary format. - - Parameters - ---------- - blastdb_aliastool_path : str - Path to the blastdb_aliastool executable. - seqid_infile : str - Path to the file that contains the list of sequence identifiers. - seqid_outfile : str - Path to the output file in binary format to pass to the -seqidlist - parameter of BLAST>=2.10. - - Returns - ------- - stdout : bytes - BLAST stdout. - stderr : bytes or str - BLAST stderr. - """ - blastdb_aliastool_args = [blastdb_aliastool_path, '-seqid_file_in', - seqid_infile, '-seqid_file_out', seqid_outfile] - - blastdb_aliastool_process = subprocess.Popen(blastdb_aliastool_args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - stdout, stderr = blastdb_aliastool_process.communicate() - - # Exit if it is not possible to create BLAST db - if len(stderr) > 0: - sys.exit(f'Could not convert {seqid_infile} to binary format.\n' - f'{blastdb_aliastool_path} returned the following error:\n{stderr}') - - return [stdout, stderr] +# def run_blastdb_aliastool(blastdb_aliastool_path, seqid_infile, seqid_outfile): +# """Convert list of sequence identifiers into binary format. + +# Parameters +# ---------- +# blastdb_aliastool_path : str +# Path to the blastdb_aliastool executable. +# seqid_infile : str +# Path to the file that contains the list of sequence identifiers. +# seqid_outfile : str +# Path to the output file in binary format to pass to the -seqidlist +# parameter of BLAST>=2.10. + +# Returns +# ------- +# stdout : bytes +# BLAST stdout. +# stderr : bytes or str +# BLAST stderr. +# """ +# blastdb_aliastool_args = [blastdb_aliastool_path, '-seqid_file_in', +# seqid_infile, '-seqid_file_out', seqid_outfile] + +# blastdb_aliastool_process = subprocess.Popen(blastdb_aliastool_args, +# stdout=subprocess.PIPE, +# stderr=subprocess.PIPE) + +# stdout, stderr = blastdb_aliastool_process.communicate() + +# # Exit if it is not possible to create BLAST db +# if len(stderr) > 0: +# sys.exit(f'Could not convert {seqid_infile} to binary format.\n' +# f'{blastdb_aliastool_path} returned the following error:\n{stderr}') + +# return [stdout, stderr] diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py index b5568305..08648b00 100755 --- a/CHEWBBACA/utils/constants.py +++ b/CHEWBBACA/utils/constants.py @@ -130,13 +130,13 @@ 'local': 'http://127.0.0.1:5000/NS/api/'} # Authors, GitHub repository, documentation, tutorial and contacts -authors = 'Rafael Mamede, Pedro Cerqueira, Mickael Silva, João Carriço, Mário Ramirez' -repository = 'https://github.com/B-UMMI/chewBBACA' -documentation = 'https://chewbbaca.readthedocs.io/en/latest/index.html' -contacts = 'imm-bioinfo@medicina.ulisboa.pt' +AUTHORS = 'Rafael Mamede, Pedro Cerqueira, Mickael Silva, João Carriço, Mário Ramirez' +REPOSITORY = 'https://github.com/B-UMMI/chewBBACA' +DOCUMENTATION = 'https://chewbbaca.readthedocs.io/en/latest/index.html' +CONTACTS = 'imm-bioinfo@medicina.ulisboa.pt' # Timeout, in seconds, to wait for user input -prompt_timeout = 30 +PROMPT_TIMEOUT = 30 # Minimum MAJOR and MINOR BLAST versions BLAST_MAJOR = 2 @@ -238,7 +238,7 @@ BLAST_TASK_THRESHOLD = {'blastn': 50, 'blastp': 30} # BLAST outfmt -BLAST_DEFAULT_OUTFMT = '6 qseqid qstart qend qlen sseqid slen score' +BLAST_DEFAULT_OUTFMT = '6 qaccver qstart qend qlen saccver slen score' # Input file prefix maximum length PREFIX_MAXLEN = 30 @@ -610,15 +610,23 @@ INPUTS_SHARE_PREFIX = ('The following input files share the same filename prefix ' '(substring before the first "." in the filename):\n{0}\n' 'Please ensure that every input file has a unique ' - 'filename prefix.') + 'filename prefix. This is necessary to unambiguously ' + 'identify each input during intermediate steps and in ' + 'output files.') INPUTS_INCLUDE_BLANKS = ('The following input files include blank spaces ' 'in the filename:\n{0}\nPlease ensure that filenames ' 'do not include blank spaces or special characters ' - '(e.g. !@#?$^*()+)') + '(e.g. !@#?$^*()+). This is necessarry to avoid issues ' + 'related to how these characters are recognized by ' + 'chewBBACA and its dependencies.') INPUTS_LONG_PREFIX = ('The following input files have a prefix longer than ' '30 characters:\n{0}\nPlease make sure that input ' - 'files have a shorter and unique prefix.') + 'files have a shorter and unique prefix (substring before ' + 'the first "." in the filename). The prefixes are used as ' + 'unique identifiers and long prefixes might lead to issues ' + '(e.g. BLAST does not accept sequence IDs longer than 50 ' + 'characters when creating a database).') MISSING_INPUT_ARG = ('Path to input files does not exist. Please provide a valid path.') diff --git a/CHEWBBACA/utils/core_functions.py b/CHEWBBACA/utils/core_functions.py index 0f0d97a6..b4e69738 100644 --- a/CHEWBBACA/utils/core_functions.py +++ b/CHEWBBACA/utils/core_functions.py @@ -719,7 +719,7 @@ def cluster_intra_filter(clusters, sequences, word_size, def blast_clusters(clusters, sequences, output_directory, blastp_path, makeblastdb_path, cpu_cores, - blastdb_aliastool_path, only_rep=False): + only_rep=False): """Use BLAST to align sequences in the same clusters. Parameters @@ -743,9 +743,6 @@ def blast_clusters(clusters, sequences, output_directory, Path to the `makeblastdb` executable. cpu_cores : int Number of BLASTp processes to run in parallel. - blastdb_aliastool_path : str or NoneType - Path to the blastalias_tool executable if it is necessary to - convert seqid files to binary format, NoneType otherwise only_rep Returns @@ -771,7 +768,7 @@ def blast_clusters(clusters, sequences, output_directory, process_num = 20 if cpu_cores <= 20 else cpu_cores splitted_seqids = mo.distribute_loci(seqids_to_blast, process_num, 'seqcount') common_args = [sequences, blastp_results_dir, blastp_path, - blast_db, blastdb_aliastool_path, only_rep, sc.cluster_blaster] + blast_db, only_rep, sc.cluster_blaster] splitted_seqids = [[s, *common_args] for s in splitted_seqids] # BLAST sequences in a cluster against every sequence in that cluster @@ -805,7 +802,7 @@ def compute_bsr(subject_score, query_score): def determine_self_scores(fasta_file, output_directory, makeblastdb_path, - blast_path, db_type, blast_threads, blastdb_aliastool_path): + blast_path, db_type, blast_threads): """Compute the self-alignment raw score for sequences in a FASTA file. Parameters @@ -823,9 +820,6 @@ def determine_self_scores(fasta_file, output_directory, makeblastdb_path, protein (prot). blast_threads : int Number of threads/cores used to run BLAST. - blastdb_aliastool_path : str or NoneType - Path to the blastalias_tool executable if it is necessary to - convert seqid files to binary format, NoneType otherwise Returns ------- @@ -863,16 +857,6 @@ def determine_self_scores(fasta_file, output_directory, makeblastdb_path, seqids_file = fo.join_paths(above_outdir, [fo.file_basename(f[0], False)]) fo.write_lines(seqids, seqids_file) seqids_files.append(seqids_file) - - if blastdb_aliastool_path is not None: - binary_seqid_files = [] - for file in seqids_files: - binary_file = f'{file}.bin' - blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - file, - binary_file) - binary_seqid_files.append(binary_file) - seqids_files = binary_seqid_files # This should not happen or be very rare, but just in case else: split_fastas = [] @@ -896,12 +880,6 @@ def determine_self_scores(fasta_file, output_directory, makeblastdb_path, seqids = below_outfile[1] seqids_file = fo.join_paths(output_directory, [fo.file_basename(below_outfile[2], False)]) fo.write_lines(seqids, seqids_file) - if blastdb_aliastool_path is not None: - binary_file = f'{seqids_file}.bin' - blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - seqids_file, - binary_file) - seqids_file = binary_file below_blastout = '{0}/{1}_blastout.tsv'.format(final_blastp_dir, fo.file_basename(below_outfile[2], False)) @@ -947,13 +925,6 @@ def determine_self_scores(fasta_file, output_directory, makeblastdb_path, # Create file with representative seqid to only compare against self id_file = fo.join_paths(output_directory, [f'{current_rep.id}_ids.txt']) fo.write_lines([current_rep.id], id_file) - if blastdb_aliastool_path is not None: - binary_file = f'{id_file}.bin' - blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - id_file, - binary_file) - id_file = binary_file - rep_blastout = fo.join_paths(output_directory, [f'{current_rep.id}_blastout.tsv']) # Cannot get self-alignemnt for some sequences if composition-based stats is enabled blast_std = bw.run_blast(blast_path, blast_db, rep_file, diff --git a/CHEWBBACA/utils/parameters_validation.py b/CHEWBBACA/utils/parameters_validation.py index 9b45b770..f28e964d 100644 --- a/CHEWBBACA/utils/parameters_validation.py +++ b/CHEWBBACA/utils/parameters_validation.py @@ -752,9 +752,9 @@ def validate_ptf_hash(ptf_hash, schema_ptfs, force_continue): ptf_num = len(schema_ptfs) if force_continue is False: if ptf_num == 1: - ptf_answer = fo.input_timeout(ct.DIFFERENT_PTF_PROMPT, ct.prompt_timeout) + ptf_answer = fo.input_timeout(ct.DIFFERENT_PTF_PROMPT, ct.PROMPT_TIMEOUT) if ptf_num > 1: - ptf_answer = fo.input_timeout(ct.MULTIPLE_PTF_PROMPT.format(ptf_num), ct.prompt_timeout) + ptf_answer = fo.input_timeout(ct.MULTIPLE_PTF_PROMPT.format(ptf_num), ct.PROMPT_TIMEOUT) else: ptf_answer = 'yes' @@ -880,7 +880,7 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio, params_diffs_text += ['{:^20} {:^20} {:^10}'.format(p[0], p[1], p[2]) for p in params_diffs] print('\n'.join(params_diffs_text)) if force_continue is False: - params_answer = fo.input_timeout(ct.ARGS_DIFFER_PROMPT, ct.prompt_timeout) + params_answer = fo.input_timeout(ct.ARGS_DIFFER_PROMPT, ct.PROMPT_TIMEOUT) else: params_answer = 'yes' @@ -1051,14 +1051,14 @@ def check_input_type(input_path, output_file): """ # Input path is for a file if os.path.isfile(input_path): - output_file = validate_input_file(input_path, output_file) + output_file, total_inputs = validate_input_file(input_path, output_file) # Input path is for a directory elif os.path.isdir(input_path): - output_file = validate_input_dir(input_path, output_file) + output_file, total_inputs = validate_input_dir(input_path, output_file) else: sys.exit(ct.INVALID_INPUT_PATH) - return output_file + return output_file, total_inputs def validate_input_file(input_path, output_file): @@ -1120,7 +1120,7 @@ def validate_input_file(input_path, output_file): else: fo.write_lines(files, output_file) - return output_file + return output_file, len(files) def validate_input_dir(input_path, output_file): @@ -1163,7 +1163,7 @@ def validate_input_dir(input_path, output_file): else: sys.exit(ct.MISSING_FASTAS_EXCEPTION) - return output_file + return output_file, len(files) def validate_loci_list(input_path, output_file, parent_dir=None): diff --git a/CHEWBBACA/utils/sequence_clustering.py b/CHEWBBACA/utils/sequence_clustering.py index 30333a43..de641dd0 100644 --- a/CHEWBBACA/utils/sequence_clustering.py +++ b/CHEWBBACA/utils/sequence_clustering.py @@ -427,7 +427,7 @@ def representative_pruner(clusters, sim_cutoff): def cluster_blaster(seqids, sequences, output_directory, - blast_path, blastdb_path, blastdb_aliastool_path, only_rep=False): + blast_path, blastdb_path, only_rep=False): """Align sequences in the same cluster with BLAST. Parameters @@ -474,13 +474,6 @@ def cluster_blaster(seqids, sequences, output_directory, blast_output = os.path.join(output_directory, '{0}_blastout.tsv'.format(cluster_id)) - if blastdb_aliastool_path is not None: - binary_file = f'{ids_file}.bin' - blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, - ids_file, - binary_file) - ids_file = binary_file - # Use subprocess to capture errors and warnings blast_std = bw.run_blast(blast_path, blastdb_path, fasta_file, blast_output, 1, 1,