diff --git a/README.md b/README.md index 3c3c0a1..4cb5c84 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Make Lastz Chains [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -![version](https://img.shields.io/badge/version-2.0.5-blue) +![version](https://img.shields.io/badge/version-2.0.6-blue) [![made-with-Nextflow](https://img.shields.io/badge/Made%20with-Nextflow-23aa62.svg)](https://www.nextflow.io/) Portable Hillerlab solution for generating pairwise genome alignment chains. @@ -83,7 +83,10 @@ The script to be called is `make_chains.py`. A quick test sample: ```bash -./make_chains.py target query test_data/test_reference.fa test_data/test_query.fa --pd test_out -f +# fasta input +./make_chains.py target query test_data/test_reference.fa test_data/test_query.fa --pd test_out -f --chaining_memory 16 +# 2bit file input - pls create 2bit files from fasta using faToTwoBit before +./make_chains.py target query test_data/test_reference.2bit test_data/test_query.2bit --pd test_out -f --chaining_memory 16 ``` ### Full list of the pipeline CLI parameters diff --git a/make_chains.py b/make_chains.py index e6a252a..2b6cd44 100755 --- a/make_chains.py +++ b/make_chains.py @@ -173,17 +173,23 @@ def save_final_chain(parameters: PipelineParameters, project_paths: ProjectPaths shutil.move(last_chain_file, project_paths.final_chain) to_log(f"Saved final chains file to {project_paths.final_chain}") +def _del_file_and_log(path): + os.remove(path) + to_log(f"x {path}") + def cleanup(parameters: PipelineParameters, project_paths: ProjectPaths): """Perform the cleanup.""" if parameters.keep_temp: + to_log("Temp files are not deleted because of the --keep_temp flag") return # cleanup is not necessary dirs_to_del = [ project_paths.chain_run_dir, project_paths.cat_out_dirname, project_paths.lastz_output_dir, project_paths.lastz_working_dir, - project_paths.fill_chain_run_dir + project_paths.fill_chain_run_dir, + project_paths.kent_temp_dir ] to_log("Cleaning up the following directories") for dirname in dirs_to_del: @@ -191,12 +197,13 @@ def cleanup(parameters: PipelineParameters, project_paths: ProjectPaths): shutil.rmtree(dirname) # remove individual temp files - os.remove(project_paths.reference_genome) - os.remove(project_paths.query_genome) - os.remove(project_paths.reference_partitions) - os.remove(project_paths.query_partitions) - os.remove(project_paths.ref_chrom_sizes) - os.remove(project_paths.query_chrom_sizes) + to_log("And the following files:") + _del_file_and_log(project_paths.reference_genome) + _del_file_and_log(project_paths.query_genome) + _del_file_and_log(project_paths.reference_partitions) + _del_file_and_log(project_paths.query_partitions) + _del_file_and_log(project_paths.ref_chrom_sizes) + _del_file_and_log(project_paths.query_chrom_sizes) def run_pipeline(args): @@ -220,12 +227,14 @@ def run_pipeline(args): args.target_name, Constants.TARGET_LABEL, project_paths, - step_executables) + step_executables, + parameters) setup_genome_sequences(args.query_genome, args.query_name, Constants.QUERY_LABEL, project_paths, - step_executables) + step_executables, + parameters) # now execute steps step_manager.execute_steps(parameters, step_executables, project_paths) diff --git a/modules/project_setup_procedures.py b/modules/project_setup_procedures.py index 48fa828..0a4fefd 100644 --- a/modules/project_setup_procedures.py +++ b/modules/project_setup_procedures.py @@ -9,6 +9,7 @@ from modules.project_paths import ProjectPaths from modules.step_executables import StepExecutables from modules.make_chains_logging import to_log +from modules.parameters import PipelineParameters def check_if_twobit(genome_seq_file): @@ -63,14 +64,14 @@ def check_chrom_names_in_fasta(fa_path): return old_to_new_chrom_name -def call_two_bit_to_fa_subprocess(cmd, genome_seq_file): +def call_convert_format_subprocess(cmd, input_path): try: - subprocess.call(cmd, shell=True) + subprocess.call(cmd) # if failed: then it's likely not a fasta except subprocess.CalledProcessError: err_msg = ( f"Error! Could not execute {cmd}\n" - f"Please check whether {genome_seq_file} is " + f"Please check whether {input_path} is " f"a valid fasta or 2bit file.\n" f"Also, make sure twoBitToFa is callable.\n" ) @@ -115,7 +116,8 @@ def setup_genome_sequences(genome_seq_file: str, genome_id: str, label: str, project_paths: ProjectPaths, - executables: StepExecutables): + executables: StepExecutables, + params: PipelineParameters): """Setup genome sequence input. DoBlastzChainNet procedure requires the 2bit-formatted sequence @@ -129,6 +131,12 @@ def setup_genome_sequences(genome_seq_file: str, # https://github.com/hillerlab/make_lastz_chains/issues/3 is_two_bit = check_if_twobit(genome_seq_file) chrom_rename_table_path = None + seq_dir = params.seq_1_dir if label == Constants.TARGET_LABEL else params.seq_2_dir + to_log(f"* Setting up genome sequences for {label}\n" + f"genomeID: {genome_id}\n" + f"input sequence file: {genome_seq_file}\n" + f"is 2bit: {is_two_bit}\n" + f"planned genome dir location: {seq_dir}") # genome seq path -> final destination of 2bit used for further pipeline steps if is_two_bit: @@ -144,22 +152,25 @@ def setup_genome_sequences(genome_seq_file: str, if len(invalid_chrom_names) > 0: # there are invalid chrom names, that need to be renamed # (1) create intermediate fasta and rename chromosomes there + to_log(f"Detected {len(invalid_chrom_names)} invalid chrom names in the input 2bit file") fasta_dump_path = os.path.join(project_paths.project_dir, f"TEMP_{genome_id}_genome_dump.fa") - two_bit_to_fa_cmd = f"{executables.two_bit_to_fa} {genome_seq_file} {fasta_dump_path}" - call_two_bit_to_fa_subprocess(two_bit_to_fa_cmd, genome_seq_file) - genome_seq_file, chrom_rename_table_path = rename_chrom_names_fasta( + two_bit_to_fa_cmd = [executables.two_bit_to_fa, genome_seq_file, fasta_dump_path] + call_convert_format_subprocess(two_bit_to_fa_cmd, genome_seq_file) + to_log(f"Saving intermediate fasta to {fasta_dump_path}") + fixed_fasta_file, chrom_rename_table_path = rename_chrom_names_fasta( fasta_dump_path, project_paths.project_dir, genome_id, invalid_chrom_names ) - genome_seq_path = os.path.abspath( - os.path.join(project_paths.project_dir, f"{label}.2bit") - ) os.remove(fasta_dump_path) # (2) create 2bit with renamed sequences - fa_to_two_bit_cmd = f"{executables.fa_to_two_bit} {genome_seq_file} {genome_seq_path}" - call_two_bit_to_fa_subprocess(fa_to_two_bit_cmd, genome_seq_file) + fa_to_two_bit_cmd = [executables.fa_to_two_bit, fixed_fasta_file, seq_dir] + call_convert_format_subprocess(fa_to_two_bit_cmd, fixed_fasta_file) + to_log(f"Saving fixed fasta file {fixed_fasta_file} to {seq_dir}") else: # no invalid chrom names, use 2bit as is - genome_seq_path = os.path.abspath(genome_seq_file) + arg_input_2bit = os.path.abspath(genome_seq_file) + # but create a symlink + os.symlink(arg_input_2bit, seq_dir) + to_log(f"Created symlink from {arg_input_2bit} to {seq_dir}") else: # fasta, need to convert fasta to 2bit invalid_chrom_names = check_chrom_names_in_fasta(genome_seq_file) @@ -172,16 +183,16 @@ def setup_genome_sequences(genome_seq_file: str, genome_seq_file, project_paths.project_dir, genome_id, invalid_chrom_names ) - genome_seq_path = os.path.abspath(os.path.join(project_paths.project_dir, f"{label}.2bit")) - cmd = f"{executables.fa_to_two_bit} {genome_seq_file} {genome_seq_path}" - call_two_bit_to_fa_subprocess(cmd, genome_seq_file) + two_bit_to_fa_cmd = [executables.fa_to_two_bit, genome_seq_file, seq_dir] + call_convert_format_subprocess(two_bit_to_fa_cmd, genome_seq_file) + to_log(f"Initial fasta file {genome_seq_file} saved to {seq_dir}") # now need to create chrom.sizes file chrom_sizes_filename = f"{label}.chrom.sizes" chrom_sizes_path = os.path.join(project_paths.project_dir, chrom_sizes_filename) # must be without errors now - two_bit_reader = TwoBitFile(genome_seq_path) + two_bit_reader = TwoBitFile(seq_dir) twobit_seq_to_size = two_bit_reader.sequence_sizes() f = open(chrom_sizes_path, "w") @@ -189,7 +200,7 @@ def setup_genome_sequences(genome_seq_file: str, f.write(f"{k}\t{v}\n") f.close() - to_log(f"For {genome_id} ({label}) sequence file: {genome_seq_path}; chrom sizes saved to: {chrom_sizes_path}") + to_log(f"For {genome_id} ({label}) sequence file: {seq_dir}; chrom sizes saved to: {chrom_sizes_path}") if len(invalid_chrom_names) > 0: to_log(f"Warning! Genome sequence file {genome_seq_file}") diff --git a/steps_implementations/partition.py b/steps_implementations/partition.py index c8985a6..f9eace2 100644 --- a/steps_implementations/partition.py +++ b/steps_implementations/partition.py @@ -8,6 +8,7 @@ from modules.project_paths import ProjectPaths from modules.step_executables import StepExecutables from modules.common import GenomicRegion +from modules.error_classes import PipelineFileNotFoundError def create_partition(chrom_sizes, chunk_size, overlap): @@ -60,6 +61,9 @@ def do_partition_for_genome(genome_label: str, else: partition_file_path = project_paths.query_partitions + if not os.path.isfile(seq_dir): + err_msg = f"Error! Could not find genome sequences file {seq_dir}. Please contact developers." + raise PipelineFileNotFoundError(err_msg) # Read chromosome sizes chrom_sizes = read_chrom_sizes(seq_len_file) diff --git a/version.py b/version.py index 3b26fc8..9e712ea 100755 --- a/version.py +++ b/version.py @@ -33,7 +33,7 @@ def to_string(self): return self.version_repr -__version__ = Version(2, 0, 5) +__version__ = Version(2, 0, 6) if __name__ == "__main__": print(f"Make Lastz Chains Version: {__version__}")