Skip to content

Commit

Permalink
Fixed 2bit input handling
Browse files Browse the repository at this point in the history
  • Loading branch information
kirilenkobm committed Sep 21, 2023
1 parent 160f572 commit 9f16d8b
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 30 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Make Lastz Chains

[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
![version](https://img.shields.io/badge/version-2.0.5-blue)
![version](https://img.shields.io/badge/version-2.0.6-blue)
[![made-with-Nextflow](https://img.shields.io/badge/Made%20with-Nextflow-23aa62.svg)](https://www.nextflow.io/)

Portable Hillerlab solution for generating pairwise genome alignment chains.
Expand Down Expand Up @@ -83,7 +83,10 @@ The script to be called is `make_chains.py`.
A quick test sample:

```bash
./make_chains.py target query test_data/test_reference.fa test_data/test_query.fa --pd test_out -f
# fasta input
./make_chains.py target query test_data/test_reference.fa test_data/test_query.fa --pd test_out -f --chaining_memory 16
# 2bit file input - pls create 2bit files from fasta using faToTwoBit before
./make_chains.py target query test_data/test_reference.2bit test_data/test_query.2bit --pd test_out -f --chaining_memory 16
```

### Full list of the pipeline CLI parameters
Expand Down
27 changes: 18 additions & 9 deletions make_chains.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,30 +173,37 @@ def save_final_chain(parameters: PipelineParameters, project_paths: ProjectPaths
shutil.move(last_chain_file, project_paths.final_chain)
to_log(f"Saved final chains file to {project_paths.final_chain}")

def _del_file_and_log(path):
os.remove(path)
to_log(f"x {path}")


def cleanup(parameters: PipelineParameters, project_paths: ProjectPaths):
"""Perform the cleanup."""
if parameters.keep_temp:
to_log("Temp files are not deleted because of the --keep_temp flag")
return # cleanup is not necessary
dirs_to_del = [
project_paths.chain_run_dir,
project_paths.cat_out_dirname,
project_paths.lastz_output_dir,
project_paths.lastz_working_dir,
project_paths.fill_chain_run_dir
project_paths.fill_chain_run_dir,
project_paths.kent_temp_dir
]
to_log("Cleaning up the following directories")
for dirname in dirs_to_del:
to_log(f"x {dirname}")
shutil.rmtree(dirname)

# remove individual temp files
os.remove(project_paths.reference_genome)
os.remove(project_paths.query_genome)
os.remove(project_paths.reference_partitions)
os.remove(project_paths.query_partitions)
os.remove(project_paths.ref_chrom_sizes)
os.remove(project_paths.query_chrom_sizes)
to_log("And the following files:")
_del_file_and_log(project_paths.reference_genome)
_del_file_and_log(project_paths.query_genome)
_del_file_and_log(project_paths.reference_partitions)
_del_file_and_log(project_paths.query_partitions)
_del_file_and_log(project_paths.ref_chrom_sizes)
_del_file_and_log(project_paths.query_chrom_sizes)


def run_pipeline(args):
Expand All @@ -220,12 +227,14 @@ def run_pipeline(args):
args.target_name,
Constants.TARGET_LABEL,
project_paths,
step_executables)
step_executables,
parameters)
setup_genome_sequences(args.query_genome,
args.query_name,
Constants.QUERY_LABEL,
project_paths,
step_executables)
step_executables,
parameters)

# now execute steps
step_manager.execute_steps(parameters, step_executables, project_paths)
Expand Down
47 changes: 29 additions & 18 deletions modules/project_setup_procedures.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from modules.project_paths import ProjectPaths
from modules.step_executables import StepExecutables
from modules.make_chains_logging import to_log
from modules.parameters import PipelineParameters


def check_if_twobit(genome_seq_file):
Expand Down Expand Up @@ -63,14 +64,14 @@ def check_chrom_names_in_fasta(fa_path):
return old_to_new_chrom_name


def call_two_bit_to_fa_subprocess(cmd, genome_seq_file):
def call_convert_format_subprocess(cmd, input_path):
try:
subprocess.call(cmd, shell=True)
subprocess.call(cmd)
# if failed: then it's likely not a fasta
except subprocess.CalledProcessError:
err_msg = (
f"Error! Could not execute {cmd}\n"
f"Please check whether {genome_seq_file} is "
f"Please check whether {input_path} is "
f"a valid fasta or 2bit file.\n"
f"Also, make sure twoBitToFa is callable.\n"
)
Expand Down Expand Up @@ -115,7 +116,8 @@ def setup_genome_sequences(genome_seq_file: str,
genome_id: str,
label: str,
project_paths: ProjectPaths,
executables: StepExecutables):
executables: StepExecutables,
params: PipelineParameters):
"""Setup genome sequence input.
DoBlastzChainNet procedure requires the 2bit-formatted sequence
Expand All @@ -129,6 +131,12 @@ def setup_genome_sequences(genome_seq_file: str,
# https://github.com/hillerlab/make_lastz_chains/issues/3
is_two_bit = check_if_twobit(genome_seq_file)
chrom_rename_table_path = None
seq_dir = params.seq_1_dir if label == Constants.TARGET_LABEL else params.seq_2_dir
to_log(f"* Setting up genome sequences for {label}\n"
f"genomeID: {genome_id}\n"
f"input sequence file: {genome_seq_file}\n"
f"is 2bit: {is_two_bit}\n"
f"planned genome dir location: {seq_dir}")

# genome seq path -> final destination of 2bit used for further pipeline steps
if is_two_bit:
Expand All @@ -144,22 +152,25 @@ def setup_genome_sequences(genome_seq_file: str,
if len(invalid_chrom_names) > 0:
# there are invalid chrom names, that need to be renamed
# (1) create intermediate fasta and rename chromosomes there
to_log(f"Detected {len(invalid_chrom_names)} invalid chrom names in the input 2bit file")
fasta_dump_path = os.path.join(project_paths.project_dir, f"TEMP_{genome_id}_genome_dump.fa")
two_bit_to_fa_cmd = f"{executables.two_bit_to_fa} {genome_seq_file} {fasta_dump_path}"
call_two_bit_to_fa_subprocess(two_bit_to_fa_cmd, genome_seq_file)
genome_seq_file, chrom_rename_table_path = rename_chrom_names_fasta(
two_bit_to_fa_cmd = [executables.two_bit_to_fa, genome_seq_file, fasta_dump_path]
call_convert_format_subprocess(two_bit_to_fa_cmd, genome_seq_file)
to_log(f"Saving intermediate fasta to {fasta_dump_path}")
fixed_fasta_file, chrom_rename_table_path = rename_chrom_names_fasta(
fasta_dump_path, project_paths.project_dir, genome_id, invalid_chrom_names
)
genome_seq_path = os.path.abspath(
os.path.join(project_paths.project_dir, f"{label}.2bit")
)
os.remove(fasta_dump_path)
# (2) create 2bit with renamed sequences
fa_to_two_bit_cmd = f"{executables.fa_to_two_bit} {genome_seq_file} {genome_seq_path}"
call_two_bit_to_fa_subprocess(fa_to_two_bit_cmd, genome_seq_file)
fa_to_two_bit_cmd = [executables.fa_to_two_bit, fixed_fasta_file, seq_dir]
call_convert_format_subprocess(fa_to_two_bit_cmd, fixed_fasta_file)
to_log(f"Saving fixed fasta file {fixed_fasta_file} to {seq_dir}")
else:
# no invalid chrom names, use 2bit as is
genome_seq_path = os.path.abspath(genome_seq_file)
arg_input_2bit = os.path.abspath(genome_seq_file)
# but create a symlink
os.symlink(arg_input_2bit, seq_dir)
to_log(f"Created symlink from {arg_input_2bit} to {seq_dir}")
else:
# fasta, need to convert fasta to 2bit
invalid_chrom_names = check_chrom_names_in_fasta(genome_seq_file)
Expand All @@ -172,24 +183,24 @@ def setup_genome_sequences(genome_seq_file: str,
genome_seq_file, project_paths.project_dir, genome_id, invalid_chrom_names
)

genome_seq_path = os.path.abspath(os.path.join(project_paths.project_dir, f"{label}.2bit"))
cmd = f"{executables.fa_to_two_bit} {genome_seq_file} {genome_seq_path}"
call_two_bit_to_fa_subprocess(cmd, genome_seq_file)
two_bit_to_fa_cmd = [executables.fa_to_two_bit, genome_seq_file, seq_dir]
call_convert_format_subprocess(two_bit_to_fa_cmd, genome_seq_file)
to_log(f"Initial fasta file {genome_seq_file} saved to {seq_dir}")

# now need to create chrom.sizes file
chrom_sizes_filename = f"{label}.chrom.sizes"
chrom_sizes_path = os.path.join(project_paths.project_dir, chrom_sizes_filename)

# must be without errors now
two_bit_reader = TwoBitFile(genome_seq_path)
two_bit_reader = TwoBitFile(seq_dir)
twobit_seq_to_size = two_bit_reader.sequence_sizes()

f = open(chrom_sizes_path, "w")
for k, v in twobit_seq_to_size.items():
f.write(f"{k}\t{v}\n")
f.close()

to_log(f"For {genome_id} ({label}) sequence file: {genome_seq_path}; chrom sizes saved to: {chrom_sizes_path}")
to_log(f"For {genome_id} ({label}) sequence file: {seq_dir}; chrom sizes saved to: {chrom_sizes_path}")

if len(invalid_chrom_names) > 0:
to_log(f"Warning! Genome sequence file {genome_seq_file}")
Expand Down
4 changes: 4 additions & 0 deletions steps_implementations/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from modules.project_paths import ProjectPaths
from modules.step_executables import StepExecutables
from modules.common import GenomicRegion
from modules.error_classes import PipelineFileNotFoundError


def create_partition(chrom_sizes, chunk_size, overlap):
Expand Down Expand Up @@ -60,6 +61,9 @@ def do_partition_for_genome(genome_label: str,
else:
partition_file_path = project_paths.query_partitions

if not os.path.isfile(seq_dir):
err_msg = f"Error! Could not find genome sequences file {seq_dir}. Please contact developers."
raise PipelineFileNotFoundError(err_msg)
# Read chromosome sizes
chrom_sizes = read_chrom_sizes(seq_len_file)

Expand Down
2 changes: 1 addition & 1 deletion version.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def to_string(self):
return self.version_repr


__version__ = Version(2, 0, 5)
__version__ = Version(2, 0, 6)

if __name__ == "__main__":
print(f"Make Lastz Chains Version: {__version__}")
Expand Down

0 comments on commit 9f16d8b

Please sign in to comment.