From 43927f143d4920862b7fe275a2fe485e97c4ab72 Mon Sep 17 00:00:00 2001 From: Bogdan Kirilenko Date: Thu, 28 Sep 2023 01:21:14 +0200 Subject: [PATCH] Little rearrangement -> not using separate dir for ChainNet and NetFilterNonNested.perl --- .gitignore | 14 +++++++----- .../NetFilterNonNested.perl | 0 HL_kent_binaries/readme.txt | 4 ++++ TODO.md | 6 ++--- chain_clean_micro_env/readme.txt | 7 ------ install_dependencies.py | 7 +----- make_chains.py | 1 + modules/step_executables.py | 22 ++++--------------- steps_implementations/clean_chain_step.py | 4 ++-- 9 files changed, 24 insertions(+), 41 deletions(-) rename {chain_clean_micro_env => HL_kent_binaries}/NetFilterNonNested.perl (100%) delete mode 100644 chain_clean_micro_env/readme.txt diff --git a/.gitignore b/.gitignore index ade9a56..57951f5 100644 --- a/.gitignore +++ b/.gitignore @@ -17,14 +17,18 @@ modules/__pycache__/* */__pycache__/ # Kent binaries -HL_kent_binaries/pslSortAcc +HL_kent_binaries/axtChain +HL_kent_binaries/axtToPsl HL_kent_binaries/chainAntiRepeat -HL_kent_binaries/chainMergeSort HL_kent_binaries/chainCleaner -HL_kent_binaries/chainSort +HL_kent_binaries/chainFilter +HL_kent_binaries/chainMergeSort +HL_kent_binaries/chainNet HL_kent_binaries/chainScore -chain_clean_micro_env/chainNet -/HL_kent_binaries/chainFilter +HL_kent_binaries/chainSort +HL_kent_binaries/faToTwoBit +HL_kent_binaries/pslSortAcc +HL_kent_binaries/twoBitToFa # to be continued # test data diff --git a/chain_clean_micro_env/NetFilterNonNested.perl b/HL_kent_binaries/NetFilterNonNested.perl similarity index 100% rename from chain_clean_micro_env/NetFilterNonNested.perl rename to HL_kent_binaries/NetFilterNonNested.perl diff --git a/HL_kent_binaries/readme.txt b/HL_kent_binaries/readme.txt index 4e4f7f7..752c805 100755 --- a/HL_kent_binaries/readme.txt +++ b/HL_kent_binaries/readme.txt @@ -1,3 +1,7 @@ Directory to store Kent binaries necessary to run the pipeline, which are not included in the $PATH for some reason, and were downloaded using install_dependencies.py script. + +Although NetFilterNonNested.perl is not actually a binary, it's only purpose +is to serve as a dependency to chainCleaner, as well as ChainNet, which is not +used directly by the pipeline. diff --git a/TODO.md b/TODO.md index 4fff0f6..f21e4ac 100644 --- a/TODO.md +++ b/TODO.md @@ -17,13 +17,13 @@ ## Nice to do -- Refactor HL kent dependencies -> maybe it was not necessary to split into 2 dirs? +- ~~Refactor HL kent dependencies -> maybe it was not necessary to split into 2 dirs?~~ -> not split anymore - QC module or something - detailed statistics per each step - Explanation for each pipeline parameter in the parse_args +- Document masking, etc. - nuances that affect the pipeline performance. - ~~Refactor chain gap filler: get rid of chainExtractID dependency -> not needed~~ - ~~read parameters from config file~~ -- Document masking, etc. - nuances that affect the pipeline performance. -- https://github.com/hillerlab/make_lastz_chains/issues/20 - temp files location +- ~~https://github.com/hillerlab/make_lastz_chains/issues/20 - temp files location~~ ## Minor things diff --git a/chain_clean_micro_env/readme.txt b/chain_clean_micro_env/readme.txt deleted file mode 100644 index b28ac74..0000000 --- a/chain_clean_micro_env/readme.txt +++ /dev/null @@ -1,7 +0,0 @@ -Files present here are required to be in the $PATH to run chainCleaner: - -x chainNet -x NetFilterNonNested.perl - -Please make sure you placed chainNet binary in this directory. -Yes, it is just a workaround to make chainCleaner work. diff --git a/install_dependencies.py b/install_dependencies.py index b6294d9..767f552 100755 --- a/install_dependencies.py +++ b/install_dependencies.py @@ -11,7 +11,6 @@ SCRIPT_LOCATION = os.path.abspath(os.path.dirname(__file__)) DESTINATION_DIR = os.path.join(SCRIPT_LOCATION, "HL_kent_binaries") -CHAIN_NET_DIR = os.path.join(SCRIPT_LOCATION, "chain_clean_micro_env") HG_DOWNLOAD_LINK = "https://hgdownload.cse.ucsc.edu/admin/exe/" # OS related @@ -51,11 +50,7 @@ def process_tool(tool_name): # not found, need to acquire download_link = f"{HG_DOWNLOAD_LINK}/{HG_DOWNLOAD_DIRNAME}/{tool_name}" # destination dir for all binaries necessary to run the pipeline is HL_kent_binaries - # chainNet is only necessary for chainCleaner, and is saved to chain_clean_micro_env - # a directory that serves as temporary extension of the $PATH - # only to run chainCleaner - destination_dir = DESTINATION_DIR if tool_name != "chainNet" else CHAIN_NET_DIR - destination = os.path.join(destination_dir, tool_name) + destination = os.path.join(DESTINATION_DIR, tool_name) if os.path.isfile(destination): # if already in destination directory: just skip it diff --git a/make_chains.py b/make_chains.py index 2b6cd44..1c56481 100755 --- a/make_chains.py +++ b/make_chains.py @@ -173,6 +173,7 @@ def save_final_chain(parameters: PipelineParameters, project_paths: ProjectPaths shutil.move(last_chain_file, project_paths.final_chain) to_log(f"Saved final chains file to {project_paths.final_chain}") + def _del_file_and_log(path): os.remove(path) to_log(f"x {path}") diff --git a/modules/step_executables.py b/modules/step_executables.py index e8a8e4d..621e628 100644 --- a/modules/step_executables.py +++ b/modules/step_executables.py @@ -27,11 +27,9 @@ def __init__(self, root_dir): self.chain_cleaner = self.__find_binary(Constants.ToolNames.CHAIN_CLEANER) self.chain_sort = self.__find_binary(Constants.ToolNames.CHAIN_SORT) self.chain_score = self.__find_binary(Constants.ToolNames.CHAIN_SCORE) - self.lastz = self.__find_binary(Constants.ToolNames.LASTZ) + self.chain_net = self.__find_binary(Constants.ToolNames.CHAIN_NET) self.chain_filter = self.__find_binary(Constants.ToolNames.CHAIN_FILTER) - - # ChainNet is special for now - self.chain_net = self.__locate_chain_net(Constants.ToolNames.CHAIN_NET) + self.lastz = self.__find_binary(Constants.ToolNames.LASTZ) self.__check_completeness() @@ -57,16 +55,6 @@ def __find_binary(self, binary_name): to_log(f"* found {binary_name} at {binary_path}") return binary_path - def __locate_chain_net(self, chain_net): - if shutil.which(chain_net): - to_log(f"found {chain_net} in $PATH") - return True - elif os.path.isfile(os.path.join(self.chain_clean_env_dir, chain_net)): - to_log(f"found {chain_net} in {self.chain_clean_env_dir}") - return True - self.not_found.append(chain_net) - return None - def __check_completeness(self): if len(self.not_found) == 0: to_log("All necessary executables found.") @@ -74,10 +62,8 @@ def __check_completeness(self): not_found_bins = "\n".join([f"* {x}" for x in self.not_found]) err_msg = ( f"Error! The following tools not found neither in $PATH nor " - f"in the download dir:\n{not_found_bins}\nPlease note that " - f"chainNet should be placed either in $PATH or in the " - f"{self.chain_clean_env_dir} directory. Other tools are " - f"expected to be either in $PATH or {self.hl_kent_binaries_path}\n" + f"in the download dir:\n{not_found_bins}\n" + f"The tools are expected to be either in $PATH or {self.hl_kent_binaries_path}\n" f"Please use install_dependencies.py to automate the process." ) raise ExecutableNotFoundError(err_msg) diff --git a/steps_implementations/clean_chain_step.py b/steps_implementations/clean_chain_step.py index 5d9ad95..e4996cf 100644 --- a/steps_implementations/clean_chain_step.py +++ b/steps_implementations/clean_chain_step.py @@ -32,9 +32,9 @@ def do_chains_clean(params: PipelineParameters, _intermediate_chain = f"{_output_chain}__temp" _clean_chain_args = params.clean_chain_parameters.split() - # dirty hack to override chainNet not found error + # some Kent binaries and NetFilterNonNested.perl are necessary to run chainCleaner _temp_env = os.environ.copy() - _temp_env["PATH"] = f"{project_paths.chain_clean_micro_env}:" + _temp_env["PATH"] + _temp_env["PATH"] = f"{project_paths.hl_kent_binaries}:" + _temp_env["PATH"] chain_cleaner_cmd = [ executables.chain_cleaner,