From eb1c27a736ce98c43f14e85169398c097e2eaf53 Mon Sep 17 00:00:00 2001 From: Chunyu Ma Date: Thu, 19 Oct 2023 03:02:05 -0400 Subject: [PATCH] make genome and sample separately have their intermediate folders --- make_training_data_from_sketches.py | 2 +- run_YACHT.py | 4 +-- srcs/hypothesis_recovery_src.py | 43 +++++++++++++++++------------ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py index a77c068..190b021 100644 --- a/make_training_data_from_sketches.py +++ b/make_training_data_from_sketches.py @@ -95,7 +95,7 @@ json_file_path = os.path.join(outdir, f'{prefix}_config.json') json.dump({'manifest_file_path': manifest_file_path, 'rep_remove_df_path': rep_remove_df_path, - 'pathogen_detection_intermediate_files_dir': path_to_temp_dir, + 'intermediate_files_dir': path_to_temp_dir, 'scale': scale, 'ksize': ksize, 'ani_thresh': ani_thresh}, open(json_file_path, 'w'), indent=4) diff --git a/run_YACHT.py b/run_YACHT.py index f109171..b37c86b 100644 --- a/run_YACHT.py +++ b/run_YACHT.py @@ -52,7 +52,7 @@ # load the config file, ksize, and ani_thresh config = json.load(open(json_file_path, 'r')) manifest_file_path = config['manifest_file_path'] - path_to_temp_dir = config['pathogen_detection_intermediate_files_dir'] + path_to_genome_temp_dir = config['intermediate_files_dir'] scale = config['scale'] ksize = config['ksize'] ani_thresh = config['ani_thresh'] @@ -100,7 +100,7 @@ else: has_raw = True - manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads) + manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_genome_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads) # remove unnecessary columns remove_cols = ['md5sum', 'sample_scale_factor'] diff --git a/srcs/hypothesis_recovery_src.py b/srcs/hypothesis_recovery_src.py index 77e7df0..2ef2bcb 100644 --- a/srcs/hypothesis_recovery_src.py +++ b/srcs/hypothesis_recovery_src.py @@ -16,7 +16,7 @@ logger.remove() logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO") -def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_temp_dir: str) -> List[str]: +def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_genome_temp_dir: str, path_to_sample_temp_dir: str) -> List[str]: """ This function runs the sourmash multisearch to find the organisms that have non-zero overlap with the sample. :param manifest: a dataframe with the following columns: @@ -33,39 +33,39 @@ def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, :param scale: int (scale factor) :param ksize: string (size of kmer) :param num_threads: int (number of threads to use for parallelization) - :param path_to_temp_dir: string (path to the temporary directory generated by the training step ) + :param path_to_genome_temp_dir: string (path to the genome temporary directory generated by the training step) + :param path_to_sample_temp_dir: string (path to the sample temporary directory) :return: a list of organism names that have non-zero overlap with the sample """ # run the sourmash multisearch # prepare the input files for the sourmash multisearch # unzip the sourmash signature file to the temporary directory logger.info("Unzipping the sample signature zip file") - sample_dir = os.path.dirname(sample_file) with zipfile.ZipFile(sample_file, 'r') as sample_zip_file: - sample_zip_file.extractall(sample_dir) + sample_zip_file.extractall(path_to_sample_temp_dir) - sample_sig_file = pd.DataFrame([os.path.join(sample_dir, 'signatures', sig_file) for sig_file in os.listdir(os.path.join(sample_dir, 'signatures'))]) - sample_sig_file_path = os.path.join(path_to_temp_dir, 'sample_sig_file.txt') + sample_sig_file = pd.DataFrame([os.path.join(path_to_sample_temp_dir, 'signatures', sig_file) for sig_file in os.listdir(os.path.join(path_to_sample_temp_dir, 'signatures'))]) + sample_sig_file_path = os.path.join(path_to_sample_temp_dir, 'sample_sig_file.txt') sample_sig_file.to_csv(sample_sig_file_path, header=False, index=False) - organism_sig_file = pd.DataFrame([os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']]) - organism_sig_file_path = os.path.join(path_to_temp_dir, 'organism_sig_file.txt') + organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']]) + organism_sig_file_path = os.path.join(path_to_sample_temp_dir, 'organism_sig_file.txt') organism_sig_file.to_csv(organism_sig_file_path, header=False, index=False) # run the sourmash multisearch - cmd = f"sourmash scripts multisearch {sample_sig_file_path} {organism_sig_file_path} -s {scale} -k {ksize} -c {num_threads} -t 0 -o {os.path.join(path_to_temp_dir, 'sample_multisearch_result.csv')}" + cmd = f"sourmash scripts multisearch {sample_sig_file_path} {organism_sig_file_path} -s {scale} -k {ksize} -c {num_threads} -t 0 -o {os.path.join(path_to_sample_temp_dir, 'sample_multisearch_result.csv')}" logger.info(f"Running sourmash multisearch with command: {cmd}") exit_code = os.system(cmd) if exit_code != 0: raise ValueError(f"Error running sourmash multisearch with command: {cmd}") # read the multisearch result - multisearch_result = pd.read_csv(os.path.join(path_to_temp_dir, 'sample_multisearch_result.csv'), sep=',', header=0) + multisearch_result = pd.read_csv(os.path.join(path_to_sample_temp_dir, 'sample_multisearch_result.csv'), sep=',', header=0) multisearch_result = multisearch_result.drop_duplicates().reset_index(drop=True) return multisearch_result['match_name'].to_list() -def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List[str], sample_sig: sourmash.SourmashSignature, ksize: int, path_to_temp_dir: str) -> Tuple[List[Tuple[int, int]], pd.DataFrame]: +def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List[str], sample_sig: sourmash.SourmashSignature, ksize: int, path_to_genome_temp_dir: str) -> Tuple[List[Tuple[int, int]], pd.DataFrame]: """ This function gets the unique hashes exclusive to each of the organisms that have non-zero overlap with the sample, and then find how many are in the sampe. @@ -83,7 +83,7 @@ def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List :param sample_sig: the sample signature :param ksize: int (size of kmer) :param num_threads: int (number of threads to use for parallelization) - :param path_to_temp_dir: string (path to the temporary directory generated by the training step) + :param path_to_genome_temp_dir: string (path to the genome temporary directory generated by the training step) :return: a list of tuples, each tuple contains the following information: 1. the number of unique hashes exclusive to the organism under consideration @@ -103,7 +103,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h single_occurrence_hashes = set() multiple_occurrence_hashes = set() for md5sum in tqdm(organism_md5sum_list): - sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize) + sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize) for hash in sig.minhash.hashes: if hash in multiple_occurrence_hashes: continue @@ -118,7 +118,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h logger.info("Finding hashes that are unique to each organism") exclusive_hashes_org = [] for md5sum in tqdm(organism_md5sum_list, desc='Finding exclusive hashes'): - exclusive_hashes_org.append(__find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_hashes)) + exclusive_hashes_org.append(__find_exclusive_hashes(md5sum, path_to_genome_temp_dir, ksize, single_occurrence_hashes)) del single_occurrence_hashes # free up memory # Get sample hashes @@ -214,7 +214,7 @@ def single_hyp_test( def hypothesis_recovery( manifest: pd.DataFrame, sample_info_set: Tuple[str, sourmash.SourmashSignature], - path_to_temp_dir: str, + path_to_genome_temp_dir: str, min_coverage_list: List[float], scale: int, ksize: int, @@ -237,7 +237,7 @@ def hypothesis_recovery( 'sample_scale_factor', 'min_coverage' :param sample_info_set: a set of information about the sample, including the sample signature location and the sample signature object - :param path_to_temp_dir: path to the temporary directory generated by the training step + :param path_to_genome_temp_dir: path to the genome temporary directory generated by the training step :param min_coverage_list: a list of minimum coverage values :param scale: scale factor :param ksize: k-mer size @@ -250,11 +250,18 @@ def hypothesis_recovery( # unpack the sample info set sample_file, sample_sig = sample_info_set + # create a temporary directory for the sample + sample_dir = os.path.dirname(sample_file) + sample_name = os.path.basename(sample_file).replace('.sig.zip', '') + path_to_sample_temp_dir = os.path.join(sample_dir, f'sample_{sample_name}_intermediate_files') + if not os.path.exists(path_to_sample_temp_dir): + os.makedirs(path_to_sample_temp_dir) + # Find the organisms that have non-zero overlap with the sample - nontrivial_organism_names = get_organisms_with_nonzero_overlap(manifest, sample_file, scale, ksize, num_threads, path_to_temp_dir) + nontrivial_organism_names = get_organisms_with_nonzero_overlap(manifest, sample_file, scale, ksize, num_threads, path_to_genome_temp_dir, path_to_sample_temp_dir) # Get the unique hashes exclusive to each of the organisms that have non-zero overlap with the sample - exclusive_hashes_info, manifest = get_exclusive_hashes(manifest, nontrivial_organism_names, sample_sig, ksize, path_to_temp_dir) + exclusive_hashes_info, manifest = get_exclusive_hashes(manifest, nontrivial_organism_names, sample_sig, ksize, path_to_genome_temp_dir) # Set up the results dataframe columns given_columns = [