make genome and sample separately have their intermediate folders

KoslickiLab · Oct 19, 2023 · eb1c27a · eb1c27a
1 parent 773107f
commit eb1c27a
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 21 deletions.
diff --git a/make_training_data_from_sketches.py b/make_training_data_from_sketches.py
@@ -95,7 +95,7 @@
     json_file_path = os.path.join(outdir, f'{prefix}_config.json')
     json.dump({'manifest_file_path': manifest_file_path,
                'rep_remove_df_path': rep_remove_df_path,
-               'pathogen_detection_intermediate_files_dir': path_to_temp_dir,
+               'intermediate_files_dir': path_to_temp_dir,
                'scale': scale,
                'ksize': ksize,
                'ani_thresh': ani_thresh}, open(json_file_path, 'w'), indent=4)
diff --git a/run_YACHT.py b/run_YACHT.py
@@ -52,7 +52,7 @@
     # load the config file, ksize, and ani_thresh
     config = json.load(open(json_file_path, 'r'))
     manifest_file_path = config['manifest_file_path']
-    path_to_temp_dir = config['pathogen_detection_intermediate_files_dir']
+    path_to_genome_temp_dir = config['intermediate_files_dir']
     scale = config['scale']
     ksize = config['ksize']
     ani_thresh = config['ani_thresh']
@@ -100,7 +100,7 @@
     else:
         has_raw = True
 
-    manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads)
+    manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_genome_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads)
 
     # remove unnecessary columns
     remove_cols = ['md5sum', 'sample_scale_factor']

diff --git a/srcs/hypothesis_recovery_src.py b/srcs/hypothesis_recovery_src.py
@@ -16,7 +16,7 @@
 logger.remove()
 logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
 
-def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_temp_dir: str) -> List[str]:
+def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str, scale: int, ksize: int, num_threads: int, path_to_genome_temp_dir: str, path_to_sample_temp_dir: str) -> List[str]:
     """
     This function runs the sourmash multisearch to find the organisms that have non-zero overlap with the sample.
     :param manifest: a dataframe with the following columns:
@@ -33,39 +33,39 @@ def get_organisms_with_nonzero_overlap(manifest: pd.DataFrame, sample_file: str,
     :param scale: int (scale factor)
     :param ksize: string (size of kmer)
     :param num_threads: int (number of threads to use for parallelization)
-    :param path_to_temp_dir: string (path to the temporary directory generated by the training step )
+    :param path_to_genome_temp_dir: string (path to the genome temporary directory generated by the training step)
+    :param path_to_sample_temp_dir: string (path to the sample temporary directory)
     :return: a list of organism names that have non-zero overlap with the sample
     """    
     # run the sourmash multisearch
     # prepare the input files for the sourmash multisearch
     # unzip the sourmash signature file to the temporary directory
     logger.info("Unzipping the sample signature zip file")
-    sample_dir = os.path.dirname(sample_file)
     with zipfile.ZipFile(sample_file, 'r') as sample_zip_file:
-        sample_zip_file.extractall(sample_dir)
+        sample_zip_file.extractall(path_to_sample_temp_dir)
 
-    sample_sig_file = pd.DataFrame([os.path.join(sample_dir, 'signatures', sig_file) for sig_file in  os.listdir(os.path.join(sample_dir, 'signatures'))])
-    sample_sig_file_path = os.path.join(path_to_temp_dir, 'sample_sig_file.txt')
+    sample_sig_file = pd.DataFrame([os.path.join(path_to_sample_temp_dir, 'signatures', sig_file) for sig_file in os.listdir(os.path.join(path_to_sample_temp_dir, 'signatures'))])
+    sample_sig_file_path = os.path.join(path_to_sample_temp_dir, 'sample_sig_file.txt')
     sample_sig_file.to_csv(sample_sig_file_path, header=False, index=False)
 
-    organism_sig_file = pd.DataFrame([os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']])
-    organism_sig_file_path = os.path.join(path_to_temp_dir, 'organism_sig_file.txt')
+    organism_sig_file = pd.DataFrame([os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz') for md5sum in manifest['md5sum']])
+    organism_sig_file_path = os.path.join(path_to_sample_temp_dir, 'organism_sig_file.txt')
     organism_sig_file.to_csv(organism_sig_file_path, header=False, index=False)
 
     # run the sourmash multisearch
-    cmd = f"sourmash scripts multisearch {sample_sig_file_path} {organism_sig_file_path} -s {scale} -k {ksize} -c {num_threads} -t 0 -o {os.path.join(path_to_temp_dir, 'sample_multisearch_result.csv')}"
+    cmd = f"sourmash scripts multisearch {sample_sig_file_path} {organism_sig_file_path} -s {scale} -k {ksize} -c {num_threads} -t 0 -o {os.path.join(path_to_sample_temp_dir, 'sample_multisearch_result.csv')}"
     logger.info(f"Running sourmash multisearch with command: {cmd}")
     exit_code = os.system(cmd)
     if exit_code != 0:
         raise ValueError(f"Error running sourmash multisearch with command: {cmd}")
 
     # read the multisearch result
-    multisearch_result = pd.read_csv(os.path.join(path_to_temp_dir, 'sample_multisearch_result.csv'), sep=',', header=0)
+    multisearch_result = pd.read_csv(os.path.join(path_to_sample_temp_dir, 'sample_multisearch_result.csv'), sep=',', header=0)
     multisearch_result = multisearch_result.drop_duplicates().reset_index(drop=True)
 
     return multisearch_result['match_name'].to_list()
 
-def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List[str], sample_sig: sourmash.SourmashSignature, ksize: int, path_to_temp_dir: str) -> Tuple[List[Tuple[int, int]], pd.DataFrame]:
+def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List[str], sample_sig: sourmash.SourmashSignature, ksize: int, path_to_genome_temp_dir: str) -> Tuple[List[Tuple[int, int]], pd.DataFrame]:
     """
     This function gets the unique hashes exclusive to each of the organisms that have non-zero overlap with the sample, and
     then find how many are in the sampe.
@@ -83,7 +83,7 @@ def get_exclusive_hashes(manifest: pd.DataFrame, nontrivial_organism_names: List
     :param sample_sig: the sample signature
     :param ksize: int (size of kmer)
     :param num_threads: int (number of threads to use for parallelization)
-    :param path_to_temp_dir: string (path to the temporary directory generated by the training step)
+    :param path_to_genome_temp_dir: string (path to the genome temporary directory generated by the training step)
     :return: 
         a list of tuples, each tuple contains the following information:
             1. the number of unique hashes exclusive to the organism under consideration
@@ -103,7 +103,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h
     single_occurrence_hashes = set()
     multiple_occurrence_hashes = set()
     for md5sum in tqdm(organism_md5sum_list):
-        sig = load_signature_with_ksize(os.path.join(path_to_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
+        sig = load_signature_with_ksize(os.path.join(path_to_genome_temp_dir, 'signatures', md5sum+'.sig.gz'), ksize)
         for hash in sig.minhash.hashes:
             if hash in multiple_occurrence_hashes:
                 continue
@@ -118,7 +118,7 @@ def __find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_h
     logger.info("Finding hashes that are unique to each organism")
     exclusive_hashes_org = []
     for md5sum in tqdm(organism_md5sum_list, desc='Finding exclusive hashes'):
-        exclusive_hashes_org.append(__find_exclusive_hashes(md5sum, path_to_temp_dir, ksize, single_occurrence_hashes))
+        exclusive_hashes_org.append(__find_exclusive_hashes(md5sum, path_to_genome_temp_dir, ksize, single_occurrence_hashes))
     del single_occurrence_hashes # free up memory
 
     # Get sample hashes
@@ -214,7 +214,7 @@ def single_hyp_test(
 def hypothesis_recovery(
     manifest: pd.DataFrame,
     sample_info_set: Tuple[str, sourmash.SourmashSignature],
-    path_to_temp_dir: str,
+    path_to_genome_temp_dir: str,
     min_coverage_list: List[float],
     scale: int,
     ksize: int,
@@ -237,7 +237,7 @@ def hypothesis_recovery(
                         'sample_scale_factor',
                         'min_coverage'
     :param sample_info_set: a set of information about the sample, including the sample signature location and the sample signature object
-    :param path_to_temp_dir: path to the temporary directory generated by the training step
+    :param path_to_genome_temp_dir: path to the genome temporary directory generated by the training step
     :param min_coverage_list: a list of minimum coverage values
     :param scale: scale factor
     :param ksize: k-mer size
@@ -250,11 +250,18 @@ def hypothesis_recovery(
     # unpack the sample info set
     sample_file, sample_sig = sample_info_set
 
+    # create a temporary directory for the sample
+    sample_dir = os.path.dirname(sample_file)
+    sample_name = os.path.basename(sample_file).replace('.sig.zip', '')
+    path_to_sample_temp_dir = os.path.join(sample_dir, f'sample_{sample_name}_intermediate_files')
+    if not os.path.exists(path_to_sample_temp_dir):
+        os.makedirs(path_to_sample_temp_dir)
+
     # Find the organisms that have non-zero overlap with the sample
-    nontrivial_organism_names = get_organisms_with_nonzero_overlap(manifest, sample_file, scale, ksize, num_threads, path_to_temp_dir)
+    nontrivial_organism_names = get_organisms_with_nonzero_overlap(manifest, sample_file, scale, ksize, num_threads, path_to_genome_temp_dir, path_to_sample_temp_dir)
 
     # Get the unique hashes exclusive to each of the organisms that have non-zero overlap with the sample
-    exclusive_hashes_info, manifest = get_exclusive_hashes(manifest, nontrivial_organism_names, sample_sig, ksize, path_to_temp_dir)
+    exclusive_hashes_info, manifest = get_exclusive_hashes(manifest, nontrivial_organism_names, sample_sig, ksize, path_to_genome_temp_dir)
 
     # Set up the results dataframe columns
     given_columns = [