Merge branch 'main' into YAC-46-tests

KoslickiLab · Oct 23, 2023 · 5ce4847 · 5ce4847
2 parents 6a05aeb + fa4762d
commit 5ce4847
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 11 deletions.
diff --git a/srcs/hypothesis_recovery_src.py b/srcs/hypothesis_recovery_src.py
@@ -254,7 +254,11 @@ def hypothesis_recovery(
     sample_dir = os.path.dirname(sample_file)
     sample_name = os.path.basename(sample_file).replace('.sig.zip', '')
     path_to_sample_temp_dir = os.path.join(sample_dir, f'sample_{sample_name}_intermediate_files')
-    if not os.path.exists(path_to_sample_temp_dir):
+    if os.path.exists(path_to_sample_temp_dir):
+        # if exists, remove it
+        logger.info(f"Removing existing temporary directory: {path_to_sample_temp_dir}")
+        os.system(f'rm -rf {path_to_sample_temp_dir}')
+    else:
         os.makedirs(path_to_sample_temp_dir)
 
     # Find the organisms that have non-zero overlap with the sample

diff --git a/srcs/utils.py b/srcs/utils.py
@@ -85,9 +85,8 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     :param ksize: int (size of kmer)
     :param scale: int (scale factor)
     :param path_to_temp_dir: string (path to the folder to store the intermediate files)
-    :return: a dictionary mapping signature name to a list of its close related genomes (ANI > ani_thresh)
+    :return: a dataframe with symmetric pairwise multisearch result (query_name, match_name)
     """
-    results = {}
 
     # run the sourmash multisearch
     # save signature files to a text file
@@ -96,7 +95,7 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
     sig_files.to_csv(sig_files_path, header=False, index=False)
 
     # convert ani threshold to containment threshold
-    containment_thresh = 0.9*(ani_thresh ** ksize)
+    containment_thresh = (ani_thresh ** ksize)
     cmd = f"sourmash scripts multisearch {sig_files_path} {sig_files_path} -k {ksize} -s {scale} -c {num_threads} -t {containment_thresh} -o {os.path.join(path_to_temp_dir, 'training_multisearch_result.csv')}"
     logger.info(f"Running sourmash multisearch with command: {cmd}")
     exit_code = os.system(cmd)
@@ -105,16 +104,20 @@ def run_multisearch(num_threads: int, ani_thresh: float, ksize: int, scale: int,
 
     # read the multisearch result
     multisearch_result = pd.read_csv(os.path.join(path_to_temp_dir, 'training_multisearch_result.csv'), sep=',', header=0)
-    multisearch_result = multisearch_result.drop_duplicates().reset_index(drop=True)
     multisearch_result = multisearch_result.query('query_name != match_name').reset_index(drop=True)
+
+    # because the multisearch result is not symmetric, that is
+    # we have: A B score but not B A score
+    # we need to make it symmetric
+    A_TO_B = multisearch_result[['query_name','match_name']].drop_duplicates().reset_index(drop=True)
+    B_TO_A = A_TO_B[['match_name','query_name']].rename(columns={'match_name':'query_name','query_name':'match_name'})
+    multisearch_result = pd.concat([A_TO_B, B_TO_A]).drop_duplicates().reset_index(drop=True)
 
-    for query_name, match_name in tqdm(multisearch_result[['query_name', 'match_name']].to_numpy()):
-        if str(query_name) not in results:
-            results[str(query_name)] = [str(match_name)]
-        else:
-            results[str(query_name)].append(str(match_name))
+    # change column type to string
+    multisearch_result['query_name'] = multisearch_result['query_name'].astype(str)
+    multisearch_result['match_name'] = multisearch_result['match_name'].astype(str)
 
-    return results
+    return multisearch_result
 
 def remove_corr_organisms_from_ref(sig_info_dict: Dict[str, Tuple[str, float, int, int]], sig_same_genoms_dict: Dict[str, List[str]]) -> Tuple[Dict[str, List[str]], pd.DataFrame]:
     """