Using multiple REAL ID prefixes

swansonk14 · Sep 8, 2022 · 2d60445 · 2d60445
1 parent 4e7cefe
commit 2d60445
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -89,8 +89,6 @@ python map_real_reactions_to_reagents.py \
 
 Total number of molecules = 31,507,987,117
 
-TODO: in the future, could also try to determine valid reaction sites
-
 
 ### Count REAL Reactions
 
@@ -543,16 +541,13 @@ done
 
 Map generated molecules to REAL IDs in the format expected by Enamine.
 
-TODO: How to select between "M" and "S" type for ID.
-
 ```bash
 #!/bin/bash
 
 for NAME in mcts_AB_combined_RF_rdkit mcts_AB_combined_chemprop mcts_AB_combined_chemprop_rdkit random
 do
 python map_generated_molecules_to_real_ids.py \
     --data_path generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50.csv \
-    --smiles_save_path generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50_real_ids.csv \
-    --sdf_save_path generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50_real_ids.sdf
+    --save_dir generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50_real_ids
 done
 ```
diff --git a/map_generated_molecules_to_real_ids.py b/map_generated_molecules_to_real_ids.py
@@ -9,12 +9,10 @@
 
 class Args(Tap):
     data_path: Path  # Path to CSV file containing generated molecules with reaction and reagent IDs.
-    smiles_save_path: Path  # Path to CSV file where molecule SMILES and REAL IDs will be saved.
-    sdf_save_path: Path  # Path to SDF file where molecules and REAL IDs will be saved.
+    save_dir: Path  # Path to directory where CSV and SDF files with REAL IDs will be saved.
 
     def process_args(self) -> None:
-        self.smiles_save_path.parent.mkdir(parents=True, exist_ok=True)
-        self.sdf_save_path.parent.mkdir(parents=True, exist_ok=True)
+        self.save_dir.mkdir(parents=True, exist_ok=True)
 
 
 def map_generated_molecules_to_real_ids(args: Args) -> None:
@@ -32,23 +30,31 @@ def map_generated_molecules_to_real_ids(args: Args) -> None:
         column for column in data.columns if column.startswith('reagent_1_') and column.endswith('_id')
     )
 
-    # Create new DataFrame with SMILES and REAL IDs
-    real_data = pd.DataFrame(data=[
-        {
-            'real_id': f'm_{row["reaction_1_id"]}____'
-                       f'{"____".join(str(int(reagent_id)) for reagent_id in row[reagent_columns].dropna())}',
-            'smiles': row['smiles'],
-            'mol': Chem.MolFromSmiles(row['smiles'])
-        } for _, row in data.iterrows()
-    ])
-
-    # Save data as SDF
-    with open(args.sdf_save_path, 'w') as f:
-        WriteSDF(real_data, f, molColName='mol', idName='real_id')
-
-    # Save data as CSV
-    del real_data['mol']
-    real_data.to_csv(args.smiles_save_path, index=False)
+    # Compute REAL IDs without prefixes
+    real_ids = [
+        f'{row["reaction_1_id"]}____{"____".join(str(int(reagent_id)) for reagent_id in row[reagent_columns].dropna())}'
+        for _, row in data.iterrows()
+    ]
+
+    # Compute mols
+    mols = [Chem.MolFromSmiles(smiles) for smiles in data['smiles']]
+
+    # Loop through prefixes
+    for prefix in ['m', 's']:
+        # Create new DataFrame with molecules and REAL IDs
+        real_data = pd.DataFrame(data={
+            'real_id': [f'{prefix}_{real_id}' for real_id in real_ids],
+            'smiles': data['smiles'],
+            'mol': mols
+        })
+
+        # Save data as SDF
+        with open(args.save_dir / f'type_{prefix}.sdf', 'w') as f:
+            WriteSDF(real_data, f, molColName='mol', idName='real_id')
+
+        # Save data as CSV
+        del real_data['mol']
+        real_data.to_csv(args.save_dir / f'type_{prefix}.csv', index=False)
 
 
 if __name__ == '__main__':