Skip to content

Commit

Permalink
Using multiple REAL ID prefixes
Browse files Browse the repository at this point in the history
  • Loading branch information
swansonk14 committed Sep 8, 2022
1 parent 4e7cefe commit 2d60445
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 27 deletions.
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,6 @@ python map_real_reactions_to_reagents.py \

Total number of molecules = 31,507,987,117

TODO: in the future, could also try to determine valid reaction sites


### Count REAL Reactions

Expand Down Expand Up @@ -543,16 +541,13 @@ done

Map generated molecules to REAL IDs in the format expected by Enamine.

TODO: How to select between "M" and "S" type for ID.

```bash
#!/bin/bash

for NAME in mcts_AB_combined_RF_rdkit mcts_AB_combined_chemprop mcts_AB_combined_chemprop_rdkit random
do
python map_generated_molecules_to_real_ids.py \
--data_path generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50.csv \
--smiles_save_path generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50_real_ids.csv \
--sdf_save_path generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50_real_ids.sdf
--save_dir generations/${NAME}/molecules_train_sim_below_0.5_chembl_sim_below_0.5_top_20_percent_selected_50_real_ids
done
```
48 changes: 27 additions & 21 deletions map_generated_molecules_to_real_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@

class Args(Tap):
data_path: Path # Path to CSV file containing generated molecules with reaction and reagent IDs.
smiles_save_path: Path # Path to CSV file where molecule SMILES and REAL IDs will be saved.
sdf_save_path: Path # Path to SDF file where molecules and REAL IDs will be saved.
save_dir: Path # Path to directory where CSV and SDF files with REAL IDs will be saved.

def process_args(self) -> None:
self.smiles_save_path.parent.mkdir(parents=True, exist_ok=True)
self.sdf_save_path.parent.mkdir(parents=True, exist_ok=True)
self.save_dir.mkdir(parents=True, exist_ok=True)


def map_generated_molecules_to_real_ids(args: Args) -> None:
Expand All @@ -32,23 +30,31 @@ def map_generated_molecules_to_real_ids(args: Args) -> None:
column for column in data.columns if column.startswith('reagent_1_') and column.endswith('_id')
)

# Create new DataFrame with SMILES and REAL IDs
real_data = pd.DataFrame(data=[
{
'real_id': f'm_{row["reaction_1_id"]}____'
f'{"____".join(str(int(reagent_id)) for reagent_id in row[reagent_columns].dropna())}',
'smiles': row['smiles'],
'mol': Chem.MolFromSmiles(row['smiles'])
} for _, row in data.iterrows()
])

# Save data as SDF
with open(args.sdf_save_path, 'w') as f:
WriteSDF(real_data, f, molColName='mol', idName='real_id')

# Save data as CSV
del real_data['mol']
real_data.to_csv(args.smiles_save_path, index=False)
# Compute REAL IDs without prefixes
real_ids = [
f'{row["reaction_1_id"]}____{"____".join(str(int(reagent_id)) for reagent_id in row[reagent_columns].dropna())}'
for _, row in data.iterrows()
]

# Compute mols
mols = [Chem.MolFromSmiles(smiles) for smiles in data['smiles']]

# Loop through prefixes
for prefix in ['m', 's']:
# Create new DataFrame with molecules and REAL IDs
real_data = pd.DataFrame(data={
'real_id': [f'{prefix}_{real_id}' for real_id in real_ids],
'smiles': data['smiles'],
'mol': mols
})

# Save data as SDF
with open(args.save_dir / f'type_{prefix}.sdf', 'w') as f:
WriteSDF(real_data, f, molColName='mol', idName='real_id')

# Save data as CSV
del real_data['mol']
real_data.to_csv(args.save_dir / f'type_{prefix}.csv', index=False)


if __name__ == '__main__':
Expand Down

0 comments on commit 2d60445

Please sign in to comment.