Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merging tmqm branch #33

Merged
merged 11 commits into from
Feb 9, 2024
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@

Open Quantum Data Commons

## Setup Datasets

Use the scripts in `setup/` to download the datasets. For more information, see the [README](setup/README.md) in the `setup/` directory.

# Install the library in dev mode
### Installing openQDC
```bash
git clone [email protected]:OpenDrugDiscovery/openQDC.git
cd openQDC
# use mamba/conda
mamba env create -n openqdc -f env.yml
pip install -e .
```

## Development lifecycle

### Tests

You can run tests locally with:
Expand Down
19 changes: 0 additions & 19 deletions env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,22 @@ dependencies:
- tqdm
- loguru
- fsspec
- s3fs
- gcsfs
- joblib
- prettytable
- pyrootutils

# Scientific
- pandas
- numpy
- scipy
- sympy

# Chem
- ipdb
- datamol #==0.9.0
- rdkit #-pypi #==2022.9.3
- ase

# ML
#- einops =0.6.0
- pytorch
- dscribe

# other stuffs
- h5py >=3.8.0
- gdown #==4.6.4

# Viz
- matplotlib
- seaborn
- ipywidgets
- nglview

# Dev
- pytest >=6.0
- pytest-cov
Expand All @@ -49,8 +32,6 @@ dependencies:
- jupyterlab
- pre-commit
- ruff
- ipykernel
- isort

# Doc
- mkdocs
Expand Down
42 changes: 33 additions & 9 deletions src/openqdc/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import numpy as np
import pandas as pd
import torch
from ase.io.extxyz import write_extxyz
from loguru import logger
from sklearn.utils import Bunch
Expand Down Expand Up @@ -82,7 +81,7 @@ def read_qc_archive_h5(
return samples


class BaseDataset(torch.utils.data.Dataset):
class BaseDataset:
__energy_methods__ = []
__force_methods__ = []
energy_target_names = []
Expand Down Expand Up @@ -143,10 +142,18 @@ def _precompute_statistics(self, overwrite_local_cache: bool = False):
logger.info("Loaded precomputed statistics")
else:
logger.info("Precomputing relevant statistics")
(formation_E_mean, formation_E_std, total_E_mean, total_E_std) = self._precompute_E()
(
inter_E_mean,
inter_E_std,
formation_E_mean,
formation_E_std,
total_E_mean,
total_E_std,
) = self._precompute_E()
forces_dict = self._precompute_F()
stats = {
"formation": {"energy": {"mean": formation_E_mean, "std": formation_E_std}, "forces": forces_dict},
"inter": {"energy": {"mean": inter_E_mean, "std": inter_E_std}, "forces": forces_dict},
"total": {"energy": {"mean": total_E_mean, "std": total_E_std}, "forces": forces_dict},
}
with open(local_path, "wb") as f:
Expand All @@ -162,20 +169,24 @@ def _precompute_E(self):
s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
s[:, 1] += IsolatedAtomEnergyFactory.max_charge
matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
converted_energy_data = self.convert_energy(self.data["energies"])
converted_energy_data = self.data["energies"]
# calculation per molecule formation energy statistics
E = []
for i, matrix in enumerate(matrixs):
c = np.cumsum(np.append([0], matrix))[splits_idx]
c[1:] = c[1:] - c[:-1]
E.append(converted_energy_data[:, i] - c)
E = np.array(E).T
inter_E_mean = np.nanmean(E / self.data["n_atoms"][:, None], axis=0)
inter_E_std = np.nanstd(E / self.data["n_atoms"][:, None], axis=0)
formation_E_mean = np.nanmean(E, axis=0)
formation_E_std = np.nanstd(E, axis=0)
total_E_mean = np.nanmean(converted_energy_data, axis=0)
total_E_std = np.nanstd(converted_energy_data, axis=0)

return (
np.atleast_2d(inter_E_mean),
np.atleast_2d(inter_E_std),
np.atleast_2d(formation_E_mean),
np.atleast_2d(formation_E_std),
np.atleast_2d(total_E_mean),
Expand Down Expand Up @@ -253,12 +264,25 @@ def data_shapes(self):
"forces": (-1, 3, len(self.force_target_names)),
}

@property
def atoms_per_molecules(self):
try:
if hasattr(self, "_n_atoms"):
return self._n_atoms
self._n_atoms = self.data["n_atoms"]
return self._n_atoms
except: # noqa
return None

def _set_units(self, en, ds):
old_en, old_ds = self.energy_unit, self.distance_unit
if en is not None:
self.set_energy_unit(en)
if ds is not None:
self.set_distance_unit(ds)
en = en if en is not None else old_en
ds = ds if ds is not None else old_ds

# if en is None:
self.set_energy_unit(en)
# if ds is not None:
self.set_distance_unit(ds)
if self.__force_methods__:
self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit
self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__)
Expand Down Expand Up @@ -556,7 +580,7 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T
"""
Get the statistics of the dataset.
normalization : str, optional
Type of energy, by default "formation", must be one of ["formation", "total"]
Type of energy, by default "formation", must be one of ["formation", "total", "inter"]
return_none : bool, optional
Whether to return None if the statistics for the forces are not available, by default True
Otherwise, the statistics for the forces are set to 0.0
Expand Down
17 changes: 11 additions & 6 deletions src/openqdc/datasets/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,22 @@ class Dummy(BaseDataset):

force_target_names = [f"forces{i}" for i in range(len(__force_methods__))]
__isolated_atom_energies__ = []
__average_n_atoms__ = 20
__average_n_atoms__ = None

@property
def _stats(self):
return {
"formation": {
"energy": {
"mean": array([-12.94348027, -9.83037297]),
"std": array([4.39971409, 3.3574188]),
"mean": array([[-12.94348027, -9.83037297]]),
"std": array([[4.39971409, 3.3574188]]),
},
"forces": NOT_DEFINED,
},
"total": {
"energy": {
"mean": array([-89.44242, -1740.5336]),
"std": array([29.599571, 791.48663]),
"mean": array([[-89.44242, -1740.5336]]),
"std": array([[29.599571, 791.48663]]),
},
"forces": NOT_DEFINED,
},
Expand All @@ -51,6 +51,11 @@ def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None
except: # noqa
pass
self._set_isolated_atom_energies()
self.setup_dummy()

def setup_dummy(self):
self._n_atoms = np.array([np.random.randint(1, 100) for _ in range(self.__len__())])
self.__average_nb_atoms__ = self._n_atoms.mean()

def is_preprocessed(self):
return True
Expand All @@ -63,7 +68,7 @@ def __len__(self):

def __getitem__(self, idx: int):
shift = IsolatedAtomEnergyFactory.max_charge
size = np.random.randint(1, 100)
size = self._n_atoms[idx]
z = np.random.randint(1, 100, size)
c = np.random.randint(-1, 2, size)
return Bunch(
Expand Down
7 changes: 0 additions & 7 deletions src/openqdc/datasets/pcqm.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,6 @@ class PCQM_PM6(BaseDataset):
__force_methods__ = []
force_target_names = []

def __init__(self, energy_unit=None, distance_unit=None) -> None:
super().__init__(energy_unit=energy_unit, distance_unit=distance_unit)

@property
def root(self):
return p_join(get_local_cache(), "pubchemqc")
Expand Down Expand Up @@ -161,8 +158,4 @@ def collate_and_save_list(self, list_entries):
class PCQM_B3LYP(PCQM_PM6):
__name__ = "pubchemqc_b3lyp"
__energy_methods__ = ["b3lyp"]

energy_target_names = ["b3lyp"]

def __init__(self, energy_unit=None, distance_unit=None) -> None:
super().__init__(energy_unit=energy_unit, distance_unit=distance_unit)
2 changes: 1 addition & 1 deletion src/openqdc/datasets/spice.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def read_record(r):
smiles = r["smiles"].asstr()[0]
subset = r["subset"][0].decode("utf-8")
n_confs = r["conformations"].shape[0]
x = get_atomic_number_and_charge(dm.to_mol(smiles, add_hs=True))
x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True))
positions = r["conformations"][:]

res = dict(
Expand Down
4 changes: 4 additions & 0 deletions src/openqdc/datasets/transition1x.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ class Transition1X(BaseDataset):
"wB97x_6-31G(d).forces",
]

__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"

def read_raw_entries(self):
raw_path = p_join(self.root, "Transition1x.h5")
f = load_hdf5_file(raw_path)["data"]
Expand Down
2 changes: 0 additions & 2 deletions src/openqdc/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
load_hdf5_file,
load_json,
load_pkl,
load_torch,
makedirs,
save_pkl,
set_cache_dir,
Expand All @@ -19,7 +18,6 @@
"makedirs",
"load_hdf5_file",
"load_json",
"load_torch",
"create_hdf5_file",
"check_file",
"set_cache_dir",
Expand Down
Loading
Loading