Skip to content

Commit

Permalink
Merge pull request #34 from OpenDrugDiscovery/minor_fixes
Browse files Browse the repository at this point in the history
Improvements to Dummy Dataset
  • Loading branch information
FNTwin authored Mar 8, 2024
2 parents 817bbed + 8ca966a commit 088d457
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 76 deletions.
82 changes: 41 additions & 41 deletions openqdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,28 @@
_lazy_imports_obj = {
"__version__": "openqdc._version",
"BaseDataset": "openqdc.datasets.base",
"ANI1": "openqdc.datasets.ani",
"ANI1CCX": "openqdc.datasets.ani",
"ANI1X": "openqdc.datasets.ani",
"Spice": "openqdc.datasets.spice",
"GEOM": "openqdc.datasets.geom",
"QMugs": "openqdc.datasets.qmugs",
"ISO17": "openqdc.datasets.iso_17",
"COMP6": "openqdc.datasets.comp6",
"GDML": "openqdc.datasets.gdml",
"Molecule3D": "openqdc.datasets.molecule3d",
"OrbnetDenali": "openqdc.datasets.orbnet_denali",
"SN2RXN": "openqdc.datasets.sn2_rxn",
"QM7X": "openqdc.datasets.qm7x",
"DES": "openqdc.datasets.des",
"NablaDFT": "openqdc.datasets.nabladft",
"SolvatedPeptides": "openqdc.datasets.solvated_peptides",
"WaterClusters": "openqdc.datasets.waterclusters3_30",
"TMQM": "openqdc.datasets.tmqm",
"Dummy": "openqdc.datasets.dummy",
"PCQM_B3LYP": "openqdc.datasets.pcqm",
"PCQM_PM6": "openqdc.datasets.pcqm",
"Transition1X": "openqdc.datasets.transition1x",
"ANI1": "openqdc.datasets.potential.ani",
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"GEOM": "openqdc.datasets.potential.geom",
"QMugs": "openqdc.datasets.potential.qmugs",
"ISO17": "openqdc.datasets.potential.iso_17",
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
"DES": "openqdc.datasets.interaction.des",
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
"TMQM": "openqdc.datasets.potential.tmqm",
"Dummy": "openqdc.datasets.potential.dummy",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
"Transition1X": "openqdc.datasets.potential.transition1x",
"AVAILABLE_DATASETS": "openqdc.datasets",
}

Expand Down Expand Up @@ -66,23 +66,23 @@ def __dir__():
# checkers what they are.
from ._version import __version__ # noqa
from .datasets import AVAILABLE_DATASETS # noqa
from .datasets.ani import ANI1, ANI1CCX, ANI1X # noqa
from .datasets.base import BaseDataset # noqa
from .datasets.comp6 import COMP6 # noqa
from .datasets.des import DES # noqa
from .datasets.dummy import Dummy # noqa
from .datasets.gdml import GDML # noqa
from .datasets.geom import GEOM # noqa
from .datasets.iso_17 import ISO17 # noqa
from .datasets.molecule3d import Molecule3D # noqa
from .datasets.nabladft import NablaDFT # noqa
from .datasets.orbnet_denali import OrbnetDenali # noqa
from .datasets.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .datasets.qm7x import QM7X # noqa
from .datasets.qmugs import QMugs # noqa
from .datasets.sn2_rxn import SN2RXN # noqa
from .datasets.solvated_peptides import SolvatedPeptides # noqa
from .datasets.spice import Spice # noqa
from .datasets.tmqm import TMQM # noqa
from .datasets.transition1x import Transition1X # noqa
from .datasets.waterclusters3_30 import WaterClusters # noqa
from .datasets.interaction.des import DES # noqa
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X # noqa
from .datasets.potential.comp6 import COMP6 # noqa
from .datasets.potential.dummy import Dummy # noqa
from .datasets.potential.gdml import GDML # noqa
from .datasets.potential.geom import GEOM # noqa
from .datasets.potential.iso_17 import ISO17 # noqa
from .datasets.potential.molecule3d import Molecule3D # noqa
from .datasets.potential.nabladft import NablaDFT # noqa
from .datasets.potential.orbnet_denali import OrbnetDenali # noqa
from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .datasets.potential.qm7x import QM7X # noqa
from .datasets.potential.qmugs import QMugs # noqa
from .datasets.potential.sn2_rxn import SN2RXN # noqa
from .datasets.potential.solvated_peptides import SolvatedPeptides # noqa
from .datasets.potential.spice import Spice # noqa
from .datasets.potential.tmqm import TMQM # noqa
from .datasets.potential.transition1x import Transition1X # noqa
from .datasets.potential.waterclusters3_30 import WaterClusters # noqa
8 changes: 3 additions & 5 deletions openqdc/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,16 +443,14 @@ def preprocess(self, overwrite=False):
res = self.collate_list(entries)
self.save_preprocess(res)

def save_xyz(self, idx: int, path: Optional[str] = None, name=None):
def save_xyz(self, idx: int, path: Optional[str] = None, ext=True):
"""
Save the entry at index idx as an extxyz file.
"""
if path is None:
path = os.getcwd()
at = self.get_ase_atoms(idx, ext=True)
if name is not None:
name = at.info["name"]
write_extxyz(p_join(path, f"{name}.xyz"), at)
at = self.get_ase_atoms(idx, ext=ext)
write_extxyz(p_join(path, f"mol_{idx}.xyz"), at)

def get_ase_atoms(self, idx: int, ext=True):
"""
Expand Down
71 changes: 43 additions & 28 deletions openqdc/datasets/potential/dummy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import numpy as np # noqa
from numpy import array
from sklearn.utils import Bunch
import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
from openqdc.utils.constants import NOT_DEFINED


class Dummy(BaseDataset):
"""
Dummy dataset
Dummy dataset for testing.
"""

__name__ = "dummy"
Expand All @@ -30,21 +27,26 @@ def _stats(self):
return {
"formation": {
"energy": {
"mean": array([[-12.94348027, -9.83037297]]),
"std": array([[4.39971409, 3.3574188]]),
"mean": np.array([[-12.94348027, -9.83037297]]),
"std": np.array([[4.39971409, 3.3574188]]),
},
"forces": NOT_DEFINED,
},
"total": {
"energy": {
"mean": array([[-89.44242, -1740.5336]]),
"std": array([[29.599571, 791.48663]]),
"mean": np.array([[-89.44242, -1740.5336]]),
"std": np.array([[29.599571, 791.48663]]),
},
"forces": NOT_DEFINED,
},
}

def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None:
def __init__(
self,
energy_unit=None,
distance_unit=None,
cache_dir=None,
) -> None:
try:
super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir)

Expand All @@ -54,8 +56,37 @@ def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None
self.setup_dummy()

def setup_dummy(self):
self._n_atoms = np.array([np.random.randint(1, 100) for _ in range(self.__len__())])
self.__average_nb_atoms__ = self._n_atoms.mean()
n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))])
position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2)
atomic_inputs = np.concatenate(
[
np.concatenate(
[
# z, c, x, y, z
np.random.randint(1, 100, size=(size, 1)),
np.random.randint(-1, 2, size=(size, 1)),
np.random.randn(size, 3),
],
axis=1,
)
for size in n_atoms
],
axis=0,
) # (sum(n_atoms), 5)
name = [f"dummy_{i}" for i in range(len(self))]
subset = ["dummy" for i in range(len(self))]
energies = np.random.rand(len(self), len(self.__energy_methods__))
forces = np.concatenate([np.random.randn(size, 3, len(self.__force_methods__)) * 100 for size in n_atoms])
self.data = dict(
n_atoms=n_atoms,
position_idx_range=position_idx_range,
name=name,
atomic_inputs=atomic_inputs,
subset=subset,
energies=energies,
forces=forces,
)
self.__average_nb_atoms__ = self.data["n_atoms"].mean()

def is_preprocessed(self):
return True
Expand All @@ -65,19 +96,3 @@ def read_raw_entries(self):

def __len__(self):
return 9999

def __getitem__(self, idx: int):
shift = IsolatedAtomEnergyFactory.max_charge
size = self._n_atoms[idx]
z = np.random.randint(1, 100, size)
c = np.random.randint(-1, 2, size)
return Bunch(
positions=np.random.rand(size, 3) * 10,
atomic_numbers=z,
charges=c,
e0=self.__isolated_atom_energies__[..., z, c + shift].T,
energies=np.random.randn(len(self.__energy_methods__)),
name="dummy_{}".format(idx),
subset="dummy",
forces=(np.random.randn(size, 3, len(self.__force_methods__)) * 100),
)
4 changes: 2 additions & 2 deletions openqdc/datasets/potential/transition1x.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ class Transition1X(BaseDataset):
"wB97x_6-31G(d).forces",
]

__energy_unit__ = "hartree"
__energy_unit__ = "ev"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__forces_unit__ = "ev/ang"

def read_raw_entries(self):
raw_path = p_join(self.root, "Transition1x.h5")
Expand Down

0 comments on commit 088d457

Please sign in to comment.