From cb6dbaf7a4b5aa94a36d7256da4682e988d7a966 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 13:29:44 -0400 Subject: [PATCH 1/7] Conversion of en fixes + datastructure for e0s_dict and retriaval of covs --- openqdc/datasets/base.py | 53 +++------------ openqdc/datasets/energies.py | 124 +++++++++++++++++++++++++++++++---- openqdc/methods/enums.py | 11 +++- openqdc/utils/regressor.py | 2 + 4 files changed, 134 insertions(+), 56 deletions(-) diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index cb38393..49a9b5c 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -129,6 +129,7 @@ def __init__( set_cache_dir(cache_dir) # self._init_lambda_fn() self.data = None + self._original_unit = self.__energy_unit__ self.recompute_statistics = recompute_statistics self.regressor_kwargs = regressor_kwargs self.transform = transform @@ -268,6 +269,10 @@ def pkl_data_keys(self): def pkl_data_types(self): return {"name": str, "subset": str, "n_atoms": np.int32} + @property + def atom_energies(self): + return self._e0s_dispatcher + @property def data_types(self): return { @@ -299,7 +304,11 @@ def _set_units(self, en, ds): def _set_isolated_atom_energies(self): if self.__energy_methods__ is None: logger.error("No energy methods defined for this dataset.") - f = get_conversion("hartree", self.__energy_unit__) + if self.energy_type == "formation": + f = get_conversion("hartree", self.__energy_unit__) + else: + # regression are calculated on the original unit of the dataset + f = get_conversion(self._original_unit, self.__energy_unit__) self.__isolated_atom_energies__ = f(self.e0s_dispatcher.e0s_matrix) def convert_energy(self, x): @@ -558,48 +567,6 @@ def wrapper(idx): datum["idxs"] = idxs return datum - @classmethod - def as_dataloader( - cls, - batch_size: int = 8, - energy_unit: Optional[str] = None, - distance_unit: Optional[str] = None, - array_format: str = "torch", - energy_type: str = "formation", - overwrite_local_cache: bool = False, - cache_dir: Optional[str] = None, - recompute_statistics: bool = False, - transform: Optional[Callable] = None, - ): - """ - Return the dataset as a dataloader. - - Parameters - ---------- - batch_size : int, optional - Batch size, by default 8 - For other parameters, see the __init__ method. - """ - if not has_package("torch_geometric"): - raise ImportError("torch_geometric is required to use this method.") - assert array_format in ["torch", "jax"], f"Format {array_format} must be torch or jax." - from torch_geometric.data import Data - from torch_geometric.loader import DataLoader - - return DataLoader( - cls( - energy_unit=energy_unit, - distance_unit=distance_unit, - array_format=array_format, - energy_type=energy_type, - overwrite_local_cache=overwrite_local_cache, - cache_dir=cache_dir, - recompute_statistics=recompute_statistics, - transform=lambda x: Data(**x) if transform is None else transform, - ), - batch_size=batch_size, - ) - def as_iter(self, atoms: bool = False, energy_method: int = 0): """ Return the dataset as an iterator. diff --git a/openqdc/datasets/energies.py b/openqdc/datasets/energies.py index 60af6d5..c08dcb2 100644 --- a/openqdc/datasets/energies.py +++ b/openqdc/datasets/energies.py @@ -1,10 +1,13 @@ from abc import ABC, abstractmethod +from dataclasses import dataclass, field from os.path import join as p_join +from typing import Union import numpy as np from loguru import logger from openqdc.methods.enums import PotentialMethod +from openqdc.utils.constants import ATOM_SYMBOLS, ATOMIC_NUMBERS from openqdc.utils.io import load_pkl, save_pkl from openqdc.utils.regressor import Regressor @@ -41,6 +44,40 @@ def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface": return NullEnergy(data, **kwargs) +@dataclass(frozen=False, eq=True) +class AtomSpec: + symbol: Union[str, int] + charge: int = 0 + + def __post_init__(self): + if not isinstance(self.symbol, str): + self.symbol = ATOM_SYMBOLS[self.symbol] + self.number = ATOMIC_NUMBERS[self.symbol] + + def __hash__(self): + return hash((self.symbol, self.charge)) + + def __eq__(self, other): + if not isinstance(other, AtomSpec): + symbol, charge = other[0], other[1] + other = AtomSpec(symbol=symbol, charge=charge) + return (self.number, self.charge) == (other.number, other.charge) + + +@dataclass +class AtomEnergy: + mean: np.array + std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32)) + + def __post_init__(self): + if not isinstance(self.mean, np.ndarray): + self.mean = np.array([self.mean], dtype=np.float32) + + def append(self, other): + self.mean = np.append(self.mean, other.mean) + self.std = np.append(self.std, other.std) + + class AtomEnergies: """ Manager class for interface with the isolated atom energies classes @@ -71,6 +108,32 @@ def e0s_matrix(self) -> np.ndarray: """ return self.factory.e0_matrix + @property + def e0s_dict(self): + """ + Return the isolated atom energies dictionary + """ + return self.factory.e0_dict + + def __str__(self): + return f"Atoms: { list(set(map(lambda x : x.symbol, self.e0s_dict.keys())))}" + + def __repr__(self): + return str(self) + + def __getitem__(self, item): + try: + atom, charge = item[0], item[1] + except TypeError: + atom = item + charge = 0 + except IndexError: + atom = item[0] + charge = 0 + if not isinstance(atom, str): + atom = ATOM_SYMBOLS[atom] + return self.e0s_dict[(atom, charge)] + class IsolatedEnergyInterface(ABC): """ @@ -78,8 +141,6 @@ class IsolatedEnergyInterface(ABC): different implementation of an isolated atom energy value """ - _e0_matrixs = [] - def __init__(self, data, **kwargs): """ Parameters @@ -93,7 +154,8 @@ def __init__(self, data, **kwargs): selected energy class. Mostly used for regression to pass the regressor_kwargs. """ - + self._e0_matrixs = [] + self._e0_dict = None self.kwargs = kwargs self.data = data self._post_init() @@ -120,27 +182,61 @@ def e0_matrix(self) -> np.ndarray: """ return np.array(self._e0_matrixs) + @property + def e0_dict(self) -> np.ndarray: + """ + Return the isolated atom energies matrixes + """ + + return self._e0s_dict + def __str__(self) -> str: return self.__class__.__name__.lower() -class NullEnergy(IsolatedEnergyInterface): +class PhysicalEnergy(IsolatedEnergyInterface): """ - Class that returns a null (zeros) matrix for the isolated atom energies in case - of no energies are available. + Class that returns a physical (SE,DFT,etc) isolated atom energies. """ + def _assembly_e0_dict(self): + datum = {} + for method in self.data.__energy_methods__: + for key, values in method.atom_energies_dict.items(): + atm = AtomSpec(*key) + ens = AtomEnergy(values) + if atm not in datum: + datum[atm] = ens + else: + datum[atm].append(ens) + self._e0s_dict = datum + def _post_init(self): - self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))] + self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__] + self._assembly_e0_dict() -class PhysicalEnergy(IsolatedEnergyInterface): +class NullEnergy(IsolatedEnergyInterface): """ - Class that returns a physical (SE,DFT,etc) isolated atom energies. + Class that returns a null (zeros) matrix for the isolated atom energies in case + of no energies are available. """ + def _assembly_e0_dict(self): + datum = {} + for _ in self.data.__energy_methods__: + for key, values in PotentialMethod.NONE.atom_energies_dict.items(): + atm = AtomSpec(*key) + ens = AtomEnergy(values) + if atm not in datum: + datum[atm] = ens + else: + datum[atm].append(ens) + self._e0s_dict = datum + def _post_init(self): - self._e0_matrixs = [energy_method.atom_energies_matrix for energy_method in self.data.__energy_methods__] + self._e0_matrixs = [PotentialMethod.NONE.atom_energies_matrix for _ in range(len(self.data.energy_methods))] + self._assembly_e0_dict() class RegressionEnergy(IsolatedEnergyInterface): @@ -175,7 +271,9 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None: """ atomic_energies_dict = {} for i, z in enumerate(self.regressor.numbers): - atomic_energies_dict[z] = E0s[i] + for charge in range(-10, 11): + atomic_energies_dict[AtomSpec(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i]) + # atomic_energies_dict[z] = E0s[i] self._e0s_dict = atomic_energies_dict self.save_e0s() @@ -187,7 +285,9 @@ def _set_linear_e0s(self) -> None: new_e0s = [np.zeros((max(self.data.numbers) + 1, MAX_CHARGE_NUMBER)) for _ in range(len(self))] for z, e0 in self._e0s_dict.items(): for i in range(len(self)): - new_e0s[i][z, :] = e0[i] + # new_e0s[i][z, :] = e0[i] + new_e0s[i][z.number, z.charge] = e0.mean[i] + # for atom_sp, values in self._e0_matrixs = new_e0s def save_e0s(self) -> None: diff --git a/openqdc/methods/enums.py b/openqdc/methods/enums.py index 50a7f1d..3cf5104 100644 --- a/openqdc/methods/enums.py +++ b/openqdc/methods/enums.py @@ -1,8 +1,10 @@ from enum import Enum from loguru import logger +from numpy import array, float32 from openqdc.methods.atom_energies import atom_energy_collection, to_e_matrix +from openqdc.utils.constants import ATOM_SYMBOLS class StrEnum(str, Enum): @@ -472,6 +474,13 @@ class PotentialMethod(QmMethod): # SPLIT FOR INTERACTIO ENERGIES AND FIX MD1 XLYP_TZP = Functional.XLYP, BasisSet.TZP NONE = Functional.NONE, BasisSet.NONE + def _build_default_dict(self): + e0_dict = {} + for SYMBOL in ATOM_SYMBOLS: + for CHARGE in range(-10, 11): + e0_dict[(SYMBOL, CHARGE)] = array([0], dtype=float32) + return e0_dict + @property def atom_energies_dict(self): """Get the atomization energy dictionary""" @@ -483,7 +492,7 @@ def atom_energies_dict(self): raise except: # noqa logger.info(f"No available atomization energy for the QM method {key}. All values are set to 0.") - + energies = self._build_default_dict() return energies diff --git a/openqdc/utils/regressor.py b/openqdc/utils/regressor.py index c230ce7..1d3e50a 100644 --- a/openqdc/utils/regressor.py +++ b/openqdc/utils/regressor.py @@ -146,6 +146,8 @@ def solve(self): else: X, y = self.X, self.y[:, energy_idx] E0s, cov = self.solver(X, y) + if cov is None: + cov = np.zeros_like(E0s) + 1.0 E0_list.append(E0s) cov_list.append(cov) return np.vstack(E0_list).T, np.vstack(cov_list).T From da767efc409c1f965258c3aa4075517828eb5de7 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 14:38:13 -0400 Subject: [PATCH 2/7] Tests --- tests/test_energies.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/test_energies.py diff --git a/tests/test_energies.py b/tests/test_energies.py new file mode 100644 index 0000000..47a8e7b --- /dev/null +++ b/tests/test_energies.py @@ -0,0 +1,44 @@ +import numpy as np +import pytest + +from openqdc.datasets.energies import AtomEnergies, AtomEnergy +from openqdc.methods import PotentialMethod + + +class Container: + __name__ = "container" + __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD] + energy_methods = [str(PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD)] + refit_e0s = True + + def __init__(self, energy_type="formation"): + self.energy_type = energy_type + + +@pytest.fixture +def physical_energies(): + dummy = Container() + return AtomEnergies(dummy) + + +def test_atom_energies_object(physical_energies): + assert isinstance(physical_energies, AtomEnergies) + + +def test_indexing(physical_energies): + assert isinstance(physical_energies[6], AtomEnergy) + assert isinstance(physical_energies[(6, 1)], AtomEnergy) + assert isinstance(physical_energies[6, 1], AtomEnergy) + assert isinstance(physical_energies[("C", 1)], AtomEnergy) + assert isinstance(physical_energies["C", 1], AtomEnergy) + assert physical_energies[("C", 1)] == physical_energies[(6, 1)] + assert not physical_energies[("Cl", -2)] == physical_energies[(6, 1)] + with pytest.raises(KeyError): + physical_energies[("Cl", -6)] + + +def test_matrix(physical_energies): + matrix = physical_energies.e0s_matrix + assert len(matrix) == 1 + assert isinstance(matrix, np.ndarray) + assert np.any(matrix) From b8791d646fc7f1781dfdec76b318d7ba03af70dd Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 14:49:34 -0400 Subject: [PATCH 3/7] Added _original_unit to xyz --- openqdc/datasets/io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openqdc/datasets/io.py b/openqdc/datasets/io.py index bf90ea5..cd8bfdb 100644 --- a/openqdc/datasets/io.py +++ b/openqdc/datasets/io.py @@ -47,6 +47,7 @@ def __init__( self.recompute_statistics = True self.refit_e0s = True self.energy_type = energy_type + self._original_unit = energy_unit self.__energy_unit__ = energy_unit self.__distance_unit__ = distance_unit self.__energy_methods__ = [PotentialMethod.NONE if not level_of_theory else level_of_theory] From 6abe893213f165560c4ac569e3453a48c11bedd3 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 18:54:55 +0000 Subject: [PATCH 4/7] Disable mkdocs --- mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index c1be218..0bdc741 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,7 +13,7 @@ nav: - Overview: index.md - Available Datasets: datasets.md - Tutorials: - - Really hard example: tutorials/usage.ipynb + #- Really hard example: tutorials/usage.ipynb - API: - Datasets: API/available_datasets.md - Isolated Atoms Energies: API/isolated_atom_energies.md From 8d21e15acde1fe3c50dda99c8d4b73fe90bc8091 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 18:57:27 +0000 Subject: [PATCH 5/7] I hate mkdocs errors --- mkdocs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 0bdc741..6db3178 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,8 +12,8 @@ docs_dir: "docs" nav: - Overview: index.md - Available Datasets: datasets.md - - Tutorials: - #- Really hard example: tutorials/usage.ipynb + #- Tutorials: + # #- Really hard example: tutorials/usage.ipynb - API: - Datasets: API/available_datasets.md - Isolated Atoms Energies: API/isolated_atom_energies.md From 46bc652af870f494701379ef2f012a616c137afd Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 18:59:29 +0000 Subject: [PATCH 6/7] mkdocs action --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2c34132..1f04d36 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -53,5 +53,5 @@ jobs: - name: Run tests run: python -m pytest - - name: Test building the doc - run: mkdocs build + #- name: Test building the doc + # run: mkdocs build From 2f4b692d03d3c3155efdd082eb1b4ad208cd5124 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Wed, 1 May 2024 20:28:01 -0400 Subject: [PATCH 7/7] Docstrings + naming --- openqdc/datasets/energies.py | 50 +++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/openqdc/datasets/energies.py b/openqdc/datasets/energies.py index c08dcb2..3a19233 100644 --- a/openqdc/datasets/energies.py +++ b/openqdc/datasets/energies.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from os.path import join as p_join -from typing import Union +from typing import Dict, Union import numpy as np from loguru import logger @@ -45,7 +45,13 @@ def dispatch_factory(data, **kwargs) -> "IsolatedEnergyInterface": @dataclass(frozen=False, eq=True) -class AtomSpec: +class AtomSpecies: + """ + Structure that defines a tuple of chemical specie and charge + and provide hash and automatic conversion from atom number to + checmical symbol + """ + symbol: Union[str, int] charge: int = 0 @@ -58,14 +64,21 @@ def __hash__(self): return hash((self.symbol, self.charge)) def __eq__(self, other): - if not isinstance(other, AtomSpec): + if not isinstance(other, AtomSpecies): symbol, charge = other[0], other[1] - other = AtomSpec(symbol=symbol, charge=charge) + other = AtomSpecies(symbol=symbol, charge=charge) return (self.number, self.charge) == (other.number, other.charge) @dataclass class AtomEnergy: + """ + Datastructure to store isolated atom energies + and the std deviation associated to the value. + By default the std will be 1 if no value was calculated + or not available (formation energy case) + """ + mean: np.array std: np.array = field(default_factory=lambda: np.array([1], dtype=np.float32)) @@ -73,7 +86,10 @@ def __post_init__(self): if not isinstance(self.mean, np.ndarray): self.mean = np.array([self.mean], dtype=np.float32) - def append(self, other): + def append(self, other: "AtomEnergy"): + """ + Append the mean and std of another atom energy + """ self.mean = np.append(self.mean, other.mean) self.std = np.append(self.std, other.std) @@ -109,7 +125,7 @@ def e0s_matrix(self) -> np.ndarray: return self.factory.e0_matrix @property - def e0s_dict(self): + def e0s_dict(self) -> Dict[AtomSpecies, AtomEnergy]: """ Return the isolated atom energies dictionary """ @@ -121,7 +137,17 @@ def __str__(self): def __repr__(self): return str(self) - def __getitem__(self, item): + def __getitem__(self, item: AtomSpecies) -> AtomEnergy: + """ + Retrieve a key from the isolated atom dictionary. + Item can be written as tuple(Symbol, charge), + tuple(Chemical number, charge). If no charge is passed, + it will be automatically set to 0. + Examples: + AtomEnergies[6], AtomEnergies[6,1], + AtomEnergies["C",1], AtomEnergies[(6,1)] + AtomEnergies[("C,1)] + """ try: atom, charge = item[0], item[1] except TypeError: @@ -183,9 +209,9 @@ def e0_matrix(self) -> np.ndarray: return np.array(self._e0_matrixs) @property - def e0_dict(self) -> np.ndarray: + def e0_dict(self) -> Dict: """ - Return the isolated atom energies matrixes + Return the isolated atom energies dict """ return self._e0s_dict @@ -203,7 +229,7 @@ def _assembly_e0_dict(self): datum = {} for method in self.data.__energy_methods__: for key, values in method.atom_energies_dict.items(): - atm = AtomSpec(*key) + atm = AtomSpecies(*key) ens = AtomEnergy(values) if atm not in datum: datum[atm] = ens @@ -226,7 +252,7 @@ def _assembly_e0_dict(self): datum = {} for _ in self.data.__energy_methods__: for key, values in PotentialMethod.NONE.atom_energies_dict.items(): - atm = AtomSpec(*key) + atm = AtomSpecies(*key) ens = AtomEnergy(values) if atm not in datum: datum[atm] = ens @@ -272,7 +298,7 @@ def _set_lin_atom_species_dict(self, E0s, covs) -> None: atomic_energies_dict = {} for i, z in enumerate(self.regressor.numbers): for charge in range(-10, 11): - atomic_energies_dict[AtomSpec(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i]) + atomic_energies_dict[AtomSpecies(z, charge)] = AtomEnergy(E0s[i], 1 if covs is None else covs[i]) # atomic_energies_dict[z] = E0s[i] self._e0s_dict = atomic_energies_dict self.save_e0s()