Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring by p #72

Merged
merged 20 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ jobs:
- name: Install library
run: python -m pip install --no-deps .

- name: Check directory
run: ls

- name: Run tests
run: pytest
run: python -m pytest

- name: Test building the doc
run: mkdocs build
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,8 @@ We also provide support for the following publicly available QM Noncovalent Inte
| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
| [L7](https://pubs.acs.org/doi/10.1021/ct400036b) |

# How to cite
All data presented in the OpenQDC are already published in scientific journals, full reference to the respective paper is attached to each dataset class. When citing data obtained from OpenQDC, you should cite both the original paper(s) the data come from and our paper on OpenQDC itself. The reference is:

ADD REF HERE LATER
5 changes: 0 additions & 5 deletions docs/API/isolated_atom_energies.md

This file was deleted.

3 changes: 3 additions & 0 deletions docs/API/methods.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# QM Methods

::: openqdc.methods
11 changes: 6 additions & 5 deletions docs/tutorials/usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,7 @@
"\n",
"$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$\n",
"\n",
"The isolated atoms energies are automatically used inside the datasets for the correct level of theory, but you can also use them directly by accessing the IsolatedAtomEnergyFactor class."
"The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow"
]
},
{
Expand Down Expand Up @@ -715,10 +715,11 @@
}
],
"source": [
"from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory\n",
"from openqdc.methods import QmMethod\n",
"\n",
"# Get the hasmap of isolated atom energies for the b3lyp/6-31g* method\n",
"IsolatedAtomEnergyFactory.get(\"b3lyp/6-31g*\")"
"# Get the b3lyp/6-31g* method\n",
"method = QmMethod.B3LYP_6_31G_D\n",
"method.atom_energies_dict"
]
},
{
Expand All @@ -745,7 +746,7 @@
],
"source": [
"# Get the matrix of atomization energies for the b3lyp/6-31g* method\n",
"IsolatedAtomEnergyFactory.get_matrix(\"b3lyp/6-31g*\")"
"method.atom_energies_matrix"
]
},
{
Expand Down
7 changes: 4 additions & 3 deletions openqdc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ def exist_dataset(dataset):


def format_entry(empty_dataset):
if len(empty_dataset.__energy_methods__) > 10:
entry = ",".join(empty_dataset.__energy_methods__[:10]) + "..."
energy_methods = [str(x) for x in empty_dataset.__energy_methods__]
if len(energy_methods) > 10:
entry = ",".join(energy_methods[:10]) + "..."
else:
entry = ",".join(empty_dataset.__energy_methods__[:10])
entry = ",".join(energy_methods[:10])
return entry


Expand Down
20 changes: 9 additions & 11 deletions openqdc/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
from sklearn.utils import Bunch
from tqdm import tqdm

from openqdc.utils.atomization_energies import (
IsolatedAtomEnergyFactory,
chemical_symbols,
)
from openqdc.utils.constants import (
ATOM_SYMBOLS,
MAX_CHARGE,
NB_ATOMIC_FEATURES,
NOT_DEFINED,
POSSIBLE_NORMALIZATION,
Expand Down Expand Up @@ -135,7 +133,7 @@ def __force_methods__(self):

@property
def energy_methods(self):
return self.__energy_methods__
return [str(i) for i in self.__energy_methods__]

@property
def force_methods(self):
Expand Down Expand Up @@ -205,7 +203,7 @@ def _set_linear_e0s(self):
def _precompute_E(self):
splits_idx = self.data["position_idx_range"][:, 1]
s = np.array(self.data["atomic_inputs"][:, :2], dtype=int)
s[:, 1] += IsolatedAtomEnergyFactory.max_charge
s[:, 1] += MAX_CHARGE
matrixs = [matrix[s[:, 0], s[:, 1]] for matrix in self.__isolated_atom_energies__]
REGRESSOR_SUCCESS = False
try:
Expand Down Expand Up @@ -279,7 +277,7 @@ def numbers(self):

@property
def chemical_species(self):
return np.array(chemical_symbols)[self.numbers]
return np.array(ATOM_SYMBOLS)[self.numbers]

@property
def energy_unit(self):
Expand Down Expand Up @@ -347,7 +345,7 @@ def _set_new_e0s_unit(self, en):
@property
def force_mask(self):
if len(self.__class__.__force_mask__) == 0:
self.__class__.__force_mask__ = [False] * len(self.energy_methods)
self.__class__.__force_mask__ = [False] * len(self.__energy_methods__)
return self.__class__.__force_mask__

def _set_units(self, en, ds):
Expand All @@ -364,11 +362,11 @@ def _set_units(self, en, ds):
self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__)

def _set_isolated_atom_energies(self):
if self.energy_methods is None:
if self.__energy_methods__ is None:
logger.error("No energy methods defined for this dataset.")
f = get_conversion("hartree", self.__energy_unit__)
self.__isolated_atom_energies__ = f(
np.array([IsolatedAtomEnergyFactory.get_matrix(energy_method) for energy_method in self.energy_methods])
np.array([en_method.atom_energies_matrix for en_method in self.__energy_methods__])
)

def convert_energy(self, x):
Expand Down Expand Up @@ -703,7 +701,7 @@ def __smiles_converter__(self, x):
return x

def __getitem__(self, idx: int):
shift = IsolatedAtomEnergyFactory.max_charge
shift = MAX_CHARGE
p_start, p_end = self.data["position_idx_range"][idx]
input = self.data["atomic_inputs"][p_start:p_end]
z, c, positions, energies = (
Expand Down
23 changes: 13 additions & 10 deletions openqdc/datasets/interaction/L7.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from loguru import logger

from openqdc.datasets.interaction.base import BaseInteractionDataset
from openqdc.utils.molecule import atom_table
from openqdc.methods import InteractionMethod, InterEnergyType
from openqdc.utils.constants import ATOM_TABLE


class DataItemYAMLObj:
Expand Down Expand Up @@ -66,16 +67,18 @@ class L7(BaseInteractionDataset):
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__energy_methods__ = [
"CSD(T) | QCISD(T)",
"DLPNO-CCSD(T)",
"MP2/CBS",
"MP2C/CBS",
"fixed",
"DLPNO-CCSD(T0)",
"LNO-CCSD(T)",
"FN-DMC",
InteractionMethod.QCISDT_CBS, # "QCISD(T)/CBS",
InteractionMethod.DLPNO_CCSDT, # "DLPNO-CCSD(T)",
InteractionMethod.MP2_CBS, # "MP2/CBS",
InteractionMethod.MP2C_CBS, # "MP2C/CBS",
InteractionMethod.FIXED, # "fixed", TODO: we should remove this level of theory because unless we have a pro
InteractionMethod.DLPNO_CCSDT0, # "DLPNO-CCSD(T0)",
InteractionMethod.LNO_CCSDT, # "LNO-CCSD(T)",
InteractionMethod.FN_DMC, # "FN-DMC",
]

__energy_type__ = [InterEnergyType.TOTAL] * 8

energy_target_names = []

def read_raw_entries(self) -> List[Dict]:
Expand All @@ -102,7 +105,7 @@ def read_raw_entries(self) -> List[Dict]:
energies = np.array([energies], dtype=np.float32)
pos = np.array(lines[1:])[:, 1:].astype(np.float32)
elems = np.array(lines[1:])[:, 0]
atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
natoms0 = n_atoms_first[0]
natoms1 = n_atoms[0] - natoms0
charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
Expand Down
18 changes: 11 additions & 7 deletions openqdc/datasets/interaction/X40.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

from openqdc.datasets.interaction.base import BaseInteractionDataset
from openqdc.datasets.interaction.L7 import get_loader
from openqdc.utils.molecule import atom_table
from openqdc.methods import InteractionMethod, InterEnergyType
from openqdc.utils.constants import ATOM_TABLE


class X40(BaseInteractionDataset):
Expand All @@ -29,12 +30,15 @@ class X40(BaseInteractionDataset):
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__energy_methods__ = [
"CCSD(T)/CBS",
"MP2/CBS",
"dCCSD(T)/haDZ",
"dCCSD(T)/haTZ",
"MP2.5/CBS(aDZ)",
InteractionMethod.CCSD_T_CBS, # "CCSD(T)/CBS",
InteractionMethod.MP2_CBS, # "MP2/CBS",
InteractionMethod.DCCSDT_HA_DZ, # "dCCSD(T)/haDZ",
InteractionMethod.DCCSDT_HA_TZ, # "dCCSD(T)/haTZ",
InteractionMethod.MP2_5_CBS_ADZ, # "MP2.5/CBS(aDZ)",
]
__energy_type__ = [
InterEnergyType.TOTAL,
] * 5

energy_target_names = []

Expand Down Expand Up @@ -62,7 +66,7 @@ def read_raw_entries(self) -> List[Dict]:
energies = np.array([energies], dtype=np.float32)
pos = np.array(lines[1:])[:, 1:].astype(np.float32)
elems = np.array(lines[1:])[:, 0]
atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elems]), axis=1)
atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elems]), axis=1)
natoms0 = n_atoms_first[0]
natoms1 = n_atoms[0] - natoms0
charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)
Expand Down
19 changes: 3 additions & 16 deletions openqdc/datasets/interaction/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,12 @@
from sklearn.utils import Bunch

from openqdc.datasets.base import BaseDataset
from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
from openqdc.utils.constants import NB_ATOMIC_FEATURES
from openqdc.utils.constants import MAX_CHARGE, NB_ATOMIC_FEATURES
from openqdc.utils.io import pull_locally, push_remote, to_atoms


class BaseInteractionDataset(BaseDataset):
def __init__(
self,
energy_unit: Optional[str] = None,
distance_unit: Optional[str] = None,
overwrite_local_cache: bool = False,
cache_dir: Optional[str] = None,
) -> None:
super().__init__(
energy_unit=energy_unit,
distance_unit=distance_unit,
overwrite_local_cache=overwrite_local_cache,
cache_dir=cache_dir,
)
__energy_type__ = []

def collate_list(self, list_entries: List[Dict]):
# concatenate entries
Expand Down Expand Up @@ -63,7 +50,7 @@ def data_types(self):
}

def __getitem__(self, idx: int):
shift = IsolatedAtomEnergyFactory.max_charge
shift = MAX_CHARGE
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the interaction dataset we don't really need the Isolated Atom Energies so we can rework the getitem method in case

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code just give zeros isolated atom energies for all the interaction methods

p_start, p_end = self.data["position_idx_range"][idx]
input = self.data["atomic_inputs"][p_start:p_end]
z, c, positions, energies = (
Expand Down
60 changes: 41 additions & 19 deletions openqdc/datasets/interaction/des370k.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
from tqdm import tqdm

from openqdc.datasets.interaction.base import BaseInteractionDataset
from openqdc.methods import InteractionMethod, InterEnergyType
from openqdc.utils.constants import ATOM_TABLE
from openqdc.utils.io import get_local_cache
from openqdc.utils.molecule import atom_table, molecule_groups
from openqdc.utils.molecule import molecule_groups


class DES370K(BaseInteractionDataset):
Expand All @@ -27,23 +29,43 @@ class DES370K(BaseInteractionDataset):
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__energy_methods__ = [
"mp2/cc-pvdz",
"mp2/cc-pvqz",
"mp2/cc-pvtz",
"mp2/cbs",
"ccsd(t)/cc-pvdz",
"ccsd(t)/cbs", # cbs
"ccsd(t)/nn", # nn
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz_es",
"sapt0/aug-cc-pwcvxz_ex",
"sapt0/aug-cc-pwcvxz_exs2",
"sapt0/aug-cc-pwcvxz_ind",
"sapt0/aug-cc-pwcvxz_exind",
"sapt0/aug-cc-pwcvxz_disp",
"sapt0/aug-cc-pwcvxz_exdisp_os",
"sapt0/aug-cc-pwcvxz_exdisp_ss",
"sapt0/aug-cc-pwcvxz_delta_HF",
InteractionMethod.MP2_CC_PVDZ,
InteractionMethod.MP2_CC_PVQZ,
InteractionMethod.MP2_CC_PVTZ,
InteractionMethod.MP2_CBS,
InteractionMethod.CCSD_T_CC_PVDZ,
InteractionMethod.CCSD_T_CBS,
InteractionMethod.CCSD_T_NN,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
InteractionMethod.SAPT0_AUG_CC_PWCVXZ,
]

__energy_type__ = [
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.TOTAL,
InterEnergyType.ES,
InterEnergyType.EX,
InterEnergyType.EX_S2,
InterEnergyType.IND,
InterEnergyType.EX_IND,
InterEnergyType.DISP,
InterEnergyType.EX_DISP_OS,
InterEnergyType.EX_DISP_SS,
InterEnergyType.DELTA_HF,
]

energy_target_names = [
Expand Down Expand Up @@ -87,7 +109,7 @@ def _read_raw_entries(cls) -> List[Dict]:

elements = row["elements"].split()

atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
atomic_nums = np.expand_dims(np.array([ATOM_TABLE.GetAtomicNumber(x) for x in elements]), axis=1)

charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)

Expand Down
Loading
Loading