diff --git a/docs/src/examples/linear-model.py b/docs/src/examples/linear-model.py index 0ef7d24..0d4e133 100644 --- a/docs/src/examples/linear-model.py +++ b/docs/src/examples/linear-model.py @@ -16,13 +16,22 @@ # # Dataset # ------- -# +# # As data set we use the SHIFTML set. You can obtain the dataset used in this # example from our :download:`website<../../static/dataset.xyz>`. # We read the first 20 structures of the data set using -# `ASE `. +# `ASE `. import ase.io +import numpy as np +from equistore import Labels +from equistore.operations import slice, sum_over_samples +from rascaline import SoapPowerSpectrum + +from equisolve.numpy.models.linear_model import Ridge +from equisolve.utils import dictionary_to_tensormap +from equisolve.utils.convert import ase_to_tensormap + frames = ase.io.read("dataset.xyz", ":20") @@ -39,7 +48,6 @@ # We construct the descriptor training data with a SOAP powerspectrum using # rascaline. We first define the hyper parameters for the calculation -from rascaline import SoapPowerSpectrum HYPER_PARAMETERS = { "cutoff": 5.0, @@ -67,11 +75,11 @@ # # For more details on how the descriptor works see the documentation of # rascaline. -# -# We now move all keys into properties. TODO: Explain why we have to do this. +# +# We now move all keys into properties to access them for our model. -descriptor = descriptor.keys_to_properties( - ["species_center", "species_neighbor_1", "species_neighbor_2"]) +descriptor = descriptor.keys_to_samples(["species_center"]) +descriptor = descriptor.keys_to_properties(["species_neighbor_1", "species_neighbor_2"]) # %% # @@ -79,9 +87,7 @@ # structure. However, our energies as target data is per structure only. # Therefore, we sum the features of each center atom per structure. -from equistore.operations import sum_over_samples - -X = sum_over_samples(descriptor, ["center"]) +X = sum_over_samples(descriptor, ["center", "species_center"]) # %% # @@ -105,7 +111,6 @@ # We construct the target data by converting energies and forces into a # :class:`equisolve.TensorMap`. -from equisolve.utils.convert import ase_to_tensormap y = ase_to_tensormap(frames, energy="energy", forces="forces") @@ -120,53 +125,54 @@ # Construct the model # ------------------- # -# Before we fit the model we have to define our regression values. +# Before we fit the model we have to define our regression values. # -# For this we create a TensorMap containing with a single sample and the -# same number of features as ``X``. +# For this we create a TensorMap containing with the desired regulerizer + + +alpha_dict = {"values": 1e-5} +alpha = dictionary_to_tensormap(alpha_dict, X) + +# %% +# +# So far ``alpha`` contains the same number of samples as ``X``. However, +# the regulerizer only has to be one sample, because all samples will be +# regulerized in the same way in a linear model. +# +# We remove all sample except the 0th one by using the +# :func:`equistore.operations.slice`. -import numpy as np -from equistore import Labels -from equistore.operations import slice samples = Labels( names=["structure"], values=np.array([(0,)]), ) -alpha = slice(X, samples=samples) -n_features = len(alpha.block().values[:]) - -alpha.block().values[:] = 1e-5 +alpha = slice(alpha, samples=samples) # %% # -# In our example we use the same alpha value for all features. However, -# :class:`equisolve.numpy.models.linear_model.Ridge` also allows for different -# regularization of each feature. You can apply a feature wise regularization by -# roviding setting ``alpha.block().values`` with an 1d array of the same length as the -# number of features in your training data. +# In our regulerizer we use the same values for all features. However, +# :class:`equisolve.numpy.models.linear_model.Ridge` can also handle different +# regularization for each feature. You can apply a feature wise regularization by +# setting ``"values"`` of ``alpha_dict`` with an 1d array of the same length as the +# number of features in the training data X (here 7200) # # With a valid regulerizer object we now initilize the Ridge object. # ``parameter_keys`` determines with respect to which parameters the regression is -# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and +# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and # ``"positions"`` (forces). -from equisolve.numpy.models.linear_model import Ridge -clf = Ridge(parameter_keys=["values", "positions"], - alpha=alpha) +clf = Ridge(parameter_keys=["values", "positions"], alpha=alpha) # %% # # Next we create a sample weighting :class:`equistiore.TensorMap` that weights energies # five times more then the forces. -from equisolve.utils import dictionary_to_tensormap - sw_dict = {"values": 5, "positions": 1} sw = dictionary_to_tensormap(sw_dict, y) - # %% # # The function `equisolve.utils.dictionary_to_tensormap` create a diff --git a/examples/linear-model.py b/examples/linear-model.py index 0ef7d24..5c08129 100644 --- a/examples/linear-model.py +++ b/examples/linear-model.py @@ -10,19 +10,31 @@ For constructing a linear Model we need the atomic descriptor as training data ``X`` as well as the energies and forces as target data ``y``. + +We first import all necessary packages. """ +import ase.io +import numpy as np +from equistore import Labels +from equistore.operations import slice, sum_over_samples +from rascaline import SoapPowerSpectrum + +from equisolve.numpy.models.linear_model import Ridge +from equisolve.utils import dictionary_to_tensormap +from equisolve.utils.convert import ase_to_tensormap + + # %% # # Dataset # ------- -# +# # As data set we use the SHIFTML set. You can obtain the dataset used in this # example from our :download:`website<../../static/dataset.xyz>`. # We read the first 20 structures of the data set using -# `ASE `. +# `ASE `. -import ase.io frames = ase.io.read("dataset.xyz", ":20") @@ -39,7 +51,6 @@ # We construct the descriptor training data with a SOAP powerspectrum using # rascaline. We first define the hyper parameters for the calculation -from rascaline import SoapPowerSpectrum HYPER_PARAMETERS = { "cutoff": 5.0, @@ -67,11 +78,11 @@ # # For more details on how the descriptor works see the documentation of # rascaline. -# -# We now move all keys into properties. TODO: Explain why we have to do this. +# +# We now move all keys into properties to access them for our model. -descriptor = descriptor.keys_to_properties( - ["species_center", "species_neighbor_1", "species_neighbor_2"]) +descriptor = descriptor.keys_to_samples(["species_center"]) +descriptor = descriptor.keys_to_properties(["species_neighbor_1", "species_neighbor_2"]) # %% # @@ -79,9 +90,7 @@ # structure. However, our energies as target data is per structure only. # Therefore, we sum the features of each center atom per structure. -from equistore.operations import sum_over_samples - -X = sum_over_samples(descriptor, ["center"]) +X = sum_over_samples(descriptor, ["center", "species_center"]) # %% # @@ -105,7 +114,6 @@ # We construct the target data by converting energies and forces into a # :class:`equisolve.TensorMap`. -from equisolve.utils.convert import ase_to_tensormap y = ase_to_tensormap(frames, energy="energy", forces="forces") @@ -120,53 +128,54 @@ # Construct the model # ------------------- # -# Before we fit the model we have to define our regression values. +# Before we fit the model we have to define our regression values. # -# For this we create a TensorMap containing with a single sample and the -# same number of features as ``X``. +# For this we create a TensorMap containing with the desired regulerizer + + +alpha_dict = {"values": 1e-5} +alpha = dictionary_to_tensormap(alpha_dict, X) + +# %% +# +# So far ``alpha`` contains the same number of samples as ``X``. However, +# the regulerizer only has to be one sample, because all samples will be +# regulerized in the same way in a linear model. +# +# We remove all sample except the 0th one by using the +# :func:`equistore.operations.slice`. -import numpy as np -from equistore import Labels -from equistore.operations import slice samples = Labels( names=["structure"], values=np.array([(0,)]), ) -alpha = slice(X, samples=samples) -n_features = len(alpha.block().values[:]) - -alpha.block().values[:] = 1e-5 +alpha = slice(alpha, samples=samples) # %% # -# In our example we use the same alpha value for all features. However, -# :class:`equisolve.numpy.models.linear_model.Ridge` also allows for different -# regularization of each feature. You can apply a feature wise regularization by -# roviding setting ``alpha.block().values`` with an 1d array of the same length as the -# number of features in your training data. +# In our regulerizer we use the same values for all features. However, +# :class:`equisolve.numpy.models.linear_model.Ridge` can also handle different +# regularization for each feature. You can apply a feature wise regularization by +# setting ``"values"`` of ``alpha_dict`` with an 1d array of the same length as the +# number of features in the training data X (here 7200) # # With a valid regulerizer object we now initilize the Ridge object. # ``parameter_keys`` determines with respect to which parameters the regression is -# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and +# performed. Here, we choose a regression wrt. to ``"values"`` (energies) and # ``"positions"`` (forces). -from equisolve.numpy.models.linear_model import Ridge -clf = Ridge(parameter_keys=["values", "positions"], - alpha=alpha) +clf = Ridge(parameter_keys=["values", "positions"], alpha=alpha) # %% # # Next we create a sample weighting :class:`equistiore.TensorMap` that weights energies # five times more then the forces. -from equisolve.utils import dictionary_to_tensormap - sw_dict = {"values": 5, "positions": 1} sw = dictionary_to_tensormap(sw_dict, y) - # %% # # The function `equisolve.utils.dictionary_to_tensormap` create a diff --git a/src/equisolve/numpy/models/linear_model.py b/src/equisolve/numpy/models/linear_model.py index e194444..314155e 100644 --- a/src/equisolve/numpy/models/linear_model.py +++ b/src/equisolve/numpy/models/linear_model.py @@ -12,7 +12,6 @@ from equistore import Labels, TensorBlock, TensorMap from scipy.linalg import solve -from ...utils.metrics import rmse from ..utils import block_to_array @@ -49,7 +48,7 @@ def __init__(self, parameter_keys: Union[List[str], str], alpha: TensorMap) -> N self.alpha = alpha self.coef_ = [] - def _validate_data(self, X: TensorMap, y: TensorMap = None) -> None: + def _validate_data(self, X: TensorMap, y: Optional[TensorMap] = None) -> None: """Validates :class:`equistore.TensorBlock`'s for the usage in models. :param X: training data to check @@ -84,9 +83,11 @@ def _validate_params(self, X: TensorBlock) -> None: for i_block, X_block in enumerate(X.blocks()): alpha_block = self.alpha.block(i_block) if len(alpha_block.samples) != 1: - raise ValueError("Only one sample is allowed for regularization. " - f"Given alpha contains {len(alpha_block.samples)} " - "samples.") + raise ValueError( + "Only one sample is allowed for regularization. " + f"Given alpha contains {len(alpha_block.samples)} " + "samples." + ) if len(X_block.properties) != len(alpha_block.properties): raise ValueError("X and y must have the same number of features") @@ -180,6 +181,6 @@ def score(self, X: TensorMap, y: TensorMap) -> float: :returns score: :math:`RMSE` of ``self.predict(X)`` wrt. `y` """ - y_pred = self.predict(X) + # y_pred = self.predict(X) # We need a tensormap implementation of rmse :-) # return rmse(y, y_pred) diff --git a/tox.ini b/tox.ini index 22853b4..f3b016d 100644 --- a/tox.ini +++ b/tox.ini @@ -11,6 +11,8 @@ envlist = docs format +lint_folders = {toxinidir}/examples {toxinidir}/src {toxinidir}/tests {toxinidir}/setup.py + [testenv:lint] # lint the Python code with flake8 (code linter), black (code formatter), and # isort (sorting of imports) @@ -20,9 +22,9 @@ deps = black isort commands = - flake8 {toxinidir}/src {toxinidir}/tests {toxinidir}/setup.py - black --check --diff {toxinidir}/src {toxinidir}/tests {toxinidir}/setup.py - isort --check-only --diff {toxinidir}/src {toxinidir}/tests {toxinidir}/setup.py + flake8 {[tox]lint_folders} + black --check --diff {[tox]lint_folders} + isort --check-only --diff {[tox]lint_folders} [testenv] # configures which environments run with each python version @@ -82,8 +84,8 @@ deps = black isort commands = - black {toxinidir}/src {toxinidir}/tests {toxinidir}/setup.py - isort {toxinidir}/src {toxinidir}/tests {toxinidir}/setup.py + black {[tox]lint_folders} + isort {[tox]lint_folders} [flake8] # https://flake8.pycqa.org/en/latest/#