From 268591cf8e03a8d23256efe711e5b3c3a4b71ce6 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 8 Mar 2024 03:24:49 -0500 Subject: [PATCH 1/8] pt: make get_data non-blocking (#3422) `to(DEVICE)` is cpu-blocking but `to(DEVICE, non-blocking=True)` is not blocking. This improves performance by at least 0.1s/100 steps. Before, `get_data` is blocking: ![1709698811097](https://github.com/deepmodeling/deepmd-kit/assets/9496702/b86b3928-41e7-46d3-8692-ca96b3a6475a) ![1709698811150](https://github.com/deepmodeling/deepmd-kit/assets/9496702/c4365203-3f3d-4de8-aae6-d8587f0e95a0) After, `get_data` is not blocking: ![1709698811122](https://github.com/deepmodeling/deepmd-kit/assets/9496702/d991c8f0-35c8-4b5d-822e-77af961e9b6e) ![1709698811169](https://github.com/deepmodeling/deepmd-kit/assets/9496702/a56160c2-78c7-4a44-aa96-1df0b520a60a) The subsequent blocking is `phys2inter` (via `torch.linalg.inv`). Signed-off-by: Jinzhe Zeng --- deepmd/pt/train/training.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 6938db9b3c..93afc38575 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -973,9 +973,11 @@ def get_data(self, is_train=True, task_key="Default"): continue elif not isinstance(batch_data[key], list): if batch_data[key] is not None: - batch_data[key] = batch_data[key].to(DEVICE) + batch_data[key] = batch_data[key].to(DEVICE, non_blocking=True) else: - batch_data[key] = [item.to(DEVICE) for item in batch_data[key]] + batch_data[key] = [ + item.to(DEVICE, non_blocking=True) for item in batch_data[key] + ] # we may need a better way to classify which are inputs and which are labels # now wrapper only supports the following inputs: input_keys = [ From fefc0e6cf64e3dcd6d78e9ce8707f4fc8c2a3b17 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 8 Mar 2024 03:25:01 -0500 Subject: [PATCH 2/8] pt: fix print_on_training when there is no validation data (#3423) #3405 changed results from `None` to `{}` but `print_on_training` wasn't revised. --- deepmd/pt/train/training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 93afc38575..62bc5a4c97 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -1037,7 +1037,7 @@ def print_on_training(self, fout, step_id, cur_lr, train_results, valid_results) print_str = "" print_str += "%7d" % step_id if not self.multi_task: - if valid_results is not None: + if valid_results: prop_fmt = " %11.2e %11.2e" for k in train_keys: print_str += prop_fmt % (valid_results[k], train_results[k]) @@ -1047,7 +1047,7 @@ def print_on_training(self, fout, step_id, cur_lr, train_results, valid_results) print_str += prop_fmt % (train_results[k]) else: for model_key in self.model_keys: - if valid_results[model_key] is not None: + if valid_results[model_key]: prop_fmt = " %11.2e %11.2e" for k in sorted(valid_results[model_key].keys()): print_str += prop_fmt % ( From dabbd35cfcc0c75eff7e567c69f6c8b228e14cbe Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 8 Mar 2024 03:25:17 -0500 Subject: [PATCH 3/8] Consistent activation functions between backends (#3431) 1. add relu, gelu, gelu_tf, relu6, softplus, sigmoid, and linear to dpmodel; 2. add gelu_tf, relu6, soft6, softplus, and sigmoid to pt; 3. change gelu in pt from non-approximate to approximate. If one still wants to use the non-approximate version, we may consider to add a new key; 4. add linear to tf; 5. none in tf now returns `lambda x: x` instead of `None` to be type consistent; 6. support uppercase in all backends; 7. add consistent tests. Signed-off-by: Jinzhe Zeng --- deepmd/common.py | 10 +++- deepmd/dpmodel/utils/network.py | 59 +++++++++++++++++--- deepmd/pt/utils/utils.py | 20 +++++-- deepmd/tf/common.py | 13 ++--- source/tests/consistent/test_activation.py | 63 ++++++++++++++++++++++ 5 files changed, 146 insertions(+), 19 deletions(-) create mode 100644 source/tests/consistent/test_activation.py diff --git a/deepmd/common.py b/deepmd/common.py index 29d32111a8..c776975591 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -52,7 +52,15 @@ _DICT_VAL = TypeVar("_DICT_VAL") _PRECISION = Literal["default", "float16", "float32", "float64"] _ACTIVATION = Literal[ - "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu", "gelu_tf" + "relu", + "relu6", + "softplus", + "sigmoid", + "tanh", + "gelu", + "gelu_tf", + "none", + "linear", ] __all__.extend( [ diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py index feb3355e77..6206367b1b 100644 --- a/deepmd/dpmodel/utils/network.py +++ b/deepmd/dpmodel/utils/network.py @@ -10,6 +10,7 @@ datetime, ) from typing import ( + Callable, ClassVar, Dict, List, @@ -309,14 +310,7 @@ def call(self, x: np.ndarray) -> np.ndarray: """ if self.w is None or self.activation_function is None: raise ValueError("w, b, and activation_function must be set") - if self.activation_function == "tanh": - fn = np.tanh - elif self.activation_function.lower() == "none": - - def fn(x): - return x - else: - raise NotImplementedError(self.activation_function) + fn = get_activation_fn(self.activation_function) y = ( np.matmul(x, self.w) + self.b if self.b is not None @@ -332,6 +326,55 @@ def fn(x): return y +def get_activation_fn(activation_function: str) -> Callable[[np.ndarray], np.ndarray]: + activation_function = activation_function.lower() + if activation_function == "tanh": + return np.tanh + elif activation_function == "relu": + + def fn(x): + # https://stackoverflow.com/a/47936476/9567349 + return x * (x > 0) + + return fn + elif activation_function in ("gelu", "gelu_tf"): + + def fn(x): + # generated by GitHub Copilot + return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3))) + + return fn + elif activation_function == "relu6": + + def fn(x): + # generated by GitHub Copilot + return np.minimum(np.maximum(x, 0), 6) + + return fn + elif activation_function == "softplus": + + def fn(x): + # generated by GitHub Copilot + return np.log(1 + np.exp(x)) + + return fn + elif activation_function == "sigmoid": + + def fn(x): + # generated by GitHub Copilot + return 1 / (1 + np.exp(-x)) + + return fn + elif activation_function.lower() in ("none", "linear"): + + def fn(x): + return x + + return fn + else: + raise NotImplementedError(activation_function) + + def make_multilayer_network(T_NetworkLayer, ModuleBase): class NN(ModuleBase): """Native representation of a neural network. diff --git a/deepmd/pt/utils/utils.py b/deepmd/pt/utils/utils.py index f5a4cd84b6..10dcadadac 100644 --- a/deepmd/pt/utils/utils.py +++ b/deepmd/pt/utils/utils.py @@ -21,10 +21,16 @@ def get_activation_fn(activation: str) -> Callable: """Returns the activation function corresponding to `activation`.""" if activation.lower() == "relu": return F.relu - elif activation.lower() == "gelu": - return F.gelu + elif activation.lower() == "gelu" or activation.lower() == "gelu_tf": + return lambda x: F.gelu(x, approximate="tanh") elif activation.lower() == "tanh": return torch.tanh + elif activation.lower() == "relu6": + return F.relu6 + elif activation.lower() == "softplus": + return F.softplus + elif activation.lower() == "sigmoid": + return torch.sigmoid elif activation.lower() == "linear" or activation.lower() == "none": return lambda x: x else: @@ -42,10 +48,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if self.activation.lower() == "relu": return F.relu(x) - elif self.activation.lower() == "gelu": - return F.gelu(x) + elif self.activation.lower() == "gelu" or self.activation.lower() == "gelu_tf": + return F.gelu(x, approximate="tanh") elif self.activation.lower() == "tanh": return torch.tanh(x) + elif self.activation.lower() == "relu6": + return F.relu6(x) + elif self.activation.lower() == "softplus": + return F.softplus(x) + elif self.activation.lower() == "sigmoid": + return torch.sigmoid(x) elif self.activation.lower() == "linear" or self.activation.lower() == "none": return x else: diff --git a/deepmd/tf/common.py b/deepmd/tf/common.py index b1872e72ed..0d59990a29 100644 --- a/deepmd/tf/common.py +++ b/deepmd/tf/common.py @@ -135,14 +135,14 @@ def gelu_wrapper(x): "tanh": tf.nn.tanh, "gelu": gelu, "gelu_tf": gelu_tf, - "None": None, - "none": None, + "linear": lambda x: x, + "none": lambda x: x, } def get_activation_func( activation_fn: Union["_ACTIVATION", None], -) -> Union[Callable[[tf.Tensor], tf.Tensor], None]: +) -> Callable[[tf.Tensor], tf.Tensor]: """Get activation function callable based on string name. Parameters @@ -161,10 +161,11 @@ def get_activation_func( if unknown activation function is specified """ if activation_fn is None: - return None - if activation_fn not in ACTIVATION_FN_DICT: + activation_fn = "none" + assert activation_fn is not None + if activation_fn.lower() not in ACTIVATION_FN_DICT: raise RuntimeError(f"{activation_fn} is not a valid activation function") - return ACTIVATION_FN_DICT[activation_fn] + return ACTIVATION_FN_DICT[activation_fn.lower()] def get_precision(precision: "_PRECISION") -> Any: diff --git a/source/tests/consistent/test_activation.py b/source/tests/consistent/test_activation.py new file mode 100644 index 0000000000..bb06df9082 --- /dev/null +++ b/source/tests/consistent/test_activation.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import unittest + +import numpy as np + +from deepmd.dpmodel.utils.network import get_activation_fn as get_activation_fn_dp + +from .common import ( + INSTALLED_PT, + INSTALLED_TF, + parameterized, +) + +if INSTALLED_PT: + from deepmd.pt.utils.utils import get_activation_fn as get_activation_fn_pt + from deepmd.pt.utils.utils import ( + to_numpy_array, + to_torch_tensor, + ) +if INSTALLED_TF: + from deepmd.tf.common import get_activation_func as get_activation_fn_tf + from deepmd.tf.env import ( + tf, + ) + + +@parameterized( + ( + "Relu", + "Relu6", + "Softplus", + "Sigmoid", + "Tanh", + "Gelu", + "Gelu_tf", + "Linear", + "None", + ), +) +class TestActivationFunctionConsistent(unittest.TestCase): + def setUp(self): + (self.activation,) = self.param + self.random_input = np.random.default_rng().normal(scale=10, size=(10, 10)) + self.ref = get_activation_fn_dp(self.activation)(self.random_input) + + @unittest.skipUnless(INSTALLED_TF, "TensorFlow is not installed") + def test_tf_consistent_with_ref(self): + if INSTALLED_TF: + place_holder = tf.placeholder(tf.float64, self.random_input.shape) + t_test = get_activation_fn_tf(self.activation)(place_holder) + with tf.Session() as sess: + test = sess.run(t_test, feed_dict={place_holder: self.random_input}) + np.testing.assert_allclose(self.ref, test, atol=1e-10) + + @unittest.skipUnless(INSTALLED_PT, "PyTorch is not installed") + def test_pt_consistent_with_ref(self): + if INSTALLED_PT: + test = to_numpy_array( + get_activation_fn_pt(self.activation)( + to_torch_tensor(self.random_input) + ) + ) + np.testing.assert_allclose(self.ref, test, atol=1e-10) From 66edd1f991254bd0c5dbbd3ff04774ac7c34c76d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 8 Mar 2024 03:26:51 -0500 Subject: [PATCH 4/8] fix errors when `dp` is executed without any subcommands (#3437) Signed-off-by: Jinzhe Zeng --- deepmd/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/main.py b/deepmd/main.py index 870a04a088..09457419e8 100644 --- a/deepmd/main.py +++ b/deepmd/main.py @@ -798,7 +798,8 @@ def main(): ): deepmd_main = BACKENDS[args.backend]().entry_point_hook elif args.command is None: - pass + # help message has been printed in parse_args + return else: raise RuntimeError(f"unknown command {args.command}") From d3dd6044fed98a23053634e0ae4d1a21e2a0a3c6 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 8 Mar 2024 04:32:11 -0500 Subject: [PATCH 5/8] pt: avoid torch.tensor(constant) during forward (#3421) `torch.tensor(constant)` copies memory from the CPU to the GPU, so it is host blocking and should be avoided in the `forward` method. Before, the CPU waited for the GPU using `cudaStreamSynchronize`, blocking the CPU from doing the following things, where the CPU memory needs to be copied to the GPU, a.k.a. host-to-device (H2D). ![1709693858444](https://github.com/deepmodeling/deepmd-kit/assets/9496702/e6fb6281-245f-4620-82bd-dbcd02121e32) After this PR, all ops in the energy loss are asynchronous, as no H2D happens. ![1709694622120](https://github.com/deepmodeling/deepmd-kit/assets/9496702/172e1601-1a9c-4236-a1e2-a749edc25c50) --------- Signed-off-by: Jinzhe Zeng Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- deepmd/pt/loss/denoise.py | 14 +++++++------- deepmd/pt/loss/ener.py | 2 +- deepmd/pt/loss/tensor.py | 2 +- .../pt/model/atomic_model/linear_atomic_model.py | 14 ++++++++------ 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/deepmd/pt/loss/denoise.py b/deepmd/pt/loss/denoise.py index cd12e70bb1..57691558cb 100644 --- a/deepmd/pt/loss/denoise.py +++ b/deepmd/pt/loss/denoise.py @@ -52,7 +52,7 @@ def forward(self, model_pred, label, natoms, learning_rate, mae=False): coord_mask = label["coord_mask"] type_mask = label["type_mask"] - loss = torch.tensor(0.0, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE) + loss = torch.zeros(1, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE)[0] more_loss = {} if self.has_coord: if self.mask_loss_coord: @@ -66,9 +66,9 @@ def forward(self, model_pred, label, natoms, learning_rate, mae=False): beta=self.beta, ) else: - coord_loss = torch.tensor( - 0.0, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE - ) + coord_loss = torch.zeros( + 1, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE + )[0] else: coord_loss = F.smooth_l1_loss( updated_coord.view(-1, 3), @@ -89,9 +89,9 @@ def forward(self, model_pred, label, natoms, learning_rate, mae=False): reduction="mean", ) else: - token_loss = torch.tensor( - 0.0, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE - ) + token_loss = torch.zeros( + 1, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE + )[0] else: token_loss = F.nll_loss( F.log_softmax(logits.view(-1, self.ntypes - 1), dim=-1), diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py index 2834733112..1d70528e88 100644 --- a/deepmd/pt/loss/ener.py +++ b/deepmd/pt/loss/ener.py @@ -108,7 +108,7 @@ def forward(self, model_pred, label, natoms, learning_rate, mae=False): pref_e = self.limit_pref_e + (self.start_pref_e - self.limit_pref_e) * coef pref_f = self.limit_pref_f + (self.start_pref_f - self.limit_pref_f) * coef pref_v = self.limit_pref_v + (self.start_pref_v - self.limit_pref_v) * coef - loss = torch.tensor(0.0, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE) + loss = torch.zeros(1, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE)[0] more_loss = {} # more_loss['log_keys'] = [] # showed when validation on the fly # more_loss['test_keys'] = [] # showed when doing dp test diff --git a/deepmd/pt/loss/tensor.py b/deepmd/pt/loss/tensor.py index ee42536557..5ac0a6e37b 100644 --- a/deepmd/pt/loss/tensor.py +++ b/deepmd/pt/loss/tensor.py @@ -83,7 +83,7 @@ def forward(self, model_pred, label, natoms, learning_rate=0.0, mae=False): Other losses for display. """ del learning_rate, mae - loss = torch.tensor(0.0, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE) + loss = torch.zeros(1, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE)[0] more_loss = {} if ( self.has_local_weight diff --git a/deepmd/pt/model/atomic_model/linear_atomic_model.py b/deepmd/pt/model/atomic_model/linear_atomic_model.py index 5e1a80087e..68705049ae 100644 --- a/deepmd/pt/model/atomic_model/linear_atomic_model.py +++ b/deepmd/pt/model/atomic_model/linear_atomic_model.py @@ -78,6 +78,10 @@ def __init__( self.atomic_bias = None self.mixed_types_list = [model.mixed_types() for model in self.models] + self.rcuts = torch.tensor( + self.get_model_rcuts(), dtype=torch.float64, device=env.DEVICE + ) + self.nsels = torch.tensor(self.get_model_nsels(), device=env.DEVICE) BaseAtomicModel.__init__(self, **kwargs) def mixed_types(self) -> bool: @@ -117,14 +121,12 @@ def get_model_sels(self) -> List[List[int]]: """Get the sels for each individual models.""" return [model.get_sel() for model in self.models] - def _sort_rcuts_sels(self, device: torch.device) -> Tuple[List[float], List[int]]: + def _sort_rcuts_sels(self) -> Tuple[List[float], List[int]]: # sort the pair of rcut and sels in ascending order, first based on sel, then on rcut. - rcuts = torch.tensor(self.get_model_rcuts(), dtype=torch.float64, device=device) - nsels = torch.tensor(self.get_model_nsels(), device=device) zipped = torch.stack( [ - rcuts, - nsels, + self.rcuts, + self.nsels, ], dim=0, ).T @@ -171,7 +173,7 @@ def forward_atomic( if self.do_grad_r() or self.do_grad_c(): extended_coord.requires_grad_(True) extended_coord = extended_coord.view(nframes, -1, 3) - sorted_rcuts, sorted_sels = self._sort_rcuts_sels(device=extended_coord.device) + sorted_rcuts, sorted_sels = self._sort_rcuts_sels() nlists = build_multiple_neighbor_list( extended_coord, nlist, From a9bcf4153847ddb3773f46b0cf7e011eaecacf43 Mon Sep 17 00:00:00 2001 From: Lysithea <52808607+CaRoLZhangxy@users.noreply.github.com> Date: Sat, 9 Mar 2024 01:51:52 +0800 Subject: [PATCH 6/8] clean up the init interface of pt.dataloader (#3434) https://github.com/deepmodeling/deepmd-kit/issues/3427 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- deepmd/pt/entrypoints/main.py | 6 +++--- deepmd/pt/utils/dataloader.py | 21 +++++++++++++++---- deepmd/pt/utils/dataset.py | 13 +++--------- source/tests/pt/model/test_model.py | 13 +----------- source/tests/pt/model/test_saveload_dpa1.py | 13 +----------- .../tests/pt/model/test_saveload_se_e2_a.py | 13 +----------- source/tests/pt/test_sampler.py | 9 +------- source/tests/pt/test_stat.py | 9 +------- 8 files changed, 28 insertions(+), 69 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 0e5767cb4e..76796f6197 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -134,7 +134,7 @@ def prepare_trainer_input_single( DpLoaderSet( validation_systems, validation_dataset_params["batch_size"], - model_params_single, + model_params_single["type_map"], ) if validation_systems else None @@ -143,13 +143,13 @@ def prepare_trainer_input_single( train_data_single = DpLoaderSet( training_systems, training_dataset_params["batch_size"], - model_params_single, + model_params_single["type_map"], ) else: train_data_single = DpLoaderSet( training_systems, training_dataset_params["batch_size"], - model_params_single, + model_params_single["type_map"], ) return ( train_data_single, diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 2715bced52..0359071d71 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -55,13 +55,27 @@ def setup_seed(seed): class DpLoaderSet(Dataset): - """A dataset for storing DataLoaders to multiple Systems.""" + """A dataset for storing DataLoaders to multiple Systems. + + Parameters + ---------- + sys_path + Path to the data system + batch_size + Max frame count in a batch. + type_map + Gives the name of different atom types + seed + Random seed for dataloader + shuffle + If the data are shuffled (Only effective in serial mode. Always shuffle in distributed data parallelism) + """ def __init__( self, systems, batch_size, - model_params, + type_map, seed=10, shuffle=True, ): @@ -77,8 +91,7 @@ def __init__( def construct_dataset(system): return DeepmdDataSetForLoader( system=system, - type_map=model_params["type_map"], - shuffle=shuffle, + type_map=type_map, ) with Pool( diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py index 77297d980c..dbe4d92a0f 100644 --- a/deepmd/pt/utils/dataset.py +++ b/deepmd/pt/utils/dataset.py @@ -3,6 +3,7 @@ from typing import ( List, + Optional, ) from torch.utils.data import ( @@ -16,24 +17,16 @@ class DeepmdDataSetForLoader(Dataset): - def __init__( - self, - system: str, - type_map: str, - shuffle=True, - ): + def __init__(self, system: str, type_map: Optional[List[str]] = None): """Construct DeePMD-style dataset containing frames cross different systems. Args: - systems: Paths to systems. - - batch_size: Max frame count in a batch. - type_map: Atom types. """ self.system = system self._type_map = type_map - self._data_system = DeepmdData( - sys_path=system, shuffle_test=shuffle, type_map=self._type_map - ) + self._data_system = DeepmdData(sys_path=system, type_map=self._type_map) self.mixed_type = self._data_system.mixed_type self._ntypes = self._data_system.get_ntypes() self._natoms = self._data_system.get_natoms() diff --git a/source/tests/pt/model/test_model.py b/source/tests/pt/model/test_model.py index 69ec88f5d7..f42c11aa4c 100644 --- a/source/tests/pt/model/test_model.py +++ b/source/tests/pt/model/test_model.py @@ -273,18 +273,7 @@ def test_consistency(self): self.wanted_step ) # Build DeePMD graph - my_ds = DpLoaderSet( - self.systems, - self.batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": self.sel, - "rcut": self.rcut, - }, - "type_map": self.type_map, - }, - ) + my_ds = DpLoaderSet(self.systems, self.batch_size, self.type_map) my_ds.add_data_requirement(energy_data_requirement) my_model = get_model( model_params={ diff --git a/source/tests/pt/model/test_saveload_dpa1.py b/source/tests/pt/model/test_saveload_dpa1.py index 408afbef43..712b44485e 100644 --- a/source/tests/pt/model/test_saveload_dpa1.py +++ b/source/tests/pt/model/test_saveload_dpa1.py @@ -46,18 +46,7 @@ def get_dataset(config): batch_size = config["training"]["training_data"]["batch_size"] type_map = model_config["type_map"] - dataset = DpLoaderSet( - systems, - batch_size, - model_params={ - "descriptor": { - "type": "dpa1", - "sel": sel, - "rcut": rcut, - }, - "type_map": type_map, - }, - ) + dataset = DpLoaderSet(systems, batch_size, type_map) data_stat_nbatch = model_config.get("data_stat_nbatch", 10) sampled = make_stat_input(dataset.systems, dataset.dataloaders, data_stat_nbatch) return dataset, sampled diff --git a/source/tests/pt/model/test_saveload_se_e2_a.py b/source/tests/pt/model/test_saveload_se_e2_a.py index 382f119c30..56ea3283d9 100644 --- a/source/tests/pt/model/test_saveload_se_e2_a.py +++ b/source/tests/pt/model/test_saveload_se_e2_a.py @@ -46,18 +46,7 @@ def get_dataset(config): batch_size = config["training"]["training_data"]["batch_size"] type_map = model_config["type_map"] - dataset = DpLoaderSet( - systems, - batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": sel, - "rcut": rcut, - }, - "type_map": type_map, - }, - ) + dataset = DpLoaderSet(systems, batch_size, type_map) data_stat_nbatch = model_config.get("data_stat_nbatch", 10) sampled = make_stat_input(dataset.systems, dataset.dataloaders, data_stat_nbatch) return dataset, sampled diff --git a/source/tests/pt/test_sampler.py b/source/tests/pt/test_sampler.py index 25980cc144..4f1091c936 100644 --- a/source/tests/pt/test_sampler.py +++ b/source/tests/pt/test_sampler.py @@ -46,14 +46,7 @@ def setUp(self): self.my_dataset = DpLoaderSet( self.systems, self.batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": self.sel, - "rcut": self.rcut, - }, - "type_map": model_config["type_map"], - }, + model_config["type_map"], seed=10, shuffle=False, ) diff --git a/source/tests/pt/test_stat.py b/source/tests/pt/test_stat.py index e69caad502..51ca903bc2 100644 --- a/source/tests/pt/test_stat.py +++ b/source/tests/pt/test_stat.py @@ -137,14 +137,7 @@ def setUp(self): self.my_dataset = DpLoaderSet( self.systems, self.batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": self.sel, - "rcut": self.rcut, - }, - "type_map": model_config["type_map"], - }, + model_config["type_map"], seed=10, ) self.filter_neuron = model_config["descriptor"]["neuron"] From fd82f0484e9d2d6625937f889b3b379b9cc5af23 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Sun, 10 Mar 2024 02:02:57 +0800 Subject: [PATCH 7/8] Add `max_ckpt_keep` for trainer (#3441) Signed-off-by: Duo <50307526+iProzd@users.noreply.github.com> Co-authored-by: Jinzhe Zeng --- deepmd/pt/train/training.py | 10 ++++++++++ deepmd/tf/train/trainer.py | 5 ++++- deepmd/utils/argcheck.py | 6 ++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 62bc5a4c97..fb28f0c4f2 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -132,6 +132,7 @@ def __init__( self.disp_freq = training_params.get("disp_freq", 1000) self.save_ckpt = training_params.get("save_ckpt", "model.ckpt") self.save_freq = training_params.get("save_freq", 1000) + self.max_ckpt_keep = training_params.get("max_ckpt_keep", 5) self.lcurve_should_print_header = True def get_opt_param(params): @@ -924,6 +925,15 @@ def save_model(self, save_path, lr=0.0, step=0): {"model": module.state_dict(), "optimizer": self.optimizer.state_dict()}, save_path, ) + checkpoint_dir = save_path.parent + checkpoint_files = [ + f + for f in checkpoint_dir.glob("*.pt") + if not f.is_symlink() and f.name.startswith(self.save_ckpt) + ] + if len(checkpoint_files) > self.max_ckpt_keep: + checkpoint_files.sort(key=lambda x: x.stat().st_mtime) + checkpoint_files[0].unlink() def get_data(self, is_train=True, task_key="Default"): if not self.multi_task: diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py index 1dd31fd0bb..27478abaa1 100644 --- a/deepmd/tf/train/trainer.py +++ b/deepmd/tf/train/trainer.py @@ -164,6 +164,7 @@ def get_lr_and_coef(lr_param): self.disp_freq = tr_data.get("disp_freq", 1000) self.save_freq = tr_data.get("save_freq", 1000) self.save_ckpt = tr_data.get("save_ckpt", "model.ckpt") + self.max_ckpt_keep = tr_data.get("max_ckpt_keep", 5) self.display_in_training = tr_data.get("disp_training", True) self.timing_in_training = tr_data.get("time_training", True) self.profiling = self.run_opt.is_chief and tr_data.get("profiling", False) @@ -498,7 +499,9 @@ def _init_session(self): # Initializes or restore global variables init_op = tf.global_variables_initializer() if self.run_opt.is_chief: - self.saver = tf.train.Saver(save_relative_paths=True) + self.saver = tf.train.Saver( + save_relative_paths=True, max_to_keep=self.max_ckpt_keep + ) if self.run_opt.init_mode == "init_from_scratch": log.info("initialize model from scratch") run_sess(self.sess, init_op) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 5e8db431f8..e822e18d50 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2134,6 +2134,11 @@ def training_args(): # ! modified by Ziyao: data configuration isolated. doc_disp_freq = "The frequency of printing learning curve." doc_save_freq = "The frequency of saving check point." doc_save_ckpt = "The path prefix of saving check point files." + doc_max_ckpt_keep = ( + "The maximum number of checkpoints to keep. " + "The oldest checkpoints will be deleted once the number of checkpoints exceeds max_ckpt_keep. " + "Defaults to 5." + ) doc_disp_training = "Displaying verbose information during training." doc_time_training = "Timing durining training." doc_profiling = "Profiling during training." @@ -2192,6 +2197,7 @@ def training_args(): # ! modified by Ziyao: data configuration isolated. Argument( "save_ckpt", str, optional=True, default="model.ckpt", doc=doc_save_ckpt ), + Argument("max_ckpt_keep", int, optional=True, default=5, doc=doc_max_ckpt_keep), Argument( "disp_training", bool, optional=True, default=True, doc=doc_disp_training ), From a286bd498ca4ea30c952b6b19587db4757a3fa55 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Sun, 10 Mar 2024 15:49:51 +0800 Subject: [PATCH 8/8] pt: cleanup tester (#3442) --- deepmd/pt/entrypoints/main.py | 4 +- deepmd/pt/infer/inference.py | 350 ------------------------------ source/tests/pt/model/test_jit.py | 2 +- 3 files changed, 2 insertions(+), 354 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 76796f6197..46d284a395 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -281,9 +281,7 @@ def train(FLAGS): def freeze(FLAGS): - model = torch.jit.script( - inference.Tester(FLAGS.model, numb_test=1, head=FLAGS.head).model - ) + model = torch.jit.script(inference.Tester(FLAGS.model, head=FLAGS.head).model) torch.jit.save( model, FLAGS.output, diff --git a/deepmd/pt/infer/inference.py b/deepmd/pt/infer/inference.py index e97623dd24..6c13b363bc 100644 --- a/deepmd/pt/infer/inference.py +++ b/deepmd/pt/infer/inference.py @@ -1,41 +1,20 @@ # SPDX-License-Identifier: LGPL-3.0-or-later -import json import logging -import math from copy import ( deepcopy, ) -from pathlib import ( - Path, -) -import numpy as np import torch -from torch.utils.data import ( - DataLoader, - RandomSampler, -) -from deepmd.common import ( - expand_sys_str, -) -from deepmd.pt.loss import ( - DenoiseLoss, - EnergyStdLoss, -) from deepmd.pt.model.model import ( get_model, ) from deepmd.pt.train.wrapper import ( ModelWrapper, ) -from deepmd.pt.utils.dataloader import ( - DpLoaderSet, -) from deepmd.pt.utils.env import ( DEVICE, JIT, - NUM_WORKERS, ) if torch.__version__.startswith("2"): @@ -47,12 +26,6 @@ class Tester: def __init__( self, model_ckpt, - input_script=None, - system=None, - datafile=None, - numb_test=100, - detail_file=None, - shuffle_test=False, head=None, ): """Construct a DeePMD tester. @@ -60,9 +33,6 @@ def __init__( Args: - config: The Dict-like configuration with training options. """ - self.numb_test = numb_test - self.detail_file = detail_file - self.shuffle_test = shuffle_test # Model state_dict = torch.load(model_ckpt, map_location=DEVICE) if "model" in state_dict: @@ -85,54 +55,6 @@ def __init__( ] = state_dict[item].clone() state_dict = state_dict_head - # Data - if input_script is not None: - with open(input_script) as fin: - self.input_script = json.load(fin) - training_params = self.input_script["training"] - if not self.multi_task: - assert ( - "validation_data" in training_params - ), f"Validation systems not found in {input_script}!" - self.systems = training_params["validation_data"]["systems"] - self.batchsize = training_params["validation_data"]["batch_size"] - log.info(f"Testing validation systems in input script: {input_script}") - else: - assert ( - "data_dict" in training_params - ), f"Input script {input_script} is not in multi-task mode!" - assert head in training_params["data_dict"], ( - f"Specified head {head} not found in input script {input_script}! " - f"Available ones are {list(training_params['data_dict'].keys())}." - ) - assert ( - "validation_data" in training_params["data_dict"][head] - ), f"Validation systems not found in head {head} of {input_script}!" - self.systems = training_params["data_dict"][head]["validation_data"][ - "systems" - ] - self.batchsize = training_params["data_dict"][head]["validation_data"][ - "batch_size" - ] - log.info( - f"Testing validation systems in head {head} of input script: {input_script}" - ) - elif system is not None: - self.systems = expand_sys_str(system) - self.batchsize = "auto" - log.info("Testing systems in path: %s", system) - elif datafile is not None: - with open(datafile) as fin: - self.systems = fin.read().splitlines() - self.batchsize = "auto" - log.info("Testing systems in file: %s", datafile) - else: - self.systems = None - self.batchsize = None - - self.type_split = False - if model_params["descriptor"]["type"] in ["se_e2_a"]: - self.type_split = True self.model_params = deepcopy(model_params) model_params["resuming"] = True self.model = get_model(model_params).to(DEVICE) @@ -142,275 +64,3 @@ def __init__( if JIT: self.wrapper = torch.jit.script(self.wrapper) self.wrapper.load_state_dict(state_dict) - - # Loss - if "fitting_net" not in model_params: - assert ( - input_script is not None - ), "Denoise model must use --input-script mode!" - loss_params = self.input_script["loss"] - loss_type = loss_params.pop("type", "ener") - assert ( - loss_type == "denoise" - ), "Models without fitting_net only support denoise test!" - self.noise_settings = { - "noise_type": loss_params.pop("noise_type", "uniform"), - "noise": loss_params.pop("noise", 1.0), - "noise_mode": loss_params.pop("noise_mode", "fix_num"), - "mask_num": loss_params.pop("mask_num", 8), - "same_mask": loss_params.pop("same_mask", False), - "mask_coord": loss_params.pop("mask_coord", False), - "mask_type": loss_params.pop("mask_type", False), - "mask_type_idx": len(model_params["type_map"]) - 1, - } - loss_params["ntypes"] = len(model_params["type_map"]) - self.loss = DenoiseLoss(**loss_params) - else: - self.noise_settings = None - self.loss = EnergyStdLoss(inference=True) - - @staticmethod - def get_data(data): - with torch.device("cpu"): - batch_data = next(iter(data)) - for key in batch_data.keys(): - if key == "sid" or key == "fid": - continue - elif not isinstance(batch_data[key], list): - if batch_data[key] is not None: - batch_data[key] = batch_data[key].to(DEVICE) - else: - batch_data[key] = [item.to(DEVICE) for item in batch_data[key]] - input_dict = {} - for item in [ - "coord", - "atype", - "box", - ]: - if item in batch_data: - input_dict[item] = batch_data[item] - else: - input_dict[item] = None - label_dict = {} - for item in [ - "energy", - "force", - "virial", - "clean_coord", - "clean_type", - "coord_mask", - "type_mask", - ]: - if item in batch_data: - label_dict[item] = batch_data[item] - return input_dict, label_dict - - def run(self): - systems = self.systems - system_results = {} - global_sum_natoms = 0 - for cc, system in enumerate(systems): - log.info("# ---------------output of dp test--------------- ") - log.info(f"# testing system : {system}") - system_pred = [] - system_label = [] - dataset = DpLoaderSet( - [system], - self.batchsize, - self.model_params, - shuffle=self.shuffle_test, - ) - sampler = RandomSampler( - dataset, replacement=True, num_samples=dataset.total_batch - ) - if sampler is None: - log.warning( - "Sampler not specified!" - ) # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration. - dataloader = DataLoader( - dataset, - sampler=sampler, - batch_size=None, - num_workers=min( - NUM_WORKERS, 1 - ), # setting to 0 diverges the behavior of its iterator; should be >=1 - drop_last=False, - ) - with torch.device("cpu"): - data = iter(dataloader) - - single_results = {} - sum_natoms = 0 - sys_natoms = None - for ii in range(self.numb_test): - try: - input_dict, label_dict = self.get_data(data) - except StopIteration: - if ( - ii < dataset.total_batch - ): # Unexpected stop iteration.(test step < total batch) - raise StopIteration - else: - break - model_pred, _, _ = self.wrapper(**input_dict) - system_pred.append( - { - item: model_pred[item].detach().cpu().numpy() - for item in model_pred - } - ) - system_label.append( - { - item: label_dict[item].detach().cpu().numpy() - for item in label_dict - } - ) - natoms = int(input_dict["atype"].shape[-1]) - _, more_loss = self.loss( - model_pred, label_dict, natoms, 1.0, mae=True - ) # TODO: lr here is useless - if sys_natoms is None: - sys_natoms = natoms - else: - assert ( - sys_natoms == natoms - ), "Frames in one system must be the same!" - sum_natoms += natoms - for k, v in more_loss.items(): - if "mae" in k: - single_results[k] = single_results.get(k, 0.0) + v * natoms - else: - single_results[k] = single_results.get(k, 0.0) + v**2 * natoms - if self.detail_file is not None: - save_detail_file( - Path(self.detail_file), - system_pred, - system_label, - sys_natoms, - system_name=system, - append=(cc != 0), - ) - results = { - k: v / sum_natoms if "mae" in k else math.sqrt(v / sum_natoms) - for k, v in single_results.items() - } - for item in sorted(results.keys()): - log.info(f"{item}: {results[item]:.4f}") - log.info("# ----------------------------------------------- ") - for k, v in single_results.items(): - system_results[k] = system_results.get(k, 0.0) + v - global_sum_natoms += sum_natoms - - global_results = { - k: v / global_sum_natoms if "mae" in k else math.sqrt(v / global_sum_natoms) - for k, v in system_results.items() - } - log.info("# ----------weighted average of errors----------- ") - if not self.multi_task: - log.info(f"# number of systems : {len(systems)}") - else: - log.info(f"# number of systems for {self.head}: {len(systems)}") - for item in sorted(global_results.keys()): - log.info(f"{item}: {global_results[item]:.4f}") - log.info("# ----------------------------------------------- ") - return global_results - - -def save_txt_file( - fname: Path, data: np.ndarray, header: str = "", append: bool = False -): - """Save numpy array to test file. - - Parameters - ---------- - fname : str - filename - data : np.ndarray - data to save to disk - header : str, optional - header string to use in file, by default "" - append : bool, optional - if true file will be appended insted of overwriting, by default False - """ - flags = "ab" if append else "w" - with fname.open(flags) as fp: - np.savetxt(fp, data, header=header) - - -def save_detail_file( - detail_path, system_pred, system_label, natoms, system_name, append=False -): - ntest = len(system_pred) - data_e = np.concatenate([item["energy"] for item in system_label]).reshape([-1, 1]) - pred_e = np.concatenate([item["energy"] for item in system_pred]).reshape([-1, 1]) - pe = np.concatenate( - ( - data_e, - pred_e, - ), - axis=1, - ) - save_txt_file( - detail_path.with_suffix(".e.out"), - pe, - header="%s: data_e pred_e" % system_name, - append=append, - ) - pe_atom = pe / natoms - save_txt_file( - detail_path.with_suffix(".e_peratom.out"), - pe_atom, - header="%s: data_e pred_e" % system_name, - append=append, - ) - if "force" in system_pred[0]: - data_f = np.concatenate([item["force"] for item in system_label]).reshape( - [-1, 3] - ) - pred_f = np.concatenate([item["force"] for item in system_pred]).reshape( - [-1, 3] - ) - pf = np.concatenate( - ( - data_f, - pred_f, - ), - axis=1, - ) - save_txt_file( - detail_path.with_suffix(".f.out"), - pf, - header="%s: data_fx data_fy data_fz pred_fx pred_fy pred_fz" % system_name, - append=append, - ) - if "virial" in system_pred[0]: - data_v = np.concatenate([item["virial"] for item in system_label]).reshape( - [-1, 9] - ) - pred_v = np.concatenate([item["virial"] for item in system_pred]).reshape( - [-1, 9] - ) - pv = np.concatenate( - ( - data_v, - pred_v, - ), - axis=1, - ) - save_txt_file( - detail_path.with_suffix(".v.out"), - pv, - header=f"{system_name}: data_vxx data_vxy data_vxz data_vyx data_vyy " - "data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx " - "pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz", - append=append, - ) - pv_atom = pv / natoms - save_txt_file( - detail_path.with_suffix(".v_peratom.out"), - pv_atom, - header=f"{system_name}: data_vxx data_vxy data_vxz data_vyx data_vyy " - "data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx " - "pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz", - append=append, - ) diff --git a/source/tests/pt/model/test_jit.py b/source/tests/pt/model/test_jit.py index fc07267b88..81ea49a68e 100644 --- a/source/tests/pt/model/test_jit.py +++ b/source/tests/pt/model/test_jit.py @@ -31,7 +31,7 @@ class JITTest: def test_jit(self): trainer = get_trainer(deepcopy(self.config)) trainer.run() - model = torch.jit.script(inference.Tester("./model.pt", numb_test=1).model) + model = torch.jit.script(inference.Tester("./model.pt").model) torch.jit.save(model, "./frozen_model.pth", {}) def tearDown(self):