From 11771fa420b038d9ab5c2622a158bacef78149ac Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 16 Nov 2023 18:03:42 -0500 Subject: [PATCH 1/4] fix restarting from compressed training with type embedding Signed-off-by: Jinzhe Zeng --- deepmd/model/dos.py | 4 ++- deepmd/model/ener.py | 4 ++- deepmd/model/model.py | 55 +++++++++++++++++++++++++++++++++++ deepmd/model/multi.py | 4 ++- deepmd/model/pairwise_dprc.py | 4 ++- deepmd/model/tensor.py | 4 ++- deepmd/utils/type_embed.py | 8 +---- 7 files changed, 71 insertions(+), 12 deletions(-) diff --git a/deepmd/model/dos.py b/deepmd/model/dos.py index 697fad9a9e..22e291a0f0 100644 --- a/deepmd/model/dos.py +++ b/deepmd/model/dos.py @@ -155,10 +155,12 @@ def build( # type embedding if any if self.typeebd is not None: - type_embedding = self.typeebd.build( + type_embedding = self.build_type_embedding( self.ntypes, reuse=reuse, suffix=suffix, + frz_model=frz_model, + ckpt_meta=ckpt_meta, ) input_dict["type_embedding"] = type_embedding input_dict["atype"] = atype_ diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py index 1976c1ad51..0d8d66b305 100644 --- a/deepmd/model/ener.py +++ b/deepmd/model/ener.py @@ -203,10 +203,12 @@ def build( # type embedding if any if self.typeebd is not None and "type_embedding" not in input_dict: - type_embedding = self.typeebd.build( + type_embedding = self.build_type_embedding( self.ntypes, reuse=reuse, suffix=suffix, + ckpt_meta=ckpt_meta, + frz_model=frz_model, ) input_dict["type_embedding"] = type_embedding # spin if any diff --git a/deepmd/model/model.py b/deepmd/model/model.py index 3f24e42aec..4a242904de 100644 --- a/deepmd/model/model.py +++ b/deepmd/model/model.py @@ -331,6 +331,61 @@ def build_descrpt( self.descrpt.pass_tensors_from_frz_model(*imported_tensors[:-1]) return dout + def build_type_embedding( + self, + ntypes: int, + frz_model: Optional[str] = None, + ckpt_meta: Optional[str] = None, + suffix: str = "", + reuse: Optional[Union[bool, Enum]] = None, + ) -> tf.Tensor: + """Build the type embedding part of the model. + + Parameters + ---------- + ntypes : int + The number of types + frz_model : str, optional + The path to the frozen model + ckpt_meta : str, optional + The path prefix of the checkpoint and meta files + suffix : str, optional + The suffix of the scope + reuse : bool or tf.AUTO_REUSE, optional + Whether to reuse the variables + + Returns + ------- + tf.Tensor + The type embedding tensor + """ + assert self.typeebd is not None + if frz_model is None and ckpt_meta is None: + dout = self.typeebd.build( + ntypes, + reuse=reuse, + suffix=suffix, + ) + else: + # nothing input + feed_dict = {} + return_elements = [ + f"t_typeebd{suffix}:0", + ] + if frz_model is not None: + imported_tensors = self._import_graph_def_from_frz_model( + frz_model, feed_dict, return_elements + ) + elif ckpt_meta is not None: + imported_tensors = self._import_graph_def_from_ckpt_meta( + ckpt_meta, feed_dict, return_elements + ) + else: + raise RuntimeError("should not reach here") # pragma: no cover + dout = imported_tensors[-1] + self.typeebd.type_embedding_from_graph = dout + return dout + def _import_graph_def_from_frz_model( self, frz_model: str, feed_dict: dict, return_elements: List[str] ): diff --git a/deepmd/model/multi.py b/deepmd/model/multi.py index bfc67b9792..83b231c0e8 100644 --- a/deepmd/model/multi.py +++ b/deepmd/model/multi.py @@ -317,10 +317,12 @@ def build( # type embedding if any if self.typeebd is not None: - type_embedding = self.typeebd.build( + type_embedding = self.build_type_embedding( self.ntypes, reuse=reuse, suffix=suffix, + frz_model=frz_model, + ckpt_meta=ckpt_meta, ) input_dict["type_embedding"] = type_embedding input_dict["atype"] = atype_ diff --git a/deepmd/model/pairwise_dprc.py b/deepmd/model/pairwise_dprc.py index 6983a31cfd..f74571febb 100644 --- a/deepmd/model/pairwise_dprc.py +++ b/deepmd/model/pairwise_dprc.py @@ -173,10 +173,12 @@ def build( atype_qmmm = gather_placeholder(atype_qmmm, forward_qmmm_map, placeholder=-1) box_qm = box - type_embedding = self.typeebd.build( + type_embedding = self.build_type_embedding( self.ntypes, reuse=reuse, suffix=suffix, + frz_model=frz_model, + ckpt_meta=ckpt_meta, ) input_dict_qm["type_embedding"] = type_embedding input_dict_qmmm["type_embedding"] = type_embedding diff --git a/deepmd/model/tensor.py b/deepmd/model/tensor.py index 9099b753a4..6a21e085f3 100644 --- a/deepmd/model/tensor.py +++ b/deepmd/model/tensor.py @@ -135,10 +135,12 @@ def build( # type embedding if any if self.typeebd is not None: - type_embedding = self.typeebd.build( + type_embedding = self.build_type_embedding( self.ntypes, reuse=reuse, suffix=suffix, + ckpt_meta=ckpt_meta, + frz_model=frz_model, ) input_dict["type_embedding"] = type_embedding input_dict["atype"] = atype_ diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py index aadbb3c6e0..c8ab01f7f5 100644 --- a/deepmd/utils/type_embed.py +++ b/deepmd/utils/type_embed.py @@ -16,7 +16,6 @@ nvnmd_cfg, ) from deepmd.utils.graph import ( - get_tensor_by_name_from_graph, get_type_embedding_net_variables_from_graph_def, ) from deepmd.utils.network import ( @@ -109,7 +108,6 @@ def __init__( self.trainable = trainable self.uniform_seed = uniform_seed self.type_embedding_net_variables = None - self.type_embedding_from_graph = None self.padding = padding self.model_type = None @@ -135,8 +133,6 @@ def build( embedded_types The computational graph for embedded types """ - if self.model_type is not None and self.model_type == "compressed_model": - return self.type_embedding_from_graph types = tf.convert_to_tensor(list(range(ntypes)), dtype=tf.int32) ebd_type = tf.cast( tf.one_hot(tf.cast(types, dtype=tf.int32), int(ntypes)), @@ -166,7 +162,7 @@ def build( if self.padding: last_type = tf.cast(tf.zeros([1, self.neuron[-1]]), self.filter_precision) ebd_type = tf.concat([ebd_type, last_type], 0) # (ntypes + 1) * neuron[-1] - self.ebd_type = tf.identity(ebd_type, name="t_typeebd") + self.ebd_type = tf.identity(ebd_type, name="t_typeebd" + suffix) return self.ebd_type def init_variables( @@ -193,5 +189,3 @@ def init_variables( self.type_embedding_net_variables = ( get_type_embedding_net_variables_from_graph_def(graph_def, suffix=suffix) ) - type_embedding = get_tensor_by_name_from_graph(graph, "t_typeebd") - self.type_embedding_from_graph = tf.convert_to_tensor(type_embedding) From d9dd12daa01686585f5859ef70cdae0e8f6d118d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 16 Nov 2023 18:06:54 -0500 Subject: [PATCH 2/4] remove useless variable Signed-off-by: Jinzhe Zeng --- deepmd/model/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/model/model.py b/deepmd/model/model.py index 4a242904de..dd439056b4 100644 --- a/deepmd/model/model.py +++ b/deepmd/model/model.py @@ -383,7 +383,6 @@ def build_type_embedding( else: raise RuntimeError("should not reach here") # pragma: no cover dout = imported_tensors[-1] - self.typeebd.type_embedding_from_graph = dout return dout def _import_graph_def_from_frz_model( From 6692300637d2a995dec10baf1b9851b391ae226b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 16 Nov 2023 13:24:17 -0500 Subject: [PATCH 3/4] add tests for compressed training and restart Signed-off-by: Jinzhe Zeng --- source/tests/test_compressed_training.py | 60 ++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 source/tests/test_compressed_training.py diff --git a/source/tests/test_compressed_training.py b/source/tests/test_compressed_training.py new file mode 100644 index 0000000000..a9420a5bd0 --- /dev/null +++ b/source/tests/test_compressed_training.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import json +import os +import unittest + +# from deepmd.entrypoints.compress import compress +from common import ( + j_loader, + run_dp, + tests_path, +) +from packaging.version import parse as parse_version + +from deepmd.env import ( + tf, +) + + +@unittest.skipIf( + parse_version(tf.__version__) < parse_version("2"), + f"The current tf version {tf.__version__} is too low to run the new testing model.", +) +class TestCompressedTrainingSeAtten(unittest.TestCase): + def setUp(self) -> None: + data_file = str(tests_path / os.path.join("model_compression", "data")) + self.input_file = str(tests_path / "input.json") + self.frozen_model = str(tests_path / "dp-compress-training-original.pb") + self.compressed_model = str(tests_path / "dp-compress-training-compressed.pb") + self.frozen_compress_training_model = str( + tests_path / "dp-compress-training-compress-training.pb" + ) + self.ckpt_file = str(tests_path / "dp-compress-training.ckpt") + jdata = j_loader( + str(tests_path / os.path.join("model_compression", "input.json")) + ) + jdata["model"]["descriptor"] = {} + jdata["model"]["descriptor"]["type"] = "se_atten_v2" + jdata["model"]["descriptor"]["sel"] = 20 + jdata["model"]["descriptor"]["attn_layer"] = 0 + jdata["training"]["training_data"]["systems"] = data_file + jdata["training"]["validation_data"]["systems"] = data_file + jdata["training"]["save_ckpt"] = self.ckpt_file + with open(self.input_file, "w") as fp: + json.dump(jdata, fp, indent=4) + + def test_compressed_training(self): + run_dp(f"dp train {self.input_file}") + run_dp(f"dp freeze -o {self.frozen_model}") + run_dp(f"dp compress -i {self.frozen_model} -o {self.compressed_model}") + # compress training + run_dp(f"dp train {self.input_file} -f {self.compressed_model}") + # restart compress training + run_dp(f"dp train {self.input_file} -r {self.ckpt_file}") + # freeze compress training + run_dp(f"dp freeze -o {self.frozen_compress_training_model}") + # it should not be able to compress again + with self.assertRaises(RuntimeError): + run_dp( + f"dp compress -i {self.frozen_compress_training_model} -o {self.compressed_model}" + ) From 8a73399d2d62568ef0bac27f9a73be3efc6ed114 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 17 Nov 2023 22:00:14 -0500 Subject: [PATCH 4/4] assign `-c` for `dp freeze` Signed-off-by: Jinzhe Zeng --- source/tests/test_compressed_training.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/source/tests/test_compressed_training.py b/source/tests/test_compressed_training.py index a9420a5bd0..0a0bbeaadf 100644 --- a/source/tests/test_compressed_training.py +++ b/source/tests/test_compressed_training.py @@ -30,6 +30,7 @@ def setUp(self) -> None: tests_path / "dp-compress-training-compress-training.pb" ) self.ckpt_file = str(tests_path / "dp-compress-training.ckpt") + self.checkpoint_dir = str(tests_path) jdata = j_loader( str(tests_path / os.path.join("model_compression", "input.json")) ) @@ -45,14 +46,16 @@ def setUp(self) -> None: def test_compressed_training(self): run_dp(f"dp train {self.input_file}") - run_dp(f"dp freeze -o {self.frozen_model}") + run_dp(f"dp freeze -c {self.checkpoint_dir} -o {self.frozen_model}") run_dp(f"dp compress -i {self.frozen_model} -o {self.compressed_model}") # compress training run_dp(f"dp train {self.input_file} -f {self.compressed_model}") # restart compress training run_dp(f"dp train {self.input_file} -r {self.ckpt_file}") # freeze compress training - run_dp(f"dp freeze -o {self.frozen_compress_training_model}") + run_dp( + f"dp freeze -c {self.checkpoint_dir} -o {self.frozen_compress_training_model}" + ) # it should not be able to compress again with self.assertRaises(RuntimeError): run_dp(