From 8394101c60f8418ee3727c512d051816c259c690 Mon Sep 17 00:00:00 2001 From: Futaki Haduki <812556867@qq.com> Date: Thu, 1 Aug 2024 03:33:19 +0800 Subject: [PATCH] fix(pt): invalid type_map when multitask training (#4031) It seems that in 3.0.0b3, executing multitask training or finetune task would run into a `RuntimeError`, calling inconsistent type map. The error log is shown below. However, the `type_map` in mutitask should be a shared dict. Diving into the source code, we would see a `type_map` [here](https://github.com/deepmodeling/deepmd-kit/blob/0e0fc1a63e478d3e56285b520b34a9c58488d659/deepmd/pt/entrypoints/main.py#L300). It would cause an empty type_map in multitask training because of no `type_map` found. After applying the modification in this PR, everything seems to be well. ``` Traceback (most recent call last): File "/public/home/ypliucat/.conda/envs/deepmd-kit-3.0.0b3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper return f(*args, **kwargs) File "/public/home/ypliucat/.conda/envs/deepmd-kit-3.0.0b3/lib/python3.10/site-packages/deepmd/pt/entrypoints/main.py", line 562, in main train(FLAGS) File "/public/home/ypliucat/.conda/envs/deepmd-kit-3.0.0b3/lib/python3.10/site-packages/deepmd/pt/entrypoints/main.py", line 311, in train train_data = get_data( File "/public/home/ypliucat/.conda/envs/deepmd-kit-3.0.0b3/lib/python3.10/site-packages/deepmd/utils/data_system.py", line 802, in get_data data = DeepmdDataSystem( File "/public/home/ypliucat/.conda/envs/deepmd-kit-3.0.0b3/lib/python3.10/site-packages/deepmd/utils/data_system.py", line 184, in __init__ self.type_map = self._check_type_map_consistency(type_map_list) File "/public/home/ypliucat/.conda/envs/deepmd-kit-3.0.0b3/lib/python3.10/site-packages/deepmd/utils/data_system.py", line 616, in _check_type_map_consistency raise RuntimeError(f"inconsistent type map: {ret!s} {ii!s}") RuntimeError: inconsistent type map: ['Ag', 'Cu'] ['Ag', 'Ni'] ``` ## Summary by CodeRabbit - **New Features** - Enhanced the training process to ensure consistent handling of model type configurations, improving clarity and availability based on multi-task settings. Signed-off-by: Futaki Haduki <812556867@qq.com> --- deepmd/pt/entrypoints/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 3ea2965fa7..7ba5f0b63a 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -297,8 +297,8 @@ def train(FLAGS): "Calculate neighbor statistics... (add --skip-neighbor-stat to skip this step)" ) - type_map = config["model"].get("type_map") if not multi_task: + type_map = config["model"].get("type_map") train_data = get_data( config["training"]["training_data"], 0, type_map, None ) @@ -308,6 +308,7 @@ def train(FLAGS): else: min_nbor_dist = {} for model_item in config["model"]["model_dict"]: + type_map = config["model"]["model_dict"][model_item].get("type_map") train_data = get_data( config["training"]["data_dict"][model_item]["training_data"], 0,