From a9bcf4153847ddb3773f46b0cf7e011eaecacf43 Mon Sep 17 00:00:00 2001 From: Lysithea <52808607+CaRoLZhangxy@users.noreply.github.com> Date: Sat, 9 Mar 2024 01:51:52 +0800 Subject: [PATCH] clean up the init interface of pt.dataloader (#3434) https://github.com/deepmodeling/deepmd-kit/issues/3427 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- deepmd/pt/entrypoints/main.py | 6 +++--- deepmd/pt/utils/dataloader.py | 21 +++++++++++++++---- deepmd/pt/utils/dataset.py | 13 +++--------- source/tests/pt/model/test_model.py | 13 +----------- source/tests/pt/model/test_saveload_dpa1.py | 13 +----------- .../tests/pt/model/test_saveload_se_e2_a.py | 13 +----------- source/tests/pt/test_sampler.py | 9 +------- source/tests/pt/test_stat.py | 9 +------- 8 files changed, 28 insertions(+), 69 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 0e5767cb4e..76796f6197 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -134,7 +134,7 @@ def prepare_trainer_input_single( DpLoaderSet( validation_systems, validation_dataset_params["batch_size"], - model_params_single, + model_params_single["type_map"], ) if validation_systems else None @@ -143,13 +143,13 @@ def prepare_trainer_input_single( train_data_single = DpLoaderSet( training_systems, training_dataset_params["batch_size"], - model_params_single, + model_params_single["type_map"], ) else: train_data_single = DpLoaderSet( training_systems, training_dataset_params["batch_size"], - model_params_single, + model_params_single["type_map"], ) return ( train_data_single, diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 2715bced52..0359071d71 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -55,13 +55,27 @@ def setup_seed(seed): class DpLoaderSet(Dataset): - """A dataset for storing DataLoaders to multiple Systems.""" + """A dataset for storing DataLoaders to multiple Systems. + + Parameters + ---------- + sys_path + Path to the data system + batch_size + Max frame count in a batch. + type_map + Gives the name of different atom types + seed + Random seed for dataloader + shuffle + If the data are shuffled (Only effective in serial mode. Always shuffle in distributed data parallelism) + """ def __init__( self, systems, batch_size, - model_params, + type_map, seed=10, shuffle=True, ): @@ -77,8 +91,7 @@ def __init__( def construct_dataset(system): return DeepmdDataSetForLoader( system=system, - type_map=model_params["type_map"], - shuffle=shuffle, + type_map=type_map, ) with Pool( diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py index 77297d980c..dbe4d92a0f 100644 --- a/deepmd/pt/utils/dataset.py +++ b/deepmd/pt/utils/dataset.py @@ -3,6 +3,7 @@ from typing import ( List, + Optional, ) from torch.utils.data import ( @@ -16,24 +17,16 @@ class DeepmdDataSetForLoader(Dataset): - def __init__( - self, - system: str, - type_map: str, - shuffle=True, - ): + def __init__(self, system: str, type_map: Optional[List[str]] = None): """Construct DeePMD-style dataset containing frames cross different systems. Args: - systems: Paths to systems. - - batch_size: Max frame count in a batch. - type_map: Atom types. """ self.system = system self._type_map = type_map - self._data_system = DeepmdData( - sys_path=system, shuffle_test=shuffle, type_map=self._type_map - ) + self._data_system = DeepmdData(sys_path=system, type_map=self._type_map) self.mixed_type = self._data_system.mixed_type self._ntypes = self._data_system.get_ntypes() self._natoms = self._data_system.get_natoms() diff --git a/source/tests/pt/model/test_model.py b/source/tests/pt/model/test_model.py index 69ec88f5d7..f42c11aa4c 100644 --- a/source/tests/pt/model/test_model.py +++ b/source/tests/pt/model/test_model.py @@ -273,18 +273,7 @@ def test_consistency(self): self.wanted_step ) # Build DeePMD graph - my_ds = DpLoaderSet( - self.systems, - self.batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": self.sel, - "rcut": self.rcut, - }, - "type_map": self.type_map, - }, - ) + my_ds = DpLoaderSet(self.systems, self.batch_size, self.type_map) my_ds.add_data_requirement(energy_data_requirement) my_model = get_model( model_params={ diff --git a/source/tests/pt/model/test_saveload_dpa1.py b/source/tests/pt/model/test_saveload_dpa1.py index 408afbef43..712b44485e 100644 --- a/source/tests/pt/model/test_saveload_dpa1.py +++ b/source/tests/pt/model/test_saveload_dpa1.py @@ -46,18 +46,7 @@ def get_dataset(config): batch_size = config["training"]["training_data"]["batch_size"] type_map = model_config["type_map"] - dataset = DpLoaderSet( - systems, - batch_size, - model_params={ - "descriptor": { - "type": "dpa1", - "sel": sel, - "rcut": rcut, - }, - "type_map": type_map, - }, - ) + dataset = DpLoaderSet(systems, batch_size, type_map) data_stat_nbatch = model_config.get("data_stat_nbatch", 10) sampled = make_stat_input(dataset.systems, dataset.dataloaders, data_stat_nbatch) return dataset, sampled diff --git a/source/tests/pt/model/test_saveload_se_e2_a.py b/source/tests/pt/model/test_saveload_se_e2_a.py index 382f119c30..56ea3283d9 100644 --- a/source/tests/pt/model/test_saveload_se_e2_a.py +++ b/source/tests/pt/model/test_saveload_se_e2_a.py @@ -46,18 +46,7 @@ def get_dataset(config): batch_size = config["training"]["training_data"]["batch_size"] type_map = model_config["type_map"] - dataset = DpLoaderSet( - systems, - batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": sel, - "rcut": rcut, - }, - "type_map": type_map, - }, - ) + dataset = DpLoaderSet(systems, batch_size, type_map) data_stat_nbatch = model_config.get("data_stat_nbatch", 10) sampled = make_stat_input(dataset.systems, dataset.dataloaders, data_stat_nbatch) return dataset, sampled diff --git a/source/tests/pt/test_sampler.py b/source/tests/pt/test_sampler.py index 25980cc144..4f1091c936 100644 --- a/source/tests/pt/test_sampler.py +++ b/source/tests/pt/test_sampler.py @@ -46,14 +46,7 @@ def setUp(self): self.my_dataset = DpLoaderSet( self.systems, self.batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": self.sel, - "rcut": self.rcut, - }, - "type_map": model_config["type_map"], - }, + model_config["type_map"], seed=10, shuffle=False, ) diff --git a/source/tests/pt/test_stat.py b/source/tests/pt/test_stat.py index e69caad502..51ca903bc2 100644 --- a/source/tests/pt/test_stat.py +++ b/source/tests/pt/test_stat.py @@ -137,14 +137,7 @@ def setUp(self): self.my_dataset = DpLoaderSet( self.systems, self.batch_size, - model_params={ - "descriptor": { - "type": "se_e2_a", - "sel": self.sel, - "rcut": self.rcut, - }, - "type_map": model_config["type_map"], - }, + model_config["type_map"], seed=10, ) self.filter_neuron = model_config["descriptor"]["neuron"]