Merge pull request #502 from amcadmus/devel

training on all sets in the training data systems. strict argcheck.
deepmodeling · Apr 13, 2021 · 809e898 · 809e898
2 parents dd7dc60 + 008f10b
commit 809e898
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 73 deletions.
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
@@ -253,9 +253,9 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions):
     # get batch sizes
     batch_size = j_must_have(jdata["training"], "batch_size")
     test_size = j_must_have(jdata["training"], "numb_test")
-    stop_batch = j_must_have(jdata["training"], "stop_batch")
+    stop_batch = j_must_have(jdata["training"], "numb_steps")
     sys_probs = jdata["training"].get("sys_probs")
-    auto_prob_style = jdata["training"].get("auto_prob_style", "prob_sys_size")
+    auto_prob = jdata["training"].get("auto_prob", "prob_sys_size")
 
     # setup data modifier
     modifier: Optional[DipoleChargeModifier]
@@ -283,8 +283,9 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions):
         set_prefix=set_pfx,
         type_map=ipt_type_map,
         modifier=modifier,
+        trn_all_set = True
     )
-    data.print_summary(run_opt, sys_probs=sys_probs, auto_prob_style=auto_prob_style)
+    data.print_summary(run_opt, sys_probs=sys_probs, auto_prob_style=auto_prob)
     data.add_dict(data_requirement)
 
     # build the model with stats from the first system

diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
@@ -251,7 +251,7 @@ def _init_param(self, jdata):
                   .add('tensorboard',     bool,   default = False)\
                   .add('tensorboard_log_dir',str,    default = 'log')\
                   .add('sys_probs',   list    )\
-                  .add('auto_prob_style', str, default = "prob_sys_size")
+                  .add('auto_prob', str, default = "prob_sys_size")
         tr_data = tr_args.parse(training_param)
         # not needed
         # self.numb_test = tr_data['numb_test']
@@ -266,7 +266,7 @@ def _init_param(self, jdata):
         self.tensorboard = tr_data['tensorboard']
         self.tensorboard_log_dir = tr_data['tensorboard_log_dir']
         self.sys_probs = tr_data['sys_probs']        
-        self.auto_prob_style = tr_data['auto_prob_style']        
+        self.auto_prob_style = tr_data['auto_prob']        
         self.useBN = False
         if fitting_type == 'ener' and  self.fitting.get_numb_fparam() > 0 :
             self.numb_fparam = self.fitting.get_numb_fparam()

diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -299,7 +299,7 @@ def model_args ():
     return ca
 
 
-def learning_rate_args():
+def learning_rate_exp():
     doc_start_lr = 'The learning rate the start of the training.'
     doc_stop_lr = 'The desired learning rate at the end of the training.'
     doc_decay_steps = 'The learning rate is decaying every this number of training steps.'
@@ -309,9 +309,24 @@ def learning_rate_args():
         Argument("stop_lr", float, optional = True, default = 1e-8, doc = doc_stop_lr),
         Argument("decay_steps", int, optional = True, default = 5000, doc = doc_decay_steps)
     ]
+    return args
+
+
+def learning_rate_variant_type_args():
+    doc_lr = 'The type of the learning rate. Current type `exp`, the exponentially decaying learning rate is supported.'
+
+    return Variant("type", 
+                   [Argument("exp", dict, learning_rate_exp())],
+                   optional = True,
+                   default_tag = 'exp',
+                   doc = doc_lr)
 
-    doc_lr = "The learning rate options" 
-    return Argument("learning_rate", dict, args, [], doc = doc_lr)
+
+def learning_rate_args():
+    doc_lr = "The definitio of learning rate" 
+    return Argument("learning_rate", dict, [], 
+                    [learning_rate_variant_type_args()],
+                    doc = doc_lr)
 
 
 def start_pref(item):
@@ -361,15 +376,16 @@ def loss_args():
     return ca
 
 def training_args():
+    link_sys = make_link("systems", "training/systems")
     doc_systems = 'The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.'
-    doc_set_prefix = 'The prefix of the sets in the systems.'
+    doc_set_prefix = f'The prefix of the sets in the {link_sys}.'
     doc_stop_batch = 'Number of training batch. Each training uses one batch of data.'
-    doc_batch_size = 'This key can be \n\n\
-- list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.\n\n\
-- int: all `systems` uses the same batch size.\n\n\
-- string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.\n\n\
-- string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.'
-    doc_seed = 'The random seed for training.'
+    doc_batch_size = f'This key can be \n\n\
+- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
+- int: all {link_sys} use the same batch size.\n\n\
+- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
+- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
+    doc_seed = 'The random seed for getting frames from the training data set.'
     doc_disp_file = 'The file for printing learning curve.'
     doc_disp_freq = 'The frequency of printing learning curve.'
     doc_numb_test = 'Number of frames used for the test during training.'
@@ -382,17 +398,18 @@ def training_args():
     doc_train_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
 - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
 - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
-- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : \n\n\
-    the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
+- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
     doc_train_sys_probs = "A list of float, should be of the same length as `train_systems`, specifying the probability of each system."
+    doc_tensorboard = 'Enable tensorboard'
+    doc_tensorboard_log_dir = 'The log directory of tensorboard outputs'
 
     args = [
-        Argument("systems", [list,str], optional = False, doc = doc_systems, alias = ["train_systems"]),
-        Argument("auto_prob_style", str, optional = True, default = "prob_sys_size", doc = doc_train_auto_prob_style, alias = ["train_auto_prob_style"]),
-        Argument("sys_probs", list, optional = True, default = None, doc = doc_train_sys_probs, alias = ["train_sys_probs"]),
+        Argument("systems", [list,str], optional = False, doc = doc_systems, alias = ["trn_systems"]),
         Argument("set_prefix", str, optional = True, default = 'set', doc = doc_set_prefix),
-        Argument("stop_batch", int, optional = False, doc = doc_stop_batch),
-        Argument("batch_size", [list,int,str], optional = True, default = 'auto', doc = doc_batch_size),
+        Argument("auto_prob", str, optional = True, default = "prob_sys_size", doc = doc_train_auto_prob_style, alias = ["trn_auto_prob", "auto_prob_style"]),
+        Argument("sys_probs", list, optional = True, default = None, doc = doc_train_sys_probs, alias = ["trn_sys_probs"]),
+        Argument("batch_size", [list,int,str], optional = True, default = 'auto', doc = doc_batch_size, alias = ["trn_batch_size"]),
+        Argument("numb_steps", int, optional = False, doc = doc_stop_batch, alias = ["stop_batch"]),
         Argument("seed", [int,None], optional = True, doc = doc_seed),
         Argument("disp_file", str, optional = True, default = 'lcueve.out', doc = doc_disp_file),
         Argument("disp_freq", int, optional = True, default = 1000, doc = doc_disp_freq),
@@ -402,7 +419,9 @@ def training_args():
         Argument("disp_training", bool, optional = True, default = True, doc = doc_disp_training),
         Argument("time_training", bool, optional = True, default = True, doc = doc_time_training),
         Argument("profiling", bool, optional = True, default = False, doc = doc_profiling),
-        Argument("profiling_file", str, optional = True, default = 'timeline.json', doc = doc_profiling_file)
+        Argument("profiling_file", str, optional = True, default = 'timeline.json', doc = doc_profiling_file),
+        Argument("tensorboard", bool, optional = True, default = False, doc = doc_tensorboard),
+        Argument("tensorboard_log_dir", str, optional = True, default = 'log', doc = doc_tensorboard_log_dir),
     ]
 
     doc_training = 'The training options'
@@ -443,7 +462,7 @@ def normalize(data):
 
     base = Argument("base", dict, [ma, lra, la, ta])
     data = base.normalize_value(data, trim_pattern = "_*")
-    base.check_value(data)
+    base.check_value(data, strict = True)
 
     return data
 

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
@@ -19,7 +19,8 @@ def __init__ (self,
                   set_prefix : str = 'set',
                   shuffle_test : bool = True, 
                   type_map : List[str] = None, 
-                  modifier = None) :
+                  modifier = None,
+                  trn_all_set : bool = False) :
         """
         Constructor
         
@@ -35,6 +36,8 @@ def __init__ (self,
                 Gives the name of different atom types
         modifier
                 Data modifier that has the method `modify_data`
+        trn_all_set
+                Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
         """
         self.dirs = glob.glob (os.path.join(sys_path, set_prefix + ".*"))
         self.dirs.sort()
@@ -57,10 +60,13 @@ def __init__ (self,
         self.idx_map = self._make_idx_map(self.atom_type)
         # train dirs
         self.test_dir = self.dirs[-1]
-        if len(self.dirs) == 1 :
+        if trn_all_set:
             self.train_dirs = self.dirs
-        else :
-            self.train_dirs = self.dirs[:-1]
+        else:
+            if len(self.dirs) == 1 :
+                self.train_dirs = self.dirs
+            else :
+                self.train_dirs = self.dirs[:-1]
         self.data_dict = {}        
         # add box and coord
         self.add('box', 9, must = True)

diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
@@ -26,7 +26,8 @@ def __init__ (self,
                   set_prefix : str = 'set',
                   shuffle_test : bool = True,
                   type_map : List[str] = None, 
-                  modifier = None) :
+                  modifier = None, 
+                  trn_all_set = False) :
         """
         Constructor
         
@@ -48,18 +49,24 @@ def __init__ (self,
                 Gives the name of different atom types
         modifier
                 Data modifier that has the method `modify_data`        
+        trn_all_set
+                Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
         """
         # init data
         self.rcut = rcut
         self.system_dirs = systems
         self.nsystems = len(self.system_dirs)
         self.data_systems = []
         for ii in self.system_dirs :
-            self.data_systems.append(DeepmdData(ii, 
-                                                set_prefix=set_prefix, 
-                                                shuffle_test=shuffle_test, 
-                                                type_map = type_map, 
-                                                modifier = modifier))
+            self.data_systems.append(
+                DeepmdData(
+                    ii, 
+                    set_prefix=set_prefix, 
+                    shuffle_test=shuffle_test, 
+                    type_map = type_map, 
+                    modifier = modifier, 
+                    trn_all_set = trn_all_set
+                ))
         # batch size
         self.batch_size = batch_size
         if isinstance(self.batch_size, int) :
@@ -260,7 +267,8 @@ def _get_sys_probs(self,
                        auto_prob_style) :        
         if sys_probs is None :
             if auto_prob_style == "prob_uniform" :
-                prob = None
+                prob_v = 1./float(self.nsystems)
+                prob = [prob_v for ii in range(self.nsystems)]
             elif auto_prob_style == "prob_sys_size" :
                 prob = self.prob_nbatches
             elif auto_prob_style[:14] == "prob_sys_size;" :