train only readouts with lbfgs

vue1999 · Dec 19, 2024 · 9cb5d65 · 9cb5d65
1 parent 5eae9d7
commit 9cb5d65
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 8 deletions.
diff --git a/mace/cli/run_train.py b/mace/cli/run_train.py
@@ -612,7 +612,7 @@ def run(args: argparse.Namespace) -> None:
         batch_mode = args.lbfgs_config.get("batch_mode", False)
 
         logging.info("Switching optimizer to LBFGS")
-        optimizer = LBFGSNew(model.parameters(),
+        optimizer = LBFGSNew(model.readouts.parameters(),
                              tolerance_grad=1e-6,
                              history_size=history_size,
                              max_iter=max_iter,

diff --git a/mace/tools/train.py b/mace/tools/train.py
@@ -234,6 +234,12 @@ def train(
 
         # Validate
         if epoch % eval_interval == 0:
+            logging.info("GPU Memory Report:")
+            logging.info(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+            logging.info(f"Cached:    {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+            logging.info(f"Total:     {torch.cuda.get_device_properties(0).total_memory / 1024**2:.2f} MB")
+            logging.info(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
+            logging.info(f"Free memory: {torch.cuda.mem_get_info()[0] / (1024**2):.2f} MB")
             model_to_evaluate = (
                 model if distributed_model is None else distributed_model
             )
@@ -365,13 +371,6 @@ def take_step(
     start_time = time.time()
     batch = batch.to(device)
     batch_dict = batch.to_dict()
-
-    logging.info("GPU Memory Report:")
-    logging.info(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
-    logging.info(f"Cached:    {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
-    logging.info(f"Total:     {torch.cuda.get_device_properties(0).total_memory / 1024**2:.2f} MB")
-    logging.info(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB")
-    logging.info(f"Free memory: {torch.cuda.mem_get_info()[0] / (1024**2):.2f} MB")
 
     def closure():
         optimizer.zero_grad(set_to_none=True)