From 3aab6f970830262b2dd768172fbbcd1e49c9b8c7 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 4 Jun 2024 18:41:33 -0400 Subject: [PATCH] fix(pt): improve out-of-memory capture I just received another error message that reports out of memory. It's a bad design of PyTorch that all errors use a general `RuntimeError`. Signed-off-by: Jinzhe Zeng --- deepmd/pt/utils/auto_batch_size.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index 0af7cdcc47..6dfb80067a 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -57,6 +57,7 @@ def is_oom_error(self, e: Exception) -> bool: # (the meaningless error message should be considered as a bug in cusolver) if isinstance(e, RuntimeError) and ( "CUDA out of memory." in e.args[0] + or "CUDA driver error: out of memory" in e.args[0] or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0] ): # Release all unoccupied cached memory