From d39bb94cd45abfce128b2b7ad722873c7e7e4d09 Mon Sep 17 00:00:00 2001
From: Duo <50307526+iProzd@users.noreply.github.com>
Date: Wed, 5 Jun 2024 00:25:23 +0800
Subject: [PATCH] fix(Q1): merge #3836 in 2024Q1 (#3852)

---
 deepmd/pt/utils/auto_batch_size.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py
index 181d56f2f4..252b98cc8e 100644
--- a/deepmd/pt/utils/auto_batch_size.py
+++ b/deepmd/pt/utils/auto_batch_size.py
@@ -30,7 +30,17 @@ def is_oom_error(self, e: Exception) -> bool:
         e : Exception
             Exception
         """
-        return isinstance(e, RuntimeError) and "CUDA out of memory." in e.args[0]
+        # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
+        # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
+        # (the meaningless error message should be considered as a bug in cusolver)
+        if isinstance(e, RuntimeError) and (
+            "CUDA out of memory." in e.args[0]
+            or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
+        ):
+            # Release all unoccupied cached memory
+            torch.cuda.empty_cache()
+            return True
+        return False
 
     def execute_all(
         self, callable: Callable, total_size: int, natoms: int, *args, **kwargs