From 057dc116628c708e77708caf0965b7bd71c3ae78 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 7 Jun 2024 18:12:49 -0400
Subject: [PATCH] fix(pt): improve out-of-memory capture (#3857)

I just received another error message that reports out of memory. It's a
bad design of PyTorch that all errors use a general `RuntimeError`.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **Bug Fixes**
  - Improved out-of-memory error detection for CUDA driver issues.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/pt/utils/auto_batch_size.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py
index 0af7cdcc47..6dfb80067a 100644
--- a/deepmd/pt/utils/auto_batch_size.py
+++ b/deepmd/pt/utils/auto_batch_size.py
@@ -57,6 +57,7 @@ def is_oom_error(self, e: Exception) -> bool:
         # (the meaningless error message should be considered as a bug in cusolver)
         if isinstance(e, RuntimeError) and (
             "CUDA out of memory." in e.args[0]
+            or "CUDA driver error: out of memory" in e.args[0]
             or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
         ):
             # Release all unoccupied cached memory