forward GPU error message (#2878)

Forward detailed GPU error messages to DeePMD-kit (and then TensorFlow) instead of directly printing to the standard error. --------- Signed-off-by: Jinzhe Zeng <[email protected]> Co-authored-by: Han Wang <[email protected]>
deepmodeling · Sep 30, 2023 · cf61140 · cf61140
1 parent c4d8318
commit cf61140
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 41 deletions.
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
@@ -4,6 +4,7 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
+#include <string>
 #include <vector>
 
 #include "errors.h"
@@ -24,27 +25,31 @@ inline void DPAssert(cudaError_t code,
                      int line,
                      bool abort = true) {
   if (code != cudaSuccess) {
-    fprintf(stderr, "cuda assert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
+    std::string error_msg = "CUDA Runtime library throws an error: " +
+                            std::string(cudaGetErrorString(code)) +
+                            ", in file " + std::string(file) + ": " +
+                            std::to_string(line);
     if (code == 2) {
       // out of memory
-      fprintf(stderr,
-              "Your memory is not enough, thus an error has been raised "
-              "above. You need to take the following actions:\n"
-              "1. Check if the network size of the model is too large.\n"
-              "2. Check if the batch size of training or testing is too large. "
-              "You can set the training batch size to `auto`.\n"
-              "3. Check if the number of atoms is too large.\n"
-              "4. Check if another program is using the same GPU by execuating "
-              "`nvidia-smi`. "
-              "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
-              "environment variable.\n");
+      error_msg +=
+          "\nYour memory is not enough, thus an error has been raised "
+          "above. You need to take the following actions:\n"
+          "1. Check if the network size of the model is too large.\n"
+          "2. Check if the batch size of training or testing is too large. "
+          "You can set the training batch size to `auto`.\n"
+          "3. Check if the number of atoms is too large.\n"
+          "4. Check if another program is using the same GPU by execuating "
+          "`nvidia-smi`. "
+          "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
+          "environment variable.";
       if (abort) {
-        throw deepmd::deepmd_exception_oom("CUDA Assert");
+        throw deepmd::deepmd_exception_oom(error_msg);
       }
     }
     if (abort) {
-      throw deepmd::deepmd_exception("CUDA Assert");
+      throw deepmd::deepmd_exception(error_msg);
+    } else {
+      fprintf(stderr, "%s\n", error_msg.c_str());
     }
   }
 }
@@ -56,27 +61,23 @@ inline void nborAssert(cudaError_t code,
                        int line,
                        bool abort = true) {
   if (code != cudaSuccess) {
-    fprintf(stderr, "cuda assert: %s %s %d\n",
-            "DeePMD-kit:\tillegal nbor list sorting", file, line);
-    if (code == 2) {
-      // out of memory
-      fprintf(stderr,
-              "Your memory is not enough, thus an error has been raised "
-              "above. You need to take the following actions:\n"
-              "1. Check if the network size of the model is too large.\n"
-              "2. Check if the batch size of training or testing is too large. "
-              "You can set the training batch size to `auto`.\n"
-              "3. Check if the number of atoms is too large.\n"
-              "4. Check if another program is using the same GPU by execuating "
-              "`nvidia-smi`. "
-              "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
-              "environment variable.\n");
+    std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
+    try {
+      DPAssert(code, file, line, true);
+    } catch (deepmd::deepmd_exception_oom &e) {
+      error_msg += e.what();
       if (abort) {
-        throw deepmd::deepmd_exception_oom("CUDA Assert");
+        throw deepmd::deepmd_exception_oom(error_msg);
+      } else {
+        fprintf(stderr, "%s\n", error_msg.c_str());
+      }
+    } catch (deepmd::deepmd_exception &e) {
+      error_msg += e.what();
+      if (abort) {
+        throw deepmd::deepmd_exception(error_msg);
+      } else {
+        fprintf(stderr, "%s\n", error_msg.c_str());
       }
-    }
-    if (abort) {
-      throw deepmd::deepmd_exception("CUDA Assert");
     }
   }
 }

diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
@@ -4,6 +4,7 @@
 #include <hip/hip_runtime.h>
 #include <stdio.h>
 
+#include <string>
 #include <vector>
 // #include<rocprim/rocprim.hpp>
 // #include <hipcub/hipcub.hpp>
@@ -26,10 +27,14 @@ inline void DPAssert(hipError_t code,
                      int line,
                      bool abort = true) {
   if (code != hipSuccess) {
-    fprintf(stderr, "hip assert: %s %s %d\n", hipGetErrorString(code), file,
-            line);
+    std::string error_msg = "HIP runtime library throws an error: " +
+                            std::string(hipGetErrorString(code)) +
+                            ", in file " + std::string(file) + ": " +
+                            std::to_string(line);
     if (abort) {
-      throw deepmd::deepmd_exception("HIP Assert");
+      throw deepmd::deepmd_exception(error_msg);
+    } else {
+      fprintf(stderr, "%s\n", error_msg.c_str());
     }
   }
 }
@@ -41,10 +46,16 @@ inline void nborAssert(hipError_t code,
                        int line,
                        bool abort = true) {
   if (code != hipSuccess) {
-    fprintf(stderr, "hip assert: %s %s %d\n",
-            "DeePMD-kit:\tillegal nbor list sorting", file, line);
-    if (abort) {
-      throw deepmd::deepmd_exception("HIP Assert: illegal nbor list sorting");
+    std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
+    try {
+      DPAssert(code, file, line, true);
+    } catch (deepmd::deepmd_exception &e) {
+      error_msg += e.what();
+      if (abort) {
+        throw deepmd::deepmd_exception(error_msg);
+      } else {
+        fprintf(stderr, "%s\n", error_msg.c_str());
+      }
     }
   }
 }