Skip to content

Commit

Permalink
forward GPU error message (#2878)
Browse files Browse the repository at this point in the history
Forward detailed GPU error messages to DeePMD-kit (and then TensorFlow)
instead of directly printing to the standard error.

---------

Signed-off-by: Jinzhe Zeng <[email protected]>
Co-authored-by: Han Wang <[email protected]>
  • Loading branch information
njzjz and wanghan-iapcm authored Sep 30, 2023
1 parent c4d8318 commit cf61140
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 41 deletions.
69 changes: 35 additions & 34 deletions source/lib/include/gpu_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cuda_runtime.h>
#include <stdio.h>

#include <string>
#include <vector>

#include "errors.h"
Expand All @@ -24,27 +25,31 @@ inline void DPAssert(cudaError_t code,
int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "cuda assert: %s %s %d\n", cudaGetErrorString(code), file,
line);
std::string error_msg = "CUDA Runtime library throws an error: " +
std::string(cudaGetErrorString(code)) +
", in file " + std::string(file) + ": " +
std::to_string(line);
if (code == 2) {
// out of memory
fprintf(stderr,
"Your memory is not enough, thus an error has been raised "
"above. You need to take the following actions:\n"
"1. Check if the network size of the model is too large.\n"
"2. Check if the batch size of training or testing is too large. "
"You can set the training batch size to `auto`.\n"
"3. Check if the number of atoms is too large.\n"
"4. Check if another program is using the same GPU by execuating "
"`nvidia-smi`. "
"The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
"environment variable.\n");
error_msg +=
"\nYour memory is not enough, thus an error has been raised "
"above. You need to take the following actions:\n"
"1. Check if the network size of the model is too large.\n"
"2. Check if the batch size of training or testing is too large. "
"You can set the training batch size to `auto`.\n"
"3. Check if the number of atoms is too large.\n"
"4. Check if another program is using the same GPU by execuating "
"`nvidia-smi`. "
"The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
"environment variable.";
if (abort) {
throw deepmd::deepmd_exception_oom("CUDA Assert");
throw deepmd::deepmd_exception_oom(error_msg);
}
}
if (abort) {
throw deepmd::deepmd_exception("CUDA Assert");
throw deepmd::deepmd_exception(error_msg);
} else {
fprintf(stderr, "%s\n", error_msg.c_str());
}
}
}
Expand All @@ -56,27 +61,23 @@ inline void nborAssert(cudaError_t code,
int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "cuda assert: %s %s %d\n",
"DeePMD-kit:\tillegal nbor list sorting", file, line);
if (code == 2) {
// out of memory
fprintf(stderr,
"Your memory is not enough, thus an error has been raised "
"above. You need to take the following actions:\n"
"1. Check if the network size of the model is too large.\n"
"2. Check if the batch size of training or testing is too large. "
"You can set the training batch size to `auto`.\n"
"3. Check if the number of atoms is too large.\n"
"4. Check if another program is using the same GPU by execuating "
"`nvidia-smi`. "
"The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` "
"environment variable.\n");
std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
try {
DPAssert(code, file, line, true);
} catch (deepmd::deepmd_exception_oom &e) {
error_msg += e.what();
if (abort) {
throw deepmd::deepmd_exception_oom("CUDA Assert");
throw deepmd::deepmd_exception_oom(error_msg);
} else {
fprintf(stderr, "%s\n", error_msg.c_str());
}
} catch (deepmd::deepmd_exception &e) {
error_msg += e.what();
if (abort) {
throw deepmd::deepmd_exception(error_msg);
} else {
fprintf(stderr, "%s\n", error_msg.c_str());
}
}
if (abort) {
throw deepmd::deepmd_exception("CUDA Assert");
}
}
}
Expand Down
25 changes: 18 additions & 7 deletions source/lib/include/gpu_rocm.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <hip/hip_runtime.h>
#include <stdio.h>

#include <string>
#include <vector>
// #include<rocprim/rocprim.hpp>
// #include <hipcub/hipcub.hpp>
Expand All @@ -26,10 +27,14 @@ inline void DPAssert(hipError_t code,
int line,
bool abort = true) {
if (code != hipSuccess) {
fprintf(stderr, "hip assert: %s %s %d\n", hipGetErrorString(code), file,
line);
std::string error_msg = "HIP runtime library throws an error: " +
std::string(hipGetErrorString(code)) +
", in file " + std::string(file) + ": " +
std::to_string(line);
if (abort) {
throw deepmd::deepmd_exception("HIP Assert");
throw deepmd::deepmd_exception(error_msg);
} else {
fprintf(stderr, "%s\n", error_msg.c_str());
}
}
}
Expand All @@ -41,10 +46,16 @@ inline void nborAssert(hipError_t code,
int line,
bool abort = true) {
if (code != hipSuccess) {
fprintf(stderr, "hip assert: %s %s %d\n",
"DeePMD-kit:\tillegal nbor list sorting", file, line);
if (abort) {
throw deepmd::deepmd_exception("HIP Assert: illegal nbor list sorting");
std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
try {
DPAssert(code, file, line, true);
} catch (deepmd::deepmd_exception &e) {
error_msg += e.what();
if (abort) {
throw deepmd::deepmd_exception(error_msg);
} else {
fprintf(stderr, "%s\n", error_msg.c_str());
}
}
}
}
Expand Down

0 comments on commit cf61140

Please sign in to comment.