From 544875ed570cbbf7fe9cc545565f062417666fe9 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 21 Sep 2023 00:57:06 -0400 Subject: [PATCH] add test cuda workflow (#2848) Signed-off-by: Jinzhe Zeng --- .github/workflows/remove_test_cuda_label.yml | 18 ++++++ .github/workflows/test_cuda.yml | 60 ++++++++++++++++++++ doc/development/cicd.md | 15 +++++ doc/index.rst | 8 ++- source/install/test_cc.sh | 8 ++- source/install/test_cc_local.sh | 8 ++- source/lmp/plugin/CMakeLists.txt | 5 ++ 7 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/remove_test_cuda_label.yml create mode 100644 .github/workflows/test_cuda.yml create mode 100644 doc/development/cicd.md diff --git a/.github/workflows/remove_test_cuda_label.yml b/.github/workflows/remove_test_cuda_label.yml new file mode 100644 index 0000000000..4702814f7e --- /dev/null +++ b/.github/workflows/remove_test_cuda_label.yml @@ -0,0 +1,18 @@ +on: + pull_request_target: + types: + - "labeled" +name: Test CUDA +jobs: + remove_label: + permissions: + contents: read + pull-requests: write + # so one can re-trigger the workflow without manually removing the label + runs-on: ubuntu-latest + if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' + steps: + - uses: actions-ecosystem/action-remove-labels@v1 + with: + labels: Test CUDA + number: ${{ github.event.pull_request.number }} diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml new file mode 100644 index 0000000000..adc20c27a9 --- /dev/null +++ b/.github/workflows/test_cuda.yml @@ -0,0 +1,60 @@ +on: + # manually trigger + workflow_dispatch: + pull_request: + types: + - "labeled" +name: Test CUDA +jobs: + test_cuda: + name: Test Python and C++ on CUDA + runs-on: nvidia + if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch' + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + - name: Setup MPI + uses: mpi4py/setup-mpi@v1 + with: + mpi: mpich + - uses: lukka/get-cmake@latest + - run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \ + && sudo dpkg -i cuda-keyring_1.0-1_all.deb \ + && sudo apt-get update \ + && sudo apt-get -y install cuda-11-8 libcudnn8=8.9.5.*-1+cuda11.8 + - run: python -m pip install -U "pip>=21.3.1,!=23.0.0" + - run: pip install -v -e .[gpu,test,lmp,cu11] "ase @ https://github.com/rosswhitfield/ase/archive/edd03571aff6944b77b4a4b055239f3c3e4eeb66.zip" + env: + DP_BUILD_TESTING: 1 + DP_VARIANT: cuda + CUDA_PATH: /usr/local/cuda-11.8 + - run: dp --version + - run: pytest -s --cov=deepmd --cov=deepmd_cli source/tests --durations=0 + - run: source/install/test_cc_local.sh + env: + OMP_NUM_THREADS: 1 + TF_INTRA_OP_PARALLELISM_THREADS: 1 + TF_INTER_OP_PARALLELISM_THREADS: 1 + LMP_CXX11_ABI_0: 1 + CMAKE_GENERATOR: Ninja + DP_VARIANT: cuda + DP_USE_MPICH2: 1 + CUDA_PATH: /usr/local/cuda-11.8 + - run: | + export LD_LIBRARY_PATH=${{ github.workspace }}/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH + export PATH=${{ github.workspace }}/dp_test/bin:$PATH + pytest -s --cov=deepmd source/lmp/tests + pytest -s --cov=deepmd source/ipi/tests + env: + OMP_NUM_THREADS: 1 + TF_INTRA_OP_PARALLELISM_THREADS: 1 + TF_INTER_OP_PARALLELISM_THREADS: 1 + LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp + CUDA_PATH: /usr/local/cuda-11.8 + - uses: codecov/codecov-action@v3 + with: + gcov: true diff --git a/doc/development/cicd.md b/doc/development/cicd.md new file mode 100644 index 0000000000..b323a62385 --- /dev/null +++ b/doc/development/cicd.md @@ -0,0 +1,15 @@ +# CI/CD + + + +## CI + + + +### Test CUDA + +`Test CUDA` action runs tests on a self-hosted runner with the NVIDIA card. It is not triggered by every PR. The developer who has the permission to manage the label can apply the label `Test CUDA` to a PR to trigger this action. + + + + diff --git a/doc/index.rst b/doc/index.rst index 0924328b26..b60430b566 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -52,7 +52,6 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r .. toctree:: :maxdepth: 2 :caption: Tutorial - :glob: Tutorials Publications @@ -62,9 +61,12 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r .. toctree:: :maxdepth: 5 :caption: Developer Guide - :glob: - development/* + development/cmake + development/create-a-model + development/type-embedding + development/coding-conventions + development/cicd api_py/api_py api_op API_CC/api_cc diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh index eeff8c47bc..c874e3bf6c 100755 --- a/source/install/test_cc.sh +++ b/source/install/test_cc.sh @@ -1,5 +1,11 @@ set -e +if [ "$DP_VARIANT" = "cuda" ]; then + CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE" +elif [ "$DP_VARIANT" = "rocm" ]; then + CUDA_ARGS="-DUSE_ROCM_TOOLKIT=TRUE" +fi + #------------------ SCRIPT_PATH=$(dirname $(realpath -s $0)) @@ -11,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests mkdir -p ${BUILD_TMP_DIR} cd ${BUILD_TMP_DIR} -cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 .. +cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} .. cmake --build . -j${NPROC} cmake --install . ctest --output-on-failure diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh index 14f86a6646..49f221825b 100755 --- a/source/install/test_cc_local.sh +++ b/source/install/test_cc_local.sh @@ -1,5 +1,11 @@ set -e +if [ "$DP_VARIANT" = "cuda" ]; then + CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE" +elif [ "$DP_VARIANT" = "rocm" ]; then + CUDA_ARGS="-DUSE_ROCM_TOOLKIT=TRUE" +fi + #------------------ SCRIPT_PATH=$(dirname $(realpath -s $0)) @@ -12,7 +18,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests mkdir -p ${BUILD_TMP_DIR} cd ${BUILD_TMP_DIR} -cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 .. +cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} .. cmake --build . -j${NPROC} cmake --install . ctest --output-on-failure diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt index 86b99fe7b5..9b5f68b574 100644 --- a/source/lmp/plugin/CMakeLists.txt +++ b/source/lmp/plugin/CMakeLists.txt @@ -19,6 +19,11 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION) target_include_directories(lammps_interface INTERFACE ${LAMMPS_HEADER_DIR}) + if("$ENV{DP_USE_MPICH2}" STREQUAL "1") + # See https://stackoverflow.com/a/47976518/9567349 + set(MPI_EXECUTABLE_SUFFIX ".mpich") + endif() + find_package(MPI) if(MPI_FOUND) set(LAMMPS_MPI_INCLUDE_DIRS ${MPI_CXX_INCLUDE_DIRS})