Skip to content

fix GPU test OOM problem #127

fix GPU test OOM problem

fix GPU test OOM problem #127

Workflow file for this run

on:
# manually trigger
workflow_dispatch:
pull_request:
types:
- "labeled"
name: Test CUDA
jobs:
test_cuda:
name: Test Python and C++ on CUDA
runs-on: nvidia
# https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845
container:
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
options: --gpus all
if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch'
steps:
- name: Make sudo and git work
run: apt-get update && apt-get install -y sudo git
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
# cache: 'pip'
- name: Setup MPI
uses: mpi4py/setup-mpi@v1
with:
mpi: mpich
- uses: lukka/get-cmake@latest
- run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
&& sudo dpkg -i cuda-keyring_1.0-1_all.deb \
&& sudo apt-get update \
&& sudo apt-get -y install cuda-12-2 libcudnn8=8.9.5.*-1+cuda12.2
if: false # skip as we use nvidia image
- name: Set PyPI mirror for Aliyun cloud machine
run: python -m pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple/
- run: python -m pip install -U "pip>=21.3.1,!=23.0.0"
- run: python -m pip install "tensorflow>=2.15.0rc0"
- run: python -m pip install -v -e .[gpu,test,lmp,cu12,torch] "ase @ https://gitlab.com/ase/ase/-/archive/8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f/ase-8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f.tar.gz"
env:
DP_BUILD_TESTING: 1
DP_VARIANT: cuda
CUDA_PATH: /usr/local/cuda-12.2
NUM_WORKERS: 0
- run: dp --version
- run: python -m pytest -s --cov=deepmd source/tests --durations=0
- run: source/install/test_cc_local.sh
env:
OMP_NUM_THREADS: 1
TF_INTRA_OP_PARALLELISM_THREADS: 1
TF_INTER_OP_PARALLELISM_THREADS: 1
LMP_CXX11_ABI_0: 1
CMAKE_GENERATOR: Ninja
DP_VARIANT: cuda
DP_USE_MPICH2: 1
CUDA_PATH: /usr/local/cuda-12.2
- run: |
export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH
export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH
python -m pytest -s --cov=deepmd source/lmp/tests
python -m pytest -s --cov=deepmd source/ipi/tests
env:
OMP_NUM_THREADS: 1
TF_INTRA_OP_PARALLELISM_THREADS: 1
TF_INTER_OP_PARALLELISM_THREADS: 1
LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp
CUDA_PATH: /usr/local/cuda-12.2
- uses: codecov/codecov-action@v3
with:
gcov: true