From 55d678dcf34f78ab2f1eacbc39d1f4ae78b9c316 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 16 Nov 2023 00:30:55 -0500
Subject: [PATCH] bump CUDA version to 12.2 for pre-built packages (#2960)

TensorFlow 2.15 bumps the CUDA version to 12.2. See
https://github.com/tensorflow/tensorflow/commit/3de44168950a5972ba4cfa7e3c6cbf4cffa67fe6.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/workflows/build_cc.yml        |  2 +-
 .github/workflows/test_cuda.yml       | 13 +++++++------
 backend/find_tensorflow.py            |  9 +++++++++
 doc/install/easy-install-dev.md       |  4 ++--
 doc/install/easy-install.md           |  8 ++++----
 doc/install/install-from-c-library.md |  2 +-
 pyproject.toml                        |  9 ++++-----
 source/install/docker/Dockerfile      |  2 +-
 source/install/docker_package_c.sh    |  2 +-
 9 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 964a11ce37..e6377f4fab 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -37,7 +37,7 @@ jobs:
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
          && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
          && sudo apt-get update \
-         && sudo apt-get -y install cuda-cudart-dev-12-0 cuda-nvcc-12-0
+         && sudo apt-get -y install cuda-cudart-dev-12-2 cuda-nvcc-12-2
       if: matrix.variant == 'cuda120'
       env:
         DEBIAN_FRONTEND: noninteractive
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index 5e754226ae..d8eddaa44f 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: nvidia
     # https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845
     container:
-      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+      image: nvidia/cuda:12.2.0-devel-ubuntu22.04
       options: --gpus all
     if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch'
     steps:
@@ -31,16 +31,17 @@ jobs:
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
          && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
          && sudo apt-get update \
-         && sudo apt-get -y install cuda-11-8 libcudnn8=8.9.5.*-1+cuda11.8
+         && sudo apt-get -y install cuda-12-2 libcudnn8=8.9.5.*-1+cuda12.2
       if: false  # skip as we use nvidia image
     - name: Set PyPI mirror for Aliyun cloud machine
       run: python -m pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple/
     - run: python -m pip install -U "pip>=21.3.1,!=23.0.0"
-    - run: python -m pip install -v -e .[gpu,test,lmp,cu11] "ase @ https://gitlab.com/ase/ase/-/archive/8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f/ase-8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f.tar.gz"
+    - run: python -m pip install "tensorflow>=2.15.0rc0"
+    - run: python -m pip install -v -e .[gpu,test,lmp,cu12] "ase @ https://gitlab.com/ase/ase/-/archive/8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f/ase-8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f.tar.gz"
       env:
         DP_BUILD_TESTING: 1
         DP_VARIANT: cuda
-        CUDA_PATH: /usr/local/cuda-11.8
+        CUDA_PATH: /usr/local/cuda-12.2
     - run: dp --version
     - run: python -m pytest -s --cov=deepmd --cov=deepmd_utils source/tests --durations=0
     - run: source/install/test_cc_local.sh
@@ -52,7 +53,7 @@ jobs:
         CMAKE_GENERATOR: Ninja
         DP_VARIANT: cuda
         DP_USE_MPICH2: 1
-        CUDA_PATH: /usr/local/cuda-11.8
+        CUDA_PATH: /usr/local/cuda-12.2
     - run: |
         export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH
         export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH
@@ -63,7 +64,7 @@ jobs:
         TF_INTRA_OP_PARALLELISM_THREADS: 1
         TF_INTER_OP_PARALLELISM_THREADS: 1
         LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp
-        CUDA_PATH: /usr/local/cuda-11.8
+        CUDA_PATH: /usr/local/cuda-12.2
     - uses: codecov/codecov-action@v3
       with:
         gcov: true
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
index 6d7ce5087d..fbbe0e56c0 100644
--- a/backend/find_tensorflow.py
+++ b/backend/find_tensorflow.py
@@ -87,6 +87,13 @@ def find_tensorflow() -> Tuple[Optional[str], List[str]]:
         # TypeError if submodule_search_locations are None
         # IndexError if submodule_search_locations is an empty list
     except (AttributeError, TypeError, IndexError):
+        if os.environ.get("CIBUILDWHEEL", "0") == "1":
+            # CUDA 12.2
+            requires.extend(
+                [
+                    "tensorflow-cpu>=2.15.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",
+                ]
+            )
         requires.extend(get_tf_requirement()["cpu"])
         # setuptools will re-find tensorflow after installing setup_requires
         tf_install_dir = None
@@ -129,6 +136,8 @@ def get_tf_requirement(tf_version: str = "") -> dict:
             "cpu": [
                 "tensorflow-cpu; platform_machine!='aarch64' and (platform_machine!='arm64' or platform_system != 'Darwin')",
                 "tensorflow; platform_machine=='aarch64' or (platform_machine=='arm64' and platform_system == 'Darwin')",
+                # https://github.com/tensorflow/tensorflow/issues/61830
+                "tensorflow-cpu<2.15; platform_system=='Windows'",
                 *extra_requires,
             ],
             "gpu": [
diff --git a/doc/install/easy-install-dev.md b/doc/install/easy-install-dev.md
index dd943c37af..f3d4fa1a32 100644
--- a/doc/install/easy-install-dev.md
+++ b/doc/install/easy-install-dev.md
@@ -17,10 +17,10 @@ docker pull ghcr.io/deepmodeling/deepmd-kit:devel
 Below is an one-line shell command to download the [artifact](https://nightly.link/deepmodeling/deepmd-kit/workflows/build_wheel/devel/artifact.zip) containing wheels and install it with `pip`:
 
 ```sh
-pip install -U --pre deepmd-kit[gpu,cu11,lmp] --extra-index-url https://deepmodeling.github.io/deepmd-kit/simple
+pip install -U --pre deepmd-kit[gpu,cu12,lmp] --extra-index-url https://deepmodeling.github.io/deepmd-kit/simple
 ```
 
-`cu11` and `lmp` are optional, which is the same as the stable version.
+`cu12` and `lmp` are optional, which is the same as the stable version.
 
 ## Download pre-compiled C Library
 
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index f033310f8f..7bd632694b 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -84,13 +84,13 @@ docker pull deepmodeling/dpmdkit-rocm:dp2.0.3-rocm4.5.2-tf2.6-lmp29Sep2021
 
 ## Install Python interface with pip
 
-If you have no existing TensorFlow installed, you can use `pip` to install the pre-built package of the Python interface with CUDA 11 supported:
+If you have no existing TensorFlow installed, you can use `pip` to install the pre-built package of the Python interface with CUDA 12 supported:
 
 ```bash
-pip install deepmd-kit[gpu,cu11]
+pip install deepmd-kit[gpu,cu12]
 ```
 
-`cu11` is required only when CUDA Toolkit and cuDNN were not installed.
+`cu12` is required only when CUDA Toolkit and cuDNN were not installed.
 
 Or install the CPU version without CUDA supported:
 ```bash
@@ -99,7 +99,7 @@ pip install deepmd-kit[cpu]
 
 [The LAMMPS module](../third-party/lammps-command.md) and [the i-Pi driver](../third-party/ipi.md) are only provided on Linux and macOS. To install LAMMPS and/or i-Pi, add `lmp` and/or `ipi` to extras:
 ```bash
-pip install deepmd-kit[gpu,cu11,lmp,ipi]
+pip install deepmd-kit[gpu,cu12,lmp,ipi]
 ```
 MPICH is required for parallel running. (The macOS arm64 package doesn't support MPI yet.)
 
diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
index 343446888c..04b71234db 100644
--- a/doc/install/install-from-c-library.md
+++ b/doc/install/install-from-c-library.md
@@ -2,7 +2,7 @@
 
 DeePMD-kit provides pre-compiled C library package (`libdeepmd_c.tar.gz`) in each [release](https://github.com/deepmodeling/deepmd-kit/releases). It can be used to build the [LAMMPS plugin](./install-lammps.md) and [GROMACS patch](./install-gromacs.md), as well as many [third-party software packages](../third-party/out-of-deepmd-kit.md), without building TensorFlow and DeePMD-kit on one's own.
 
-The library is built in Linux (GLIBC 2.17) with CUDA 11.8. It's noted that this package does not contain CUDA Toolkit and cuDNN, so one needs to download them from the NVIDIA website.
+The library is built in Linux (GLIBC 2.17) with CUDA 12.2. It's noted that this package does not contain CUDA Toolkit and cuDNN, so one needs to download them from the NVIDIA website.
 
 ## Use Pre-compiled C Library to build the LAMMPS plugin and GROMACS patch
 
diff --git a/pyproject.toml b/pyproject.toml
index 4ba3bb81e1..e9ee563960 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
     "Programming Language :: C",
     "Programming Language :: C++",
     "Programming Language :: Python :: 3 :: Only",
-    "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+    "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2",
     "Intended Audience :: Science/Research",
     "Programming Language :: Python :: 3.7",
     "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
@@ -133,9 +133,8 @@ test-command = [
 test-extras = ["cpu", "test", "lmp", "ipi"]
 build = ["cp310-*"]
 skip = ["*-win32", "*-manylinux_i686", "*-musllinux*"]
-# TODO: bump to "latest" tag when CUDA supports GCC 12
-manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2022-11-19-1b19e81"
-manylinux-aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:2022-11-19-1b19e81"
+manylinux-x86_64-image = "manylinux_2_28"
+manylinux-aarch64-image = "manylinux_2_28"
 
 [tool.cibuildwheel.macos]
 environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update1", DP_ENABLE_IPI="1" }
@@ -152,7 +151,7 @@ repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2
 environment-pass = ["CIBW_BUILD", "DP_VARIANT"]
 environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update1", DP_ENABLE_IPI="1", MPI_HOME="/usr/lib64/mpich", PATH="/usr/lib64/mpich/bin:$PATH" }
 before-all = [
-    """{ if [ "$(uname -m)" = "x86_64" ] ; then yum config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && yum install -y cuda-nvcc-11-8 cuda-cudart-devel-11-8; fi }""",
+    """{ if [ "$(uname -m)" = "x86_64" ] ; then yum config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && yum install -y cuda-nvcc-12-2 cuda-cudart-devel-12-2; fi }""",
     "yum install -y mpich-devel",
 ]
 
diff --git a/source/install/docker/Dockerfile b/source/install/docker/Dockerfile
index c5fa878e2a..9ac905dcd0 100644
--- a/source/install/docker/Dockerfile
+++ b/source/install/docker/Dockerfile
@@ -4,7 +4,7 @@ RUN python -m venv /opt/deepmd-kit
 ENV PATH="/opt/deepmd-kit/bin:$PATH"
 # Install package
 COPY dist /dist
-RUN pip install "$(ls /dist/deepmd_kit-*manylinux*_x86_64.whl)[gpu,cu11,lmp,ipi]" \
+RUN pip install "$(ls /dist/deepmd_kit-*manylinux*_x86_64.whl)[gpu,cu12,lmp,ipi]" \
     && dp -h \
     && lmp -h \
     && dp_ipi \
diff --git a/source/install/docker_package_c.sh b/source/install/docker_package_c.sh
index d6fb269acd..75f2d1138b 100755
--- a/source/install/docker_package_c.sh
+++ b/source/install/docker_package_c.sh
@@ -3,7 +3,7 @@ set -e
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 
 docker run --rm -v ${SCRIPT_PATH}/../..:/root/deepmd-kit -w /root/deepmd-kit \
-	tensorflow/build:2.13-python3.11 \
+	tensorflow/build:2.15-python3.11 \
 	/bin/sh -c "pip install tensorflow cmake \
             && cd /root/deepmd-kit/source/install \
             && CC=/dt9/usr/bin/gcc \