convert models at runtime

Signed-off-by: Jinzhe Zeng <[email protected]>
njzjz · Nov 12, 2024 · 86d1b7a · 86d1b7a
1 parent 1234489
commit 86d1b7a
Show file tree

Hide file tree

Showing 20 changed files with 9,514 additions and 28 deletions.
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
@@ -27,7 +27,13 @@ jobs:
         mpi: mpich
     - uses: lukka/get-cmake@latest
     - run: python -m pip install uv
-    - run: source/install/uv_with_retry.sh pip install --system tensorflow
+    - name: Install Python dependencies
+      run: |
+        source/install/uv_with_retry.sh pip install --system tensorflow-cpu
+        export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+        source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp,jax] mpi4py
+    - name: Convert models
+      run: source/tests/infer/convert-models.sh
     - name: Download libtorch
       run: |
          wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip
@@ -47,12 +53,6 @@ jobs:
         CMAKE_GENERATOR: Ninja
         CXXFLAGS: ${{ matrix.check_memleak && '-fsanitize=leak' || '' }}
     # test lammps
-    - run: |
-        export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
-        source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp] mpi4py
-      env:
-        DP_BUILD_TESTING: 1
-      if: ${{ !matrix.check_memleak }}
     - run: pytest --cov=deepmd source/lmp/tests
       env:
         OMP_NUM_THREADS: 1

diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -63,6 +63,8 @@ jobs:
         CUDA_VISIBLE_DEVICES: 0
         # See https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
         XLA_PYTHON_CLIENT_PREALLOCATE: false
+    - name: Convert models
+      run: source/tests/infer/convert-models.sh
     - name: Download libtorch
       run: |
          wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcu124.zip -O libtorch.zip

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,9 +15,7 @@ repos:
         exclude: |
           (?x)^(
               source/tests/infer/dipolecharge_e.pbtxt|
-              source/tests/infer/deeppolar_new.pbtxt|
-              source/tests/infer/deeppot_dpa.savedmodel/saved_model.pb|
-              source/tests/infer/deeppot_sea.savedmodel/saved_model.pb
+              source/tests/infer/deeppolar_new.pbtxt
           )$
       - id: check-merge-conflict
       - id: check-symlinks

diff --git a/deepmd/pt/infer/inference.py b/deepmd/pt/infer/inference.py
@@ -62,4 +62,4 @@ def __init__(
         self.wrapper = ModelWrapper(self.model)  # inference only
         if JIT:
             self.wrapper = torch.jit.script(self.wrapper)
-        self.wrapper.load_state_dict(state_dict)
+        self.wrapper.load_state_dict(state_dict, strict=False)
diff --git a/deepmd/pt/utils/serialization.py b/deepmd/pt/utils/serialization.py
@@ -34,7 +34,7 @@ def serialize_from_file(model_file: str) -> dict:
         saved_model = torch.jit.load(model_file, map_location="cpu")
         model_def_script = json.loads(saved_model.model_def_script)
         model = get_model(model_def_script)
-        model.load_state_dict(saved_model.state_dict())
+        model.load_state_dict(saved_model.state_dict(), strict=False)
     elif model_file.endswith(".pt"):
         state_dict = torch.load(model_file, map_location="cpu", weights_only=True)
         if "model" in state_dict:

diff --git a/examples/water/dpa2/frozen_model.pth b/examples/water/dpa2/frozen_model.pth
diff --git a/source/api_cc/tests/test_deeppot_jax.cc b/source/api_cc/tests/test_deeppot_jax.cc
@@ -71,8 +71,7 @@ class TestInferDeepPotAJAX : public ::testing::Test {
   void SetUp() override {
     std::string file_name = "../../tests/infer/deeppot_sea.savedmodel";
 
-    // the model is generated for the CPU, so always use the CPU
-    dp.init(file_name, -1);
+    dp.init(file_name);
 
     natoms = expected_e.size();
     EXPECT_EQ(natoms * 3, expected_f.size());

diff --git a/source/lmp/tests/test_lammps_dpa_jax.py b/source/lmp/tests/test_lammps_dpa_jax.py
@@ -19,12 +19,6 @@
     write_lmp_data,
 )
 
-if os.environ.get("CUDA_VISIBLE_DEVICES", "") != "":
-    pytest.skip(
-        reason="The model is generated with CPU",
-        allow_module_level=True,
-    )
-
 pbtxt_file2 = (
     Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot-1.pbtxt"
 )

diff --git a/source/lmp/tests/test_lammps_jax.py b/source/lmp/tests/test_lammps_jax.py
@@ -19,12 +19,6 @@
     write_lmp_data,
 )
 
-if os.environ.get("CUDA_VISIBLE_DEVICES", "") != "":
-    pytest.skip(
-        reason="The model is generated with CPU",
-        allow_module_level=True,
-    )
-
 pbtxt_file2 = (
     Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot-1.pbtxt"
 )

diff --git a/source/tests/infer/convert-models.sh b/source/tests/infer/convert-models.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -ev
+
+dp convert-backend deeppot_sea.yaml deeppot_sea.savedmodel
+dp convert-backend deeppot_dpa.yaml deeppot_dpa.savedmodel
diff --git a/source/tests/infer/deeppot_dpa.savedmodel/fingerprint.pb b/source/tests/infer/deeppot_dpa.savedmodel/fingerprint.pb
diff --git a/source/tests/infer/deeppot_dpa.savedmodel/saved_model.pb b/source/tests/infer/deeppot_dpa.savedmodel/saved_model.pb
diff --git a/source/tests/infer/deeppot_dpa.savedmodel/variables/variables.data-00000-of-00001 b/source/tests/infer/deeppot_dpa.savedmodel/variables/variables.data-00000-of-00001
diff --git a/source/tests/infer/deeppot_dpa.savedmodel/variables/variables.index b/source/tests/infer/deeppot_dpa.savedmodel/variables/variables.index