From 94fa534c46dc0d70f8f8c410ebd9da994be23230 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 2 Sep 2023 10:34:48 -0400
Subject: [PATCH 01/63] lmp: support `unit real` (#2775)

Fix #1262.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 source/lmp/pair_deepmd.cpp      |  75 +++++++-----
 source/lmp/pair_deepmd.h        |   1 +
 source/lmp/tests/test_lammps.py | 203 +++++++++++++++++++++++++++++++-
 3 files changed, 246 insertions(+), 33 deletions(-)

diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index f0e0f23096..f285bed8ea 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -344,11 +344,18 @@ PairDeepMD::PairDeepMD(LAMMPS *lmp)
   if (lmp->citeme) {
     lmp->citeme->add(cite_user_deepmd_package);
   }
-  if (strcmp(update->unit_style, "metal") != 0) {
-    error->all(
-        FLERR,
-        "Pair deepmd requires metal unit, please set it by \"units metal\"");
+  int unit_convert;
+  if (strcmp(update->unit_style, "metal") == 0) {
+    unit_convert = utils::NOCONVERT;
+  } else if (strcmp(update->unit_style, "real") == 0) {
+    unit_convert = utils::METAL2REAL;
+  } else {
+    error->all(FLERR,
+               "Pair deepmd requires metal or real unit, please set it by "
+               "\"units metal\" or \"units real\"");
   }
+  ener_unit_cvt_factor =
+      utils::get_conversion_factor(utils::ENERGY, unit_convert);
   restartinfo = 1;
 #if LAMMPS_VERSION_NUMBER >= 20201130
   centroidstressflag =
@@ -361,6 +368,8 @@ PairDeepMD::PairDeepMD(LAMMPS *lmp)
   pppmflag = 1;
   respa_enable = 0;
   writedata = 0;
+  unit_convert_flag = utils::get_supported_conversions(utils::ENERGY);
+
   cutoff = 0.;
   numb_types = 0;
   numb_types_spin = 0;
@@ -576,7 +585,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
         }
         if (eflag_atom) {
           for (int ii = 0; ii < nlocal; ++ii) {
-            eatom[ii] += deatom[ii];
+            eatom[ii] += scale[1][1] * deatom[ii];
           }
         }
         // Added by Davide Tisi 2020
@@ -590,15 +599,15 @@ void PairDeepMD::compute(int eflag, int vflag) {
             // vatom[ii][3] += 1.0 * dvatom[9*ii+3];
             // vatom[ii][4] += 1.0 * dvatom[9*ii+6];
             // vatom[ii][5] += 1.0 * dvatom[9*ii+7];
-            cvatom[ii][0] += 1.0 * dvatom[9 * ii + 0];  // xx
-            cvatom[ii][1] += 1.0 * dvatom[9 * ii + 4];  // yy
-            cvatom[ii][2] += 1.0 * dvatom[9 * ii + 8];  // zz
-            cvatom[ii][3] += 1.0 * dvatom[9 * ii + 3];  // xy
-            cvatom[ii][4] += 1.0 * dvatom[9 * ii + 6];  // xz
-            cvatom[ii][5] += 1.0 * dvatom[9 * ii + 7];  // yz
-            cvatom[ii][6] += 1.0 * dvatom[9 * ii + 1];  // yx
-            cvatom[ii][7] += 1.0 * dvatom[9 * ii + 2];  // zx
-            cvatom[ii][8] += 1.0 * dvatom[9 * ii + 5];  // zy
+            cvatom[ii][0] += scale[1][1] * dvatom[9 * ii + 0];  // xx
+            cvatom[ii][1] += scale[1][1] * dvatom[9 * ii + 4];  // yy
+            cvatom[ii][2] += scale[1][1] * dvatom[9 * ii + 8];  // zz
+            cvatom[ii][3] += scale[1][1] * dvatom[9 * ii + 3];  // xy
+            cvatom[ii][4] += scale[1][1] * dvatom[9 * ii + 6];  // xz
+            cvatom[ii][5] += scale[1][1] * dvatom[9 * ii + 7];  // yz
+            cvatom[ii][6] += scale[1][1] * dvatom[9 * ii + 1];  // yx
+            cvatom[ii][7] += scale[1][1] * dvatom[9 * ii + 2];  // zx
+            cvatom[ii][8] += scale[1][1] * dvatom[9 * ii + 5];  // zy
           }
         }
       }
@@ -628,7 +637,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
       dvatom = all_atom_virial[0];
       if (eflag_atom) {
         for (int ii = 0; ii < nlocal; ++ii) {
-          eatom[ii] += deatom[ii];
+          eatom[ii] += scale[1][1] * deatom[ii];
         }
       }
       // Added by Davide Tisi 2020
@@ -642,15 +651,15 @@ void PairDeepMD::compute(int eflag, int vflag) {
           // vatom[ii][3] += 1.0 * dvatom[9*ii+3];
           // vatom[ii][4] += 1.0 * dvatom[9*ii+6];
           // vatom[ii][5] += 1.0 * dvatom[9*ii+7];
-          cvatom[ii][0] += 1.0 * dvatom[9 * ii + 0];  // xx
-          cvatom[ii][1] += 1.0 * dvatom[9 * ii + 4];  // yy
-          cvatom[ii][2] += 1.0 * dvatom[9 * ii + 8];  // zz
-          cvatom[ii][3] += 1.0 * dvatom[9 * ii + 3];  // xy
-          cvatom[ii][4] += 1.0 * dvatom[9 * ii + 6];  // xz
-          cvatom[ii][5] += 1.0 * dvatom[9 * ii + 7];  // yz
-          cvatom[ii][6] += 1.0 * dvatom[9 * ii + 1];  // yx
-          cvatom[ii][7] += 1.0 * dvatom[9 * ii + 2];  // zx
-          cvatom[ii][8] += 1.0 * dvatom[9 * ii + 5];  // zy
+          cvatom[ii][0] += scale[1][1] * dvatom[9 * ii + 0];  // xx
+          cvatom[ii][1] += scale[1][1] * dvatom[9 * ii + 4];  // yy
+          cvatom[ii][2] += scale[1][1] * dvatom[9 * ii + 8];  // zz
+          cvatom[ii][3] += scale[1][1] * dvatom[9 * ii + 3];  // xy
+          cvatom[ii][4] += scale[1][1] * dvatom[9 * ii + 6];  // xz
+          cvatom[ii][5] += scale[1][1] * dvatom[9 * ii + 7];  // yz
+          cvatom[ii][6] += scale[1][1] * dvatom[9 * ii + 1];  // yx
+          cvatom[ii][7] += scale[1][1] * dvatom[9 * ii + 2];  // zx
+          cvatom[ii][8] += scale[1][1] * dvatom[9 * ii + 5];  // zy
         }
       }
       if (out_freq > 0 && update->ntimestep % out_freq == 0) {
@@ -719,6 +728,12 @@ void PairDeepMD::compute(int eflag, int vflag) {
           all_v_avg = sqrt(all_v_avg / 9);
         }
         if (rank == 0) {
+          all_v_max *= scale[1][1];
+          all_v_min *= scale[1][1];
+          all_v_avg *= scale[1][1];
+          all_f_max *= scale[1][1];
+          all_f_min *= scale[1][1];
+          all_f_avg *= scale[1][1];
           fp << setw(12) << update->ntimestep << " " << setw(18) << all_v_max
              << " " << setw(18) << all_v_min << " " << setw(18) << all_v_avg
              << " " << setw(18) << all_f_max << " " << setw(18) << all_f_min
@@ -744,7 +759,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
                       displacements, MPI_DOUBLE, 0, world);
           if (rank == 0) {
             for (int dd = 0; dd < all_nlocal; ++dd) {
-              std_f_all[tagrecv[dd] - 1] = stdfrecv[dd];
+              std_f_all[tagrecv[dd] - 1] = stdfrecv[dd] * scale[1][1];
             }
             for (int dd = 0; dd < all_nlocal; ++dd) {
               fp << " " << setw(18) << std_f_all[dd];
@@ -838,7 +853,7 @@ void PairDeepMD::allocate() {
         continue;
       }
       setflag[i][j] = 1;
-      scale[i][j] = 1;
+      scale[i][j] = 1.0 * ener_unit_cvt_factor;
     }
   }
 }
@@ -998,11 +1013,11 @@ void PairDeepMD::settings(int narg, char **arg) {
       iarg += 1;
     } else if (string(arg[iarg]) == string("relative")) {
       out_rel = 1;
-      eps = atof(arg[iarg + 1]);
+      eps = atof(arg[iarg + 1]) / ener_unit_cvt_factor;
       iarg += 2;
     } else if (string(arg[iarg]) == string("relative_v")) {
       out_rel_v = 1;
-      eps_v = atof(arg[iarg + 1]);
+      eps_v = atof(arg[iarg + 1]) / ener_unit_cvt_factor;
       iarg += 2;
     } else if (string(arg[iarg]) == string("virtual_len")) {
       virtual_len.resize(numb_types_spin);
@@ -1174,7 +1189,7 @@ void PairDeepMD::coeff(int narg, char **arg) {
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo, i); j <= jhi; j++) {
       setflag[i][j] = 1;
-      scale[i][j] = 1.0;
+      scale[i][j] = 1.0 * ener_unit_cvt_factor;
       if (i > numb_types || j > numb_types) {
         char warning_msg[1024];
         sprintf(warning_msg,
@@ -1221,7 +1236,7 @@ double PairDeepMD::init_one(int i, int j) {
   }
 
   if (setflag[i][j] == 0) {
-    scale[i][j] = 1.0;
+    scale[i][j] = 1.0 * ener_unit_cvt_factor;
   }
   scale[j][i] = scale[i][j];
 
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index feff28b9a4..e811bc99b9 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -131,6 +131,7 @@ class PairDeepMD : public Pair {
   tagint *tagsend, *tagrecv;
   double *stdfsend, *stdfrecv;
   std::vector<int> type_idx_map;
+  double ener_unit_cvt_factor;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/tests/test_lammps.py b/source/lmp/tests/test_lammps.py
index 2920615b8e..78eaf7ea4e 100644
--- a/source/lmp/tests/test_lammps.py
+++ b/source/lmp/tests/test_lammps.py
@@ -217,6 +217,8 @@
 
 # https://github.com/lammps/lammps/blob/1e1311cf401c5fc2614b5d6d0ff3230642b76597/src/update.cpp#L193
 nktv2p = 1.6021765e6
+nktv2p_real = 68568.415
+metal2real = 23.060549
 
 sp.check_output(
     "{} -m deepmd convert-from pbtxt -i {} -o {}".format(
@@ -244,9 +246,9 @@ def teardown_module():
     os.remove(data_type_map_file)
 
 
-def _lammps(data_file) -> PyLammps:
+def _lammps(data_file, units="metal") -> PyLammps:
     lammps = PyLammps()
-    lammps.units("metal")
+    lammps.units(units)
     lammps.boundary("p p p")
     lammps.atom_style("atomic")
     lammps.neighbor("2.0 bin")
@@ -254,7 +256,12 @@ def _lammps(data_file) -> PyLammps:
     lammps.read_data(data_file.resolve())
     lammps.mass("1 16")
     lammps.mass("2 2")
-    lammps.timestep(0.0005)
+    if units == "metal":
+        lammps.timestep(0.0005)
+    elif units == "real":
+        lammps.timestep(0.5)
+    else:
+        raise ValueError("units should be metal or real")
     lammps.fix("1 all nve")
     return lammps
 
@@ -273,6 +280,13 @@ def lammps_type_map():
     lmp.close()
 
 
+@pytest.fixture
+def lammps_real():
+    lmp = _lammps(data_file=data_file, units="real")
+    yield lmp
+    lmp.close()
+
+
 def test_pair_deepmd(lammps):
     lammps.pair_style(f"deepmd {pb_file.resolve()}")
     lammps.pair_coeff("* *")
@@ -452,3 +466,186 @@ def test_pair_deepmd_type_map(lammps_type_map):
             expected_f[lammps_type_map.atoms[ii].id - 1]
         )
     lammps_type_map.run(1)
+
+
+def test_pair_deepmd_real(lammps_real):
+    lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+        )
+    lammps_real.run(1)
+
+
+def test_pair_deepmd_virial_real(lammps_real):
+    lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_real.pair_coeff("* *")
+    lammps_real.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps_real.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps_real.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+        )
+    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps_real.variables[f"virial{ii}"].value
+        ) / nktv2p_real == pytest.approx(expected_v[idx_map, ii] * metal2real)
+
+
+def test_pair_deepmd_model_devi_real(lammps_real):
+    lammps_real.pair_style(
+        "deepmd {} {} out_file {} out_freq 1 atomic".format(
+            pb_file.resolve(), pb_file2.resolve(), md_file.resolve()
+        )
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f * metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+    )
+
+
+def test_pair_deepmd_model_devi_virial_real(lammps_real):
+    lammps_real.pair_style(
+        "deepmd {} {} out_file {} out_freq 1 atomic".format(
+            pb_file.resolve(), pb_file2.resolve(), md_file.resolve()
+        )
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps_real.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps_real.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+        )
+    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps_real.variables[f"virial{ii}"].value
+        ) / nktv2p_real == pytest.approx(expected_v[idx_map, ii] * metal2real)
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f * metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+    )
+
+
+def test_pair_deepmd_model_devi_atomic_relative_real(lammps_real):
+    relative = 1.0
+    lammps_real.pair_style(
+        "deepmd {} {} out_file {} out_freq 1 atomic relative {}".format(
+            pb_file.resolve(),
+            pb_file2.resolve(),
+            md_file.resolve(),
+            relative * metal2real,
+        )
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    norm = np.linalg.norm(np.mean([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f /= norm + relative
+    assert md[7:] == pytest.approx(expected_md_f * metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+    )
+
+
+def test_pair_deepmd_model_devi_atomic_relative_v_real(lammps_real):
+    relative = 1.0
+    lammps_real.pair_style(
+        "deepmd {} {} out_file {} out_freq 1 atomic relative_v {}".format(
+            pb_file.resolve(),
+            pb_file2.resolve(),
+            md_file.resolve(),
+            relative * metal2real,
+        )
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+        )
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f * metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    norm = (
+        np.abs(
+            np.mean([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0)
+        )
+        / 6
+    )
+    expected_md_v /= norm + relative
+    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+    )

From d68c873fa623d2ec04d9b36b5d627fb708f1cfe2 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 3 Sep 2023 20:29:55 -0400
Subject: [PATCH 02/63] lmp: throw error for traditional installation if
 dependent packages are not installed (#2777)

See
https://github.com/search?q=repo%3Alammps%2Flammps+%22Must+install%22&type=code.
Just use the same logic as LAMMPS.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 source/lmp/Install.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/source/lmp/Install.sh b/source/lmp/Install.sh
index d2a728226e..3120a923ab 100644
--- a/source/lmp/Install.sh
+++ b/source/lmp/Install.sh
@@ -30,6 +30,17 @@ action() {
 	fi
 }
 
+if (test $1 = 1); then
+	if (test ! -e ../pppm.cpp); then
+		echo "Must install KSPACE package with USER-DEEPMD package"
+		exit 1
+	fi
+	if (test ! -e ../fix_ttm.cpp); then
+		echo "Must install EXTRA-FIX package with USER-DEEPMD package"
+		exit 1
+	fi
+fi
+
 # all package files with no dependencies
 
 for file in *.cpp *.h; do

From eb21aadcbdbb4abc60f88e8e01701df10924205d Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 3 Sep 2023 20:56:14 -0400
Subject: [PATCH 03/63] add linear models (#2781)

Fix #2724. Fix #2780.

Add the linear energy model, where the energy is the linear combination
of several sub-energy models.
Add the frozen model to load the external DP model into the graph.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 README.md                             |   1 +
 deepmd/entrypoints/train.py           |   2 +
 deepmd/infer/__init__.py              |  49 ++++-
 deepmd/infer/deep_dipole.py           |   5 +
 deepmd/infer/deep_dos.py              |   4 +
 deepmd/infer/deep_eval.py             |  11 +-
 deepmd/infer/deep_polar.py            |   4 +
 deepmd/infer/deep_pot.py              |   4 +
 deepmd/infer/deep_tensor.py           |   9 +-
 deepmd/infer/deep_wfc.py              |   5 +
 deepmd/model/frozen.py                | 195 ++++++++++++++++++++
 deepmd/model/linear.py                | 250 ++++++++++++++++++++++++++
 deepmd/model/model.py                 |  14 +-
 deepmd/train/trainer.py               |   8 +
 deepmd/utils/argcheck.py              |  42 +++++
 doc/getting-started/quick_start.ipynb |   2 +-
 doc/model/index.md                    |   1 +
 doc/model/index.rst                   |   1 +
 doc/model/linear.md                   |  24 +++
 examples/water/linear/input.json      |  56 ++++++
 source/tests/test_examples.py         |   1 +
 source/tests/test_linear_model.py     | 125 +++++++++++++
 22 files changed, 800 insertions(+), 13 deletions(-)
 create mode 100644 deepmd/model/frozen.py
 create mode 100644 deepmd/model/linear.py
 create mode 100644 doc/model/linear.md
 create mode 100644 examples/water/linear/input.json
 create mode 100644 source/tests/test_linear_model.py

diff --git a/README.md b/README.md
index 76f5c9d3bb..9b9d0ff27d 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Train a Deep Potential model using `type embedding` approach](doc/model/train-se-e2-a-tebd.md)
     - [Deep potential long-range](doc/model/dplr.md)
     - [Deep Potential - Range Correction (DPRc)](doc/model/dprc.md)
+    - [Linear model](doc/model/linear.md)
 - [Training](doc/train/index.md)
     - [Training a model](doc/train/training.md)
     - [Advanced options](doc/train/training-advanced.md)
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index fa3a82bbdf..1a0d4b9c6d 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -520,6 +520,8 @@ def update_sel(jdata):
         rcut = get_rcut(jdata)
         get_min_nbor_dist(jdata, rcut)
         return jdata
+    elif jdata["model"].get("type") in ("linear_ener", "frozen"):
+        return jdata
     descrpt_data = jdata["model"]["descriptor"]
     if descrpt_data["type"] == "hybrid":
         for ii in range(len(descrpt_data["list"])):
diff --git a/deepmd/infer/__init__.py b/deepmd/infer/__init__.py
index 5055ca9cd9..14d75d0c44 100644
--- a/deepmd/infer/__init__.py
+++ b/deepmd/infer/__init__.py
@@ -5,6 +5,7 @@
     Path,
 )
 from typing import (
+    Optional,
     Union,
 )
 
@@ -56,6 +57,7 @@ def DeepPotential(
     model_file: Union[str, Path],
     load_prefix: str = "load",
     default_tf_graph: bool = False,
+    input_map: Optional[dict] = None,
 ) -> Union[DeepDipole, DeepGlobalPolar, DeepPolar, DeepPot, DeepDOS, DeepWFC]:
     """Factory function that will inialize appropriate potential read from `model_file`.
 
@@ -67,6 +69,8 @@ def DeepPotential(
         The prefix in the load computational graph
     default_tf_graph : bool
         If uses the default tf graph, otherwise build a new tf graph for evaluation
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
 
     Returns
     -------
@@ -81,23 +85,54 @@ def DeepPotential(
     mf = Path(model_file)
 
     model_type = DeepEval(
-        mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph
+        mf,
+        load_prefix=load_prefix,
+        default_tf_graph=default_tf_graph,
+        input_map=input_map,
     ).model_type
 
     if model_type == "ener":
-        dp = DeepPot(mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph)
+        dp = DeepPot(
+            mf,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
+        )
     elif model_type == "dos":
-        dp = DeepDOS(mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph)
+        dp = DeepDOS(
+            mf,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
+        )
     elif model_type == "dipole":
-        dp = DeepDipole(mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph)
+        dp = DeepDipole(
+            mf,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
+        )
     elif model_type == "polar":
-        dp = DeepPolar(mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph)
+        dp = DeepPolar(
+            mf,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
+        )
     elif model_type == "global_polar":
         dp = DeepGlobalPolar(
-            mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph
+            mf,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
         )
     elif model_type == "wfc":
-        dp = DeepWFC(mf, load_prefix=load_prefix, default_tf_graph=default_tf_graph)
+        dp = DeepWFC(
+            mf,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
+        )
     else:
         raise RuntimeError(f"unknown model type {model_type}")
 
diff --git a/deepmd/infer/deep_dipole.py b/deepmd/infer/deep_dipole.py
index 0464a6c33f..6020118135 100644
--- a/deepmd/infer/deep_dipole.py
+++ b/deepmd/infer/deep_dipole.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     TYPE_CHECKING,
+    Optional,
 )
 
 from deepmd.infer.deep_tensor import (
@@ -24,6 +25,8 @@ class DeepDipole(DeepTensor):
         The prefix in the load computational graph
     default_tf_graph : bool
         If uses the default tf graph, otherwise build a new tf graph for evaluation
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
 
     Warnings
     --------
@@ -37,6 +40,7 @@ def __init__(
         model_file: "Path",
         load_prefix: str = "load",
         default_tf_graph: bool = False,
+        input_map: Optional[dict] = None,
     ) -> None:
         # use this in favor of dict update to move attribute from class to
         # instance namespace
@@ -53,6 +57,7 @@ def __init__(
             model_file,
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
+            input_map=input_map,
         )
 
     def get_dim_fparam(self) -> int:
diff --git a/deepmd/infer/deep_dos.py b/deepmd/infer/deep_dos.py
index 52ef056b1a..5f181bd336 100644
--- a/deepmd/infer/deep_dos.py
+++ b/deepmd/infer/deep_dos.py
@@ -46,6 +46,8 @@ class DeepDOS(DeepEval):
     auto_batch_size : bool or int or AutomaticBatchSize, default: True
         If True, automatic batch size will be used. If int, it will be used
         as the initial batch size.
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
 
     Warnings
     --------
@@ -60,6 +62,7 @@ def __init__(
         load_prefix: str = "load",
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = True,
+        input_map: Optional[dict] = None,
     ) -> None:
         # add these tensors on top of what is defined by DeepTensor Class
         # use this in favor of dict update to move attribute from class to
@@ -91,6 +94,7 @@ def __init__(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             auto_batch_size=auto_batch_size,
+            input_map=input_map,
         )
 
         # load optional tensors
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index 899c8c9acf..3f5dede1ad 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -43,6 +43,8 @@ class DeepEval:
     auto_batch_size : bool or int or AutomaticBatchSize, default: False
         If True, automatic batch size will be used. If int, it will be used
         as the initial batch size.
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
     """
 
     load_prefix: str  # set by subclass
@@ -53,9 +55,13 @@ def __init__(
         load_prefix: str = "load",
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = False,
+        input_map: Optional[dict] = None,
     ):
         self.graph = self._load_graph(
-            model_file, prefix=load_prefix, default_tf_graph=default_tf_graph
+            model_file,
+            prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
         )
         self.load_prefix = load_prefix
 
@@ -168,6 +174,7 @@ def _load_graph(
         frozen_graph_filename: "Path",
         prefix: str = "load",
         default_tf_graph: bool = False,
+        input_map: Optional[dict] = None,
     ):
         # We load the protobuf file from the disk and parse it to retrieve the
         # unserialized graph_def
@@ -178,7 +185,7 @@ def _load_graph(
             if default_tf_graph:
                 tf.import_graph_def(
                     graph_def,
-                    input_map=None,
+                    input_map=input_map,
                     return_elements=None,
                     name=prefix,
                     producer_op_list=None,
diff --git a/deepmd/infer/deep_polar.py b/deepmd/infer/deep_polar.py
index 6ecbf8aae6..118f8c98a7 100644
--- a/deepmd/infer/deep_polar.py
+++ b/deepmd/infer/deep_polar.py
@@ -28,6 +28,8 @@ class DeepPolar(DeepTensor):
         The prefix in the load computational graph
     default_tf_graph : bool
         If uses the default tf graph, otherwise build a new tf graph for evaluation
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
 
     Warnings
     --------
@@ -41,6 +43,7 @@ def __init__(
         model_file: "Path",
         load_prefix: str = "load",
         default_tf_graph: bool = False,
+        input_map: Optional[dict] = None,
     ) -> None:
         # use this in favor of dict update to move attribute from class to
         # instance namespace
@@ -57,6 +60,7 @@ def __init__(
             model_file,
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
+            input_map=input_map,
         )
 
     def get_dim_fparam(self) -> int:
diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py
index b3e9be1e67..031c5de1bc 100644
--- a/deepmd/infer/deep_pot.py
+++ b/deepmd/infer/deep_pot.py
@@ -49,6 +49,8 @@ class DeepPot(DeepEval):
     auto_batch_size : bool or int or AutomaticBatchSize, default: True
         If True, automatic batch size will be used. If int, it will be used
         as the initial batch size.
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
 
     Examples
     --------
@@ -75,6 +77,7 @@ def __init__(
         load_prefix: str = "load",
         default_tf_graph: bool = False,
         auto_batch_size: Union[bool, int, AutoBatchSize] = True,
+        input_map: Optional[dict] = None,
     ) -> None:
         # add these tensors on top of what is defined by DeepTensor Class
         # use this in favor of dict update to move attribute from class to
@@ -108,6 +111,7 @@ def __init__(
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
             auto_batch_size=auto_batch_size,
+            input_map=input_map,
         )
 
         # load optional tensors
diff --git a/deepmd/infer/deep_tensor.py b/deepmd/infer/deep_tensor.py
index 30b6fcfea5..367a8ab5e7 100644
--- a/deepmd/infer/deep_tensor.py
+++ b/deepmd/infer/deep_tensor.py
@@ -35,6 +35,8 @@ class DeepTensor(DeepEval):
         The prefix in the load computational graph
     default_tf_graph : bool
         If uses the default tf graph, otherwise build a new tf graph for evaluation
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
     """
 
     tensors = {
@@ -58,10 +60,15 @@ def __init__(
         model_file: "Path",
         load_prefix: str = "load",
         default_tf_graph: bool = False,
+        input_map: Optional[dict] = None,
     ) -> None:
         """Constructor."""
         DeepEval.__init__(
-            self, model_file, load_prefix=load_prefix, default_tf_graph=default_tf_graph
+            self,
+            model_file,
+            load_prefix=load_prefix,
+            default_tf_graph=default_tf_graph,
+            input_map=input_map,
         )
         # check model type
         model_type = self.tensors["t_tensor"][2:-2]
diff --git a/deepmd/infer/deep_wfc.py b/deepmd/infer/deep_wfc.py
index 00b10bc543..ed682f642b 100644
--- a/deepmd/infer/deep_wfc.py
+++ b/deepmd/infer/deep_wfc.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     TYPE_CHECKING,
+    Optional,
 )
 
 from deepmd.infer.deep_tensor import (
@@ -24,6 +25,8 @@ class DeepWFC(DeepTensor):
         The prefix in the load computational graph
     default_tf_graph : bool
         If uses the default tf graph, otherwise build a new tf graph for evaluation
+    input_map : dict, optional
+        The input map for tf.import_graph_def. Only work with default tf graph
 
     Warnings
     --------
@@ -37,6 +40,7 @@ def __init__(
         model_file: "Path",
         load_prefix: str = "load",
         default_tf_graph: bool = False,
+        input_map: Optional[dict] = None,
     ) -> None:
         # use this in favor of dict update to move attribute from class to
         # instance namespace
@@ -52,6 +56,7 @@ def __init__(
             model_file,
             load_prefix=load_prefix,
             default_tf_graph=default_tf_graph,
+            input_map=input_map,
         )
 
     def get_dim_fparam(self) -> int:
diff --git a/deepmd/model/frozen.py b/deepmd/model/frozen.py
new file mode 100644
index 0000000000..c1b7d0286e
--- /dev/null
+++ b/deepmd/model/frozen.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from enum import (
+    Enum,
+)
+from typing import (
+    Optional,
+    Union,
+)
+
+from deepmd.env import (
+    GLOBAL_TF_FLOAT_PRECISION,
+    MODEL_VERSION,
+    tf,
+)
+from deepmd.fit.fitting import (
+    Fitting,
+)
+from deepmd.infer import (
+    DeepPotential,
+)
+from deepmd.loss.loss import (
+    Loss,
+)
+
+from .model import (
+    Model,
+)
+
+
+class FrozenModel(Model):
+    """Load model from a frozen model, which cannot be trained.
+
+    Parameters
+    ----------
+    model_file : str
+        The path to the frozen model
+    """
+
+    def __init__(self, model_file: str, **kwargs):
+        super().__init__(**kwargs)
+        self.model_file = model_file
+        self.model = DeepPotential(model_file)
+        self.model_type = self.model.model_type
+
+    def build(
+        self,
+        coord_: tf.Tensor,
+        atype_: tf.Tensor,
+        natoms: tf.Tensor,
+        box: tf.Tensor,
+        mesh: tf.Tensor,
+        input_dict: dict,
+        frz_model: Optional[str] = None,
+        ckpt_meta: Optional[str] = None,
+        suffix: str = "",
+        reuse: Optional[Union[bool, Enum]] = None,
+    ) -> dict:
+        """Build the model.
+
+        Parameters
+        ----------
+        coord_ : tf.Tensor
+            The coordinates of atoms
+        atype_ : tf.Tensor
+            The atom types of atoms
+        natoms : tf.Tensor
+            The number of atoms
+        box : tf.Tensor
+            The box vectors
+        mesh : tf.Tensor
+            The mesh vectors
+        input_dict : dict
+            The input dict
+        frz_model : str, optional
+            The path to the frozen model
+        ckpt_meta : str, optional
+            The path to the checkpoint and meta file
+        suffix : str, optional
+            The suffix of the scope
+        reuse : bool or tf.AUTO_REUSE, optional
+            Whether to reuse the variables
+
+        Returns
+        -------
+        dict
+            The output dict
+        """
+        # reset the model to import to the correct graph
+        extra_feed_dict = {}
+        if input_dict is not None:
+            if "fparam" in input_dict:
+                extra_feed_dict["fparam"] = input_dict["fparam"]
+            if "aparam" in input_dict:
+                extra_feed_dict["aparam"] = input_dict["aparam"]
+        input_map = self.get_feed_dict(
+            coord_, atype_, natoms, box, mesh, **extra_feed_dict
+        )
+        self.model = DeepPotential(
+            self.model_file,
+            default_tf_graph=True,
+            load_prefix="load" + suffix,
+            input_map=input_map,
+        )
+
+        with tf.variable_scope("model_attr" + suffix, reuse=reuse):
+            t_tmap = tf.constant(
+                " ".join(self.get_type_map()), name="tmap", dtype=tf.string
+            )
+            t_mt = tf.constant(self.model_type, name="model_type", dtype=tf.string)
+            t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
+        with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
+            t_ntypes = tf.constant(self.get_ntypes(), name="ntypes", dtype=tf.int32)
+            t_rcut = tf.constant(
+                self.get_rcut(), name="rcut", dtype=GLOBAL_TF_FLOAT_PRECISION
+            )
+        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
+            t_dfparam = tf.constant(
+                self.model.get_dim_fparam(), name="dfparam", dtype=tf.int32
+            )
+            t_daparam = tf.constant(
+                self.model.get_dim_aparam(), name="daparam", dtype=tf.int32
+            )
+        if self.model_type == "ener":
+            return {
+                "energy": tf.identity(self.model.t_energy, name="o_energy" + suffix),
+                "force": tf.identity(self.model.t_force, name="o_force" + suffix),
+                "virial": tf.identity(self.model.t_virial, name="o_virial" + suffix),
+                "atom_ener": tf.identity(
+                    self.model.t_ae, name="o_atom_energy" + suffix
+                ),
+                "atom_virial": tf.identity(
+                    self.model.t_av, name="o_atom_virial" + suffix
+                ),
+                "coord": coord_,
+                "atype": atype_,
+            }
+        else:
+            raise NotImplementedError(
+                f"Model type {self.model_type} has not been implemented. "
+                "Contribution is welcome!"
+            )
+
+    def get_fitting(self) -> Union[Fitting, dict]:
+        """Get the fitting(s)."""
+        return {}
+
+    def get_loss(self, loss: dict, lr) -> Optional[Union[Loss, dict]]:
+        """Get the loss function(s)."""
+        # loss should be never used for a frozen model
+        return
+
+    def get_rcut(self):
+        return self.model.get_rcut()
+
+    def get_ntypes(self) -> int:
+        return self.model.get_ntypes()
+
+    def data_stat(self, data):
+        pass
+
+    def init_variables(
+        self,
+        graph: tf.Graph,
+        graph_def: tf.GraphDef,
+        model_type: str = "original_model",
+        suffix: str = "",
+    ) -> None:
+        """Init the embedding net variables with the given frozen model.
+
+        Parameters
+        ----------
+        graph : tf.Graph
+            The input frozen model graph
+        graph_def : tf.GraphDef
+            The input frozen model graph_def
+        model_type : str
+            the type of the model
+        suffix : str
+            suffix to name scope
+        """
+        pass
+
+    def enable_compression(self, suffix: str = "") -> None:
+        """Enable compression.
+
+        Parameters
+        ----------
+        suffix : str
+            suffix to name scope
+        """
+        pass
+
+    def get_type_map(self) -> list:
+        """Get the type map."""
+        return self.model.get_type_map()
diff --git a/deepmd/model/linear.py b/deepmd/model/linear.py
new file mode 100644
index 0000000000..6399766662
--- /dev/null
+++ b/deepmd/model/linear.py
@@ -0,0 +1,250 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from enum import (
+    Enum,
+)
+from functools import (
+    lru_cache,
+)
+from typing import (
+    List,
+    Optional,
+    Union,
+)
+
+from deepmd.env import (
+    GLOBAL_TF_FLOAT_PRECISION,
+    MODEL_VERSION,
+    tf,
+)
+from deepmd.fit.fitting import (
+    Fitting,
+)
+from deepmd.loss.loss import (
+    Loss,
+)
+
+from .model import (
+    Model,
+)
+
+
+class LinearModel(Model):
+    """Linear model make linear combinations of several existing models.
+
+    Parameters
+    ----------
+    models : list[dict]
+        A list of models to be combined.
+    weights : list[float] or str
+        If the type is list[float], a list of weights for each model.
+        If "mean", the weights are set to be 1 / len(models).
+        If "sum", the weights are set to be 1.
+    """
+
+    def __init__(self, models: List[dict], weights: List[float], **kwargs):
+        super().__init__(**kwargs)
+        self.models = [Model(**model) for model in models]
+        if isinstance(weights, list):
+            if len(weights) != len(models):
+                raise ValueError(
+                    "The length of weights is not equal to the number of models"
+                )
+            self.weights = weights
+        elif weights == "mean":
+            self.weights = [1 / len(models) for _ in range(len(models))]
+        elif weights == "sum":
+            self.weights = [1 for _ in range(len(models))]
+        # TODO: add more weights, for example, so-called committee models
+        else:
+            raise ValueError(f"Invalid weights {weights}")
+
+    def get_fitting(self) -> Union[Fitting, dict]:
+        """Get the fitting(s)."""
+        return {
+            f"model{ii}": model.get_fitting() for ii, model in enumerate(self.models)
+        }
+
+    def get_loss(self, loss: dict, lr) -> Optional[Union[Loss, dict]]:
+        """Get the loss function(s)."""
+        # the first model that is not None, or None if all models are None
+        for model in self.models:
+            loss = model.get_loss(loss, lr)
+            if loss is not None:
+                return loss
+        return None
+
+    def get_rcut(self):
+        return max([model.get_rcut() for model in self.models])
+
+    @lru_cache(maxsize=1)
+    def get_ntypes(self) -> int:
+        # check if all models have the same ntypes
+        for model in self.models:
+            if model.get_ntypes() != self.models[0].get_ntypes():
+                raise ValueError("Models have different ntypes")
+        return self.models[0].get_ntypes()
+
+    def data_stat(self, data):
+        for model in self.models:
+            model.data_stat(data)
+
+    def init_variables(
+        self,
+        graph: tf.Graph,
+        graph_def: tf.GraphDef,
+        model_type: str = "original_model",
+        suffix: str = "",
+    ) -> None:
+        """Init the embedding net variables with the given frozen model.
+
+        Parameters
+        ----------
+        graph : tf.Graph
+            The input frozen model graph
+        graph_def : tf.GraphDef
+            The input frozen model graph_def
+        model_type : str
+            the type of the model
+        suffix : str
+            suffix to name scope
+        """
+        for ii, model in enumerate(self.models):
+            model.init_variables(
+                graph, graph_def, model_type, suffix=f"_model{ii}{suffix}"
+            )
+
+    def enable_compression(self, suffix: str = "") -> None:
+        """Enable compression.
+
+        Parameters
+        ----------
+        suffix : str
+            suffix to name scope
+        """
+        for ii, model in enumerate(self.models):
+            model.enable_compression(suffix=f"_model{ii}{suffix}")
+
+    def get_type_map(self) -> list:
+        """Get the type map."""
+        return self.models[0].get_type_map()
+
+
+class LinearEnergyModel(LinearModel):
+    """Linear energy model make linear combinations of several existing energy models."""
+
+    model_type = "ener"
+
+    def build(
+        self,
+        coord_: tf.Tensor,
+        atype_: tf.Tensor,
+        natoms: tf.Tensor,
+        box: tf.Tensor,
+        mesh: tf.Tensor,
+        input_dict: dict,
+        frz_model: Optional[str] = None,
+        ckpt_meta: Optional[str] = None,
+        suffix: str = "",
+        reuse: Optional[Union[bool, Enum]] = None,
+    ) -> dict:
+        """Build the model.
+
+        Parameters
+        ----------
+        coord_ : tf.Tensor
+            The coordinates of atoms
+        atype_ : tf.Tensor
+            The atom types of atoms
+        natoms : tf.Tensor
+            The number of atoms
+        box : tf.Tensor
+            The box vectors
+        mesh : tf.Tensor
+            The mesh vectors
+        input_dict : dict
+            The input dict
+        frz_model : str, optional
+            The path to the frozen model
+        ckpt_meta : str, optional
+            The path to the checkpoint and meta file
+        suffix : str, optional
+            The suffix of the scope
+        reuse : bool or tf.AUTO_REUSE, optional
+            Whether to reuse the variables
+
+        Returns
+        -------
+        dict
+            The output dict
+        """
+        with tf.variable_scope("model_attr" + suffix, reuse=reuse):
+            t_tmap = tf.constant(
+                " ".join(self.get_type_map()), name="tmap", dtype=tf.string
+            )
+            t_mt = tf.constant(self.model_type, name="model_type", dtype=tf.string)
+            t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
+        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
+            # non zero not supported
+            t_dfparam = tf.constant(0, name="dfparam", dtype=tf.int32)
+            t_daparam = tf.constant(0, name="daparam", dtype=tf.int32)
+        with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
+            t_ntypes = tf.constant(self.get_ntypes(), name="ntypes", dtype=tf.int32)
+            t_rcut = tf.constant(
+                self.get_rcut(), name="rcut", dtype=GLOBAL_TF_FLOAT_PRECISION
+            )
+
+        subdicts = []
+        for ii, model in enumerate(self.models):
+            subdict = model.build(
+                coord_,
+                atype_,
+                natoms,
+                box,
+                mesh,
+                input_dict,
+                frz_model=frz_model,
+                ckpt_meta=ckpt_meta,
+                suffix=f"_model{ii}{suffix}",
+                reuse=reuse,
+            )
+            subdicts.append(subdict)
+        t_weight = tf.constant(self.weights, dtype=GLOBAL_TF_FLOAT_PRECISION)
+
+        model_dict = {}
+        # energy shape is (n_batch,), other shapes are (n_batch, -1)
+        energy = tf.reduce_sum(
+            tf.stack([mm["energy"] for mm in subdicts], axis=0) * t_weight[:, None],
+            axis=0,
+        )
+        force = tf.reduce_sum(
+            tf.stack([mm["force"] for mm in subdicts], axis=0)
+            * t_weight[:, None, None],
+            axis=0,
+        )
+        virial = tf.reduce_sum(
+            tf.stack([mm["virial"] for mm in subdicts], axis=0)
+            * t_weight[:, None, None],
+            axis=0,
+        )
+        atom_ener = tf.reduce_sum(
+            tf.stack([mm["atom_ener"] for mm in subdicts], axis=0)
+            * t_weight[:, None, None],
+            axis=0,
+        )
+        atom_virial = tf.reduce_sum(
+            tf.stack([mm["atom_virial"] for mm in subdicts], axis=0)
+            * t_weight[:, None, None],
+            axis=0,
+        )
+
+        model_dict["energy"] = tf.identity(energy, name="o_energy" + suffix)
+        model_dict["force"] = tf.identity(force, name="o_force" + suffix)
+        model_dict["virial"] = tf.identity(virial, name="o_virial" + suffix)
+        model_dict["atom_ener"] = tf.identity(atom_ener, name="o_atom_energy" + suffix)
+        model_dict["atom_virial"] = tf.identity(
+            atom_virial, name="o_atom_virial" + suffix
+        )
+
+        model_dict["coord"] = coord_
+        model_dict["atype"] = atype_
+        return model_dict
diff --git a/deepmd/model/model.py b/deepmd/model/model.py
index 26d5a6fbb1..a06a3141c0 100644
--- a/deepmd/model/model.py
+++ b/deepmd/model/model.py
@@ -82,6 +82,12 @@ def __new__(cls, *args, **kwargs):
         if cls is Model:
             # init model
             # infer model type by fitting_type
+            from deepmd.model.frozen import (
+                FrozenModel,
+            )
+            from deepmd.model.linear import (
+                LinearEnergyModel,
+            )
             from deepmd.model.multi import (
                 MultiModel,
             )
@@ -96,6 +102,10 @@ def __new__(cls, *args, **kwargs):
                 cls = MultiModel
             elif model_type == "pairwise_dprc":
                 cls = PairwiseDPRc
+            elif model_type == "frozen":
+                cls = FrozenModel
+            elif model_type == "linear_ener":
+                cls = LinearEnergyModel
             else:
                 raise ValueError(f"unknown model type: {model_type}")
             return cls.__new__(cls, *args, **kwargs)
@@ -393,11 +403,11 @@ def get_numb_dos(self) -> Union[int, dict]:
         return 0
 
     @abstractmethod
-    def get_fitting(self) -> Union[str, dict]:
+    def get_fitting(self) -> Union[Fitting, dict]:
         """Get the fitting(s)."""
 
     @abstractmethod
-    def get_loss(self, loss: dict, lr) -> Union[Loss, dict]:
+    def get_loss(self, loss: dict, lr) -> Optional[Union[Loss, dict]]:
         """Get the loss function(s)."""
 
     @abstractmethod
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index a6ac96dab4..b322336b39 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -325,6 +325,9 @@ def _build_lr(self):
         log.info("built lr")
 
     def _build_loss(self):
+        if self.stop_batch == 0:
+            # l2 is not used if stop_batch is zero
+            return None, None
         if not self.multi_task_mode:
             l2_l, l2_more = self.loss.build(
                 self.learning_rate,
@@ -449,6 +452,11 @@ def _build_optimizer(self, fitting_key=None):
         return optimizer
 
     def _build_training(self):
+        if self.stop_batch == 0:
+            # self.train_op is not used if stop_batch is zero
+            self.train_op = None
+            return
+
         trainable_variables = tf.trainable_variables()
 
         if not self.multi_task_mode:
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index b67722bd89..b38f8c8063 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -799,6 +799,7 @@ def model_args(exclude_hybrid=False):
         hybrid_models.extend(
             [
                 pairwise_dprc(),
+                linear_ener_model_args(),
             ]
         )
     return Argument(
@@ -871,6 +872,7 @@ def model_args(exclude_hybrid=False):
                 [
                     standard_model_args(),
                     multi_model_args(),
+                    frozen_model_args(),
                     *hybrid_models,
                 ],
                 optional=True,
@@ -945,6 +947,46 @@ def pairwise_dprc() -> Argument:
     return ca
 
 
+def frozen_model_args() -> Argument:
+    doc_model_file = "Path to the frozen model file."
+    ca = Argument(
+        "frozen",
+        dict,
+        [
+            Argument("model_file", str, optional=False, doc=doc_model_file),
+        ],
+    )
+    return ca
+
+
+def linear_ener_model_args() -> Argument:
+    doc_weights = (
+        "If the type is list of float, a list of weights for each model. "
+        'If "mean", the weights are set to be 1 / len(models). '
+        'If "sum", the weights are set to be 1.'
+    )
+    models_args = model_args(exclude_hybrid=True)
+    models_args.name = "models"
+    models_args.fold_subdoc = True
+    models_args.set_dtype(list)
+    models_args.set_repeat(True)
+    models_args.doc = "The sub-models."
+    ca = Argument(
+        "linear_ener",
+        dict,
+        [
+            models_args,
+            Argument(
+                "weights",
+                [list, str],
+                optional=False,
+                doc=doc_weights,
+            ),
+        ],
+    )
+    return ca
+
+
 #  --- Learning rate configurations: --- #
 def learning_rate_exp():
     doc_start_lr = "The learning rate at the start of the training."
diff --git a/doc/getting-started/quick_start.ipynb b/doc/getting-started/quick_start.ipynb
index e743b5cf5c..31209ae381 100644
--- a/doc/getting-started/quick_start.ipynb
+++ b/doc/getting-started/quick_start.ipynb
@@ -1,5 +1,5 @@
 {
- "cells": [
+  "cells": [
   {
    "attachments": {},
    "cell_type": "markdown",
diff --git a/doc/model/index.md b/doc/model/index.md
index d649df1442..4ef508ec1b 100644
--- a/doc/model/index.md
+++ b/doc/model/index.md
@@ -16,3 +16,4 @@
 - [Train a Deep Potential model using `type embedding` approach](train-se-e2-a-tebd.md)
 - [Deep potential long-range](dplr.md)
 - [Deep Potential - Range Correction (DPRc)](dprc.md)
+- [Linear model](linear.md)
diff --git a/doc/model/index.rst b/doc/model/index.rst
index 6a01a3b015..6597ce1d21 100644
--- a/doc/model/index.rst
+++ b/doc/model/index.rst
@@ -19,3 +19,4 @@ Model
    train-se-a-mask
    dplr
    dprc
+   linear
diff --git a/doc/model/linear.md b/doc/model/linear.md
new file mode 100644
index 0000000000..b5e7c5c76a
--- /dev/null
+++ b/doc/model/linear.md
@@ -0,0 +1,24 @@
+## Linear model
+
+One can linearly combine existing models with arbitrary coefficients:
+
+```json
+"model": {
+    "type": "linear_ener",
+    "models": [
+    {
+        "type": "frozen",
+        "model_file": "model0.pb"
+    },
+    {
+        "type": "frozen",
+        "model_file": "model1.pb"
+    }
+    ],
+    "weights": [0.5, 0.5]
+},
+```
+
+{ref}`weights <model[linear_ener]/weights>` can be a list of floats, `mean`, or `sum`.
+
+To obtain the model, one needs to execute `dp train` to do a zero-step training with {ref}`numb_steps <training/numb_steps>` set to `0`, and then freeze the model with `dp freeze`.
diff --git a/examples/water/linear/input.json b/examples/water/linear/input.json
new file mode 100644
index 0000000000..e6d0e267f4
--- /dev/null
+++ b/examples/water/linear/input.json
@@ -0,0 +1,56 @@
+{
+  "model": {
+    "type": "linear_ener",
+    "models": [
+      {
+        "type": "frozen",
+        "model_file": "model0.pb"
+      },
+      {
+        "type": "frozen",
+        "model_file": "model1.pb"
+      }
+    ],
+    "weights": "mean",
+    "_comment1": "that's all"
+  },
+
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment2": "that's all"
+  },
+
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0,
+    "_comment3": " that's all"
+  },
+
+  "training": {
+    "training_data": {
+      "_comment4": "Currently there must be systems",
+      "_comment5": "TODO: support empty systems",
+      "systems": [
+        "../data/data_0"
+      ],
+      "batch_size": "auto",
+      "_comment6": "that's all"
+    },
+    "numb_steps": 0,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 1000,
+    "_comment7": "that's all"
+  },
+
+  "_comment8": "that's all"
+}
diff --git a/source/tests/test_examples.py b/source/tests/test_examples.py
index c9f71e4a81..d50ca5fee1 100644
--- a/source/tests/test_examples.py
+++ b/source/tests/test_examples.py
@@ -29,6 +29,7 @@
     p_examples / "water" / "hybrid" / "input.json",
     p_examples / "water" / "dplr" / "train" / "dw.json",
     p_examples / "water" / "dplr" / "train" / "ener.json",
+    p_examples / "water" / "linear" / "input.json",
     p_examples / "nopbc" / "train" / "input.json",
     p_examples / "water_tensor" / "dipole" / "dipole_input.json",
     p_examples / "water_tensor" / "polar" / "polar_input.json",
diff --git a/source/tests/test_linear_model.py b/source/tests/test_linear_model.py
new file mode 100644
index 0000000000..13a2bc4850
--- /dev/null
+++ b/source/tests/test_linear_model.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+import sys
+
+import numpy as np
+
+from deepmd.env import (
+    GLOBAL_ENER_FLOAT_PRECISION,
+    GLOBAL_TF_FLOAT_PRECISION,
+    tf,
+)
+from deepmd.infer import (
+    DeepPotential,
+)
+from deepmd.model.linear import (
+    LinearEnergyModel,
+)
+from deepmd.utils.convert import (
+    convert_pbtxt_to_pb,
+)
+
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+from common import (
+    DataSystem,
+    del_data,
+    gen_data,
+    tests_path,
+)
+
+
+class TestLinearModel(tf.test.TestCase):
+    def setUp(self):
+        gen_data()
+        self.data_dir = "system"
+        with open(os.path.join(self.data_dir, "type_map.raw"), "w") as f:
+            f.write("O\nH")
+        self.pbtxts = [
+            os.path.join(tests_path, "infer/deeppot.pbtxt"),
+            os.path.join(tests_path, "infer/deeppot-1.pbtxt"),
+        ]
+        self.graph_dirs = [pbtxt.replace("pbtxt", "pb") for pbtxt in self.pbtxts]
+        for pbtxt, pb in zip(self.pbtxts, self.graph_dirs):
+            convert_pbtxt_to_pb(pbtxt, pb)
+        self.graphs = [DeepPotential(pb) for pb in self.graph_dirs]
+
+    def test_linear_ener_model(self):
+        numb_test = 1
+        data = DataSystem([self.data_dir], "set", 1, 1, 6, run_opt=None)
+        test_data = data.get_test()
+
+        model = LinearEnergyModel(
+            models=[
+                {
+                    "type": "frozen",
+                    "model_file": model_file,
+                }
+                for model_file in self.graph_dirs
+            ],
+            weights="mean",
+        )
+
+        t_prop_c = tf.placeholder(tf.float32, [5], name="t_prop_c")
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name="t_energy")
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord")
+        t_type = tf.placeholder(tf.int32, [None], name="i_type")
+        t_natoms = tf.placeholder(tf.int32, [model.get_ntypes() + 2], name="i_natoms")
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name="i_box")
+        t_mesh = tf.placeholder(tf.int32, [None], name="i_mesh")
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+
+        model_pred = model.build(
+            t_coord,
+            t_type,
+            t_natoms,
+            t_box,
+            t_mesh,
+            t_fparam,
+            suffix="_linear_energy",
+            reuse=False,
+        )
+
+        energy = model_pred["energy"]
+        force = model_pred["force"]
+        virial = model_pred["virial"]
+
+        feed_dict_test = {
+            t_prop_c: test_data["prop_c"],
+            t_energy: test_data["energy"][:numb_test],
+            t_coord: np.reshape(test_data["coord"][:numb_test, :], [-1]),
+            t_box: test_data["box"][:numb_test, :],
+            t_type: np.reshape(test_data["type"], [-1]),
+            t_natoms: test_data["natoms_vec"],
+            t_mesh: test_data["default_mesh"],
+            is_training: False,
+        }
+        sess = self.test_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
+        e = np.reshape(e, [1, -1])
+        f = np.reshape(f, [1, -1, 3])
+        v = np.reshape(v, [1, 9])
+
+        es = []
+        fs = []
+        vs = []
+
+        for ii, graph in enumerate(self.graphs):
+            ei, fi, vi = graph.eval(
+                test_data["coord"][:numb_test, :],
+                test_data["box"][:numb_test, :],
+                np.reshape(test_data["type"], [-1]),
+            )
+            es.append(ei)
+            fs.append(fi)
+            vs.append(vi)
+
+        np.testing.assert_allclose(e, np.mean(es, axis=0), rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(f, np.mean(fs, axis=0), rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(v, np.mean(vs, axis=0), rtol=1e-5, atol=1e-5)
+
+    def tearDown(self):
+        for pb in self.graph_dirs:
+            os.remove(pb)
+        del_data()

From 785f4d7f50d30e4348379393b52d1f510fd9f7dc Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 3 Sep 2023 22:57:28 -0400
Subject: [PATCH 04/63] lmp: add the header for atomic model deviation (#2778)

Previously, the header line does not have columns for atomic model
deviation.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 source/lmp/pair_deepmd.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index f285bed8ea..5e60213f08 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -1054,7 +1054,12 @@ void PairDeepMD::settings(int narg, char **arg) {
         fp << "#" << setw(12 - 1) << "step" << setw(18 + 1) << "max_devi_v"
            << setw(18 + 1) << "min_devi_v" << setw(18 + 1) << "avg_devi_v"
            << setw(18 + 1) << "max_devi_f" << setw(18 + 1) << "min_devi_f"
-           << setw(18 + 1) << "avg_devi_f" << endl;
+           << setw(18 + 1) << "avg_devi_f";
+        if (out_each) {
+          // at this time, we don't know how many atoms
+          fp << setw(18 + 1) << "atm_devi_f(N)";
+        }
+        fp << endl;
       } else {
         fp.open(out_file, std::ofstream::out | std::ofstream::app);
         fp << scientific;

From b38a8fcc890f01549c4fe6d411f82b5da4962940 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 3 Sep 2023 23:07:22 -0400
Subject: [PATCH 05/63] check status of allocate_temp (#2782)

Fix #2513.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 43 ++++++++++++++++++++------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a3acf8ea66..d423e8a108 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1501,8 +1501,11 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   // Tensor FPTYPE_temp;
   TensorShape FPTYPE_shape;
   FPTYPE_shape.AddDim(nall * 3);
-  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape,
-                         tensor_list);
+  tensorflow::Status status = context->allocate_temp(
+      DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
+  if (!status.ok()) {
+    return false;
+  }
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
   DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
                         cudaMemcpyDeviceToDevice));
@@ -1519,8 +1522,11 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   // Tensor double_temp;
   TensorShape double_shape;
   double_shape.AddDim(18);
-  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, double_shape,
-                         tensor_list + 1);
+  status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, double_shape,
+                                  tensor_list + 1);
+  if (!status.ok()) {
+    return false;
+  }
   // Tensor int_temp;
   TensorShape int_shape;
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
@@ -1583,7 +1589,11 @@ static int _build_nlist_gpu(OpKernelContext* context,
   // Tensor nlist_temp;
   TensorShape nlist_shape;
   nlist_shape.AddDim(nloc * 2);
-  context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
+  if (!status.ok()) {
+    return false;
+  }
   ilist = (*tensor_list).flat<int>().data();
   numneigh = ilist + nloc;
   // Tensor jlist_temp;
@@ -1594,7 +1604,10 @@ static int _build_nlist_gpu(OpKernelContext* context,
   for (tt = 0; tt < max_nnei_trial; ++tt) {
     TensorShape jlist_shape;
     jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-    context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+    status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+    if (!status.ok()) {
+      return false;
+    }
     jlist = (*(tensor_list + 1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
     for (int_64 ii = 0; ii < nloc; ++ii) {
@@ -1742,8 +1755,11 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
   // Tensor double_temp;
   TensorShape double_shape;
   double_shape.AddDim(18);
-  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, double_shape,
-                         tensor_list + 1);
+  tensorflow::Status status = context->allocate_temp(
+      DataTypeToEnum<FPTYPE>::value, double_shape, tensor_list + 1);
+  if (!status.ok()) {
+    return false;
+  }
   // Tensor int_temp;
   TensorShape int_shape;
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
@@ -1806,7 +1822,11 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
   // Tensor nlist_temp;
   TensorShape nlist_shape;
   nlist_shape.AddDim(nloc * 2);
-  context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
+  if (!status.ok()) {
+    return false;
+  }
   ilist = (*tensor_list).flat<int>().data();
   numneigh = ilist + nloc;
   // Tensor jlist_temp;
@@ -1817,7 +1837,10 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
   for (tt = 0; tt < max_nnei_trial; ++tt) {
     TensorShape jlist_shape;
     jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-    context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+    status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+    if (!status.ok()) {
+      return false;
+    }
     jlist = (*(tensor_list + 1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
     for (int_64 ii = 0; ii < nloc; ++ii) {

From 8fbcaae84a31f57087beae035525366e4c37171d Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 4 Sep 2023 21:18:30 -0400
Subject: [PATCH 06/63] docs: improve checkpoint description (#2784)

Fix #2767.
---
 deepmd/entrypoints/train.py    | 4 ++--
 deepmd/model/frozen.py         | 2 +-
 deepmd/model/linear.py         | 2 +-
 deepmd/model/model.py          | 4 ++--
 deepmd/utils/argcheck.py       | 2 +-
 deepmd_cli/main.py             | 6 +++---
 doc/nvnmd/nvnmd.md             | 4 ++--
 doc/train/training-advanced.md | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index 1a0d4b9c6d..716ff482a3 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -86,9 +86,9 @@ def train(
     INPUT : str
         json/yaml control file
     init_model : Optional[str]
-        path to checkpoint folder or None
+        path prefix of checkpoint files or None
     restart : Optional[str]
-        path to checkpoint folder or None
+        path prefix of checkpoint files or None
     output : str
         path for dump file with arguments
     init_frz_model : str
diff --git a/deepmd/model/frozen.py b/deepmd/model/frozen.py
index c1b7d0286e..972acb9185 100644
--- a/deepmd/model/frozen.py
+++ b/deepmd/model/frozen.py
@@ -74,7 +74,7 @@ def build(
         frz_model : str, optional
             The path to the frozen model
         ckpt_meta : str, optional
-            The path to the checkpoint and meta file
+            The path prefix of the checkpoint and meta files
         suffix : str, optional
             The suffix of the scope
         reuse : bool or tf.AUTO_REUSE, optional
diff --git a/deepmd/model/linear.py b/deepmd/model/linear.py
index 6399766662..799642ce33 100644
--- a/deepmd/model/linear.py
+++ b/deepmd/model/linear.py
@@ -166,7 +166,7 @@ def build(
         frz_model : str, optional
             The path to the frozen model
         ckpt_meta : str, optional
-            The path to the checkpoint and meta file
+            The path prefix of the checkpoint and meta files
         suffix : str, optional
             The suffix of the scope
         reuse : bool or tf.AUTO_REUSE, optional
diff --git a/deepmd/model/model.py b/deepmd/model/model.py
index a06a3141c0..9ae5eacf4f 100644
--- a/deepmd/model/model.py
+++ b/deepmd/model/model.py
@@ -191,7 +191,7 @@ def build(
         frz_model : str, optional
             The path to the frozen model
         ckpt_meta : str, optional
-            The path to the checkpoint and meta file
+            The path prefix of the checkpoint and meta files
         suffix : str, optional
             The suffix of the scope
         reuse : bool or tf.AUTO_REUSE, optional
@@ -259,7 +259,7 @@ def build_descrpt(
         frz_model : str, optional
             The path to the frozen model
         ckpt_meta : str, optional
-            The path to the checkpoint and meta file
+            The path prefix of the checkpoint and meta files
         suffix : str, optional
             The suffix of the scope
         reuse : bool or tf.AUTO_REUSE, optional
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index b38f8c8063..153824cb0d 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1601,7 +1601,7 @@ def training_args():  # ! modified by Ziyao: data configuration isolated.
     doc_disp_file = "The file for printing learning curve."
     doc_disp_freq = "The frequency of printing learning curve."
     doc_save_freq = "The frequency of saving check point."
-    doc_save_ckpt = "The file name of saving check point."
+    doc_save_ckpt = "The path prefix of saving check point files."
     doc_disp_training = "Displaying verbose information during training."
     doc_time_training = "Timing durining training."
     doc_profiling = "Profiling during training."
diff --git a/deepmd_cli/main.py b/deepmd_cli/main.py
index a6b293020b..e3213d8b00 100644
--- a/deepmd_cli/main.py
+++ b/deepmd_cli/main.py
@@ -150,14 +150,14 @@ def main_parser() -> argparse.ArgumentParser:
         "--init-model",
         type=str,
         default=None,
-        help="Initialize the model by the provided checkpoint.",
+        help="Initialize the model by the provided path prefix of checkpoint files.",
     )
     parser_train_subgroup.add_argument(
         "-r",
         "--restart",
         type=str,
         default=None,
-        help="Restart the training from the provided checkpoint.",
+        help="Restart the training from the provided path prefix of checkpoint files.",
     )
     parser_train_subgroup.add_argument(
         "-f",
@@ -549,7 +549,7 @@ def main_parser() -> argparse.ArgumentParser:
         "--restart",
         type=str,
         default=None,
-        help="Restart the training from the provided checkpoint.",
+        help="Restart the training from the provided prefix of checkpoint files.",
     )
     parser_train_nvnmd.add_argument(
         "-s",
diff --git a/doc/nvnmd/nvnmd.md b/doc/nvnmd/nvnmd.md
index 8cad297391..0596ba5dc8 100644
--- a/doc/nvnmd/nvnmd.md
+++ b/doc/nvnmd/nvnmd.md
@@ -162,7 +162,7 @@ where items are defined as:
 | numb_test  | the accuracy is test by using {numb_test} sample    | a positive integer |
 | disp_file  | the log file where the training message display     | a string           |
 | disp_freq  | display frequency                                   | a positive integer |
-| save_ckpt  | check point file                                    | a string           |
+| save_ckpt  | path prefix of check point files                    | a string           |
 | save_freq  | save frequency                                      | a positive integer |
 | systems    | a list of data directory which contains the dataset | string list        |
 | set_prefix | the prefix of dataset                               | a string           |
@@ -181,7 +181,7 @@ dp train-nvnmd train_qnn.json -s s2
 
 After the training process, you will get two folders: `nvnmd_cnn` and `nvnmd_qnn`. The `nvnmd_cnn` contains the model after continuous neural network (CNN) training. The `nvnmd_qnn` contains the model after quantized neural network (QNN) training. The binary file `nvnmd_qnn/model.pb` is the model file that is used to perform NVNMD in the server [http://nvnmd.picp.vip].
 
-You can also restart the CNN training from the checkpoint (`nvnmd_cnn/model.ckpt`) by
+You can also restart the CNN training from the path prefix of checkpoint files (`nvnmd_cnn/model.ckpt`) by
 
 ``` bash
 dp train-nvnmd train_cnn.json -r nvnmd_cnn/model.ckpt -s s1
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index 39cf87d8b3..b0194e3471 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -121,7 +121,7 @@ optional arguments:
   --skip-neighbor-stat  Skip calculating neighbor statistics. Sel checking, automatic sel, and model compression will be disabled. (default: False)
 ```
 
-**`--init-model model.ckpt`**, initializes the model training with an existing model that is stored in the checkpoint `model.ckpt`, the network architectures should match.
+**`--init-model model.ckpt`**, initializes the model training with an existing model that is stored in the path prefix of checkpoint files `model.ckpt`, the network architectures should match.
 
 **`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`.
 

From 211052c2b2e2b364acc1724daaedadf39e7f577b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 5 Sep 2023 21:06:31 -0400
Subject: [PATCH 07/63] fix dp test atomic polar; add UTs for dp test (#2785)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com>
---
 deepmd/entrypoints/test.py   |  66 +++++--
 source/tests/test_dp_test.py | 340 +++++++++++++++++++++++++++++++++++
 2 files changed, 396 insertions(+), 10 deletions(-)
 create mode 100644 source/tests/test_dp_test.py

diff --git a/deepmd/entrypoints/test.py b/deepmd/entrypoints/test.py
index e348318f41..b2fb30dfc0 100644
--- a/deepmd/entrypoints/test.py
+++ b/deepmd/entrypoints/test.py
@@ -848,19 +848,65 @@ def test_polar(
     if detail_file is not None:
         detail_path = Path(detail_file)
 
-        pe = np.concatenate(
-            (
-                np.reshape(test_data["polarizability"][:numb_test], [-1, 9]),
-                np.reshape(polar, [-1, 9]),
-            ),
-            axis=1,
-        )
+        if not atomic:
+            pe = np.concatenate(
+                (
+                    np.reshape(test_data["polarizability"][:numb_test], [-1, 9]),
+                    np.reshape(polar, [-1, 9]),
+                ),
+                axis=1,
+            )
+            header_text = (
+                "data_pxx data_pxy data_pxz data_pyx data_pyy data_pyz data_pzx "
+                "data_pzy data_pzz pred_pxx pred_pxy pred_pxz pred_pyx pred_pyy "
+                "pred_pyz pred_pzx pred_pzy pred_pzz"
+            )
+        else:
+            pe = np.concatenate(
+                (
+                    np.reshape(
+                        test_data["atomic_polarizability"][:numb_test],
+                        [-1, 9 * sel_natoms],
+                    ),
+                    np.reshape(polar, [-1, 9 * sel_natoms]),
+                ),
+                axis=1,
+            )
+            header_text = [
+                f"{letter}{number}"
+                for number in range(1, sel_natoms + 1)
+                for letter in [
+                    "data_pxx",
+                    "data_pxy",
+                    "data_pxz",
+                    "data_pyx",
+                    "data_pyy",
+                    "data_pyz",
+                    "data_pzx",
+                    "data_pzy",
+                    "data_pzz",
+                ]
+            ] + [
+                f"{letter}{number}"
+                for number in range(1, sel_natoms + 1)
+                for letter in [
+                    "pred_pxx",
+                    "pred_pxy",
+                    "pred_pxz",
+                    "pred_pyx",
+                    "pred_pyy",
+                    "pred_pyz",
+                    "pred_pzx",
+                    "pred_pzy",
+                    "pred_pzz",
+                ]
+            ]
+            header_text = " ".join(header_text)
+
         np.savetxt(
             detail_path.with_suffix(".out"),
             pe,
-            header="data_pxx data_pxy data_pxz data_pyx data_pyy data_pyz data_pzx "
-            "data_pzy data_pzz pred_pxx pred_pxy pred_pxz pred_pyx pred_pyy pred_pyz "
-            "pred_pzx pred_pzy pred_pzz",
+            header=header_text,
         )
     return {"rmse": (rmse_f, polar.size)}
 
diff --git a/source/tests/test_dp_test.py b/source/tests/test_dp_test.py
new file mode 100644
index 0000000000..df1b51db0d
--- /dev/null
+++ b/source/tests/test_dp_test.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+import shutil
+import unittest
+from pathlib import (
+    Path,
+)
+
+import dpdata
+import numpy as np
+from common import (
+    tests_path,
+)
+
+from deepmd.entrypoints.test import test as dp_test
+from deepmd.utils.convert import (
+    convert_pbtxt_to_pb,
+)
+
+default_places = 6
+
+
+class TestDPTest:
+    def setUp(self):
+        self.coords = np.array(
+            [
+                12.83,
+                2.56,
+                2.18,
+                12.09,
+                2.87,
+                2.74,
+                00.25,
+                3.32,
+                1.68,
+                3.36,
+                3.00,
+                1.81,
+                3.51,
+                2.51,
+                2.60,
+                4.27,
+                3.22,
+                1.56,
+            ]
+        )
+        self.atype = [0, 1, 1, 0, 1, 1]
+        self.box = np.array([13.0, 0.0, 0.0, 0.0, 13.0, 0.0, 0.0, 0.0, 13.0])
+        self.test_data = "test_dp_test"
+        dpdata.System(
+            data={
+                "orig": np.zeros(3),
+                "atom_names": ["O", "H"],
+                "atom_numbs": [2, 4],
+                "atom_types": np.array(self.atype),
+                "cells": self.box.reshape(1, 3, 3),
+                "coords": self.coords.reshape(1, 6, 3),
+            }
+        ).to_deepmd_npy(self.test_data)
+
+    def tearDown(self):
+        shutil.rmtree(self.test_data, ignore_errors=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        os.remove(cls.model_name)
+
+
+class TestDPTestEner(unittest.TestCase, TestDPTest):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "deeppot.pb"
+        convert_pbtxt_to_pb(
+            str(tests_path / os.path.join("infer", "deeppot.pbtxt")), cls.model_name
+        )
+
+    def setUp(self):
+        TestDPTest.setUp(self)
+        self.expected_e = np.array(
+            [
+                -9.275780747115504710e01,
+                -1.863501786584258468e02,
+                -1.863392472863538103e02,
+                -9.279281325486221021e01,
+                -1.863671545232153903e02,
+                -1.863619822847602165e02,
+            ]
+        )
+        self.expected_f = np.array(
+            [
+                -3.034045420701179663e-01,
+                8.405844663871177014e-01,
+                7.696947487118485642e-02,
+                7.662001266663505117e-01,
+                -1.880601391333554251e-01,
+                -6.183333871091722944e-01,
+                -5.036172391059643427e-01,
+                -6.529525836149027151e-01,
+                5.432962643022043459e-01,
+                6.382357912332115024e-01,
+                -1.748518296794561167e-01,
+                3.457363524891907125e-01,
+                1.286482986991941552e-03,
+                3.757251165286925043e-01,
+                -5.972588700887541124e-01,
+                -5.987006197104716154e-01,
+                -2.004450304880958100e-01,
+                2.495901655353461868e-01,
+            ]
+        )
+        self.expected_v = np.array(
+            [
+                -2.912234126853306959e-01,
+                -3.800610846612756388e-02,
+                2.776624987489437202e-01,
+                -5.053761003913598976e-02,
+                -3.152373041953385746e-01,
+                1.060894290092162379e-01,
+                2.826389131596073745e-01,
+                1.039129970665329250e-01,
+                -2.584378792325942586e-01,
+                -3.121722367954994914e-01,
+                8.483275876786681990e-02,
+                2.524662342344257682e-01,
+                4.142176771106586414e-02,
+                -3.820285230785245428e-02,
+                -2.727311173065460545e-02,
+                2.668859789777112135e-01,
+                -6.448243569420382404e-02,
+                -2.121731470426218846e-01,
+                -8.624335220278558922e-02,
+                -1.809695356746038597e-01,
+                1.529875294531883312e-01,
+                -1.283658185172031341e-01,
+                -1.992682279795223999e-01,
+                1.409924999632362341e-01,
+                1.398322735274434292e-01,
+                1.804318474574856390e-01,
+                -1.470309318999652726e-01,
+                -2.593983661598450730e-01,
+                -4.236536279233147489e-02,
+                3.386387920184946720e-02,
+                -4.174017537818433543e-02,
+                -1.003500282164128260e-01,
+                1.525690815194478966e-01,
+                3.398976109910181037e-02,
+                1.522253908435125536e-01,
+                -2.349125581341701963e-01,
+                9.515545977581392825e-04,
+                -1.643218849228543846e-02,
+                1.993234765412972564e-02,
+                6.027265332209678569e-04,
+                -9.563256398907417355e-02,
+                1.510815124001868293e-01,
+                -7.738094816888557714e-03,
+                1.502832772532304295e-01,
+                -2.380965783745832010e-01,
+                -2.309456719810296654e-01,
+                -6.666961081213038098e-02,
+                7.955566551234216632e-02,
+                -8.099093777937517447e-02,
+                -3.386641099800401927e-02,
+                4.447884755740908608e-02,
+                1.008593228579038742e-01,
+                4.556718179228393811e-02,
+                -6.078081273849572641e-02,
+            ]
+        )
+
+    def test_1frame(self):
+        detail_file = "test_dp_test_ener_detail"
+        dp_test(
+            model=self.model_name,
+            system=self.test_data,
+            datafile=None,
+            set_prefix="set",
+            numb_test=0,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=detail_file,
+            atomic=False,
+        )
+        # TODO: see #2721
+        idx_map = np.lexsort((np.arange(len(self.atype)), self.atype))
+        pred_e = np.loadtxt(detail_file + ".e.out", ndmin=2)[0, 1]
+        pred_f = np.loadtxt(detail_file + ".f.out", ndmin=2)[:, 3:6]
+        pred_v = np.loadtxt(detail_file + ".v.out", ndmin=2)[:, 9:18]
+        pred_e_peratom = np.loadtxt(detail_file + ".e_peratom.out", ndmin=2)[0, 1]
+        pred_v_peratom = np.loadtxt(detail_file + ".v_peratom.out", ndmin=2)[:, 9:18]
+        self.assertAlmostEqual(pred_e, np.sum(self.expected_e), places=default_places)
+        np.testing.assert_almost_equal(
+            pred_f, self.expected_f.reshape(-1, 3)[idx_map], decimal=default_places
+        )
+        np.testing.assert_almost_equal(
+            pred_v,
+            np.sum(self.expected_v.reshape(1, -1, 9), axis=1),
+            decimal=default_places,
+        )
+        np.testing.assert_almost_equal(
+            pred_e_peratom, pred_e / len(self.atype), decimal=default_places
+        )
+        np.testing.assert_almost_equal(
+            pred_v_peratom, pred_v / len(self.atype), decimal=default_places
+        )
+
+
+class TestDPTestDipole(unittest.TestCase, TestDPTest):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "deepdipole.pb"
+        convert_pbtxt_to_pb(
+            str(tests_path / os.path.join("infer", "deepdipole.pbtxt")), cls.model_name
+        )
+
+    def setUp(self):
+        TestDPTest.setUp(self)
+        self.expected_d = np.array(
+            [
+                -9.274180565967479195e-01,
+                2.698028341272042496e00,
+                2.521268387140979117e-01,
+                2.927260638453461628e00,
+                -8.571926301526779923e-01,
+                1.667785136187720063e00,
+            ]
+        )
+        self.expected_global_d = np.sum(self.expected_d.reshape(1, -1, 3), axis=1)
+        np.save(Path(self.test_data) / "set.000" / "atomic_dipole.npy", self.expected_d)
+        np.save(Path(self.test_data) / "set.000" / "dipole.npy", self.expected_global_d)
+
+    def test_1frame(self):
+        detail_file = "test_dp_test_dipole_detail"
+        dp_test(
+            model=self.model_name,
+            system=self.test_data,
+            datafile=None,
+            set_prefix="set",
+            numb_test=0,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=detail_file,
+            atomic=True,
+        )
+        dipole = np.loadtxt(detail_file + ".out", ndmin=2)[0, 6:12]
+        np.testing.assert_almost_equal(dipole, self.expected_d, decimal=default_places)
+
+    def test_1frame_global(self):
+        detail_file = "test_dp_test_global_dipole_detail"
+        dp_test(
+            model=self.model_name,
+            system=self.test_data,
+            datafile=None,
+            set_prefix="set",
+            numb_test=0,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=detail_file,
+            atomic=False,
+        )
+        dipole = np.loadtxt(detail_file + ".out", ndmin=2)[:, 3:6]
+        np.testing.assert_almost_equal(
+            dipole, self.expected_global_d, decimal=default_places
+        )
+
+
+class TestDPTestPolar(unittest.TestCase, TestDPTest):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "deeppolar.pb"
+        convert_pbtxt_to_pb(
+            str(tests_path / os.path.join("infer", "deeppolar.pbtxt")), cls.model_name
+        )
+
+    def setUp(self):
+        TestDPTest.setUp(self)
+        self.expected_d = np.array(
+            [
+                1.061407927405987051e-01,
+                -3.569013342133873778e-01,
+                -2.862108976089940138e-02,
+                -3.569013342133875444e-01,
+                1.304367268874677244e00,
+                1.037647501453442256e-01,
+                -2.862108976089940138e-02,
+                1.037647501453441284e-01,
+                8.100521520762453409e-03,
+                1.236797829492216616e00,
+                -3.717307430531632262e-01,
+                7.371515676976750919e-01,
+                -3.717307430531630041e-01,
+                1.127222682121889058e-01,
+                -2.239181552775717510e-01,
+                7.371515676976746478e-01,
+                -2.239181552775717787e-01,
+                4.448255365635306879e-01,
+            ]
+        )
+        self.expected_global_d = np.sum(self.expected_d.reshape(1, -1, 9), axis=1)
+        np.save(
+            Path(self.test_data) / "set.000" / "atomic_polarizability.npy",
+            self.expected_d,
+        )
+        np.save(
+            Path(self.test_data) / "set.000" / "polarizability.npy",
+            self.expected_global_d,
+        )
+
+    def test_1frame(self):
+        detail_file = "test_dp_test_polar_detail"
+        dp_test(
+            model=self.model_name,
+            system=self.test_data,
+            datafile=None,
+            set_prefix="set",
+            numb_test=0,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=detail_file,
+            atomic=True,
+        )
+        polar = np.loadtxt(detail_file + ".out", ndmin=2)[0, 18:36]
+        np.testing.assert_almost_equal(polar, self.expected_d, decimal=default_places)
+
+    def test_1frame_global(self):
+        detail_file = "test_dp_test_global_polar_detail"
+        dp_test(
+            model=self.model_name,
+            system=self.test_data,
+            datafile=None,
+            set_prefix="set",
+            numb_test=0,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=detail_file,
+            atomic=False,
+        )
+        polar = np.loadtxt(detail_file + ".out", ndmin=2)[:, 9:18]
+        np.testing.assert_almost_equal(
+            polar, self.expected_global_d, decimal=default_places
+        )

From b1d87dbfcc21b61d8b301d999960ac7ea048c4e1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 Sep 2023 09:09:25 +0800
Subject: [PATCH 08/63] [pre-commit.ci] pre-commit autoupdate (#2787)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--pre-commit.ci start-->
updates:
- [github.com/astral-sh/ruff-pre-commit: v0.0.286 →
v0.0.287](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.286...v0.0.287)
<!--pre-commit.ci end-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 169ac19885..ba11bbcf50 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
       files: \.py$
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.0.286
+    rev: v0.0.287
     hooks:
     - id: ruff
       args: ["--fix"]

From 9712c13ec271d11b6c0ce57241321b52f880dd6c Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 5 Sep 2023 21:15:55 -0400
Subject: [PATCH 09/63] lmp: add unit_style requirement for compute
 deeptensor/atom (#2790)

The input of the model should be Angstron so that the unit style can
only be metal or real.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 source/lmp/compute_deeptensor_atom.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/source/lmp/compute_deeptensor_atom.cpp b/source/lmp/compute_deeptensor_atom.cpp
index 6af57f0b3c..0de523e1bf 100644
--- a/source/lmp/compute_deeptensor_atom.cpp
+++ b/source/lmp/compute_deeptensor_atom.cpp
@@ -26,6 +26,14 @@ using namespace LAMMPS_NS;
 
 ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS *lmp, int narg, char **arg)
     : Compute(lmp, narg, arg), dp(lmp), tensor(nullptr) {
+  if (!(strcmp(update->unit_style, "metal") == 0 ||
+        strcmp(update->unit_style, "real") == 0)) {
+    error->all(
+        FLERR,
+        "Compute deeptensor/atom requires metal or real unit; please set it by "
+        "\"units metal\" or \"units real\"");
+  }
+
   if (narg < 4) {
     error->all(FLERR, "Illegal compute deeptensor/atom command");
   }

From 956adcb280b08a89fa4acce5717d1652b1db96ff Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 7 Sep 2023 21:18:00 -0400
Subject: [PATCH 10/63] do not sort atoms in dp test (#2794)

Fix #2721.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 deepmd/entrypoints/test.py   |  8 +++++++-
 deepmd/utils/data.py         | 10 +++++++++-
 deepmd/utils/data_system.py  |  7 ++++++-
 source/tests/test_dp_test.py |  4 +---
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/deepmd/entrypoints/test.py b/deepmd/entrypoints/test.py
index b2fb30dfc0..4658b16e7c 100644
--- a/deepmd/entrypoints/test.py
+++ b/deepmd/entrypoints/test.py
@@ -116,7 +116,13 @@ def test(
 
         # create data class
         tmap = dp.get_type_map() if dp.model_type == "ener" else None
-        data = DeepmdData(system, set_prefix, shuffle_test=shuffle_test, type_map=tmap)
+        data = DeepmdData(
+            system,
+            set_prefix,
+            shuffle_test=shuffle_test,
+            type_map=tmap,
+            sort_atoms=False,
+        )
 
         if dp.model_type == "ener":
             err = test_ener(
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 8442f84156..485079b08d 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -42,6 +42,9 @@ class DeepmdData:
             Data modifier that has the method `modify_data`
     trn_all_set
             Use all sets as training dataset. Otherwise, if the number of sets is more than 1, the last set is left for test.
+    sort_atoms : bool
+            Sort atoms by atom types. Required to enable when the data is directly feeded to
+            descriptors except mixed types.
     """
 
     def __init__(
@@ -53,6 +56,7 @@ def __init__(
         optional_type_map: bool = True,
         modifier=None,
         trn_all_set: bool = False,
+        sort_atoms: bool = True,
     ):
         """Constructor."""
         root = DPPath(sys_path)
@@ -102,6 +106,7 @@ def __init__(
         if type_map is None and self.type_map is None and self.mixed_type:
             raise RuntimeError("mixed_type format must have type_map!")
         # make idx map
+        self.sort_atoms = sort_atoms
         self.idx_map = self._make_idx_map(self.atom_type)
         # train dirs
         self.test_dir = self.dirs[-1]
@@ -586,7 +591,10 @@ def _load_type_mix(self, set_name: DPPath):
     def _make_idx_map(self, atom_type):
         natoms = atom_type.shape[0]
         idx = np.arange(natoms)
-        idx_map = np.lexsort((idx, atom_type))
+        if self.sort_atoms:
+            idx_map = np.lexsort((idx, atom_type))
+        else:
+            idx_map = idx
         return idx_map
 
     def _load_type_map(self, sys_path: DPPath):
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 6d65165456..0bfe6b7c70 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -46,6 +46,7 @@ def __init__(
         trn_all_set=False,
         sys_probs=None,
         auto_prob_style="prob_sys_size",
+        sort_atoms: bool = True,
     ):
         """Constructor.
 
@@ -84,7 +85,10 @@ def __init__(
                                 the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`,
                                 where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system,
                                 the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional
-        to the number of batches in the system.
+                to the number of batches in the system.
+        sort_atoms : bool
+            Sort atoms by atom types. Required to enable when the data is directly feeded to
+            descriptors except mixed types.
         """
         # init data
         self.rcut = rcut
@@ -101,6 +105,7 @@ def __init__(
                     optional_type_map=optional_type_map,
                     modifier=modifier,
                     trn_all_set=trn_all_set,
+                    sort_atoms=sort_atoms,
                 )
             )
         # check mix_type format
diff --git a/source/tests/test_dp_test.py b/source/tests/test_dp_test.py
index df1b51db0d..a07706acfe 100644
--- a/source/tests/test_dp_test.py
+++ b/source/tests/test_dp_test.py
@@ -180,8 +180,6 @@ def test_1frame(self):
             detail_file=detail_file,
             atomic=False,
         )
-        # TODO: see #2721
-        idx_map = np.lexsort((np.arange(len(self.atype)), self.atype))
         pred_e = np.loadtxt(detail_file + ".e.out", ndmin=2)[0, 1]
         pred_f = np.loadtxt(detail_file + ".f.out", ndmin=2)[:, 3:6]
         pred_v = np.loadtxt(detail_file + ".v.out", ndmin=2)[:, 9:18]
@@ -189,7 +187,7 @@ def test_1frame(self):
         pred_v_peratom = np.loadtxt(detail_file + ".v_peratom.out", ndmin=2)[:, 9:18]
         self.assertAlmostEqual(pred_e, np.sum(self.expected_e), places=default_places)
         np.testing.assert_almost_equal(
-            pred_f, self.expected_f.reshape(-1, 3)[idx_map], decimal=default_places
+            pred_f, self.expected_f.reshape(-1, 3), decimal=default_places
         )
         np.testing.assert_almost_equal(
             pred_v,

From c6dea0c264fc3a8d7253c712fd00f3cc82b5976b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 7 Sep 2023 21:22:27 -0400
Subject: [PATCH 11/63] lmp: `fix_dplr` use the same `type_map` from
 `pair_deepmd` (#2776)

When `pair_coeff` has set LAMMPS `type_map` (e.g. `pair_coeff * * O H`),
`fix_dplr` uses the same LAMMPS `type_map` (i.e. `O H` in this case) to
map from LAMMPS types to the DP model types, considering LAMMPS types
should always be the same among different commands.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 doc/model/dplr.md                  |  4 +-
 source/api_c/include/c_api.h       |  7 +++
 source/api_c/include/deepmd.hpp    |  9 ++++
 source/api_c/src/c_api.cc          |  6 +++
 source/api_cc/include/DeepTensor.h |  6 ++-
 source/api_cc/src/DeepTensor.cc    |  4 ++
 source/lmp/fix_dplr.cpp            | 78 ++++++++++++++++++++++++------
 source/lmp/fix_dplr.h              |  1 +
 source/lmp/pair_deepmd.cpp         |  2 +
 source/lmp/pair_deepmd.h           |  1 +
 source/lmp/tests/test_dplr.py      | 46 +++++++++++++++++-
 11 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index ecd9aa8c95..035c27ee14 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -171,8 +171,8 @@ fix_modify	0 virial yes
 ```
 
 The fix command `dplr` calculates the position of WCs by the DW model and back-propagates the long-range interaction on virtual atoms to real toms.
-At this time, the training parameter {ref}`type_map <model/type_map>` will be mapped to LAMMPS atom types.
-
+The atom names specified in [pair_style `deepmd`](../third-party/lammps-command.md#pair_style-deepmd) will be used to determine elements.
+If it is not set, the training parameter {ref}`type_map <model/type_map>` will be mapped to LAMMPS atom types.
 
 To use a time-dependent electric field, LAMMPS's `variable` feature can be utilized:
 ```lammps
diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h
index ba14ea0e50..6aa1268123 100644
--- a/source/api_c/include/c_api.h
+++ b/source/api_c/include/c_api.h
@@ -1033,6 +1033,13 @@ int* DP_DeepTensorGetSelTypes(DP_DeepTensor* dt);
  */
 int DP_DeepTensorGetNumbSelTypes(DP_DeepTensor* dt);
 
+/**
+ * @brief Get the type map of a Deep Tensor.
+ * @param[in] dt The Deep Tensor to use.
+ * @return The type map of the Deep Tensor.
+ */
+const char* DP_DeepTensorGetTypeMap(DP_DeepTensor* dt);
+
 /**
  * @brief Check if there is any exceptions throw.
  *
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index 129645e7f7..532e01e805 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -1837,6 +1837,15 @@ class DeepTensor {
   void print_summary(const std::string &pre) const {
     DP_PrintSummary(pre.c_str());
   }
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  void get_type_map(std::string &type_map) {
+    const char *type_map_c = DP_DeepTensorGetTypeMap(dt);
+    type_map.assign(type_map_c);
+    delete[] type_map_c;
+  };
 
  private:
   DP_DeepTensor *dt;
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index bd62ef2ddf..1e2ee47b8b 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -1267,6 +1267,12 @@ int DP_DeepTensorGetNumbSelTypes(DP_DeepTensor* dt) {
   return dt->dt.sel_types().size();
 }
 
+const char* DP_DeepTensorGetTypeMap(DP_DeepTensor* dt) {
+  std::string type_map;
+  dt->dt.get_type_map(type_map);
+  return string_to_char(type_map);
+}
+
 const char* DP_DeepTensorCheckOK(DP_DeepTensor* dt) {
   return string_to_char(dt->exception);
 }
diff --git a/source/api_cc/include/DeepTensor.h b/source/api_cc/include/DeepTensor.h
index ebcac96e10..af535cc9de 100644
--- a/source/api_cc/include/DeepTensor.h
+++ b/source/api_cc/include/DeepTensor.h
@@ -39,7 +39,6 @@ class DeepTensor {
    **/
   void print_summary(const std::string& pre) const;
 
- public:
   /**
    * @brief Evaluate the value by using this model.
    * @param[out] value The value to evalute, usually would be the atomic tensor.
@@ -198,6 +197,11 @@ class DeepTensor {
     assert(inited);
     return sel_type;
   };
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  void get_type_map(std::string& type_map);
 
  private:
   tensorflow::Session* session;
diff --git a/source/api_cc/src/DeepTensor.cc b/source/api_cc/src/DeepTensor.cc
index 09a9802e19..a4b7ddb90f 100644
--- a/source/api_cc/src/DeepTensor.cc
+++ b/source/api_cc/src/DeepTensor.cc
@@ -792,3 +792,7 @@ template void DeepTensor::compute_inner<float>(
     const std::vector<float> &dbox,
     const int nghost,
     const InputNlist &nlist_);
+
+void DeepTensor::get_type_map(std::string &type_map) {
+  type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
+}
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
index 51029055cf..a0df7efd24 100644
--- a/source/lmp/fix_dplr.cpp
+++ b/source/lmp/fix_dplr.cpp
@@ -4,6 +4,7 @@
 #include <iomanip>
 #include <iostream>
 #include <limits>
+#include <sstream>
 
 #include "atom.h"
 #include "comm.h"
@@ -63,7 +64,7 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
   if (strcmp(update->unit_style, "metal") != 0) {
     error->all(
         FLERR,
-        "Pair deepmd requires metal unit, please set it by \"units metal\"");
+        "Fix dplr requires metal unit, please set it by \"units metal\"");
   }
 
   int iarg = 3;
@@ -71,8 +72,7 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
   bond_type.clear();
   while (iarg < narg) {
     if (!is_key(arg[iarg])) {
-      error->all(FLERR,
-                 "Illegal pair_style command\nwrong number of parameters\n");
+      error->all(FLERR, "Illegal fix command\nwrong number of parameters\n");
     }
     if (string(arg[iarg]) == string("model")) {
       if (iarg + 1 > narg) {
@@ -128,10 +128,6 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
   }
   assert(map_vec.size() % 2 == 0),
       "number of ints provided by type_associate should be even";
-  for (int ii = 0; ii < map_vec.size() / 2; ++ii) {
-    type_asso[map_vec[ii * 2 + 0]] = map_vec[ii * 2 + 1];
-    bk_type_asso[map_vec[ii * 2 + 1]] = map_vec[ii * 2 + 0];
-  }
 
   // dpt.init(model);
   // dtm.init("frozen_model.pb");
@@ -142,6 +138,63 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
     error->one(FLERR, e.what());
   }
 
+  pair_deepmd = (PairDeepMD *)force->pair_match("deepmd", 1);
+  if (!pair_deepmd) {
+    error->all(FLERR, "pair_style deepmd should be set before this fix\n");
+  }
+
+  int n = atom->ntypes;
+  std::vector<std::string> type_names = pair_deepmd->type_names;
+  std::vector<std::string> type_map;
+  std::string type_map_str;
+  dpt.get_type_map(type_map_str);
+  // convert the string to a vector of strings
+  std::istringstream iss(type_map_str);
+  std::string type_name;
+  while (iss >> type_name) {
+    type_map.push_back(type_name);
+  }
+  if (type_names.size() == 0 || type_map.size() == 0) {
+    type_idx_map.resize(n);
+    for (int ii = 0; ii < n; ++ii) {
+      type_idx_map[ii] = ii;
+    }
+  } else {
+    type_idx_map.clear();
+    for (std::string type_name : type_names) {
+      bool found_element = false;
+      for (int ii = 0; ii < type_map.size(); ++ii) {
+        if (type_map[ii] == type_name) {
+          type_idx_map.push_back(ii);
+          found_element = true;
+          break;
+        }
+      }
+      if (!found_element && "NULL" == type_name) {
+        type_idx_map.push_back(type_map.size());  // ghost type
+        found_element = true;
+      }
+      if (!found_element) {
+        error->all(FLERR, "pair_coeff: element " + type_name +
+                              " not found in the DPLR model");
+      }
+    }
+    int numb_types = type_idx_map.size();
+    if (numb_types < n) {
+      type_idx_map.resize(n);
+      for (int ii = numb_types; ii < n; ++ii) {
+        type_idx_map[ii] = ii;
+      }
+    }
+  }
+
+  for (int ii = 0; ii < map_vec.size() / 2; ++ii) {
+    type_asso[type_idx_map[map_vec[ii * 2 + 0]]] =
+        type_idx_map[map_vec[ii * 2 + 1]];
+    bk_type_asso[type_idx_map[map_vec[ii * 2 + 1]]] =
+        type_idx_map[map_vec[ii * 2 + 0]];
+  }
+
   sel_type = dpt.sel_types();
   sort(sel_type.begin(), sel_type.end());
   dpl_type.clear();
@@ -149,11 +202,6 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
     dpl_type.push_back(type_asso[sel_type[ii]]);
   }
 
-  pair_deepmd = (PairDeepMD *)force->pair_match("deepmd", 1);
-  if (!pair_deepmd) {
-    error->all(FLERR, "pair_style deepmd should be set before this fix\n");
-  }
-
   // set comm size needed by this fix
   comm_reverse = 3;
 }
@@ -284,7 +332,7 @@ void FixDPLR::get_valid_pairs(vector<pair<int, int> > &pairs) {
   // get type
   int *type = atom->type;
   for (int ii = 0; ii < nall; ++ii) {
-    dtype[ii] = type[ii] - 1;
+    dtype[ii] = type_idx_map[type[ii] - 1];
   }
 
   int **bondlist = neighbor->bondlist;
@@ -394,7 +442,7 @@ void FixDPLR::pre_force(int vflag) {
   vector<FLOAT_PREC> dcoord(nall * 3, 0.);
   // get type
   for (int ii = 0; ii < nall; ++ii) {
-    dtype[ii] = type[ii] - 1;
+    dtype[ii] = type_idx_map[type[ii] - 1];
   }
   // get box
   dbox[0] = domain->h[0];  // xx
@@ -518,7 +566,7 @@ void FixDPLR::post_force(int vflag) {
   {
     int *type = atom->type;
     for (int ii = 0; ii < nall; ++ii) {
-      dtype[ii] = type[ii] - 1;
+      dtype[ii] = type_idx_map[type[ii] - 1];
     }
     dbox[0] = domain->h[0];  // xx
     dbox[4] = domain->h[1];  // yy
diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h
index cf1a4b16e1..23ae1c818d 100644
--- a/source/lmp/fix_dplr.h
+++ b/source/lmp/fix_dplr.h
@@ -78,6 +78,7 @@ class FixDPLR : public Fix {
   double qe2f;
   void update_efield_variables();
   enum { NONE, CONSTANT, EQUAL };
+  std::vector<int> type_idx_map;
 };
 }  // namespace LAMMPS_NS
 
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 5e60213f08..ec53a1dc99 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -1163,8 +1163,10 @@ void PairDeepMD::coeff(int narg, char **arg) {
     }
 
     type_idx_map.clear();
+    type_names.clear();
     while (iarg < narg) {
       std::string type_name = arg[iarg];
+      type_names.push_back(type_name);
       bool found_element = false;
       for (int ii = 0; ii < type_map.size(); ++ii) {
         if (type_map[ii] == type_name) {
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index e811bc99b9..d04ea97916 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -75,6 +75,7 @@ class PairDeepMD : public Pair {
   std::string get_file_content(const std::string &model);
   std::vector<std::string> get_file_content(
       const std::vector<std::string> &models);
+  std::vector<std::string> type_names;
 
  protected:
   virtual void allocate();
diff --git a/source/lmp/tests/test_dplr.py b/source/lmp/tests/test_dplr.py
index d9322702a8..ceebb71310 100644
--- a/source/lmp/tests/test_dplr.py
+++ b/source/lmp/tests/test_dplr.py
@@ -20,6 +20,7 @@
 dipole_pbtxt_file = Path(__file__).parent / "lrdipole.pbtxt"
 dipole_pb_file = Path(__file__).parent / "lrdipole.pb"
 data_file = Path(__file__).parent / "data.lmp"
+data_type_map_file = Path(__file__).parent / "data_type_map.lmp"
 
 # this is as the same as python and c++ tests, test_deeppot_a.py
 expected_e_sr = -40.56538550
@@ -252,6 +253,7 @@
 )
 mol_list = np.array([1, 2, 1, 1, 2, 2, 1, 2])
 type_OH = np.array([1, 1, 2, 2, 2, 2, 3, 3])
+type_HO = np.array([2, 2, 1, 1, 1, 1, 3, 3])
 charge = np.array([6, 6, 1, 1, 1, 1, -8, -8])
 bond_list = (((1, 7), (2, 8)),)
 mass_list = np.array([15.99940, 1.00794, 15.99940])
@@ -275,19 +277,22 @@ def setup_module():
     write_lmp_data_full(
         box, coord, mol_list, type_OH, charge, data_file, bond_list, mass_list
     )
+    write_lmp_data_full(
+        box, coord, mol_list, type_HO, charge, data_type_map_file, bond_list, mass_list
+    )
 
 
 def teardown_module():
     os.remove(data_file)
 
 
-def _lammps(data_file) -> PyLammps:
+def _lammps(data_file, exclude_type="1 3") -> PyLammps:
     lammps = PyLammps()
     lammps.units("metal")
     lammps.boundary("p p p")
     lammps.atom_style("full")
     lammps.neighbor("0.2 bin")
-    lammps.neigh_modify("every 1 delay 0 check no exclude type 1 3")
+    lammps.neigh_modify("every 1 delay 0 check no exclude type " + exclude_type)
     lammps.read_data(data_file.resolve())
     lammps.timestep(0.0005)
     lammps.fix("1 all nve")
@@ -301,6 +306,13 @@ def lammps():
     lmp.close()
 
 
+@pytest.fixture
+def lammps_type_map():
+    lmp = _lammps(data_file=data_type_map_file, exclude_type="2 3")
+    yield lmp
+    lmp.close()
+
+
 def test_pair_deepmd_sr(lammps):
     lammps.pair_style(f"deepmd {pb_file.resolve()}")
     lammps.pair_coeff("* *")
@@ -460,3 +472,33 @@ def test_min_dplr(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f_min_step1[lammps.atoms[ii].id - 1]
         )
+
+
+def test_pair_deepmd_lr_type_map(lammps_type_map):
+    lammps_type_map.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_type_map.pair_coeff("* * H O")
+    lammps_type_map.bond_style("zero")
+    lammps_type_map.bond_coeff("*")
+    lammps_type_map.special_bonds("lj/coul 1 1 1 angle no")
+    lammps_type_map.kspace_style("pppm/dplr 1e-5")
+    lammps_type_map.kspace_modify(
+        f"gewald {beta:.2f} diff ik mesh {mesh:d} {mesh:d} {mesh:d}"
+    )
+    lammps_type_map.fix(
+        f"0 all dplr model {pb_file.resolve()} type_associate 2 3 bond_type 1"
+    )
+    lammps_type_map.fix_modify("0 virial yes")
+    lammps_type_map.run(0)
+    for ii in range(8):
+        if lammps_type_map.atoms[ii].id > 6:
+            assert lammps_type_map.atoms[ii].position == pytest.approx(
+                expected_WC[lammps_type_map.atoms[ii].id - 7]
+            )
+    assert lammps_type_map.eval("elong") == pytest.approx(expected_e_kspace)
+    assert lammps_type_map.eval("pe") == pytest.approx(expected_e_lr)
+    for ii in range(8):
+        if lammps_type_map.atoms[ii].id <= 6:
+            assert lammps_type_map.atoms[ii].force == pytest.approx(
+                expected_f_lr[lammps_type_map.atoms[ii].id - 1]
+            )
+    lammps_type_map.run(1)

From c6829bcd619f179510f7d1c66be91b292ba6b0fb Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 7 Sep 2023 21:23:15 -0400
Subject: [PATCH 12/63] check status of allocate_temp (#2797)

It's the same as #2782, but one line was missing in that PR.

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index d423e8a108..a8882fb5f4 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1549,8 +1549,11 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
     // Tensor cpy_temp;
     TensorShape cpy_shape;
     cpy_shape.AddDim(mem_cpy * 3);
-    context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
-                           tensor_list + 3);
+    status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
+                                    tensor_list + 3);
+    if (!status.ok()) {
+      return false;
+    }
     // Tensor t_temp;
     TensorShape t_shape;
     t_shape.AddDim(mem_cpy * 2);

From 6e306fa6eb8fb2841bf31e8a53344f384bb8f577 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 10 Sep 2023 20:04:18 -0400
Subject: [PATCH 13/63] migrate Python build backend to scikit-build-core
 (#2798)

This PR migrates the Python build backend from scikit-build to
[scikit-build-core](https://github.com/scikit-build/scikit-build-core),
which is being actively developed.

There are no breaking changes to users; all tests should be passed
without modification.

The behavior for editable installation (`pip install -e .`) is changed.
Before, the compiled binary libraries (e.g. `libdeepmd_op.sp`) are
installed into the source directory. In this PR, the library files are
installed into `site-packages` separately, while the Python files are
still in the source code and can be modified for development.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .gitignore                                    |   2 +-
 MANIFEST.in                                   |   9 -
 backend/__init__.py                           |   1 +
 backend/dp_backend.py                         |  22 ++-
 backend/dynamic_metadata.py                   |  87 +++++++++
 backend/find_tensorflow.py                    |   6 +
 backend/read_env.py                           | 109 +++++++++++
 deepmd/.gitignore                             |   2 -
 deepmd/env.py                                 |  14 +-
 deepmd/lmp.py                                 |   2 +-
 {source => deepmd}/op/_add_flt_nvnmd_grad.py  |   0
 {source => deepmd}/op/_copy_flt_nvnmd_grad.py |   0
 .../op/_dotmul_flt_nvnmd_grad.py              |   0
 {source => deepmd}/op/_flt_nvnmd_grad.py      |   0
 {source => deepmd}/op/_gelu.py                |   0
 {source => deepmd}/op/_map_flt_nvnmd_grad.py  |   0
 .../op/_matmul_fitnet_nvnmd_grad.py           |   0
 .../op/_matmul_flt2fix_nvnmd.py               |   0
 .../op/_matmul_flt_nvnmd_grad.py              |   0
 {source => deepmd}/op/_mul_flt_nvnmd_grad.py  |   0
 {source => deepmd}/op/_prod_force_grad.py     |   0
 .../op/_prod_force_se_a_grad.py               |   0
 .../op/_prod_force_se_a_mask_grad.py          |   0
 .../op/_prod_force_se_r_grad.py               |   0
 {source => deepmd}/op/_prod_virial_grad.py    |   0
 .../op/_prod_virial_se_a_grad.py              |   0
 .../op/_prod_virial_se_r_grad.py              |   0
 {source => deepmd}/op/_quantize_nvnmd_grad.py |   0
 {source => deepmd}/op/_soft_min_force_grad.py |   0
 .../op/_soft_min_virial_grad.py               |   0
 {source => deepmd}/op/_tabulate_grad.py       |   0
 .../op/_tanh4_flt_nvnmd_grad.py               |   0
 deepmd_cli/main.py                            |  18 +-
 pyproject.toml                                |  56 +++++-
 setup.py                                      | 174 ------------------
 source/api_c/CMakeLists.txt                   |   2 +-
 source/api_cc/CMakeLists.txt                  |   2 +-
 source/config/CMakeLists.txt                  |   3 +-
 source/config/__init__.py                     |   2 +
 source/ipi/CMakeLists.txt                     |  10 +-
 source/lib/CMakeLists.txt                     |   2 +-
 source/lib/src/cuda/CMakeLists.txt            |   2 +-
 source/lib/src/cuda/cudart/CMakeLists.txt     |   2 +-
 source/lib/src/rocm/CMakeLists.txt            |   2 +-
 source/lmp/plugin/CMakeLists.txt              |   2 +-
 source/op/CMakeLists.txt                      |   6 +-
 46 files changed, 300 insertions(+), 237 deletions(-)
 delete mode 100644 MANIFEST.in
 create mode 100644 backend/__init__.py
 create mode 100644 backend/dynamic_metadata.py
 create mode 100644 backend/read_env.py
 rename {source => deepmd}/op/_add_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_copy_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_dotmul_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_gelu.py (100%)
 rename {source => deepmd}/op/_map_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_matmul_fitnet_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_matmul_flt2fix_nvnmd.py (100%)
 rename {source => deepmd}/op/_matmul_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_mul_flt_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_prod_force_grad.py (100%)
 rename {source => deepmd}/op/_prod_force_se_a_grad.py (100%)
 rename {source => deepmd}/op/_prod_force_se_a_mask_grad.py (100%)
 rename {source => deepmd}/op/_prod_force_se_r_grad.py (100%)
 rename {source => deepmd}/op/_prod_virial_grad.py (100%)
 rename {source => deepmd}/op/_prod_virial_se_a_grad.py (100%)
 rename {source => deepmd}/op/_prod_virial_se_r_grad.py (100%)
 rename {source => deepmd}/op/_quantize_nvnmd_grad.py (100%)
 rename {source => deepmd}/op/_soft_min_force_grad.py (100%)
 rename {source => deepmd}/op/_soft_min_virial_grad.py (100%)
 rename {source => deepmd}/op/_tabulate_grad.py (100%)
 rename {source => deepmd}/op/_tanh4_flt_nvnmd_grad.py (100%)
 delete mode 100644 setup.py
 create mode 100644 source/config/__init__.py

diff --git a/.gitignore b/.gitignore
index 7401566afd..82d3e4a7da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,7 +22,7 @@ _skbuild
 deepmd_kit.egg-info/
 dist
 .eggs
-_version.py
+/deepmd/_version.py
 venv*
 .vscode/**
 _build
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 007e125125..0000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,9 +0,0 @@
-prune source/tests
-prune source/api_c/tests
-prune source/api_cc/tests
-prune source/lib/tests
-prune source/lmp/tests
-prune doc
-prune examples
-prune data
-prune .github
diff --git a/backend/__init__.py b/backend/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/backend/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/backend/dp_backend.py b/backend/dp_backend.py
index 9e5932be74..d28afdb239 100644
--- a/backend/dp_backend.py
+++ b/backend/dp_backend.py
@@ -4,12 +4,14 @@
     List,
 )
 
-from find_tensorflow import (
+from scikit_build_core import build as _orig
+
+from .find_tensorflow import (
     find_tensorflow,
 )
-
-# TODO: switch to scikit_build_core after it is available
-from setuptools import build_meta as _orig
+from .read_env import (
+    set_scikit_build_env,
+)
 
 __all__ = [
     "build_sdist",
@@ -24,10 +26,14 @@ def __dir__() -> List[str]:
     return __all__
 
 
+set_scikit_build_env()
+
 prepare_metadata_for_build_wheel = _orig.prepare_metadata_for_build_wheel
 build_wheel = _orig.build_wheel
 build_sdist = _orig.build_sdist
 get_requires_for_build_sdist = _orig.get_requires_for_build_sdist
+prepare_metadata_for_build_editable = _orig.prepare_metadata_for_build_editable
+build_editable = _orig.build_editable
 
 
 def get_requires_for_build_wheel(
@@ -36,7 +42,7 @@ def get_requires_for_build_wheel(
     return _orig.get_requires_for_build_wheel(config_settings) + find_tensorflow()[1]
 
 
-# TODO: export get_requires_for_build_editable, prepare_metadata_for_build_editable, build_editable
-# after scikit-build is ready
-# See https://github.com/scikit-build/scikit-build/issues/740
-# Now we use the legacy-editable mode
+def get_requires_for_build_editable(
+    config_settings: dict,
+) -> List[str]:
+    return _orig.get_requires_for_build_editable(config_settings) + find_tensorflow()[1]
diff --git a/backend/dynamic_metadata.py b/backend/dynamic_metadata.py
new file mode 100644
index 0000000000..1270b6031e
--- /dev/null
+++ b/backend/dynamic_metadata.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Dict,
+    List,
+    Optional,
+)
+
+from .find_tensorflow import (
+    get_tf_requirement,
+)
+from .read_env import (
+    get_argument_from_env,
+)
+
+__all__ = ["dynamic_metadata"]
+
+
+def __dir__() -> List[str]:
+    return __all__
+
+
+def dynamic_metadata(
+    field: str,
+    settings: Optional[Dict[str, object]] = None,
+) -> str:
+    assert field in ["optional-dependencies", "entry-points", "scripts"]
+    _, _, find_libpython_requires, extra_scripts, tf_version = get_argument_from_env()
+    if field == "scripts":
+        return {
+            "dp": "deepmd_cli.main:main",
+            **extra_scripts,
+        }
+    elif field == "optional-dependencies":
+        return {
+            "test": [
+                "dpdata>=0.1.9",
+                "ase",
+                "pytest",
+                "pytest-cov",
+                "pytest-sugar",
+            ],
+            "docs": [
+                "sphinx>=3.1.1",
+                "sphinx_rtd_theme>=1.0.0rc1",
+                "sphinx_markdown_tables",
+                "myst-nb",
+                "breathe",
+                "exhale",
+                "numpydoc",
+                "ase",
+                "deepmodeling-sphinx>=0.1.0",
+                "dargs>=0.3.4",
+                "sphinx-argparse",
+                "pygments-lammps",
+                "sphinxcontrib-bibtex",
+            ],
+            "lmp": [
+                "lammps~=2023.8.2.0.0; platform_system=='Linux'",
+                "lammps~=2023.8.2.0.0; platform_system!='Linux'",
+                *find_libpython_requires,
+            ],
+            "ipi": [
+                "i-PI",
+                *find_libpython_requires,
+            ],
+            **get_tf_requirement(tf_version),
+            "cu11": [
+                "nvidia-cuda-runtime-cu11",
+                "nvidia-cublas-cu11",
+                "nvidia-cufft-cu11",
+                "nvidia-curand-cu11",
+                "nvidia-cusolver-cu11",
+                "nvidia-cusparse-cu11",
+                "nvidia-cudnn-cu11",
+                "nvidia-cuda-nvcc-cu11",
+            ],
+            "cu12": [
+                "nvidia-cuda-runtime-cu12",
+                "nvidia-cublas-cu12",
+                "nvidia-cufft-cu12",
+                "nvidia-curand-cu12",
+                "nvidia-cusolver-cu12",
+                "nvidia-cusparse-cu12",
+                "nvidia-cudnn-cu12",
+                "nvidia-cuda-nvcc-cu12",
+            ],
+        }
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
index 567d09b75e..8fe3cedb63 100644
--- a/backend/find_tensorflow.py
+++ b/backend/find_tensorflow.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import os
 import site
+from functools import (
+    lru_cache,
+)
 from importlib.machinery import (
     FileFinder,
 )
@@ -25,6 +28,7 @@
 )
 
 
+@lru_cache()
 def find_tensorflow() -> Tuple[Optional[str], List[str]]:
     """Find TensorFlow library.
 
@@ -89,6 +93,7 @@ def find_tensorflow() -> Tuple[Optional[str], List[str]]:
     return tf_install_dir, requires
 
 
+@lru_cache()
 def get_tf_requirement(tf_version: str = "") -> dict:
     """Get TensorFlow requirement (CPU) when TF is not installed.
 
@@ -143,6 +148,7 @@ def get_tf_requirement(tf_version: str = "") -> dict:
         }
 
 
+@lru_cache()
 def get_tf_version(tf_path: Union[str, Path]) -> str:
     """Get TF version from a TF Python library path.
 
diff --git a/backend/read_env.py b/backend/read_env.py
new file mode 100644
index 0000000000..575c1a57de
--- /dev/null
+++ b/backend/read_env.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Read environment variables to configure the build."""
+
+import os
+from functools import (
+    lru_cache,
+)
+from typing import (
+    Tuple,
+)
+
+from packaging.version import (
+    Version,
+)
+
+from .find_tensorflow import (
+    find_tensorflow,
+    get_tf_version,
+)
+
+
+@lru_cache()
+def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
+    """Get the arguments from environment variables.
+
+    The environment variables are assumed to be not changed during the build.
+
+    Returns
+    -------
+    str
+        The minimum required CMake version.
+    list of str
+        The CMake arguments.
+    list of str
+        The requirements for the build.
+    dict
+        The extra scripts to be installed.
+    str
+        The TensorFlow version.
+    """
+    cmake_args = []
+    extra_scripts = {}
+    # get variant option from the environment varibles, available: cpu, cuda, rocm
+    dp_variant = os.environ.get("DP_VARIANT", "cpu").lower()
+    if dp_variant == "cpu" or dp_variant == "":
+        cmake_minimum_required_version = "3.16"
+    elif dp_variant == "cuda":
+        cmake_minimum_required_version = "3.23"
+        cmake_args.append("-DUSE_CUDA_TOOLKIT:BOOL=TRUE")
+        cuda_root = os.environ.get("CUDAToolkit_ROOT")
+        if cuda_root:
+            cmake_args.append(f"-DCUDAToolkit_ROOT:STRING={cuda_root}")
+    elif dp_variant == "rocm":
+        cmake_minimum_required_version = "3.21"
+        cmake_args.append("-DUSE_ROCM_TOOLKIT:BOOL=TRUE")
+        rocm_root = os.environ.get("ROCM_ROOT")
+        if rocm_root:
+            cmake_args.append(f"-DCMAKE_HIP_COMPILER_ROCM_ROOT:STRING={rocm_root}")
+        hipcc_flags = os.environ.get("HIP_HIPCC_FLAGS")
+        if hipcc_flags:
+            cmake_args.append(f"-DHIP_HIPCC_FLAGS:STRING={hipcc_flags}")
+    else:
+        raise RuntimeError("Unsupported DP_VARIANT option: %s" % dp_variant)
+
+    if os.environ.get("DP_BUILD_TESTING", "0") == "1":
+        cmake_args.append("-DBUILD_TESTING:BOOL=TRUE")
+    if os.environ.get("DP_ENABLE_NATIVE_OPTIMIZATION", "0") == "1":
+        cmake_args.append("-DENABLE_NATIVE_OPTIMIZATION:BOOL=TRUE")
+    dp_lammps_version = os.environ.get("DP_LAMMPS_VERSION", "")
+    dp_ipi = os.environ.get("DP_ENABLE_IPI", "0")
+    if dp_lammps_version != "" or dp_ipi == "1":
+        cmake_args.append("-DBUILD_CPP_IF:BOOL=TRUE")
+        cmake_args.append("-DUSE_TF_PYTHON_LIBS:BOOL=TRUE")
+    else:
+        cmake_args.append("-DBUILD_CPP_IF:BOOL=FALSE")
+
+    if dp_lammps_version != "":
+        cmake_args.append(f"-DLAMMPS_VERSION={dp_lammps_version}")
+    if dp_ipi == "1":
+        cmake_args.append("-DENABLE_IPI:BOOL=TRUE")
+        extra_scripts["dp_ipi"] = "deepmd.entrypoints.ipi:dp_ipi"
+
+    tf_install_dir, _ = find_tensorflow()
+    tf_version = get_tf_version(tf_install_dir)
+    if tf_version == "" or Version(tf_version) >= Version("2.12"):
+        find_libpython_requires = []
+    else:
+        find_libpython_requires = ["find_libpython"]
+    cmake_args.append(f"-DTENSORFLOW_VERSION={tf_version}")
+
+    cmake_args = [
+        f"-DTENSORFLOW_ROOT:PATH={tf_install_dir}",
+        "-DBUILD_PY_IF:BOOL=TRUE",
+        *cmake_args,
+    ]
+    return (
+        cmake_minimum_required_version,
+        cmake_args,
+        find_libpython_requires,
+        extra_scripts,
+        tf_version,
+    )
+
+
+def set_scikit_build_env():
+    """Set scikit-build environment variables before executing scikit-build."""
+    cmake_minimum_required_version, cmake_args, _, _, _ = get_argument_from_env()
+    os.environ["SKBUILD_CMAKE_MINIMUM_VERSION"] = cmake_minimum_required_version
+    os.environ["SKBUILD_CMAKE_ARGS"] = ";".join(cmake_args)
diff --git a/deepmd/.gitignore b/deepmd/.gitignore
index b2b9057ea2..b2d7614637 100644
--- a/deepmd/.gitignore
+++ b/deepmd/.gitignore
@@ -1,4 +1,2 @@
-op/_*.py
 pkg_config
 run_config.ini
-!op/__init__.py
diff --git a/deepmd/env.py b/deepmd/env.py
index 615e89f3ac..d8875cabd2 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -27,6 +27,8 @@
     Version,
 )
 
+import deepmd.lib
+
 if TYPE_CHECKING:
     from types import (
         ModuleType,
@@ -101,7 +103,9 @@ def dlopen_library(module: str, filename: str):
     "TF_VERSION",
 ]
 
-SHARED_LIB_MODULE = "op"
+SHARED_LIB_MODULE = "lib"
+SHARED_LIB_DIR = Path(deepmd.lib.__path__[0])
+CONFIG_FILE = SHARED_LIB_DIR / "run_config.ini"
 
 # Python library version
 try:
@@ -361,11 +365,7 @@ def get_module(module_name: str) -> "ModuleType":
         ext = ".so"
         prefix = "lib"
 
-    module_file = (
-        (Path(__file__).parent / SHARED_LIB_MODULE / (prefix + module_name))
-        .with_suffix(ext)
-        .resolve()
-    )
+    module_file = (SHARED_LIB_DIR / (prefix + module_name)).with_suffix(ext).resolve()
 
     if not module_file.is_file():
         raise FileNotFoundError(f"module {module_name} does not exist")
@@ -433,7 +433,7 @@ def get_module(module_name: str) -> "ModuleType":
 
 
 def _get_package_constants(
-    config_file: Path = Path(__file__).parent / "run_config.ini",
+    config_file: Path = CONFIG_FILE,
 ) -> Dict[str, str]:
     """Read package constants set at compile time by CMake to dictionary.
 
diff --git a/deepmd/lmp.py b/deepmd/lmp.py
index e5a3d4904f..a955844758 100644
--- a/deepmd/lmp.py
+++ b/deepmd/lmp.py
@@ -64,7 +64,7 @@ def get_library_path(module: str) -> List[str]:
     raise RuntimeError("Unsupported platform")
 
 tf_dir = tf.sysconfig.get_lib()
-op_dir = str((Path(__file__).parent / "op").absolute())
+op_dir = str((Path(__file__).parent / "lib").absolute())
 
 
 cuda_library_paths = []
diff --git a/source/op/_add_flt_nvnmd_grad.py b/deepmd/op/_add_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_add_flt_nvnmd_grad.py
rename to deepmd/op/_add_flt_nvnmd_grad.py
diff --git a/source/op/_copy_flt_nvnmd_grad.py b/deepmd/op/_copy_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_copy_flt_nvnmd_grad.py
rename to deepmd/op/_copy_flt_nvnmd_grad.py
diff --git a/source/op/_dotmul_flt_nvnmd_grad.py b/deepmd/op/_dotmul_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_dotmul_flt_nvnmd_grad.py
rename to deepmd/op/_dotmul_flt_nvnmd_grad.py
diff --git a/source/op/_flt_nvnmd_grad.py b/deepmd/op/_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_flt_nvnmd_grad.py
rename to deepmd/op/_flt_nvnmd_grad.py
diff --git a/source/op/_gelu.py b/deepmd/op/_gelu.py
similarity index 100%
rename from source/op/_gelu.py
rename to deepmd/op/_gelu.py
diff --git a/source/op/_map_flt_nvnmd_grad.py b/deepmd/op/_map_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_map_flt_nvnmd_grad.py
rename to deepmd/op/_map_flt_nvnmd_grad.py
diff --git a/source/op/_matmul_fitnet_nvnmd_grad.py b/deepmd/op/_matmul_fitnet_nvnmd_grad.py
similarity index 100%
rename from source/op/_matmul_fitnet_nvnmd_grad.py
rename to deepmd/op/_matmul_fitnet_nvnmd_grad.py
diff --git a/source/op/_matmul_flt2fix_nvnmd.py b/deepmd/op/_matmul_flt2fix_nvnmd.py
similarity index 100%
rename from source/op/_matmul_flt2fix_nvnmd.py
rename to deepmd/op/_matmul_flt2fix_nvnmd.py
diff --git a/source/op/_matmul_flt_nvnmd_grad.py b/deepmd/op/_matmul_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_matmul_flt_nvnmd_grad.py
rename to deepmd/op/_matmul_flt_nvnmd_grad.py
diff --git a/source/op/_mul_flt_nvnmd_grad.py b/deepmd/op/_mul_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_mul_flt_nvnmd_grad.py
rename to deepmd/op/_mul_flt_nvnmd_grad.py
diff --git a/source/op/_prod_force_grad.py b/deepmd/op/_prod_force_grad.py
similarity index 100%
rename from source/op/_prod_force_grad.py
rename to deepmd/op/_prod_force_grad.py
diff --git a/source/op/_prod_force_se_a_grad.py b/deepmd/op/_prod_force_se_a_grad.py
similarity index 100%
rename from source/op/_prod_force_se_a_grad.py
rename to deepmd/op/_prod_force_se_a_grad.py
diff --git a/source/op/_prod_force_se_a_mask_grad.py b/deepmd/op/_prod_force_se_a_mask_grad.py
similarity index 100%
rename from source/op/_prod_force_se_a_mask_grad.py
rename to deepmd/op/_prod_force_se_a_mask_grad.py
diff --git a/source/op/_prod_force_se_r_grad.py b/deepmd/op/_prod_force_se_r_grad.py
similarity index 100%
rename from source/op/_prod_force_se_r_grad.py
rename to deepmd/op/_prod_force_se_r_grad.py
diff --git a/source/op/_prod_virial_grad.py b/deepmd/op/_prod_virial_grad.py
similarity index 100%
rename from source/op/_prod_virial_grad.py
rename to deepmd/op/_prod_virial_grad.py
diff --git a/source/op/_prod_virial_se_a_grad.py b/deepmd/op/_prod_virial_se_a_grad.py
similarity index 100%
rename from source/op/_prod_virial_se_a_grad.py
rename to deepmd/op/_prod_virial_se_a_grad.py
diff --git a/source/op/_prod_virial_se_r_grad.py b/deepmd/op/_prod_virial_se_r_grad.py
similarity index 100%
rename from source/op/_prod_virial_se_r_grad.py
rename to deepmd/op/_prod_virial_se_r_grad.py
diff --git a/source/op/_quantize_nvnmd_grad.py b/deepmd/op/_quantize_nvnmd_grad.py
similarity index 100%
rename from source/op/_quantize_nvnmd_grad.py
rename to deepmd/op/_quantize_nvnmd_grad.py
diff --git a/source/op/_soft_min_force_grad.py b/deepmd/op/_soft_min_force_grad.py
similarity index 100%
rename from source/op/_soft_min_force_grad.py
rename to deepmd/op/_soft_min_force_grad.py
diff --git a/source/op/_soft_min_virial_grad.py b/deepmd/op/_soft_min_virial_grad.py
similarity index 100%
rename from source/op/_soft_min_virial_grad.py
rename to deepmd/op/_soft_min_virial_grad.py
diff --git a/source/op/_tabulate_grad.py b/deepmd/op/_tabulate_grad.py
similarity index 100%
rename from source/op/_tabulate_grad.py
rename to deepmd/op/_tabulate_grad.py
diff --git a/source/op/_tanh4_flt_nvnmd_grad.py b/deepmd/op/_tanh4_flt_nvnmd_grad.py
similarity index 100%
rename from source/op/_tanh4_flt_nvnmd_grad.py
rename to deepmd/op/_tanh4_flt_nvnmd_grad.py
diff --git a/deepmd_cli/main.py b/deepmd_cli/main.py
index e3213d8b00..f707bf7589 100644
--- a/deepmd_cli/main.py
+++ b/deepmd_cli/main.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import argparse
-import imp
+import importlib.util
 import logging
+import os
+import sys
 import textwrap
 from typing import (
     List,
@@ -12,11 +14,15 @@
 def load_child_module(name):
     """Load a child module without loading its parent module."""
     names = name.split(".")
-    path = None
-    for name in names:
-        f, path, info = imp.find_module(name, path)
-        path = [path]
-    return imp.load_module(name, f, path[0], info)
+    parent_spec = importlib.util.find_spec(names[0])
+    paths = os.path.join(*names[1:]) + ".py"
+    spec = importlib.util.spec_from_file_location(
+        name, os.path.join(parent_spec.submodule_search_locations[0], paths)
+    )
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[name] = module
+    spec.loader.exec_module(module)
+    return module
 
 
 __version__ = load_child_module("deepmd._version").__version__
diff --git a/pyproject.toml b/pyproject.toml
index 687e0284cc..7b8f55d562 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,19 +1,16 @@
 [build-system]
 requires = [
-    "setuptools>=61",
-    "setuptools_scm[toml]>=6.2",
-    "wheel",
-    "scikit-build",
-    "cmake",
-    # see https://github.com/scikit-build/scikit-build/releases/tag/0.13.1
-    "ninja; platform_system!='Windows'",
+    # dynamic metadata API is still unstable
+    # TODO: unpin the upper bound when it is stable
+    "scikit-build-core>=0.5,<0.6",
+    "packaging",
 ]
-build-backend = "dp_backend"
-backend-path = ["backend"]
+build-backend = "backend.dp_backend"
+backend-path = ["."]
 
 [project]
 name = "deepmd-kit"
-dynamic = ["version", "optional-dependencies", "entry-points"]
+dynamic = ["version", "optional-dependencies", "scripts"]
 description = "A deep learning package for many-body potential energy representation and molecular dynamics"
 authors = [
   {name = "DeepModeling"},
@@ -53,6 +50,9 @@ requires-python = ">=3.7"
 readme = "README.md"
 keywords = ["deepmd"]
 
+[project.entry-points."lammps.plugins"]
+deepmd = "deepmd.lmp:get_op_dir"
+
 [project.urls]
 Homepage = "https://github.com/deepmodeling/deepmd-kit"
 documentation = "https://docs.deepmodeling.com/projects/deepmd"
@@ -61,6 +61,42 @@ repository = "https://github.com/deepmodeling/deepmd-kit"
 [tool.setuptools_scm]
 write_to = "deepmd/_version.py"
 
+[tool.scikit-build]
+experimental = true
+minimum-version = "0.5"
+cmake.source-dir = "source"
+sdist.include = [
+    "/deepmd/_version.py",
+]
+sdist.exclude = [
+    "/source/tests",
+    "/source/api_c/tests",
+    "/source/api_cc/tests",
+    "/source/lib/tests",
+    "/source/lmp/tests",
+    "/doc",
+    "/examples",
+    "/data",
+    "/.github",
+]
+wheel.packages = [
+    "deepmd",
+    "deepmd_cli",
+]
+wheel.py-api = "py37"
+build-dir = "build/{wheel_tag}"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.setuptools_scm"
+
+[tool.scikit-build.metadata.optional-dependencies]
+provider = "backend.dynamic_metadata"
+provider-path = "backend"
+
+[tool.scikit-build.metadata.scripts]
+provider = "backend.dynamic_metadata"
+provider-path = "backend"
+
 [tool.cibuildwheel]
 test-command = [
     "python -m deepmd -h",
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 7b69c14c40..0000000000
--- a/setup.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# SPDX-License-Identifier: LGPL-3.0-or-later
-"""Setup script for DeePMD-kit package."""
-
-import os
-import sys
-
-from packaging.version import (
-    Version,
-)
-from skbuild import (
-    setup,
-)
-from wheel.bdist_wheel import (
-    bdist_wheel,
-)
-
-topdir = os.path.abspath(os.path.dirname(__file__))
-sys.path.insert(0, os.path.join(topdir, "backend"))
-
-from find_tensorflow import (
-    find_tensorflow,
-    get_tf_requirement,
-    get_tf_version,
-)
-
-cmake_args = []
-extra_scripts = []
-# get variant option from the environment varibles, available: cpu, cuda, rocm
-dp_variant = os.environ.get("DP_VARIANT", "cpu").lower()
-if dp_variant == "cpu" or dp_variant == "":
-    cmake_minimum_required_version = "3.16"
-elif dp_variant == "cuda":
-    cmake_minimum_required_version = "3.23"
-    cmake_args.append("-DUSE_CUDA_TOOLKIT:BOOL=TRUE")
-    cuda_root = os.environ.get("CUDAToolkit_ROOT")
-    if cuda_root:
-        cmake_args.append(f"-DCUDAToolkit_ROOT:STRING={cuda_root}")
-elif dp_variant == "rocm":
-    cmake_minimum_required_version = "3.21"
-    cmake_args.append("-DUSE_ROCM_TOOLKIT:BOOL=TRUE")
-    rocm_root = os.environ.get("ROCM_ROOT")
-    if rocm_root:
-        cmake_args.append(f"-DCMAKE_HIP_COMPILER_ROCM_ROOT:STRING={rocm_root}")
-    hipcc_flags = os.environ.get("HIP_HIPCC_FLAGS")
-    if hipcc_flags:
-        cmake_args.append(f"-DHIP_HIPCC_FLAGS:STRING={hipcc_flags}")
-else:
-    raise RuntimeError("Unsupported DP_VARIANT option: %s" % dp_variant)
-
-if os.environ.get("DP_BUILD_TESTING", "0") == "1":
-    cmake_args.append("-DBUILD_TESTING:BOOL=TRUE")
-if os.environ.get("DP_ENABLE_NATIVE_OPTIMIZATION", "0") == "1":
-    cmake_args.append("-DENABLE_NATIVE_OPTIMIZATION:BOOL=TRUE")
-dp_lammps_version = os.environ.get("DP_LAMMPS_VERSION", "")
-dp_ipi = os.environ.get("DP_ENABLE_IPI", "0")
-if dp_lammps_version != "" or dp_ipi == "1":
-    cmake_args.append("-DBUILD_CPP_IF:BOOL=TRUE")
-    cmake_args.append("-DUSE_TF_PYTHON_LIBS:BOOL=TRUE")
-else:
-    cmake_args.append("-DBUILD_CPP_IF:BOOL=FALSE")
-
-if dp_lammps_version != "":
-    cmake_args.append(f"-DLAMMPS_VERSION={dp_lammps_version}")
-if dp_ipi == "1":
-    cmake_args.append("-DENABLE_IPI:BOOL=TRUE")
-    extra_scripts.append("dp_ipi = deepmd.entrypoints.ipi:dp_ipi")
-
-
-tf_install_dir, _ = find_tensorflow()
-tf_version = get_tf_version(tf_install_dir)
-if tf_version == "" or Version(tf_version) >= Version("2.12"):
-    find_libpython_requires = []
-else:
-    find_libpython_requires = ["find_libpython"]
-cmake_args.append(f"-DTENSORFLOW_VERSION={tf_version}")
-
-
-class bdist_wheel_abi3(bdist_wheel):
-    def get_tag(self):
-        python, abi, plat = super().get_tag()
-        if python.startswith("cp"):
-            if tf_version == "" or Version(tf_version) >= Version("2.12"):
-                return "py38", "none", plat
-            return "py37", "none", plat
-        return python, abi, plat
-
-
-# TODO: migrate packages and entry_points to pyproject.toml after scikit-build supports it
-# See also https://scikit-build.readthedocs.io/en/latest/usage.html#setuptools-options
-setup(
-    packages=[
-        "deepmd",
-        "deepmd/descriptor",
-        "deepmd/fit",
-        "deepmd/infer",
-        "deepmd/loss",
-        "deepmd/utils",
-        "deepmd/loggers",
-        "deepmd/cluster",
-        "deepmd/entrypoints",
-        "deepmd/op",
-        "deepmd/model",
-        "deepmd/train",
-        "deepmd/nvnmd",
-        "deepmd/nvnmd/data",
-        "deepmd/nvnmd/descriptor",
-        "deepmd/nvnmd/entrypoints",
-        "deepmd/nvnmd/fit",
-        "deepmd/nvnmd/utils",
-        "deepmd_cli",
-    ],
-    cmake_args=[
-        f"-DTENSORFLOW_ROOT:PATH={tf_install_dir}",
-        "-DBUILD_PY_IF:BOOL=TRUE",
-        *cmake_args,
-    ],
-    cmake_source_dir="source",
-    cmake_minimum_required_version=cmake_minimum_required_version,
-    extras_require={
-        "test": ["dpdata>=0.1.9", "ase", "pytest", "pytest-cov", "pytest-sugar"],
-        "docs": [
-            "sphinx>=3.1.1",
-            "sphinx_rtd_theme>=1.0.0rc1",
-            "sphinx_markdown_tables",
-            "myst-nb",
-            "breathe",
-            "exhale",
-            "numpydoc",
-            "ase",
-            "deepmodeling-sphinx>=0.1.0",
-            "dargs>=0.3.4",
-            "sphinx-argparse",
-            "pygments-lammps",
-            "sphinxcontrib-bibtex",
-        ],
-        "lmp": [
-            "lammps~=2023.8.2.0.0; platform_system=='Linux'",
-            "lammps~=2023.8.2.0.0; platform_system!='Linux'",
-            *find_libpython_requires,
-        ],
-        "ipi": [
-            "i-PI",
-            *find_libpython_requires,
-        ],
-        **get_tf_requirement(tf_version),
-        "cu11": [
-            "nvidia-cuda-runtime-cu11",
-            "nvidia-cublas-cu11",
-            "nvidia-cufft-cu11",
-            "nvidia-curand-cu11",
-            "nvidia-cusolver-cu11",
-            "nvidia-cusparse-cu11",
-            "nvidia-cudnn-cu11",
-            "nvidia-cuda-nvcc-cu11",
-        ],
-        "cu12": [
-            "nvidia-cuda-runtime-cu12",
-            "nvidia-cublas-cu12",
-            "nvidia-cufft-cu12",
-            "nvidia-curand-cu12",
-            "nvidia-cusolver-cu12",
-            "nvidia-cusparse-cu12",
-            "nvidia-cudnn-cu12",
-            "nvidia-cuda-nvcc-cu12",
-        ],
-    },
-    entry_points={
-        "console_scripts": ["dp = deepmd_cli.main:main", *extra_scripts],
-        "lammps.plugins": ["deepmd = deepmd.lmp:get_op_dir"],
-    },
-    cmdclass={
-        "bdist_wheel": bdist_wheel_abi3,
-    },
-)
diff --git a/source/api_c/CMakeLists.txt b/source/api_c/CMakeLists.txt
index 93b170f59c..f6e741105a 100644
--- a/source/api_c/CMakeLists.txt
+++ b/source/api_c/CMakeLists.txt
@@ -19,7 +19,7 @@ if(CMAKE_TESTING_ENABLED)
 endif()
 
 if(BUILD_PY_IF)
-  install(TARGETS ${libname} DESTINATION deepmd/op/)
+  install(TARGETS ${libname} DESTINATION deepmd/lib/)
 else(BUILD_PY_IF)
   install(
     TARGETS ${libname}
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index 7dc836a873..bdcb51a498 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 target_compile_features(${libname} PUBLIC cxx_std_11)
 
 if(BUILD_PY_IF)
-  install(TARGETS ${libname} DESTINATION deepmd/op/)
+  install(TARGETS ${libname} DESTINATION deepmd/lib/)
 else(BUILD_PY_IF)
   install(
     TARGETS ${libname}
diff --git a/source/config/CMakeLists.txt b/source/config/CMakeLists.txt
index eb0bbc8bf4..5473b91f29 100644
--- a/source/config/CMakeLists.txt
+++ b/source/config/CMakeLists.txt
@@ -3,4 +3,5 @@
 configure_file("run_config.ini" "${CMAKE_CURRENT_BINARY_DIR}/run_config.ini"
                @ONLY)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/run_config.ini DESTINATION deepmd)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/run_config.ini __init__.py
+        DESTINATION deepmd/lib)
diff --git a/source/config/__init__.py b/source/config/__init__.py
new file mode 100644
index 0000000000..d8ba46b41f
--- /dev/null
+++ b/source/config/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+# empty file for find the module
diff --git a/source/ipi/CMakeLists.txt b/source/ipi/CMakeLists.txt
index adf69f723f..158f98aea5 100644
--- a/source/ipi/CMakeLists.txt
+++ b/source/ipi/CMakeLists.txt
@@ -24,11 +24,7 @@ endif()
 target_link_libraries(${ipiname} PRIVATE ${libipiname})
 target_include_directories(${ipiname}
                            PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/)
-if(BUILD_PY_IF)
-  set(LIB_DIR op)
-else(BUILD_PY_IF)
-  set(LIB_DIR lib)
-endif(BUILD_PY_IF)
+set(LIB_DIR lib)
 
 if(BUILD_PY_IF AND TENSORFLOW_LINK_LIBPYTHON)
   # ignore undefined reference for libpython
@@ -63,8 +59,8 @@ if(CMAKE_TESTING_ENABLED)
 endif()
 
 if(BUILD_PY_IF)
-  install(TARGETS ${libipiname} DESTINATION deepmd/op/)
-  install(TARGETS ${ipiname} DESTINATION deepmd/op/)
+  install(TARGETS ${libipiname} DESTINATION deepmd/lib/)
+  install(TARGETS ${ipiname} DESTINATION deepmd/lib/)
 else(BUILD_PY_IF)
   install(TARGETS ${libipiname} DESTINATION lib/)
   install(TARGETS ${ipiname} DESTINATION bin/)
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index bd6b585644..5f5528de3e 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -38,7 +38,7 @@ if(CMAKE_TESTING_ENABLED)
 endif()
 
 if(BUILD_PY_IF)
-  install(TARGETS ${libname} DESTINATION deepmd/op/)
+  install(TARGETS ${libname} DESTINATION deepmd/lib/)
 else(BUILD_PY_IF)
   install(
     TARGETS ${libname}
diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt
index bfdf9cd466..1d5ae690e1 100644
--- a/source/lib/src/cuda/CMakeLists.txt
+++ b/source/lib/src/cuda/CMakeLists.txt
@@ -56,5 +56,5 @@ if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
     DESTINATION lib/)
 endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
 if(BUILD_PY_IF)
-  install(TARGETS deepmd_op_cuda DESTINATION deepmd/op/)
+  install(TARGETS deepmd_op_cuda DESTINATION deepmd/lib/)
 endif(BUILD_PY_IF)
diff --git a/source/lib/src/cuda/cudart/CMakeLists.txt b/source/lib/src/cuda/cudart/CMakeLists.txt
index 7562c2ea1b..e612ad63ed 100644
--- a/source/lib/src/cuda/cudart/CMakeLists.txt
+++ b/source/lib/src/cuda/cudart/CMakeLists.txt
@@ -9,5 +9,5 @@ if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
     DESTINATION lib/)
 endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
 if(BUILD_PY_IF)
-  install(TARGETS deepmd_dyn_cudart DESTINATION deepmd/op/)
+  install(TARGETS deepmd_dyn_cudart DESTINATION deepmd/lib/)
 endif(BUILD_PY_IF)
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
index f659973897..1b093977b6 100644
--- a/source/lib/src/rocm/CMakeLists.txt
+++ b/source/lib/src/rocm/CMakeLists.txt
@@ -35,5 +35,5 @@ if(BUILD_CPP_IF)
     DESTINATION lib/)
 endif(BUILD_CPP_IF)
 if(BUILD_PY_IF)
-  install(TARGETS deepmd_op_rocm DESTINATION deepmd/op/)
+  install(TARGETS deepmd_op_rocm DESTINATION deepmd/lib/)
 endif(BUILD_PY_IF)
diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt
index ce879764ee..86b99fe7b5 100644
--- a/source/lmp/plugin/CMakeLists.txt
+++ b/source/lmp/plugin/CMakeLists.txt
@@ -100,7 +100,7 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
   endif()
 
   if(BUILD_PY_IF)
-    install(TARGETS ${libname} DESTINATION deepmd/op/)
+    install(TARGETS ${libname} DESTINATION deepmd/lib/)
   else(BUILD_PY_IF)
     install(TARGETS ${libname} DESTINATION lib/)
 
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 167c6c5396..7a92e259e0 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -50,7 +50,6 @@ file(
   prod_virial_grad_multi_device.cc
   soft_min_force_grad.cc
   soft_min_virial_grad.cc)
-file(GLOB OP_PY *.py)
 file(GLOB OP_REMAPPER_SRC optimizer/parallel.cc)
 
 add_library(${LIB_DEEPMD_OP} MODULE ${OP_SRC} ${OP_REMAPPER_SRC})
@@ -91,9 +90,8 @@ if(BUILD_PY_IF)
 endif(BUILD_PY_IF)
 
 if(BUILD_PY_IF)
-  install(TARGETS ${LIB_DEEPMD_OP} DESTINATION deepmd/op/)
-  install(TARGETS op_grads DESTINATION deepmd/op/)
-  install(FILES ${OP_PY} DESTINATION deepmd/op/)
+  install(TARGETS ${LIB_DEEPMD_OP} DESTINATION deepmd/lib/)
+  install(TARGETS op_grads DESTINATION deepmd/lib/)
 else(BUILD_PY_IF)
   install(TARGETS ${LIB_DEEPMD_OP} DESTINATION lib/)
 endif(BUILD_PY_IF)

From c8580300343ca92a31fa58f73831eaabf60971ff Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 11 Sep 2023 22:55:39 -0400
Subject: [PATCH 14/63] fix np.loadtxt DeprecationWarning (#2802)

```
DeprecationWarning: loadtxt(): Parsing an integer via a float is deprecated.  To avoid this warning, you can:
    * make sure the original data is stored as integers.
    * use the `converters=` keyword argument.  If you only use
      NumPy 1.23 or later, `converters=float` will normally work.
    * Use `np.loadtxt(...).astype(np.int64)` parsing the file as
      floating point and then convert it.  (On all NumPy versions.)
  (Deprecated NumPy 1.23)
  return np.loadtxt(str(self.path), **kwargs)
```

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 485079b08d..57bea00fac 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -580,7 +580,7 @@ def _load_data(
             return np.float32(0.0), data
 
     def _load_type(self, sys_path: DPPath):
-        atom_type = (sys_path / "type.raw").load_txt(dtype=np.int32, ndmin=1)
+        atom_type = (sys_path / "type.raw").load_txt(ndmin=1).astype(np.int32)
         return atom_type
 
     def _load_type_mix(self, set_name: DPPath):

From b6ff8aaddd7dbd74f00dff97ecedaa84a5f76e75 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 11 Sep 2023 23:15:57 -0400
Subject: [PATCH 15/63] support atomic/relative model deviation in CLI (#2801)

Fix #2017.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 deepmd/infer/model_devi.py        | 144 +++++++++++++++++++++++++++---
 deepmd_cli/main.py                |  16 ++++
 doc/test/model-deviation.md       |  11 +++
 doc/third-party/lammps-command.md |   2 +-
 pyproject.toml                    |   2 +-
 source/tests/test_model_devi.py   |  89 ++++++++++++++++++
 6 files changed, 248 insertions(+), 16 deletions(-)

diff --git a/deepmd/infer/model_devi.py b/deepmd/infer/model_devi.py
index e9950f9d5e..8c329a0845 100644
--- a/deepmd/infer/model_devi.py
+++ b/deepmd/infer/model_devi.py
@@ -2,6 +2,7 @@
 from typing import (
     Optional,
     Tuple,
+    overload,
 )
 
 import numpy as np
@@ -20,10 +21,39 @@
     DeepPot,
 )
 
+try:
+    from typing import Literal  # python >=3.8
+except ImportError:
+    from typing_extensions import Literal  # type: ignore
+
+
+@overload
+def calc_model_devi_f(
+    fs: np.ndarray,
+    real_f: Optional[np.ndarray] = None,
+    relative: Optional[float] = None,
+    atomic: Literal[False] = False,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ...
+
+
+@overload
+def calc_model_devi_f(
+    fs: np.ndarray,
+    real_f: Optional[np.ndarray] = None,
+    relative: Optional[float] = None,
+    *,
+    atomic: Literal[True],
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    ...
+
 
 def calc_model_devi_f(
-    fs: np.ndarray, real_f: Optional[np.ndarray] = None
-) -> Tuple[np.ndarray]:
+    fs: np.ndarray,
+    real_f: Optional[np.ndarray] = None,
+    relative: Optional[float] = None,
+    atomic: bool = False,
+) -> Tuple[np.ndarray, ...]:
     """Calculate model deviation of force.
 
     Parameters
@@ -33,6 +63,12 @@ def calc_model_devi_f(
     real_f : numpy.ndarray or None
         real force, size of `n_frames x n_atoms x 3`. If given,
         the RMS real error is calculated instead.
+    relative : float, default: None
+        If given, calculate the relative model deviation of force. The
+        value is the level parameter for computing the relative model
+        deviation of the force.
+    atomic : bool, default: False
+        Whether return deviation of force in all atoms
 
     Returns
     -------
@@ -42,6 +78,8 @@ def calc_model_devi_f(
         minimum deviation of force in all atoms
     avg_devi_f : numpy.ndarray
         average deviation of force in all atoms
+    fs_devi : numpy.ndarray
+        deviation of force in all atoms, returned if atomic=True
     """
     if real_f is None:
         fs_devi = np.linalg.norm(np.std(fs, axis=0), axis=-1)
@@ -49,9 +87,21 @@ def calc_model_devi_f(
         fs_devi = np.linalg.norm(
             np.sqrt(np.mean(np.square(fs - real_f), axis=0)), axis=-1
         )
+    if relative is not None:
+        if real_f is None:
+            # if real force is not given, the magnitude is calculated from mean value of four models
+            # See DeepPotModelDevi::compute_relative_std_f
+            # See also Eq. 71 in DeePMD-kit v2 paepr
+            magnitude = np.linalg.norm(np.mean(fs, axis=0), axis=-1)
+        else:
+            # otherwise, the magnitude is calculated from the real force
+            magnitude = np.linalg.norm(real_f, axis=-1)
+        fs_devi /= magnitude + relative
     max_devi_f = np.max(fs_devi, axis=-1)
     min_devi_f = np.min(fs_devi, axis=-1)
     avg_devi_f = np.mean(fs_devi, axis=-1)
+    if atomic:
+        return max_devi_f, min_devi_f, avg_devi_f, fs_devi
     return max_devi_f, min_devi_f, avg_devi_f
 
 
@@ -86,8 +136,10 @@ def calc_model_devi_e(
 
 
 def calc_model_devi_v(
-    vs: np.ndarray, real_v: Optional[np.ndarray] = None
-) -> Tuple[np.ndarray]:
+    vs: np.ndarray,
+    real_v: Optional[np.ndarray] = None,
+    relative: Optional[float] = None,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Calculate model deviation of virial.
 
     Parameters
@@ -97,6 +149,10 @@ def calc_model_devi_v(
     real_v : numpy.ndarray
         real virial, size of `n_frames x 9`. If given,
         the RMS real error is calculated instead.
+    relative : float, default: None
+        If given, calculate the relative model deviation of virial. The
+        value is the level parameter for computing the relative model
+        deviation of the virial.
 
     Returns
     -------
@@ -111,13 +167,25 @@ def calc_model_devi_v(
         vs_devi = np.std(vs, axis=0)
     else:
         vs_devi = np.sqrt(np.mean(np.square(vs - real_v), axis=0))
+    if relative is not None:
+        if real_v is None:
+            # if real virial is not given, the magnitude is calculated from mean value of four models
+            # See DeepPotModelDevi::compute_relative_std_v
+            # See also Eq. 72 in DeePMD-kit v2 paepr
+            magnitude = np.linalg.norm(np.mean(vs, axis=0), axis=-1)
+        else:
+            # otherwise, the magnitude is calculated from the real virial
+            magnitude = np.linalg.norm(real_v, axis=-1)
+        vs_devi /= magnitude + relative
     max_devi_v = np.max(vs_devi, axis=-1)
     min_devi_v = np.min(vs_devi, axis=-1)
     avg_devi_v = np.linalg.norm(vs_devi, axis=-1) / 3
     return max_devi_v, min_devi_v, avg_devi_v
 
 
-def write_model_devi_out(devi: np.ndarray, fname: str, header: str = ""):
+def write_model_devi_out(
+    devi: np.ndarray, fname: str, header: str = "", atomic: bool = False
+):
     """Write output of model deviation.
 
     Parameters
@@ -128,8 +196,13 @@ def write_model_devi_out(devi: np.ndarray, fname: str, header: str = ""):
         the file name to dump
     header : str, default=""
         the header to dump
+    atomic : bool, default: False
+        whether atomic model deviation is printed
     """
-    assert devi.shape[1] == 8
+    if not atomic:
+        assert devi.shape[1] == 8
+    else:
+        assert devi.shape[1] > 8
     header = "%s\n%10s" % (header, "step")
     for item in "vf":
         header += "%19s%19s%19s" % (
@@ -138,11 +211,13 @@ def write_model_devi_out(devi: np.ndarray, fname: str, header: str = ""):
             f"avg_devi_{item}",
         )
     header += "%19s" % "devi_e"
+    if atomic:
+        header += "%19s" % "atm_devi_f(N)"
     with open(fname, "ab") as fp:
         np.savetxt(
             fp,
             devi,
-            fmt=["%12d"] + ["%19.6e" for _ in range(7)],
+            fmt=["%12d"] + ["%19.6e" for _ in range(devi.shape[1] - 1)],
             delimiter="",
             header=header,
         )
@@ -175,6 +250,9 @@ def calc_model_devi(
     fparam: Optional[np.ndarray] = None,
     aparam: Optional[np.ndarray] = None,
     real_data: Optional[dict] = None,
+    atomic: bool = False,
+    relative: Optional[float] = None,
+    relative_v: Optional[float] = None,
 ):
     """Python interface to calculate model deviation.
 
@@ -200,6 +278,16 @@ def calc_model_devi(
         atomic specific parameters
     real_data : dict, optional
         real data to calculate RMS real error
+    atomic : bool, default: False
+        If True, calculate the force model deviation of each atom.
+    relative : float, default: None
+        If given, calculate the relative model deviation of force. The
+        value is the level parameter for computing the relative model
+        deviation of the force.
+    relative_v : float, default: None
+        If given, calculate the relative model deviation of virial. The
+        value is the level parameter for computing the relative model
+        deviation of the virial.
 
     Returns
     -------
@@ -241,16 +329,26 @@ def calc_model_devi(
 
     devi = [np.arange(coord.shape[0]) * frequency]
     if real_data is None:
-        devi += list(calc_model_devi_v(virials))
-        devi += list(calc_model_devi_f(forces))
+        devi += list(calc_model_devi_v(virials, relative=relative_v))
+        devi_f = list(calc_model_devi_f(forces, relative=relative, atomic=atomic))
+        devi += devi_f[:3]
         devi.append(calc_model_devi_e(energies))
     else:
-        devi += list(calc_model_devi_v(virials, real_data["virial"]))
-        devi += list(calc_model_devi_f(forces, real_data["force"]))
+        devi += list(
+            calc_model_devi_v(virials, real_data["virial"], relative=relative_v)
+        )
+        devi_f = list(
+            calc_model_devi_f(
+                forces, real_data["force"], relative=relative, atomic=atomic
+            )
+        )
+        devi += devi_f[:3]
         devi.append(calc_model_devi_e(energies, real_data["energy"]))
     devi = np.vstack(devi).T
+    if atomic:
+        devi = np.concatenate([devi, devi_f[3]], axis=1)
     if fname:
-        write_model_devi_out(devi, fname)
+        write_model_devi_out(devi, fname, atomic=atomic)
     return devi
 
 
@@ -262,6 +360,9 @@ def make_model_devi(
     output: str,
     frequency: int,
     real_error: bool = False,
+    atomic: bool = False,
+    relative: Optional[float] = None,
+    relative_v: Optional[float] = None,
     **kwargs,
 ):
     """Make model deviation calculation.
@@ -282,6 +383,16 @@ def make_model_devi(
         This paramter is used to determine the index in the output file.
     real_error : bool, default: False
         If True, calculate the RMS real error instead of model deviation.
+    atomic : bool, default: False
+        If True, calculate the force model deviation of each atom.
+    relative : float, default: None
+        If given, calculate the relative model deviation of force. The
+        value is the level parameter for computing the relative model
+        deviation of the force.
+    relative_v : float, default: None
+        If given, calculate the relative model deviation of virial. The
+        value is the level parameter for computing the relative model
+        deviation of the virial.
     **kwargs
         Arbitrary keyword arguments.
     """
@@ -305,7 +416,9 @@ def make_model_devi(
 
     for system in all_sys:
         # create data-system
-        dp_data = DeepmdData(system, set_prefix, shuffle_test=False, type_map=tmap)
+        dp_data = DeepmdData(
+            system, set_prefix, shuffle_test=False, type_map=tmap, sort_atoms=False
+        )
         if first_dp.get_dim_fparam() > 0:
             dp_data.add(
                 "fparam",
@@ -385,11 +498,14 @@ def make_model_devi(
                 fparam=fparam,
                 aparam=aparam,
                 real_data=real_data,
+                atomic=atomic,
+                relative=relative,
+                relative_v=relative_v,
             )
             nframes_tot += coord.shape[0]
             devis.append(devi)
         devis = np.vstack(devis)
         devis[:, 0] = np.arange(nframes_tot) * frequency
-        write_model_devi_out(devis, output, header=system)
+        write_model_devi_out(devis, output, header=system, atomic=atomic)
         devis_coll.append(devis)
     return devis_coll
diff --git a/deepmd_cli/main.py b/deepmd_cli/main.py
index f707bf7589..94ceb9888d 100644
--- a/deepmd_cli/main.py
+++ b/deepmd_cli/main.py
@@ -454,6 +454,22 @@ def main_parser() -> argparse.ArgumentParser:
         default=False,
         help="Calculate the RMS real error of the model. The real data should be given in the systems.",
     )
+    parser_model_devi.add_argument(
+        "--atomic",
+        action="store_true",
+        default=False,
+        help="Print the force model deviation of each atom.",
+    )
+    parser_model_devi.add_argument(
+        "--relative",
+        type=float,
+        help="Calculate the relative model deviation of force. The level parameter for computing the relative model deviation of the force should be given.",
+    )
+    parser_model_devi.add_argument(
+        "--relative_v",
+        type=float,
+        help="Calculate the relative model deviation of virial. The level parameter for computing the relative model deviation of the virial should be given.",
+    )
 
     # * convert models
     parser_transform = subparsers.add_parser(
diff --git a/doc/test/model-deviation.md b/doc/test/model-deviation.md
index 41cda9ddb7..6a89d7c2f4 100644
--- a/doc/test/model-deviation.md
+++ b/doc/test/model-deviation.md
@@ -36,3 +36,14 @@ optional arguments:
 ```
 
 For more details concerning the definition of model deviation and its application, please refer to [Yuzhi Zhang, Haidi Wang, Weijie Chen, Jinzhe Zeng, Linfeng Zhang, Han Wang, and Weinan E, DP-GEN: A concurrent learning platform for the generation of reliable deep learning based potential energy models, Computer Physics Communications, 2020, 253, 107206.](https://doi.org/10.1016/j.cpc.2020.107206)
+
+## Relative model deviation
+
+By default, the model deviation is output in absolute value. If the argument `--relative` is passed, then the relative model deviation of the force will be output, including values output by the argument `--atomic`. The relative model deviation of the force on atom $i$ is defined by
+
+$$E_{f_i}=\frac{\left|D_{f_i}\right|}{\left|f_i\right|+l}$$
+
+where $D_{f_i}$ is the absolute model deviation of the force on atom $i$, $f_i$ is the norm of the force and $l$ is provided as the parameter of the keyword `relative`.
+If the argument `--relative_v` is set, then the relative model deviation of the virial will be output instead of the absolute value, with the same definition of that of the force:
+
+$$E_{v_i}=\frac{\left|D_{v_i}\right|}{\left|v_i\right|+l}$$
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index 15acb2e497..e1d482381f 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -40,7 +40,7 @@ and the model deviation will be computed among all models every `out_freq` times
     <i>fparam_from_compute</i> value = id
         id = compute id used to update the frame parameter.
     <i>atomic</i> = no value is required.
-        If this keyword is set, the model deviation of each atom will be output.
+        If this keyword is set, the force model deviation of each atom will be output.
     <i>relative</i> value = level
         level = The level parameter for computing the relative model deviation of the force
     <i>relative_v</i> value = level
diff --git a/pyproject.toml b/pyproject.toml
index 7b8f55d562..b169a3b0eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ dependencies = [
     'pyyaml',
     'dargs >= 0.3.5',
     'python-hostlist >= 1.21',
-    'typing_extensions; python_version < "3.7"',
+    'typing_extensions; python_version < "3.8"',
     'importlib_metadata>=1.4; python_version < "3.8"',
     'h5py',
     'wcmatch',
diff --git a/source/tests/test_model_devi.py b/source/tests/test_model_devi.py
index 91c95af46c..c7d050cd76 100644
--- a/source/tests/test_model_devi.py
+++ b/source/tests/test_model_devi.py
@@ -113,6 +113,95 @@ def test_make_model_devi_real_erorr(self):
             6,
         )
 
+    def test_make_model_devi_atomic_relative(self):
+        _, expected_f, expected_v = self.graphs[0].eval(
+            self.coord[0], self.box[0], self.atype
+        )
+        _, expected_f2, expected_v2 = self.graphs[1].eval(
+            self.coord[0], self.box[0], self.atype
+        )
+        expected_f = expected_f.reshape((-1, 3))
+        expected_f2 = expected_f2.reshape((-1, 3))
+        expected_v = expected_v.reshape((-1, 3))
+        expected_v2 = expected_v2.reshape((-1, 3))
+        relative = 1.0
+        make_model_devi(
+            models=self.graph_dirs,
+            system=self.data_dir,
+            set_prefix="set",
+            output=self.output,
+            frequency=self.freq,
+            atomic=True,
+            relative=relative,
+        )
+        md = np.loadtxt(self.output)
+        # copy from lammps test
+        norm = np.linalg.norm(np.mean([expected_f, expected_f2], axis=0), axis=1)
+        expected_md_f = np.linalg.norm(
+            np.std([expected_f, expected_f2], axis=0), axis=1
+        )
+        expected_md_f /= norm + relative
+        np.testing.assert_allclose(md[8:], expected_md_f, 6)
+        np.testing.assert_allclose(md[7], self.expect[7], 6)
+        np.testing.assert_allclose(md[4], np.max(expected_md_f), 6)
+        np.testing.assert_allclose(md[5], np.min(expected_md_f), 6)
+        np.testing.assert_allclose(md[6], np.mean(expected_md_f), 6)
+        expected_md_v = (
+            np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0)
+            / 6
+        )
+        np.testing.assert_allclose(md[1], np.max(expected_md_v), 6)
+        np.testing.assert_allclose(md[2], np.min(expected_md_v), 6)
+        np.testing.assert_allclose(md[3], np.sqrt(np.mean(np.square(expected_md_v))), 6)
+
+    def test_make_model_devi_atomic_relative_v(self):
+        _, expected_f, expected_v = self.graphs[0].eval(
+            self.coord[0], self.box[0], self.atype
+        )
+        _, expected_f2, expected_v2 = self.graphs[1].eval(
+            self.coord[0], self.box[0], self.atype
+        )
+        expected_f = expected_f.reshape((-1, 3))
+        expected_f2 = expected_f2.reshape((-1, 3))
+        expected_v = expected_v.reshape((-1, 3))
+        expected_v2 = expected_v2.reshape((-1, 3))
+        relative = 1.0
+        make_model_devi(
+            models=self.graph_dirs,
+            system=self.data_dir,
+            set_prefix="set",
+            output=self.output,
+            frequency=self.freq,
+            atomic=True,
+            relative_v=relative,
+        )
+        md = np.loadtxt(self.output)
+        # copy from lammps test
+        expected_md_f = np.linalg.norm(
+            np.std([expected_f, expected_f2], axis=0), axis=1
+        )
+        np.testing.assert_allclose(md[8:], expected_md_f, 6)
+        np.testing.assert_allclose(md[7], self.expect[7], 6)
+        np.testing.assert_allclose(md[4], np.max(expected_md_f), 6)
+        np.testing.assert_allclose(md[5], np.min(expected_md_f), 6)
+        np.testing.assert_allclose(md[6], np.mean(expected_md_f), 6)
+        expected_md_v = (
+            np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0)
+            / 6
+        )
+        norm = (
+            np.abs(
+                np.mean(
+                    [np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0
+                )
+            )
+            / 6
+        )
+        expected_md_v /= norm + relative
+        np.testing.assert_allclose(md[1], np.max(expected_md_v), 6)
+        np.testing.assert_allclose(md[2], np.min(expected_md_v), 6)
+        np.testing.assert_allclose(md[3], np.sqrt(np.mean(np.square(expected_md_v))), 6)
+
     def tearDown(self):
         for pb in self.graph_dirs:
             os.remove(pb)

From 89d0278bc6608a50442cb52760594a51c554962a Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 11 Sep 2023 23:19:21 -0400
Subject: [PATCH 16/63] docs: add doc to install cmake (#2805)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 doc/install/install-from-source.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index dd28d86ae5..4d75d484ec 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -166,6 +166,13 @@ cd $deepmd_source_dir/source
 mkdir build
 cd build
 ```
+
+The installation requires CMake 3.16 or later for the CPU version, CMake 3.23 or later for the CUDA support, and CMake 3.21 or later for the ROCM support. One can install CMake via `pip` if it is not installed or the installed version does not satisfy the requirement:
+
+```sh
+pip install -U cmake
+```
+
 I assume you have activated the TensorFlow Python environment and want to install DeePMD-kit into path `$deepmd_root`, then execute CMake
 ```bash
 cmake -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=$deepmd_root ..

From 13d49bd97c060525957449e9e214c47e3154b6b2 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 11 Sep 2023 23:44:44 -0400
Subject: [PATCH 17/63] docs: add docs for addtional CMake arguments via pip
 (#2806)

Fix #2432.

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 doc/install/install-from-source.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index 4d75d484ec..e6a4b1a7cb 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -78,6 +78,7 @@ One may set the following environment variables before executing `pip`:
 | ROCM_ROOT             | Path                   | Detected automatically | The path to the ROCM toolkit directory. |
 | TENSORFLOW_ROOT       | Path                   | Detected automatically | The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.|
 | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1           | 0             | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. |
+| CMAKE_ARGS             | str                   | -             | Additional CMake arguments |
 
 To test the installation, one should first jump out of the source directory
 ```

From 8cd822e2932bd7fd3a3e5fabff7e97fd67ac4f92 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 11 Sep 2023 23:47:31 -0400
Subject: [PATCH 18/63] ignore drdq when generalized force loss is not set
 (#2807)

This PR fixes an error for the example in the `examples` directory.
---
 deepmd/loss/ener.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py
index 4cc7619d50..7895fadbf3 100644
--- a/deepmd/loss/ener.py
+++ b/deepmd/loss/ener.py
@@ -121,13 +121,14 @@ def __init__(
         )
         # drdq: the partial derivative of atomic coordinates w.r.t. generalized coordinates
         # TODO: could numb_generalized_coord decided from the training data?
-        add_data_requirement(
-            "drdq",
-            self.numb_generalized_coord * 3,
-            atomic=True,
-            must=False,
-            high_prec=False,
-        )
+        if self.has_gf > 0:
+            add_data_requirement(
+                "drdq",
+                self.numb_generalized_coord * 3,
+                atomic=True,
+                must=False,
+                high_prec=False,
+            )
         if self.enable_atom_ener_coeff:
             add_data_requirement(
                 "atom_ener_coeff",

From 445ec23aee974fe5b69f5752a49b2de9b55fd743 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 12 Sep 2023 11:52:56 +0800
Subject: [PATCH 19/63] Bump docker/build-push-action from 4.1.1 to 4.2.1
 (#2804)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps
[docker/build-push-action](https://github.com/docker/build-push-action)
from 4.1.1 to 4.2.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/docker/build-push-action/releases">docker/build-push-action's
releases</a>.</em></p>
<blockquote>
<h2>v4.2.1</h2>
<blockquote>
<p><strong>Note</strong></p>
<p>Buildx v0.10 enables support for a minimal <a
href="https://slsa.dev/provenance/">SLSA Provenance</a> attestation,
which requires support for <a
href="https://github.com/opencontainers/image-spec">OCI-compliant</a>
multi-platform images. This may introduce issues with registry and
runtime support (e.g. <a
href="https://redirect.github.com/docker/buildx/issues/1533">Google
Cloud Run and AWS Lambda</a>). You can optionally disable the default
provenance attestation functionality using <code>provenance:
false</code>.</p>
</blockquote>
<ul>
<li>warn if docker config can't be parsed by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/build-push-action/pull/957">docker/build-push-action#957</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/build-push-action/compare/v4.2.0...v4.2.1">https://github.com/docker/build-push-action/compare/v4.2.0...v4.2.1</a></p>
<h2>v4.2.0</h2>
<blockquote>
<p><strong>Note</strong></p>
<p>Buildx v0.10 enables support for a minimal <a
href="https://slsa.dev/provenance/">SLSA Provenance</a> attestation,
which requires support for <a
href="https://github.com/opencontainers/image-spec">OCI-compliant</a>
multi-platform images. This may introduce issues with registry and
runtime support (e.g. <a
href="https://redirect.github.com/docker/buildx/issues/1533">Google
Cloud Run and AWS Lambda</a>). You can optionally disable the default
provenance attestation functionality using <code>provenance:
false</code>.</p>
</blockquote>
<ul>
<li>display proxy configuration by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/build-push-action/pull/872">docker/build-push-action#872</a></li>
<li>chore(deps): Bump <code>@​docker/actions-toolkit</code> from 0.6.0
to 0.8.0 in <a
href="https://redirect.github.com/docker/build-push-action/pull/930">docker/build-push-action#930</a></li>
<li>chore(deps): Bump word-wrap from 1.2.3 to 1.2.5 in <a
href="https://redirect.github.com/docker/build-push-action/pull/925">docker/build-push-action#925</a></li>
<li>chore(deps): Bump semver from 6.3.0 to 6.3.1 in <a
href="https://redirect.github.com/docker/build-push-action/pull/902">docker/build-push-action#902</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/build-push-action/compare/v4.1.1...v4.2.0">https://github.com/docker/build-push-action/compare/v4.1.1...v4.2.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/docker/build-push-action/commit/0a97817b6ade9f46837855d676c4cca3a2471fc9"><code>0a97817</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/957">#957</a>
from crazy-max/warn-docker-config</li>
<li><a
href="https://github.com/docker/build-push-action/commit/ec39ef320c442d2af669fd8555e7f3b7ad7026bd"><code>ec39ef3</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/build-push-action/commit/f46044b799d766d2bb3e644bd48da06b6c30a978"><code>f46044b</code></a>
warn if docker config can't be parsed</li>
<li><a
href="https://github.com/docker/build-push-action/commit/4e4ee680f69700760d5620a94c020aef883043aa"><code>4e4ee68</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/951">#951</a>
from crazy-max/ci-concurrency</li>
<li><a
href="https://github.com/docker/build-push-action/commit/e86cf554b69aa5dfd1179937926b6d0cf6550be7"><code>e86cf55</code></a>
ci: missing concurrency checks</li>
<li><a
href="https://github.com/docker/build-push-action/commit/daa0106f78eed71cdc2a900489ce515acedeef64"><code>daa0106</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/949">#949</a>
from docker/dependabot/github_actions/actions/checkout-4</li>
<li><a
href="https://github.com/docker/build-push-action/commit/ce51e905a661fc760fece3d76496f92e48beb8df"><code>ce51e90</code></a>
chore(deps): Bump actions/checkout from 3 to 4</li>
<li><a
href="https://github.com/docker/build-push-action/commit/1fde16337d494a70a6a0d6c3f24486b201a6a873"><code>1fde163</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/950">#950</a>
from crazy-max/fix-ci</li>
<li><a
href="https://github.com/docker/build-push-action/commit/ae311c520f6e620595e7fcb93d4f61d6a5573b5e"><code>ae311c5</code></a>
ci: fix workflow</li>
<li><a
href="https://github.com/docker/build-push-action/commit/9311bf5263ae5b36f3ec67aff768790c6e2344ad"><code>9311bf5</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/932">#932</a>
from crazy-max/form-templates</li>
<li>Additional commits viewable in <a
href="https://github.com/docker/build-push-action/compare/2eb1c1961a95fc15694676618e422e8ba1d63825...0a97817b6ade9f46837855d676c4cca3a2471fc9">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/build-push-action&package-manager=github_actions&previous-version=4.1.1&new-version=4.2.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_wheel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 5f10a85269..e47d753f1c 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -113,7 +113,7 @@ jobs:
           images: ghcr.io/deepmodeling/deepmd-kit
 
       - name: Build and push Docker image
-        uses: docker/build-push-action@2eb1c1961a95fc15694676618e422e8ba1d63825
+        uses: docker/build-push-action@0a97817b6ade9f46837855d676c4cca3a2471fc9
         with:
           context: source/install/docker
           push: ${{ github.repository_owner == 'deepmodeling' && github.event_name == 'push' }}

From 70ccd41b49397b00f8030e43d361c0fc37e65fdf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 13 Sep 2023 09:46:25 +0800
Subject: [PATCH 20/63] [pre-commit.ci] pre-commit autoupdate (#2808)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--pre-commit.ci start-->
updates:
- [github.com/psf/black: 23.7.0 →
23.9.1](https://github.com/psf/black/compare/23.7.0...23.9.1)
- [github.com/astral-sh/ruff-pre-commit: v0.0.287 →
v0.0.288](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.287...v0.0.288)
<!--pre-commit.ci end-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ba11bbcf50..d39f5ec127 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     -   id: check-toml
 # Python
 -   repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 23.9.1
     hooks:
     -   id: black-jupyter
 -   repo: https://github.com/PyCQA/isort
@@ -33,7 +33,7 @@ repos:
       files: \.py$
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.0.287
+    rev: v0.0.288
     hooks:
     - id: ruff
       args: ["--fix"]

From 959c1299be968bcf209f7245e4bcdfe28eff185f Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 12 Sep 2023 21:58:08 -0400
Subject: [PATCH 21/63] drop old GCC versions in test (#2812)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/workflows/test_python.yml | 35 +++++++++++++------------------
 backend/find_tensorflow.py        | 21 +++++++++++++++++++
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index 25dded26aa..0ac29a7d9b 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -5,48 +5,43 @@ name: Test Python
 jobs:
   testpython:
     name: Test Python
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         include:
           - python: 3.7
-            gcc: 5
-            tf: 1.14
-          - python: 3.7
-            gcc: 6
-            tf: 1.14
-          - python: 3.7
-            gcc: 7
-            tf: 1.14
-          - python: 3.7
-            gcc: 8
             tf: 1.14
           - python: 3.8
-            gcc: 8
             tf:
           - python: "3.11"
-            gcc: 8
             tf:
 
-    container: ghcr.io/deepmodeling/deepmd-kit-test-environment:py${{ matrix.python }}-gcc${{ matrix.gcc }}-tf${{ matrix.tf }}
     steps:
-    - name: work around permission issue
-      run: git config --global --add safe.directory /__w/deepmd-kit/deepmd-kit
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+        cache: 'pip'
+    - uses: mpi4py/setup-mpi@v1
+      if: ${{ matrix.tf == '' }}
+      with:
+        mpi: openmpi
     # https://github.com/pypa/pip/issues/11770
     - run: python -m pip install -U "pip>=21.3.1,!=23.0.0"
     - run: pip install -e .[cpu,test]
       env:
-        CC: gcc-${{ matrix.gcc }}
-        CXX: g++-${{ matrix.gcc }}
         TENSORFLOW_VERSION: ${{ matrix.tf }}
         DP_BUILD_TESTING: 1
+    - run: pip install horovod mpi4py
+      if: ${{ matrix.tf == '' }}
+      env:
+        HOROVOD_WITH_TENSORFLOW: 1
+        HOROVOD_WITHOUT_GLOO: 1
     - run: dp --version
     - run: pytest --cov=deepmd --cov=deepmd_cli source/tests --durations=0
     - uses: codecov/codecov-action@v3
       with:
         gcov: true
-        gcov_executable: gcov-${{ matrix.gcc }}
   pass:
     name: Pass testing Python
     needs: [testpython]
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
index 8fe3cedb63..aa75d5ecb4 100644
--- a/backend/find_tensorflow.py
+++ b/backend/find_tensorflow.py
@@ -112,16 +112,31 @@ def get_tf_requirement(tf_version: str = "") -> dict:
     if tf_version == "":
         tf_version = os.environ.get("TENSORFLOW_VERSION", "")
 
+    extra_requires = []
+    extra_select = {}
+    if not (tf_version == "" or tf_version in SpecifierSet(">=2.12")):
+        extra_requires.append("protobuf<3.20")
+    if tf_version == "" or tf_version in SpecifierSet(">=1.15"):
+        extra_select["mpi"] = [
+            "horovod",
+            "mpi4py",
+        ]
+    else:
+        extra_select["mpi"] = []
+
     if tf_version == "":
         return {
             "cpu": [
                 "tensorflow-cpu; platform_machine!='aarch64' and (platform_machine!='arm64' or platform_system != 'Darwin')",
                 "tensorflow; platform_machine=='aarch64' or (platform_machine=='arm64' and platform_system == 'Darwin')",
+                *extra_requires,
             ],
             "gpu": [
                 "tensorflow",
                 "tensorflow-metal; platform_machine=='arm64' and platform_system == 'Darwin'",
+                *extra_requires,
             ],
+            **extra_select,
         }
     elif tf_version in SpecifierSet("<1.15") or tf_version in SpecifierSet(
         ">=2.0,<2.1"
@@ -129,22 +144,28 @@ def get_tf_requirement(tf_version: str = "") -> dict:
         return {
             "cpu": [
                 f"tensorflow=={tf_version}",
+                *extra_requires,
             ],
             "gpu": [
                 f"tensorflow-gpu=={tf_version}; platform_machine!='aarch64'",
                 f"tensorflow=={tf_version}; platform_machine=='aarch64'",
+                *extra_requires,
             ],
+            **extra_select,
         }
     else:
         return {
             "cpu": [
                 f"tensorflow-cpu=={tf_version}; platform_machine!='aarch64' and (platform_machine!='arm64' or platform_system != 'Darwin')",
                 f"tensorflow=={tf_version}; platform_machine=='aarch64'  or (platform_machine=='arm64' and platform_system == 'Darwin')",
+                *extra_requires,
             ],
             "gpu": [
                 f"tensorflow=={tf_version}",
                 "tensorflow-metal; platform_machine=='arm64' and platform_system == 'Darwin'",
+                *extra_requires,
             ],
+            **extra_select,
         }
 
 
From 3a98751d0e7735eac2b0b6e8176e9c490db2d516 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 13 Sep 2023 20:05:37 +0000
Subject: [PATCH 22/63] Bump actions/checkout from 3 to 4 (#2803)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to
4.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/actions/checkout/releases">actions/checkout's
releases</a>.</em></p>
<blockquote>
<h2>v4.0.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Update default runtime to node20 by <a
href="https://github.com/takost"><code>@​takost</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1436">actions/checkout#1436</a></li>
<li>Support fetching without the --progress option by <a
href="https://github.com/simonbaird"><code>@​simonbaird</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1067">actions/checkout#1067</a></li>
<li>Release 4.0.0 by <a
href="https://github.com/takost"><code>@​takost</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1447">actions/checkout#1447</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/takost"><code>@​takost</code></a> made
their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1436">actions/checkout#1436</a></li>
<li><a
href="https://github.com/simonbaird"><code>@​simonbaird</code></a> made
their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1067">actions/checkout#1067</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/checkout/compare/v3...v4.0.0">https://github.com/actions/checkout/compare/v3...v4.0.0</a></p>
<h2>v3.6.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Mark test scripts with Bash'isms to be run via Bash by <a
href="https://github.com/dscho"><code>@​dscho</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1377">actions/checkout#1377</a></li>
<li>Add option to fetch tags even if fetch-depth &gt; 0 by <a
href="https://github.com/RobertWieczoreck"><code>@​RobertWieczoreck</code></a>
in <a
href="https://redirect.github.com/actions/checkout/pull/579">actions/checkout#579</a></li>
<li>Release 3.6.0 by <a
href="https://github.com/luketomlinson"><code>@​luketomlinson</code></a>
in <a
href="https://redirect.github.com/actions/checkout/pull/1437">actions/checkout#1437</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a
href="https://github.com/RobertWieczoreck"><code>@​RobertWieczoreck</code></a>
made their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/579">actions/checkout#579</a></li>
<li><a
href="https://github.com/luketomlinson"><code>@​luketomlinson</code></a>
made their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1437">actions/checkout#1437</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/checkout/compare/v3.5.3...v3.6.0">https://github.com/actions/checkout/compare/v3.5.3...v3.6.0</a></p>
<h2>v3.5.3</h2>
<h2>What's Changed</h2>
<ul>
<li>Fix: Checkout Issue in self hosted runner due to faulty submodule
check-ins by <a
href="https://github.com/megamanics"><code>@​megamanics</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1196">actions/checkout#1196</a></li>
<li>Fix typos found by codespell by <a
href="https://github.com/DimitriPapadopoulos"><code>@​DimitriPapadopoulos</code></a>
in <a
href="https://redirect.github.com/actions/checkout/pull/1287">actions/checkout#1287</a></li>
<li>Add support for sparse checkouts by <a
href="https://github.com/dscho"><code>@​dscho</code></a> and <a
href="https://github.com/dfdez"><code>@​dfdez</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1369">actions/checkout#1369</a></li>
<li>Release v3.5.3 by <a
href="https://github.com/TingluoHuang"><code>@​TingluoHuang</code></a>
in <a
href="https://redirect.github.com/actions/checkout/pull/1376">actions/checkout#1376</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a
href="https://github.com/megamanics"><code>@​megamanics</code></a> made
their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1196">actions/checkout#1196</a></li>
<li><a
href="https://github.com/DimitriPapadopoulos"><code>@​DimitriPapadopoulos</code></a>
made their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1287">actions/checkout#1287</a></li>
<li><a href="https://github.com/dfdez"><code>@​dfdez</code></a> made
their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1369">actions/checkout#1369</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/checkout/compare/v3...v3.5.3">https://github.com/actions/checkout/compare/v3...v3.5.3</a></p>
<h2>v3.5.2</h2>
<h2>What's Changed</h2>
<ul>
<li>Fix: Use correct API url / endpoint in GHES by <a
href="https://github.com/fhammerl"><code>@​fhammerl</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1289">actions/checkout#1289</a>
based on <a
href="https://redirect.github.com/actions/checkout/issues/1286">#1286</a>
by <a href="https://github.com/1newsr"><code>@​1newsr</code></a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/actions/checkout/compare/v3.5.1...v3.5.2">https://github.com/actions/checkout/compare/v3.5.1...v3.5.2</a></p>
<h2>v3.5.1</h2>
<h2>What's Changed</h2>
<ul>
<li>Improve checkout performance on Windows runners by upgrading
<code>@​actions/github</code> dependency by <a
href="https://github.com/BrettDong"><code>@​BrettDong</code></a> in <a
href="https://redirect.github.com/actions/checkout/pull/1246">actions/checkout#1246</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/BrettDong"><code>@​BrettDong</code></a>
made their first contribution in <a
href="https://redirect.github.com/actions/checkout/pull/1246">actions/checkout#1246</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/actions/checkout/blob/main/CHANGELOG.md">actions/checkout's
changelog</a>.</em></p>
<blockquote>
<h1>Changelog</h1>
<h2>v4.0.0</h2>
<ul>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1067">Support
fetching without the --progress option</a></li>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1436">Update to
node20</a></li>
</ul>
<h2>v3.6.0</h2>
<ul>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1377">Fix: Mark
test scripts with Bash'isms to be run via Bash</a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/579">Add
option to fetch tags even if fetch-depth &gt; 0</a></li>
</ul>
<h2>v3.5.3</h2>
<ul>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1196">Fix:
Checkout fail in self-hosted runners when faulty submodule are
checked-in</a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/1287">Fix
typos found by codespell</a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/1369">Add
support for sparse checkouts</a></li>
</ul>
<h2>v3.5.2</h2>
<ul>
<li><a href="https://redirect.github.com/actions/checkout/pull/1289">Fix
api endpoint for GHES</a></li>
</ul>
<h2>v3.5.1</h2>
<ul>
<li><a href="https://redirect.github.com/actions/checkout/pull/1246">Fix
slow checkout on Windows</a></li>
</ul>
<h2>v3.5.0</h2>
<ul>
<li><a href="https://redirect.github.com/actions/checkout/pull/1237">Add
new public key for known_hosts</a></li>
</ul>
<h2>v3.4.0</h2>
<ul>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1209">Upgrade
codeql actions to v2</a></li>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1210">Upgrade
dependencies</a></li>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1225">Upgrade
<code>@​actions/io</code></a></li>
</ul>
<h2>v3.3.0</h2>
<ul>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1045">Implement
branch list using callbacks from exec function</a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/1050">Add
in explicit reference to private checkout options</a></li>
<li>[Fix comment typos (that got added in <a
href="https://redirect.github.com/actions/checkout/issues/770">#770</a>)](<a
href="https://redirect.github.com/actions/checkout/pull/1057">actions/checkout#1057</a>)</li>
</ul>
<h2>v3.2.0</h2>
<ul>
<li><a href="https://redirect.github.com/actions/checkout/pull/942">Add
GitHub Action to perform release</a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/967">Fix
status badge</a></li>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1002">Replace
datadog/squid with ubuntu/squid Docker image</a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/964">Wrap
pipeline commands for submoduleForeach in quotes</a></li>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1029">Update
<code>@​actions/io</code> to 1.1.2</a></li>
<li><a
href="https://redirect.github.com/actions/checkout/pull/1039">Upgrading
version to 3.2.0</a></li>
</ul>
<h2>v3.1.0</h2>
<ul>
<li><a href="https://redirect.github.com/actions/checkout/pull/939">Use
<code>@​actions/core</code> <code>saveState</code> and
<code>getState</code></a></li>
<li><a href="https://redirect.github.com/actions/checkout/pull/922">Add
<code>github-server-url</code> input</a></li>
</ul>
<h2>v3.0.2</h2>
<ul>
<li><a href="https://redirect.github.com/actions/checkout/pull/770">Add
input <code>set-safe-directory</code></a></li>
</ul>
<h2>v3.0.1</h2>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/actions/checkout/commit/3df4ab11eba7bda6032a0b82a6bb43b11571feac"><code>3df4ab1</code></a>
Release 4.0.0 (<a
href="https://redirect.github.com/actions/checkout/issues/1447">#1447</a>)</li>
<li><a
href="https://github.com/actions/checkout/commit/8b5e8b768746b50394015010d25e690bfab9dfbc"><code>8b5e8b7</code></a>
Support fetching without the --progress option (<a
href="https://redirect.github.com/actions/checkout/issues/1067">#1067</a>)</li>
<li><a
href="https://github.com/actions/checkout/commit/97a652b80035363df47baee5031ec8670b8878ac"><code>97a652b</code></a>
Update default runtime to node20 (<a
href="https://redirect.github.com/actions/checkout/issues/1436">#1436</a>)</li>
<li>See full diff in <a
href="https://github.com/actions/checkout/compare/v3...v4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=3&new-version=4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_cc.yml    | 2 +-
 .github/workflows/build_wheel.yml | 6 +++---
 .github/workflows/package_c.yml   | 4 ++--
 .github/workflows/test_cc.yml     | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index c2d2613464..500b305ba9 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -20,7 +20,7 @@ jobs:
         - variant: clang
           dp_variant: clang
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         submodules: true
     - uses: actions/setup-python@v4
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index e47d753f1c..3e3a265159 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -38,7 +38,7 @@ jobs:
             platform_id: manylinux_aarch64
             dp_variant: cpu
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
           # https://github.com/pypa/setuptools_scm/issues/480
@@ -60,7 +60,7 @@ jobs:
     name: Build source distribution
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           submodules: true
       - uses: actions/setup-python@v4
@@ -94,7 +94,7 @@ jobs:
     needs: [build_wheels]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/download-artifact@v3
         with:
           name: artifact
diff --git a/.github/workflows/package_c.yml b/.github/workflows/package_c.yml
index d3be4c9469..ada205be00 100644
--- a/.github/workflows/package_c.yml
+++ b/.github/workflows/package_c.yml
@@ -9,7 +9,7 @@ jobs:
     name: Build C library
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Package C library
         run: ./source/install/docker_package_c.sh
       # for download and debug
@@ -30,7 +30,7 @@ jobs:
     needs: [build_c]
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Download artifact
         uses: actions/download-artifact@v3
         with:
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index c53921bf60..9404f12937 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -7,7 +7,7 @@ jobs:
     name: Test C++
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - uses: actions/setup-python@v4
       with:
         python-version: '3.11'

From a8b31237732ed04841ba48e7304f0edfaf3fd518 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 13 Sep 2023 21:45:08 -0400
Subject: [PATCH 23/63] `ndarray.tostring` -> `ndarray.tobytes` (#2814)

`tostring` has been deprecated. See
https://numpy.org/devdocs/reference/generated/numpy.ndarray.tostring.html

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/entrypoints/transfer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/entrypoints/transfer.py b/deepmd/entrypoints/transfer.py
index dc580fbe0a..535b32ec09 100644
--- a/deepmd/entrypoints/transfer.py
+++ b/deepmd/entrypoints/transfer.py
@@ -196,7 +196,7 @@ def from_array(
         )
 
     def from_str(self, tensor: np.ndarray):
-        self.node.attr["value"].tensor.tensor_content = tensor.tostring()
+        self.node.attr["value"].tensor.tensor_content = tensor.tobytes()
 
 
 def load_tensor(node: tf.Tensor, dtype_old: type, dtype_new: type) -> np.ndarray:

From 58dd3e2449dba7719d5d6921ddd779bfbfc6005e Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 13 Sep 2023 21:45:37 -0400
Subject: [PATCH 24/63] `tf.accumulate_n` -> `tf.add_n` (#2815)

`tf.accumulate_n` has been deprecated, and from the source code, I see
that `tf.accumulate_n` is just a wrapper of `tf.add_n` since TF 2.1
(https://github.com/tensorflow/tensorflow/commit/292d3094313136b77bb5f444561bc3ffc529b246).

See:
https://www.tensorflow.org/api_docs/python/tf/math/accumulate_n
https://www.tensorflow.org/api_docs/python/tf/math/add_n

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/descriptor/se_a.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 1349f61464..82df8cc1a3 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -1054,8 +1054,8 @@ def _filter(
                         # add zero is meaningless; skip
                         rets.append(ret)
                     start_index += self.sel_a[type_i]
-                # faster to use accumulate_n than multiple add
-                xyz_scatter_1 = tf.accumulate_n(rets)
+                # faster to use add_n than multiple add
+                xyz_scatter_1 = tf.add_n(rets)
             else:
                 xyz_scatter_1 = self._filter_lower(
                     type_i,

From 7da9aaf075e8dea1eca9a08f99f3917235b55e3b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 13 Sep 2023 21:46:43 -0400
Subject: [PATCH 25/63] `tf.test.TestCase.test_session` ->
 `tf.test.TestCase.cached_session` (#2816)

`tf.test.TestCase.test_session` is deprecated in TF 1.11. We used it
when we still tested TF 1.8, and now it is ok to replace it.
---
 source/tests/test_activation_fn_gelu.py    |  2 +-
 source/tests/test_data_large_batch.py      |  6 +++---
 source/tests/test_data_modifier.py         |  2 +-
 source/tests/test_data_modifier_shuffle.py |  2 +-
 source/tests/test_descrpt_hybrid.py        |  2 +-
 source/tests/test_descrpt_nonsmth.py       | 10 +++++-----
 source/tests/test_descrpt_se_a_mask.py     |  2 +-
 source/tests/test_descrpt_se_a_type.py     |  4 ++--
 source/tests/test_descrpt_se_atten.py      |  8 ++++----
 source/tests/test_descrpt_se_r.py          | 10 +++++-----
 source/tests/test_descrpt_sea_ef.py        |  2 +-
 source/tests/test_descrpt_sea_ef_para.py   |  2 +-
 source/tests/test_descrpt_sea_ef_rot.py    |  2 +-
 source/tests/test_descrpt_sea_ef_vert.py   |  2 +-
 source/tests/test_descrpt_smooth.py        | 10 +++++-----
 source/tests/test_dipole_se_a.py           |  2 +-
 source/tests/test_dipole_se_a_tebd.py      |  2 +-
 source/tests/test_embedding_net.py         |  2 +-
 source/tests/test_ewald.py                 |  6 +++---
 source/tests/test_fitting_dos.py           |  2 +-
 source/tests/test_fitting_ener_type.py     |  2 +-
 source/tests/test_layer_name.py            |  2 +-
 source/tests/test_linear_model.py          |  2 +-
 source/tests/test_model_dos.py             |  2 +-
 source/tests/test_model_loc_frame.py       |  2 +-
 source/tests/test_model_multi.py           |  2 +-
 source/tests/test_model_se_a.py            |  6 +++---
 source/tests/test_model_se_a_aparam.py     |  2 +-
 source/tests/test_model_se_a_ebd.py        |  2 +-
 source/tests/test_model_se_a_fparam.py     |  2 +-
 source/tests/test_model_se_a_srtab.py      |  2 +-
 source/tests/test_model_se_a_type.py       |  2 +-
 source/tests/test_model_se_atten.py        | 12 ++++++------
 source/tests/test_model_se_r.py            |  2 +-
 source/tests/test_model_se_t.py            |  2 +-
 source/tests/test_model_spin.py            |  2 +-
 source/tests/test_nvnmd_entrypoints.py     |  6 +++---
 source/tests/test_nvnmd_op.py              | 20 ++++++++++----------
 source/tests/test_pairwise_dprc.py         |  2 +-
 source/tests/test_polar_se_a.py            |  2 +-
 source/tests/test_polar_se_a_tebd.py       |  2 +-
 source/tests/test_prod_env_mat.py          |  2 +-
 source/tests/test_prod_force.py            |  2 +-
 source/tests/test_prod_force_grad.py       |  2 +-
 source/tests/test_prod_virial.py           |  2 +-
 source/tests/test_prod_virial_grad.py      |  2 +-
 source/tests/test_tab_nonsmth.py           |  2 +-
 source/tests/test_tab_smooth.py            |  2 +-
 source/tests/test_type_embed.py            |  4 ++--
 source/tests/test_type_one_side.py         |  4 ++--
 50 files changed, 90 insertions(+), 90 deletions(-)

diff --git a/source/tests/test_activation_fn_gelu.py b/source/tests/test_activation_fn_gelu.py
index 6ecbd0154f..b1c30eeefc 100644
--- a/source/tests/test_activation_fn_gelu.py
+++ b/source/tests/test_activation_fn_gelu.py
@@ -17,7 +17,7 @@
 class TestGelu(tf.test.TestCase):
     def setUp(self):
         self.places = 6
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.inputs = tf.reshape(
             tf.constant([0.0, 1.0, 2.0, 3.0], dtype=tf.float64), [-1, 1]
         )
diff --git a/source/tests/test_data_large_batch.py b/source/tests/test_data_large_batch.py
index 3ae46e8cb9..5750f956f8 100644
--- a/source/tests/test_data_large_batch.py
+++ b/source/tests/test_data_large_batch.py
@@ -180,7 +180,7 @@ def test_data_mixed_type(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
@@ -376,7 +376,7 @@ def test_stripped_data_mixed_type(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
@@ -572,7 +572,7 @@ def test_compressible_data_mixed_type(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
diff --git a/source/tests/test_data_modifier.py b/source/tests/test_data_modifier.py
index dfc602fd92..368a60d68a 100644
--- a/source/tests/test_data_modifier.py
+++ b/source/tests/test_data_modifier.py
@@ -80,7 +80,7 @@ def _setUp(self):
         model.build(data)
 
         # freeze the graph
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             init_op = tf.global_variables_initializer()
             sess.run(init_op)
             graph = tf.get_default_graph()
diff --git a/source/tests/test_data_modifier_shuffle.py b/source/tests/test_data_modifier_shuffle.py
index 151caa9e16..9ddbb8ee29 100644
--- a/source/tests/test_data_modifier_shuffle.py
+++ b/source/tests/test_data_modifier_shuffle.py
@@ -81,7 +81,7 @@ def _setUp(self):
         model.build(data)
 
         # freeze the graph
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             init_op = tf.global_variables_initializer()
             sess.run(init_op)
             graph = tf.get_default_graph()
diff --git a/source/tests/test_descrpt_hybrid.py b/source/tests/test_descrpt_hybrid.py
index ed39c04307..317f6ea5a0 100644
--- a/source/tests/test_descrpt_hybrid.py
+++ b/source/tests/test_descrpt_hybrid.py
@@ -115,7 +115,7 @@ def test_descriptor_hybrid(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_descrpt_nonsmth.py b/source/tests/test_descrpt_nonsmth.py
index 1d503e6c8c..fd3bb0b2f7 100644
--- a/source/tests/test_descrpt_nonsmth.py
+++ b/source/tests/test_descrpt_nonsmth.py
@@ -160,7 +160,7 @@ class TestNonSmooth(Inter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        Inter.setUp(self, data, sess=self.test_session().__enter__())
+        Inter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, suffix="_se")
@@ -180,8 +180,8 @@ def test_pbc(self):
         data = Data()
         inter0 = Inter()
         inter1 = Inter()
-        inter0.setUp(data, pbc=True, sess=self.test_session().__enter__())
-        inter1.setUp(data, pbc=False, sess=self.test_session().__enter__())
+        inter0.setUp(data, pbc=True, sess=self.cached_session().__enter__())
+        inter1.setUp(data, pbc=False, sess=self.cached_session().__enter__())
         inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
         inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
 
@@ -233,8 +233,8 @@ def test_pbc_small_box(self):
         data1 = Data(box_scale=2)
         inter0 = Inter()
         inter1 = Inter()
-        inter0.setUp(data0, pbc=True, sess=self.test_session().__enter__())
-        inter1.setUp(data1, pbc=False, sess=self.test_session().__enter__())
+        inter0.setUp(data0, pbc=True, sess=self.cached_session().__enter__())
+        inter1.setUp(data1, pbc=False, sess=self.cached_session().__enter__())
         inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
         inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
 
diff --git a/source/tests/test_descrpt_se_a_mask.py b/source/tests/test_descrpt_se_a_mask.py
index 30c514a2cc..85cd1cc2a1 100644
--- a/source/tests/test_descrpt_se_a_mask.py
+++ b/source/tests/test_descrpt_se_a_mask.py
@@ -277,7 +277,7 @@ def test_descriptor_se_a_mask(self):
             t_aparam: test_data["aparam"][:numb_test, :],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [op_dout] = sess.run([dout], feed_dict=feed_dict_test)
         op_dout = op_dout.reshape([-1])
diff --git a/source/tests/test_descrpt_se_a_type.py b/source/tests/test_descrpt_se_a_type.py
index b10920b1d4..aeab18f149 100644
--- a/source/tests/test_descrpt_se_a_type.py
+++ b/source/tests/test_descrpt_se_a_type.py
@@ -120,7 +120,7 @@ def test_descriptor_two_sides(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
         model_dout = model_dout.reshape([-1])
@@ -284,7 +284,7 @@ def test_descriptor_one_side(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
         model_dout = model_dout.reshape([-1])
diff --git a/source/tests/test_descrpt_se_atten.py b/source/tests/test_descrpt_se_atten.py
index e49e6ab3e2..76df651a46 100644
--- a/source/tests/test_descrpt_se_atten.py
+++ b/source/tests/test_descrpt_se_atten.py
@@ -141,7 +141,7 @@ def test_descriptor_two_sides(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
         model_dout = model_dout.reshape([-1])
@@ -318,7 +318,7 @@ def test_descriptor_one_side(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
         model_dout = model_dout.reshape([-1])
@@ -488,7 +488,7 @@ def test_stripped_type_embedding_descriptor_two_sides(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
         model_dout = model_dout.reshape([-1])
@@ -666,7 +666,7 @@ def test_compressible_descriptor_two_sides(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [model_dout] = sess.run([dout], feed_dict=feed_dict_test)
         model_dout = model_dout.reshape([-1])
diff --git a/source/tests/test_descrpt_se_r.py b/source/tests/test_descrpt_se_r.py
index c20515a5fa..779954a545 100644
--- a/source/tests/test_descrpt_se_r.py
+++ b/source/tests/test_descrpt_se_r.py
@@ -135,7 +135,7 @@ class TestSmooth(Inter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        Inter.setUp(self, data, sess=self.test_session().__enter__())
+        Inter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, suffix="_se_r")
@@ -155,8 +155,8 @@ def test_pbc(self):
         data = Data()
         inter0 = Inter()
         inter1 = Inter()
-        inter0.setUp(data, pbc=True, sess=self.test_session().__enter__())
-        inter1.setUp(data, pbc=False, sess=self.test_session().__enter__())
+        inter0.setUp(data, pbc=True, sess=self.cached_session().__enter__())
+        inter1.setUp(data, pbc=False, sess=self.cached_session().__enter__())
         inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
         inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
 
@@ -208,8 +208,8 @@ def test_pbc_small_box(self):
         data1 = Data(box_scale=2)
         inter0 = Inter()
         inter1 = Inter()
-        inter0.setUp(data0, pbc=True, sess=self.test_session().__enter__())
-        inter1.setUp(data1, pbc=False, sess=self.test_session().__enter__())
+        inter0.setUp(data0, pbc=True, sess=self.cached_session().__enter__())
+        inter1.setUp(data1, pbc=False, sess=self.cached_session().__enter__())
         inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
         inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
 
diff --git a/source/tests/test_descrpt_sea_ef.py b/source/tests/test_descrpt_sea_ef.py
index e39afec97e..efd86854c7 100644
--- a/source/tests/test_descrpt_sea_ef.py
+++ b/source/tests/test_descrpt_sea_ef.py
@@ -154,7 +154,7 @@ class TestSmooth(Inter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        Inter.setUp(self, data, sess=self.test_session().__enter__())
+        Inter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, suffix="_sea_ef")
diff --git a/source/tests/test_descrpt_sea_ef_para.py b/source/tests/test_descrpt_sea_ef_para.py
index 1ddcc4e196..1a109013cb 100644
--- a/source/tests/test_descrpt_sea_ef_para.py
+++ b/source/tests/test_descrpt_sea_ef_para.py
@@ -154,7 +154,7 @@ class TestSmooth(Inter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        Inter.setUp(self, data, sess=self.test_session().__enter__())
+        Inter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, suffix="_sea_ef_para")
diff --git a/source/tests/test_descrpt_sea_ef_rot.py b/source/tests/test_descrpt_sea_ef_rot.py
index 10553b878d..d94565af96 100644
--- a/source/tests/test_descrpt_sea_ef_rot.py
+++ b/source/tests/test_descrpt_sea_ef_rot.py
@@ -17,7 +17,7 @@
 
 class TestEfRot(tf.test.TestCase):
     def setUp(self):
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.natoms = [5, 5, 2, 3]
         self.ntypes = 2
         self.sel_a = [12, 24]
diff --git a/source/tests/test_descrpt_sea_ef_vert.py b/source/tests/test_descrpt_sea_ef_vert.py
index dcbc418720..77ffb3150c 100644
--- a/source/tests/test_descrpt_sea_ef_vert.py
+++ b/source/tests/test_descrpt_sea_ef_vert.py
@@ -154,7 +154,7 @@ class TestSmooth(Inter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        Inter.setUp(self, data, sess=self.test_session().__enter__())
+        Inter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, suffix="_sea_ef_vert")
diff --git a/source/tests/test_descrpt_smooth.py b/source/tests/test_descrpt_smooth.py
index aa0730cdea..59076e366e 100644
--- a/source/tests/test_descrpt_smooth.py
+++ b/source/tests/test_descrpt_smooth.py
@@ -153,7 +153,7 @@ class TestSmooth(Inter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        Inter.setUp(self, data, sess=self.test_session().__enter__())
+        Inter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, suffix="_smth")
@@ -173,8 +173,8 @@ def test_pbc(self):
         data = Data()
         inter0 = Inter()
         inter1 = Inter()
-        inter0.setUp(data, pbc=True, sess=self.test_session().__enter__())
-        inter1.setUp(data, pbc=False, sess=self.test_session().__enter__())
+        inter0.setUp(data, pbc=True, sess=self.cached_session().__enter__())
+        inter1.setUp(data, pbc=False, sess=self.cached_session().__enter__())
         inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
         inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
 
@@ -226,8 +226,8 @@ def test_pbc_small_box(self):
         data1 = Data(box_scale=2)
         inter0 = Inter()
         inter1 = Inter()
-        inter0.setUp(data0, pbc=True, sess=self.test_session().__enter__())
-        inter1.setUp(data1, pbc=False, sess=self.test_session().__enter__())
+        inter0.setUp(data0, pbc=True, sess=self.cached_session().__enter__())
+        inter1.setUp(data1, pbc=False, sess=self.cached_session().__enter__())
         inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
         inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
 
diff --git a/source/tests/test_dipole_se_a.py b/source/tests/test_dipole_se_a.py
index 4e2fa9b30d..687e68c2be 100644
--- a/source/tests/test_dipole_se_a.py
+++ b/source/tests/test_dipole_se_a.py
@@ -111,7 +111,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [p, gp] = sess.run([dipole, gdipole], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_dipole_se_a_tebd.py b/source/tests/test_dipole_se_a_tebd.py
index f848526735..4b2e6d0688 100644
--- a/source/tests/test_dipole_se_a_tebd.py
+++ b/source/tests/test_dipole_se_a_tebd.py
@@ -129,7 +129,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [p, gp] = sess.run([dipole, gdipole], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_embedding_net.py b/source/tests/test_embedding_net.py
index f09ef74948..1b8c68c089 100644
--- a/source/tests/test_embedding_net.py
+++ b/source/tests/test_embedding_net.py
@@ -13,7 +13,7 @@
 
 class Inter(tf.test.TestCase):
     def setUp(self):
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.inputs = tf.constant([0.0, 1.0, 2.0], dtype=tf.float64)
         self.ndata = 3
         self.inputs = tf.reshape(self.inputs, [-1, 1])
diff --git a/source/tests/test_ewald.py b/source/tests/test_ewald.py
index b6b925f801..ef2ace39a4 100644
--- a/source/tests/test_ewald.py
+++ b/source/tests/test_ewald.py
@@ -64,7 +64,7 @@ def setUp(self):
     def test_py_interface(self):
         hh = 1e-4
         places = 4
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         t_energy, t_force, t_virial = op_module.ewald_recp(
             self.coord,
             self.charge,
@@ -91,7 +91,7 @@ def test_py_interface(self):
     def test_force(self):
         hh = 1e-4
         places = 6
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         t_energy, t_force, t_virial = op_module.ewald_recp(
             self.coord,
             self.charge,
@@ -144,7 +144,7 @@ def test_force(self):
     def test_virial(self):
         hh = 1e-4
         places = 6
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         t_energy, t_force, t_virial = op_module.ewald_recp(
             self.coord,
             self.charge,
diff --git a/source/tests/test_fitting_dos.py b/source/tests/test_fitting_dos.py
index 95de81c32c..60a0ee4158 100644
--- a/source/tests/test_fitting_dos.py
+++ b/source/tests/test_fitting_dos.py
@@ -180,7 +180,7 @@ def test_fitting(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [pred_atom_dos] = sess.run([atom_dos], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_fitting_ener_type.py b/source/tests/test_fitting_ener_type.py
index 54621b634a..42190ef557 100644
--- a/source/tests/test_fitting_ener_type.py
+++ b/source/tests/test_fitting_ener_type.py
@@ -188,7 +188,7 @@ def test_fitting(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [pred_atom_ener] = sess.run([atom_ener], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_layer_name.py b/source/tests/test_layer_name.py
index 6de4a09736..c6a2f0b09c 100644
--- a/source/tests/test_layer_name.py
+++ b/source/tests/test_layer_name.py
@@ -137,7 +137,7 @@ def test_model(self):
             is_training: False,
         }
 
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             sess.run(tf.global_variables_initializer())
             [e1, f1, v1, e2, f2, v2] = sess.run(
                 [e_energy1, e_force1, e_virial1, e_energy2, e_force2, e_virial2],
diff --git a/source/tests/test_linear_model.py b/source/tests/test_linear_model.py
index 13a2bc4850..21f0f6efc8 100644
--- a/source/tests/test_linear_model.py
+++ b/source/tests/test_linear_model.py
@@ -94,7 +94,7 @@ def test_linear_ener_model(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         e = np.reshape(e, [1, -1])
diff --git a/source/tests/test_model_dos.py b/source/tests/test_model_dos.py
index 3562a5b9f9..c7160d4dda 100644
--- a/source/tests/test_model_dos.py
+++ b/source/tests/test_model_dos.py
@@ -116,7 +116,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [pred_dos, pred_atom_dos] = sess.run([dos, atom_dos], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_loc_frame.py b/source/tests/test_model_loc_frame.py
index ed0fc3815a..c493013316 100644
--- a/source/tests/test_model_loc_frame.py
+++ b/source/tests/test_model_loc_frame.py
@@ -114,7 +114,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_multi.py b/source/tests/test_model_multi.py
index 384f1e0553..9017da22e7 100644
--- a/source/tests/test_model_multi.py
+++ b/source/tests/test_model_multi.py
@@ -141,7 +141,7 @@ def test_model(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
 
         # test water energy
         sess.run(tf.global_variables_initializer())
diff --git a/source/tests/test_model_se_a.py b/source/tests/test_model_se_a.py
index 65e42f43a0..d3b4323f0d 100644
--- a/source/tests/test_model_se_a.py
+++ b/source/tests/test_model_se_a.py
@@ -123,7 +123,7 @@ def test_model_atom_ener(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         self.assertAlmostEqual(e[0], set_atom_ener[0], places=10)
@@ -212,7 +212,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
@@ -347,7 +347,7 @@ def test_model_atom_ener_type_embedding(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         self.assertAlmostEqual(e[0], set_atom_ener[0], places=10)
diff --git a/source/tests/test_model_se_a_aparam.py b/source/tests/test_model_se_a_aparam.py
index b236320d24..41111c57ee 100644
--- a/source/tests/test_model_se_a_aparam.py
+++ b/source/tests/test_model_se_a_aparam.py
@@ -115,7 +115,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_se_a_ebd.py b/source/tests/test_model_se_a_ebd.py
index 96de277d2f..bf856b7bc5 100644
--- a/source/tests/test_model_se_a_ebd.py
+++ b/source/tests/test_model_se_a_ebd.py
@@ -115,7 +115,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_se_a_fparam.py b/source/tests/test_model_se_a_fparam.py
index fad41947e2..cdb85157a4 100644
--- a/source/tests/test_model_se_a_fparam.py
+++ b/source/tests/test_model_se_a_fparam.py
@@ -116,7 +116,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_se_a_srtab.py b/source/tests/test_model_se_a_srtab.py
index ff91af619b..98cab9e073 100644
--- a/source/tests/test_model_se_a_srtab.py
+++ b/source/tests/test_model_se_a_srtab.py
@@ -140,7 +140,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_se_a_type.py b/source/tests/test_model_se_a_type.py
index 63d0ae279c..85e4a2916d 100644
--- a/source/tests/test_model_se_a_type.py
+++ b/source/tests/test_model_se_a_type.py
@@ -121,7 +121,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
diff --git a/source/tests/test_model_se_atten.py b/source/tests/test_model_se_atten.py
index 6e6e9928a6..445959ceb2 100644
--- a/source/tests/test_model_se_atten.py
+++ b/source/tests/test_model_se_atten.py
@@ -132,7 +132,7 @@ def test_model(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
@@ -258,7 +258,7 @@ def test_exclude_types(self):
             is_training: False,
         }
 
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             sess.run(tf.global_variables_initializer())
             [des] = sess.run([dout], feed_dict=feed_dict_test1)
 
@@ -357,7 +357,7 @@ def test_compressible_model(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
@@ -485,7 +485,7 @@ def test_compressible_exclude_types(self):
             is_training: False,
         }
 
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             sess.run(tf.global_variables_initializer())
             [des] = sess.run([dout], feed_dict=feed_dict_test1)
 
@@ -587,7 +587,7 @@ def test_stripped_type_embedding_model(self):
             t_mesh: test_data["default_mesh"],
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
         # print(sess.run(model.type_embedding))
@@ -719,7 +719,7 @@ def test_stripped_type_embedding_exclude_types(self):
             is_training: False,
         }
 
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             sess.run(tf.global_variables_initializer())
             [des] = sess.run([dout], feed_dict=feed_dict_test1)
 
diff --git a/source/tests/test_model_se_r.py b/source/tests/test_model_se_r.py
index 01151d8c30..94812308c6 100644
--- a/source/tests/test_model_se_r.py
+++ b/source/tests/test_model_se_r.py
@@ -111,7 +111,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_se_t.py b/source/tests/test_model_se_t.py
index 300ad46a0a..1d67e852c7 100644
--- a/source/tests/test_model_se_t.py
+++ b/source/tests/test_model_se_t.py
@@ -109,7 +109,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_model_spin.py b/source/tests/test_model_spin.py
index a264f38616..9bdf1d780a 100644
--- a/source/tests/test_model_spin.py
+++ b/source/tests/test_model_spin.py
@@ -122,7 +122,7 @@ def test_model_spin(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [out_ener, out_force, out_virial] = sess.run(
             [energy, force, virial], feed_dict=feed_dict_test
diff --git a/source/tests/test_nvnmd_entrypoints.py b/source/tests/test_nvnmd_entrypoints.py
index af0cd48146..3e721516f1 100644
--- a/source/tests/test_nvnmd_entrypoints.py
+++ b/source/tests/test_nvnmd_entrypoints.py
@@ -454,7 +454,7 @@ def test_model_qnn_v0(self):
             dic_ph["default_mesh"]: mesh_dat,
         }
         #
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         # get tensordic
         keys = "o_descriptor,o_rmat,o_energy".split(",")
@@ -762,7 +762,7 @@ def test_model_qnn_v1(self):
             dic_ph["default_mesh"]: mesh_dat,
         }
         #
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         # get tensordic
         keys = "o_descriptor,o_rmat,o_energy".split(",")
@@ -818,7 +818,7 @@ def test_model_qnn_v1(self):
         ref_dout = 60.73941362
         np.testing.assert_almost_equal(pred, ref_dout, 8)
         # test freeze
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         weight_file1 = str(tests_path / "nvnmd" / "ref" / "weight_v1_cnn.npy")
         weight_file2 = str(tests_path / "nvnmd" / "out" / "weight_v1_qnn.npy")
         save_weight(sess, weight_file2)
diff --git a/source/tests/test_nvnmd_op.py b/source/tests/test_nvnmd_op.py
index 2b59b9ef94..3419b375e4 100644
--- a/source/tests/test_nvnmd_op.py
+++ b/source/tests/test_nvnmd_op.py
@@ -17,7 +17,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -110,7 +110,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -140,7 +140,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -166,7 +166,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -192,7 +192,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -238,7 +238,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -284,7 +284,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -330,7 +330,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -376,7 +376,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
@@ -402,7 +402,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
 
     def test_op(self):
         # graph
diff --git a/source/tests/test_pairwise_dprc.py b/source/tests/test_pairwise_dprc.py
index 2ea5888b60..04aaa237b1 100644
--- a/source/tests/test_pairwise_dprc.py
+++ b/source/tests/test_pairwise_dprc.py
@@ -349,7 +349,7 @@ def test_model_ener(self):
             t_aparam: np.reshape(np.tile(test_data["aparam"], 5), [-1]),
             is_training: False,
         }
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [e, f, v] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_polar_se_a.py b/source/tests/test_polar_se_a.py
index 1933816488..2564dc0656 100644
--- a/source/tests/test_polar_se_a.py
+++ b/source/tests/test_polar_se_a.py
@@ -110,7 +110,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [p, gp] = sess.run([polar, gpolar], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_polar_se_a_tebd.py b/source/tests/test_polar_se_a_tebd.py
index 284cb46498..570c4261d9 100644
--- a/source/tests/test_polar_se_a_tebd.py
+++ b/source/tests/test_polar_se_a_tebd.py
@@ -128,7 +128,7 @@ def test_model(self):
             is_training: False,
         }
 
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         [p, gp] = sess.run([polar, gpolar], feed_dict=feed_dict_test)
 
diff --git a/source/tests/test_prod_env_mat.py b/source/tests/test_prod_env_mat.py
index cf0b9e9296..663b991831 100644
--- a/source/tests/test_prod_env_mat.py
+++ b/source/tests/test_prod_env_mat.py
@@ -11,7 +11,7 @@
 
 class TestProdEnvMat(tf.test.TestCase):
     def setUp(self):
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.nframes = 2
         self.dcoord = [
             12.83,
diff --git a/source/tests/test_prod_force.py b/source/tests/test_prod_force.py
index e0497d0b7e..83a44c0be9 100644
--- a/source/tests/test_prod_force.py
+++ b/source/tests/test_prod_force.py
@@ -18,7 +18,7 @@ def setUp(self):
             config.graph_options.rewrite_options.custom_optimizers.add().name = (
                 "dpparallel"
             )
-        self.sess = self.test_session(config=config).__enter__()
+        self.sess = self.cached_session(config=config).__enter__()
         self.nframes = 2
         self.dcoord = [
             12.83,
diff --git a/source/tests/test_prod_force_grad.py b/source/tests/test_prod_force_grad.py
index a7eaeb7511..012def217f 100644
--- a/source/tests/test_prod_force_grad.py
+++ b/source/tests/test_prod_force_grad.py
@@ -10,7 +10,7 @@
 
 class TestProdForceGrad(tf.test.TestCase):
     def setUp(self):
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.nframes = 2
         self.dcoord = [
             12.83,
diff --git a/source/tests/test_prod_virial.py b/source/tests/test_prod_virial.py
index 29f71daf68..2abcfcb1bf 100644
--- a/source/tests/test_prod_virial.py
+++ b/source/tests/test_prod_virial.py
@@ -10,7 +10,7 @@
 
 class TestProdVirial(tf.test.TestCase):
     def setUp(self):
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.nframes = 2
         self.dcoord = [
             12.83,
diff --git a/source/tests/test_prod_virial_grad.py b/source/tests/test_prod_virial_grad.py
index f7d6cfe92d..548b63a54b 100644
--- a/source/tests/test_prod_virial_grad.py
+++ b/source/tests/test_prod_virial_grad.py
@@ -10,7 +10,7 @@
 
 class TestProdVirialGrad(tf.test.TestCase):
     def setUp(self):
-        self.sess = self.test_session().__enter__()
+        self.sess = self.cached_session().__enter__()
         self.nframes = 2
         self.dcoord = [
             12.83,
diff --git a/source/tests/test_tab_nonsmth.py b/source/tests/test_tab_nonsmth.py
index d6df226478..9e3f9ff640 100644
--- a/source/tests/test_tab_nonsmth.py
+++ b/source/tests/test_tab_nonsmth.py
@@ -178,7 +178,7 @@ class TestTabNonSmooth(IntplInter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        IntplInter.setUp(self, data, sess=self.test_session().__enter__())
+        IntplInter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, places=5, suffix="_tab")
diff --git a/source/tests/test_tab_smooth.py b/source/tests/test_tab_smooth.py
index 220ba4e3f3..49b18e14f3 100644
--- a/source/tests/test_tab_smooth.py
+++ b/source/tests/test_tab_smooth.py
@@ -175,7 +175,7 @@ class TestTabSmooth(IntplInter, tf.test.TestCase):
     def setUp(self):
         self.places = 5
         data = Data()
-        IntplInter.setUp(self, data, sess=self.test_session().__enter__())
+        IntplInter.setUp(self, data, sess=self.cached_session().__enter__())
 
     def test_force(self):
         force_test(self, self, places=5, suffix="_tab_smth")
diff --git a/source/tests/test_type_embed.py b/source/tests/test_type_embed.py
index 47de16cbdc..3e79bad70b 100644
--- a/source/tests/test_type_embed.py
+++ b/source/tests/test_type_embed.py
@@ -23,14 +23,14 @@ def test_embed_atom_type(self):
         )
         expected_out = [[1, 2, 3], [1, 2, 3], [1, 2, 3], [7, 7, 7], [7, 7, 7]]
         atom_embed = embed_atom_type(ntypes, natoms, type_embedding)
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         atom_embed = sess.run(atom_embed)
         np.testing.assert_almost_equal(atom_embed, expected_out, 10)
 
     def test_type_embed_net(self):
         ten = TypeEmbedNet([2, 4, 8], seed=1, uniform_seed=True)
         type_embedding = ten.build(2)
-        sess = self.test_session().__enter__()
+        sess = self.cached_session().__enter__()
         sess.run(tf.global_variables_initializer())
         type_embedding = sess.run(type_embedding)
 
diff --git a/source/tests/test_type_one_side.py b/source/tests/test_type_one_side.py
index e16ecd2b12..8e7c173912 100644
--- a/source/tests/test_type_one_side.py
+++ b/source/tests/test_type_one_side.py
@@ -125,7 +125,7 @@ def test_descriptor_one_side_exclude_types(self):
         feed_dict_test2[t_type] = np.reshape(new_type2[:numb_test, :], [-1])
         feed_dict_test2[t_natoms] = new_natoms2
 
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             sess.run(tf.global_variables_initializer())
             [model_dout1] = sess.run([dout], feed_dict=feed_dict_test1)
             [model_dout2] = sess.run([dout], feed_dict=feed_dict_test2)
@@ -231,7 +231,7 @@ def test_se_r_one_side_exclude_types(self):
         feed_dict_test2[t_type] = np.reshape(new_type2[:numb_test, :], [-1])
         feed_dict_test2[t_natoms] = new_natoms2
 
-        with self.test_session() as sess:
+        with self.cached_session() as sess:
             sess.run(tf.global_variables_initializer())
             [model_dout1] = sess.run([dout], feed_dict=feed_dict_test1)
             [model_dout2] = sess.run([dout], feed_dict=feed_dict_test2)

From 1a4a7ca4b65932e7952c0a7b0d26f7f6c815e52f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?=
 <yifanl0716@gmail.com>
Date: Thu, 14 Sep 2023 22:55:50 -0500
Subject: [PATCH 26/63] lmp: let fparam_do_compute not execute by default
 (#2819)

One should set the variable do_compute in pair_deepmd.cpp false by
default, so that fparam can be used correctly. The current version will
trigger the error
https://github.com/deepmodeling/deepmd-kit/blob/7da9aaf075e8dea1eca9a08f99f3917235b55e3b/source/lmp/pair_deepmd.cpp#L1044-L1046
unexpectedly.
---
 source/lmp/pair_deepmd.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index ec53a1dc99..489c31ff19 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -382,6 +382,7 @@ PairDeepMD::PairDeepMD(LAMMPS *lmp)
   eps_v = 0.;
   scale = NULL;
   do_ttm = false;
+  do_compute = false;
   single_model = false;
   multi_models_mod_devi = false;
   multi_models_no_mod_devi = false;

From 20a41d04bdbf6058d6af9a8ccf3070b4aff2f882 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 15 Sep 2023 00:19:56 -0400
Subject: [PATCH 27/63] speed up GitHub Actions (#2822)

This PR speeds up multiple GitHub Actions in the following way:

- only install `cuda-nvcc` and `cuda-cudart-dev` instead of the whole
cudatoolkit
- skip installing clang as it's already shipped with the GitHub Action
image
- enable cache for all `setup-python`
- use Ninja instead of Make as the CMake generator

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/workflows/build_cc.yml    | 14 ++++++++++----
 .github/workflows/build_wheel.yml |  1 +
 .github/workflows/test_cc.yml     |  3 +++
 source/install/build_cc.sh        |  4 ++--
 source/install/build_from_c.sh    |  6 +++---
 source/install/package_c.sh       |  4 ++--
 source/install/test_cc.sh         |  4 ++--
 source/install/test_cc_local.sh   |  4 ++--
 8 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 500b305ba9..55a5a5c4d8 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -26,14 +26,20 @@ jobs:
     - uses: actions/setup-python@v4
       with:
         python-version: '3.11'
+        cache: 'pip'
+    - uses: lukka/get-cmake@latest
     - run: python -m pip install tensorflow
-    - run: sudo apt-get update && sudo apt-get install -y nvidia-cuda-toolkit
+    - run: |
+         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
+         && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
+         && sudo apt-get update \
+         && sudo apt-get -y install cuda-cudart-dev-11-8 cuda-nvcc-11-8
       if: matrix.variant == 'cuda'
     - run: |
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
          && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
          && sudo apt-get update \
-         && sudo apt-get -y install cuda-12-0
+         && sudo apt-get -y install cuda-cudart-dev-12-0 cuda-nvcc-12-0
       if: matrix.variant == 'cuda120'
       env:
         DEBIAN_FRONTEND: noninteractive
@@ -44,12 +50,11 @@ jobs:
          && sudo apt-get update \
          && sudo apt-get install -y rocm-dev hipcub-dev
       if: matrix.variant == 'rocm'
-    - run: sudo apt-get update && sudo apt-get install -y clang
-      if: matrix.variant == 'clang'
     - run: source/install/build_cc.sh
       env:
         DP_VARIANT: ${{ matrix.dp_variant }}
         DOWNLOAD_TENSORFLOW: "FALSE"
+        CMAKE_GENERATOR: Ninja
       if: matrix.variant != 'clang'
     - run: source/install/build_cc.sh
       env:
@@ -57,6 +62,7 @@ jobs:
         DOWNLOAD_TENSORFLOW: "FALSE"
         CC: clang
         CXX: clang++
+        CMAKE_GENERATOR: Ninja
       if: matrix.variant == 'clang'
     - name: Test files exist
       run: |
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 3e3a265159..85b2d6b884 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -67,6 +67,7 @@ jobs:
         name: Install Python
         with:
           python-version: '3.11'
+          cache: 'pip'
       - run: python -m pip install build
       - name: Build sdist
         run: python -m build --sdist
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index 9404f12937..fa37009730 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -11,10 +11,12 @@ jobs:
     - uses: actions/setup-python@v4
       with:
         python-version: '3.11'
+        cache: 'pip'
     - name: Setup MPI
       uses: mpi4py/setup-mpi@v1
       with:
         mpi: mpich
+    - uses: lukka/get-cmake@latest
     - run: python -m pip install tensorflow
     - run: source/install/test_cc_local.sh
       env:
@@ -22,6 +24,7 @@ jobs:
         TF_INTRA_OP_PARALLELISM_THREADS: 1
         TF_INTER_OP_PARALLELISM_THREADS: 1
         LMP_CXX11_ABI_0: 1
+        CMAKE_GENERATOR: Ninja
     # test lammps
     # ASE issue: https://gitlab.com/ase/ase/-/merge_requests/2843
     # TODO: remove ase version when ase has new release
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index 7cb9ca38ad..bfa3cd1ce4 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -21,8 +21,8 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DUSE_TF_PYTHON_LIBS=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_2Aug2023 ..
-make -j${NPROC}
-make install
+cmake --build . -j${NPROC}
+cmake --install .
 
 #------------------
 echo "Congratulations! DeePMD-kit has been installed at ${INSTALL_PREFIX}"
diff --git a/source/install/build_from_c.sh b/source/install/build_from_c.sh
index b64a62eaff..3a48d3d46c 100755
--- a/source/install/build_from_c.sh
+++ b/source/install/build_from_c.sh
@@ -14,9 +14,9 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_2Aug2023 ..
-make -j${NPROC}
-make install
-make lammps
+cmake --build . -j${NPROC}
+cmake --install .
+cmake --build . --target=lammps
 
 #------------------
 echo "Congratulations! DeePMD-kit has been installed at ${INSTALL_PREFIX}"
diff --git a/source/install/package_c.sh b/source/install/package_c.sh
index c250956e19..0c145b22af 100755
--- a/source/install/package_c.sh
+++ b/source/install/package_c.sh
@@ -20,8 +20,8 @@ cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
 	-DPACKAGE_C=TRUE \
 	-DUSE_TF_PYTHON_LIBS=TRUE \
 	..
-make -j${NPROC}
-make install
+cmake --build . -j${NPROC}
+cmake --install .
 
 #------------------
 
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index 6da5962899..55fe03bad8 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -12,8 +12,8 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
-make -j${NPROC}
-make install
+cmake --build . -j${NPROC}
+cmake --install .
 
 #------------------
 # go to a subdirectory...
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 5a400a15c2..ec1bfadd69 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -13,8 +13,8 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
-make -j${NPROC}
-make install
+cmake --build . -j${NPROC}
+cmake --install .
 
 #------------------
 # go to a subdirectory...

From 5591ed154e3fe1aa0ccb288f2e8312ec51817c27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?=
 <yifanl0716@gmail.com>
Date: Fri, 15 Sep 2023 00:09:49 -0500
Subject: [PATCH 28/63] fix grammatical errors (#2796)

Fix grammatical errors in the document.
---
 doc/development/type-embedding.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/development/type-embedding.md b/doc/development/type-embedding.md
index a027ebdf26..5919d6c944 100644
--- a/doc/development/type-embedding.md
+++ b/doc/development/type-embedding.md
@@ -1,6 +1,6 @@
 # Atom Type Embedding
 ## Overview
-Here is an overview of the DeePMD-kit algorithm. Given a specific centric atom, we can obtain the matrix describing its local environment, named $\mathcal R$. It is consist of the distance between the centric atom and its neighbors, as well as a direction vector. We can embed each distance into a vector of $M_1$ dimension by an `embedding net`, so the environment matrix $\mathcal R$ can be embedded into matrix $\mathcal G$. We can thus extract a descriptor vector (of $M_1 \times M_2$ dim) of the centric atom from the $\mathcal G$ by some matrix multiplication, and put the descriptor into `fitting net` to get predicted energy $E$. The vanilla version of DeePMD-kit builds `embedding net` and `fitting net` relying on the atom type, resulting in $O(N)$ memory usage. After applying atom type embedding, in DeePMD-kit v2.0, we can share one `embedding net` and one `fitting net` in total, which decline training complexity largely.
+Here is an overview of the DeePMD-kit algorithm. Given a specific centric atom, we can obtain the matrix describing its local environment, named $\mathcal R$. It consists of the distance between the centric atom and its neighbors, as well as a direction vector. We can embed each distance into a vector of $M_1$ dimension by an `embedding net`, so the environment matrix $\mathcal R$ can be embedded into matrix $\mathcal G$. We can thus extract a descriptor vector (of $M_1 \times M_2$ dim) of the centric atom from the $\mathcal G$ by some matrix multiplication, and put the descriptor into `fitting net` to get the predicted energy $E$. The vanilla version of DeePMD-kit builds `embedding net` and `fitting net` relying on the atom type, resulting in $O(N)$ memory usage. After applying atom type embedding, in DeePMD-kit v2.0, we can share one `embedding net` and one `fitting net` in total, which reduces training complexity largely.
 
 ## Preliminary
 In the following chart, you can find the meaning of symbols used to clarify the atom-type embedding algorithm.

From 21db464245d950f6cacb83d46111e287833bfa32 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 15 Sep 2023 01:10:37 -0400
Subject: [PATCH 29/63] improve configurations of Python lint tools (#2823)

1. use `black-pre-commit-mirror` instead of `black` which is faster;
2. first ruff and then black;
3. remove `tool.ruff.target-version` which can be detected
automatically;
4. add `RUF` and `NPY` rules to `tool.ruff.select`;
5. set `tool.ruff.pydocstyle.convention` to `numpy`, which can
automatically add several rules to `ignore`.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml                 |  8 ++++----
 data/raw/shuffle_raw.py                 |  3 ++-
 deepmd/calculator.py                    | 11 +++++++++--
 deepmd/descriptor/se_a.py               |  6 +++---
 deepmd/descriptor/se_a_ebd.py           |  2 +-
 deepmd/descriptor/se_a_mask.py          |  2 +-
 deepmd/descriptor/se_atten.py           |  4 ++--
 deepmd/descriptor/se_r.py               |  2 +-
 deepmd/descriptor/se_t.py               |  2 +-
 deepmd/entrypoints/ipi.py               |  2 +-
 deepmd/fit/dos.py                       |  8 ++++----
 deepmd/infer/deep_tensor.py             |  4 +++-
 deepmd/loss/ener.py                     |  4 ++--
 deepmd/model/model_stat.py              |  2 +-
 deepmd/nvnmd/entrypoints/wrap.py        |  2 +-
 deepmd/train/run_options.py             |  4 ++--
 deepmd/train/trainer.py                 |  8 ++++----
 deepmd/utils/argcheck.py                | 15 +++++++++------
 deepmd/utils/data_system.py             |  4 +---
 deepmd/utils/finetune.py                |  6 ++----
 deepmd/utils/multi_init.py              |  4 ++--
 deepmd/utils/network.py                 |  4 ++--
 deepmd/utils/path.py                    |  2 +-
 deepmd/utils/spin.py                    |  7 ++++---
 deepmd_cli/main.py                      |  2 +-
 pyproject.toml                          | 11 +++++------
 source/install/build_tf.py              |  2 +-
 source/tests/common.py                  |  4 +---
 source/tests/test_argument_parser.py    |  2 +-
 source/tests/test_descrpt_sea_ef_rot.py |  8 ++++----
 source/tests/test_fitting_stat.py       |  4 ++--
 31 files changed, 78 insertions(+), 71 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d39f5ec127..19c29c0322 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,10 +22,6 @@ repos:
     -   id: check-symlinks
     -   id: check-toml
 # Python
--   repo: https://github.com/psf/black
-    rev: 23.9.1
-    hooks:
-    -   id: black-jupyter
 -   repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:
@@ -37,6 +33,10 @@ repos:
     hooks:
     - id: ruff
       args: ["--fix"]
+-   repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.1
+    hooks:
+    -   id: black-jupyter
 # numpydoc
 -   repo: https://github.com/Carreau/velin
     rev: 0.0.12
diff --git a/data/raw/shuffle_raw.py b/data/raw/shuffle_raw.py
index f8c689e3f2..51bb7466c9 100755
--- a/data/raw/shuffle_raw.py
+++ b/data/raw/shuffle_raw.py
@@ -69,7 +69,8 @@ def _main():
     tmp = np.reshape(tmp, [nframe, -1])
     nframe = tmp.shape[0]
     idx = np.arange(nframe)
-    np.random.shuffle(idx)
+    rng = np.random.default_rng()
+    rng.shuffle(idx)
 
     for ii in raws:
         data = np.loadtxt(inpath + "/" + ii)
diff --git a/deepmd/calculator.py b/deepmd/calculator.py
index acef657e2c..8636ff30d2 100644
--- a/deepmd/calculator.py
+++ b/deepmd/calculator.py
@@ -6,6 +6,7 @@
 )
 from typing import (
     TYPE_CHECKING,
+    ClassVar,
     Dict,
     List,
     Optional,
@@ -69,13 +70,19 @@ class DP(Calculator):
     """
 
     name = "DP"
-    implemented_properties = ["energy", "free_energy", "forces", "virial", "stress"]
+    implemented_properties: ClassVar[List[str]] = [
+        "energy",
+        "free_energy",
+        "forces",
+        "virial",
+        "stress",
+    ]
 
     def __init__(
         self,
         model: Union[str, "Path"],
         label: str = "DP",
-        type_dict: Dict[str, int] = None,
+        type_dict: Optional[Dict[str, int]] = None,
         **kwargs,
     ) -> None:
         Calculator.__init__(self, label=label, **kwargs)
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 82df8cc1a3..cceb72d4fb 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -144,7 +144,7 @@ class DescrptSeA(DescrptSe):
     .. [1] Linfeng Zhang, Jiequn Han, Han Wang, Wissam A. Saidi, Roberto Car, and E. Weinan. 2018.
        End-to-end symmetry preserving inter-atomic potential energy model for finite and extended
        systems. In Proceedings of the 32nd International Conference on Neural Information Processing
-       Systems (NIPS'18). Curran Associates Inc., Red Hook, NY, USA, 4441–4451.
+       Systems (NIPS'18). Curran Associates Inc., Red Hook, NY, USA, 4441-4451.
     """
 
     def __init__(
@@ -890,7 +890,7 @@ def _filter_lower(
         suffix="",
     ):
         """Input env matrix, returns R.G."""
-        outputs_size = [1] + self.filter_neuron
+        outputs_size = [1, *self.filter_neuron]
         # cut-out inputs
         # with natom x (nei_type_i x 4)
         inputs_i = tf.slice(inputs, [0, start_index * 4], [-1, incrs_index * 4])
@@ -1006,7 +1006,7 @@ def _filter(
         nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
         # natom x (nei x 4)
         shape = inputs.get_shape().as_list()
-        outputs_size = [1] + self.filter_neuron
+        outputs_size = [1, *self.filter_neuron]
         outputs_size_2 = self.n_axis_neuron
         all_excluded = all(
             (type_input, type_i) in self.exclude_types for type_i in range(self.ntypes)
diff --git a/deepmd/descriptor/se_a_ebd.py b/deepmd/descriptor/se_a_ebd.py
index f46444169e..4816ec1569 100644
--- a/deepmd/descriptor/se_a_ebd.py
+++ b/deepmd/descriptor/se_a_ebd.py
@@ -230,7 +230,7 @@ def _embedding_net(
         # natom x (nei x 4)
         inputs = tf.reshape(inputs, [-1, self.ndescrpt])
         shape = inputs.get_shape().as_list()
-        outputs_size = [1] + filter_neuron
+        outputs_size = [1, *filter_neuron]
         with tf.variable_scope(name, reuse=reuse):
             xyz_scatter_total = []
             # with natom x (nei x 4)
diff --git a/deepmd/descriptor/se_a_mask.py b/deepmd/descriptor/se_a_mask.py
index cdec33e292..e4625922cc 100644
--- a/deepmd/descriptor/se_a_mask.py
+++ b/deepmd/descriptor/se_a_mask.py
@@ -112,7 +112,7 @@ class DescrptSeAMask(DescrptSeA):
     .. [1] Linfeng Zhang, Jiequn Han, Han Wang, Wissam A. Saidi, Roberto Car, and E. Weinan. 2018.
        End-to-end symmetry preserving inter-atomic potential energy model for finite and extended
        systems. In Proceedings of the 32nd International Conference on Neural Information Processing
-       Systems (NIPS'18). Curran Associates Inc., Red Hook, NY, USA, 4441–4451.
+       Systems (NIPS'18). Curran Associates Inc., Red Hook, NY, USA, 4441-4451.
     """
 
     def __init__(
diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py
index 12558c45c4..c962952ec0 100644
--- a/deepmd/descriptor/se_atten.py
+++ b/deepmd/descriptor/se_atten.py
@@ -1057,7 +1057,7 @@ def _filter_lower(
         reuse=None,
     ):
         """Input env matrix, returns R.G."""
-        outputs_size = [1] + self.filter_neuron
+        outputs_size = [1, *self.filter_neuron]
         # cut-out inputs
         # with natom x (nei_type_i x 4)
         inputs_i = tf.slice(inputs, [0, start_index * 4], [-1, incrs_index * 4])
@@ -1260,7 +1260,7 @@ def _filter(
         nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
         # natom x (nei x 4)
         shape = inputs.get_shape().as_list()
-        outputs_size = [1] + self.filter_neuron
+        outputs_size = [1, *self.filter_neuron]
         outputs_size_2 = self.n_axis_neuron
 
         start_index = 0
diff --git a/deepmd/descriptor/se_r.py b/deepmd/descriptor/se_r.py
index ad9fda2238..fbc54a651f 100644
--- a/deepmd/descriptor/se_r.py
+++ b/deepmd/descriptor/se_r.py
@@ -638,7 +638,7 @@ def _filter_r(
         trainable=True,
     ):
         # natom x nei
-        outputs_size = [1] + self.filter_neuron
+        outputs_size = [1, *self.filter_neuron]
         with tf.variable_scope(name, reuse=reuse):
             start_index = 0
             xyz_scatter_total = []
diff --git a/deepmd/descriptor/se_t.py b/deepmd/descriptor/se_t.py
index 34af8a90a2..671dbd4e15 100644
--- a/deepmd/descriptor/se_t.py
+++ b/deepmd/descriptor/se_t.py
@@ -633,7 +633,7 @@ def _filter(
     ):
         # natom x (nei x 4)
         shape = inputs.get_shape().as_list()
-        outputs_size = [1] + self.filter_neuron
+        outputs_size = [1, *self.filter_neuron]
         with tf.variable_scope(name, reuse=reuse):
             start_index_i = 0
             result = None
diff --git a/deepmd/entrypoints/ipi.py b/deepmd/entrypoints/ipi.py
index b14b369e40..da287ff3de 100644
--- a/deepmd/entrypoints/ipi.py
+++ b/deepmd/entrypoints/ipi.py
@@ -24,7 +24,7 @@ def _program(name: str, args: List[str]):
     args : list of str
         list of arguments
     """
-    return subprocess.call([os.path.join(ROOT_DIR, name)] + args, close_fds=False)
+    return subprocess.call([os.path.join(ROOT_DIR, name), *args], close_fds=False)
 
 
 def dp_ipi():
diff --git a/deepmd/fit/dos.py b/deepmd/fit/dos.py
index 82018ea520..9a7cb734e5 100644
--- a/deepmd/fit/dos.py
+++ b/deepmd/fit/dos.py
@@ -98,8 +98,8 @@ def __init__(
         numb_aparam: int = 0,
         numb_dos: int = 300,
         rcond: Optional[float] = None,
-        trainable: List[bool] = None,
-        seed: int = None,
+        trainable: Optional[List[bool]] = None,
+        seed: Optional[int] = None,
         activation_function: str = "tanh",
         precision: str = "default",
         uniform_seed: bool = False,
@@ -380,8 +380,8 @@ def build(
         self,
         inputs: tf.Tensor,
         natoms: tf.Tensor,
-        input_dict: dict = None,
-        reuse: bool = None,
+        input_dict: Optional[dict] = None,
+        reuse: Optional[bool] = None,
         suffix: str = "",
     ) -> tf.Tensor:
         """Build the computational graph for fitting net.
diff --git a/deepmd/infer/deep_tensor.py b/deepmd/infer/deep_tensor.py
index 367a8ab5e7..268523e959 100644
--- a/deepmd/infer/deep_tensor.py
+++ b/deepmd/infer/deep_tensor.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     TYPE_CHECKING,
+    ClassVar,
+    Dict,
     List,
     Optional,
     Tuple,
@@ -39,7 +41,7 @@ class DeepTensor(DeepEval):
         The input map for tf.import_graph_def. Only work with default tf graph
     """
 
-    tensors = {
+    tensors: ClassVar[Dict[str, str]] = {
         # descriptor attrs
         "t_ntypes": "descrpt_attr/ntypes:0",
         "t_rcut": "descrpt_attr/rcut:0",
diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py
index 7895fadbf3..95997bad10 100644
--- a/deepmd/loss/ener.py
+++ b/deepmd/loss/ener.py
@@ -388,9 +388,9 @@ def __init__(
         limit_pref_ae: float = 0.0,
         start_pref_pf: float = 0.0,
         limit_pref_pf: float = 0.0,
-        relative_f: float = None,
+        relative_f: Optional[float] = None,
         enable_atom_ener_coeff: bool = False,
-        use_spin: list = None,
+        use_spin: Optional[list] = None,
     ) -> None:
         self.starter_learning_rate = starter_learning_rate
         self.start_pref_e = start_pref_e
diff --git a/deepmd/model/model_stat.py b/deepmd/model/model_stat.py
index 08bc162632..d2cc918b64 100644
--- a/deepmd/model/model_stat.py
+++ b/deepmd/model/model_stat.py
@@ -58,7 +58,7 @@ def make_stat_input(data, nbatches, merge_sys=True):
 
 
 def merge_sys_stat(all_stat):
-    first_key = list(all_stat.keys())[0]
+    first_key = next(iter(all_stat.keys()))
     nsys = len(all_stat[first_key])
     ret = defaultdict(list)
     for ii in range(nsys):
diff --git a/deepmd/nvnmd/entrypoints/wrap.py b/deepmd/nvnmd/entrypoints/wrap.py
index 896e1e0342..455dd999df 100644
--- a/deepmd/nvnmd/entrypoints/wrap.py
+++ b/deepmd/nvnmd/entrypoints/wrap.py
@@ -145,7 +145,7 @@ def wrap(self):
         nvnmd_cfg.save(nvnmd_cfg.config_file)
         head = self.wrap_head(nhs, nws)
         # output model
-        hs = [] + head
+        hs = [*head]
         for d in datas:
             hs.extend(d)
 
diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py
index ad1774908b..451632949e 100644
--- a/deepmd/train/run_options.py
+++ b/deepmd/train/run_options.py
@@ -45,7 +45,7 @@
 
 
 # http://patorjk.com/software/taag. Font:Big"
-WELCOME = (  # noqa
+WELCOME = (
     r" _____               _____   __  __  _____           _     _  _   ",
     r"|  __ \             |  __ \ |  \/  ||  __ \         | |   (_)| |  ",
     r"| |  | |  ___   ___ | |__) || \  / || |  | | ______ | | __ _ | |_ ",
@@ -71,7 +71,7 @@
     f"build float prec:     {global_float_prec}",
     f"build variant:        {GLOBAL_CONFIG['dp_variant']}",
     f"build with tf inc:    {GLOBAL_CONFIG['tf_include_dir']}",
-    f"build with tf lib:    {GLOBAL_CONFIG['tf_libs'].replace(';', _sep)}",  # noqa
+    f"build with tf lib:    {GLOBAL_CONFIG['tf_libs'].replace(';', _sep)}",
 )
 
 
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index b322336b39..1f7b78045b 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -250,7 +250,7 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix=""):
             if not self.multi_task_mode:
                 single_data = data
             else:
-                single_data = data[list(data.keys())[0]]
+                single_data = data[next(iter(data.keys()))]
             if self.ntypes < single_data.get_ntypes():
                 raise ValueError(
                     "The number of types of the training data is %d, but that of the "
@@ -373,7 +373,7 @@ def _build_network(self, data, suffix=""):
             if not self.multi_task_mode:
                 self._get_place_horders(data.get_data_dict())
             else:
-                self._get_place_horders(data[list(data.keys())[0]].get_data_dict())
+                self._get_place_horders(data[next(iter(data.keys()))].get_data_dict())
 
         self.place_holders["type"] = tf.placeholder(tf.int32, [None], name="t_type")
         self.place_holders["natoms_vec"] = tf.placeholder(
@@ -467,7 +467,7 @@ def _build_training(self):
                 var_list=trainable_variables,
                 name="train_step",
             )
-            train_ops = [apply_op] + self._extra_train_ops
+            train_ops = [apply_op, *self._extra_train_ops]
             self.train_op = tf.group(*train_ops)
         else:
             self.train_op = {}
@@ -479,7 +479,7 @@ def _build_training(self):
                     var_list=trainable_variables,
                     name=f"train_step_{fitting_key}",
                 )
-                train_ops = [apply_op] + self._extra_train_ops
+                train_ops = [apply_op, *self._extra_train_ops]
                 self.train_op[fitting_key] = tf.group(*train_ops)
         log.info("built training")
 
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 153824cb0d..f670feb578 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -177,7 +177,7 @@ def descrpt_se_a_args():
     doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
     doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
     doc_trainable = "If the parameters in the embedding net is trainable"
     doc_seed = "Random seed for parameter initialization"
@@ -263,7 +263,8 @@ def descrpt_se_a_tpe_args():
     doc_type_nlayer = "number of hidden layers of type embedding net"
     doc_numb_aparam = "dimension of atomic parameter. if set to a value > 0, the atomic parameters are embedded."
 
-    return descrpt_se_a_args() + [
+    return [
+        *descrpt_se_a_args(),
         Argument("type_nchanl", int, optional=True, default=4, doc=doc_type_nchanl),
         Argument("type_nlayer", int, optional=True, default=2, doc=doc_type_nlayer),
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
@@ -280,7 +281,7 @@ def descrpt_se_r_args():
     doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
     doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
     doc_trainable = "If the parameters in the embedding net are trainable"
     doc_seed = "Random seed for parameter initialization"
@@ -344,7 +345,7 @@ def descrpt_se_atten_common_args():
     doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
     doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
     doc_trainable = "If the parameters in the embedding net is trainable"
     doc_seed = "Random seed for parameter initialization"
@@ -397,7 +398,8 @@ def descrpt_se_atten_args():
     doc_smooth_type_embdding = "When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True."
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
 
-    return descrpt_se_atten_common_args() + [
+    return [
+        *descrpt_se_atten_common_args(),
         Argument(
             "stripped_type_embedding",
             bool,
@@ -422,7 +424,8 @@ def descrpt_se_atten_args():
 def descrpt_se_atten_v2_args():
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
 
-    return descrpt_se_atten_common_args() + [
+    return [
+        *descrpt_se_atten_common_args(),
         Argument(
             "set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero
         ),
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 0bfe6b7c70..0071da755c 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -618,9 +618,7 @@ def _check_type_map_consistency(self, type_map_list):
                 min_len = min([len(ii), len(ret)])
                 for idx in range(min_len):
                     if ii[idx] != ret[idx]:
-                        raise RuntimeError(
-                            f"inconsistent type map: {str(ret)} {str(ii)}"
-                        )
+                        raise RuntimeError(f"inconsistent type map: {ret!s} {ii!s}")
                 if len(ii) > len(ret):
                     ret = ii
         return ret
diff --git a/deepmd/utils/finetune.py b/deepmd/utils/finetune.py
index b641a6beca..4e597b1e05 100644
--- a/deepmd/utils/finetune.py
+++ b/deepmd/utils/finetune.py
@@ -56,7 +56,7 @@ def replace_model_params_with_pretrained_model(
         if i not in pretrained_type_map:
             out_line_type.append(i)
     assert not out_line_type, (
-        f"{str(out_line_type)} type(s) not contained in the pretrained model! "
+        f"{out_line_type!s} type(s) not contained in the pretrained model! "
         "Please choose another suitable one."
     )
     if cur_type_map != pretrained_type_map:
@@ -103,9 +103,7 @@ def replace_model_params_with_pretrained_model(
             # keep some params that are irrelevant to model structures (need to discuss) TODO
             if "trainable" in cur_para.keys():
                 target_para["trainable"] = cur_para["trainable"]
-            log.info(
-                f"Change the '{config_key}' from {str(cur_para)} to {str(target_para)}."
-            )
+            log.info(f"Change the '{config_key}' from {cur_para!s} to {target_para!s}.")
             jdata["model"][config_key] = target_para
 
     return jdata, cur_type_map
diff --git a/deepmd/utils/multi_init.py b/deepmd/utils/multi_init.py
index fd56f715c5..6c070dc67e 100644
--- a/deepmd/utils/multi_init.py
+++ b/deepmd/utils/multi_init.py
@@ -54,7 +54,7 @@ def replace_model_params_with_frz_multi_model(
         if i not in pretrained_type_map:
             out_line_type.append(i)
     assert not out_line_type, (
-        f"{str(out_line_type)} type(s) not contained in the pretrained model! "
+        f"{out_line_type!s} type(s) not contained in the pretrained model! "
         "Please choose another suitable one."
     )
     if cur_type_map != pretrained_type_map:
@@ -169,5 +169,5 @@ def _change_sub_config(jdata: Dict[str, Any], src_jdata: Dict[str, Any], sub_key
     # keep some params that are irrelevant to model structures (need to discuss) TODO
     if "trainable" in cur_para.keys():
         target_para["trainable"] = cur_para["trainable"]
-    log.info(f"Change the '{sub_key}' from {str(cur_para)} to {str(target_para)}.")
+    log.info(f"Change the '{sub_key}' from {cur_para!s} to {target_para!s}.")
     jdata[sub_key] = target_para
diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py
index a2fd81b85c..36d8c42f82 100644
--- a/deepmd/utils/network.py
+++ b/deepmd/utils/network.py
@@ -183,11 +183,11 @@ def embedding_net(
     References
     ----------
     .. [1] Kaiming  He,  Xiangyu  Zhang,  Shaoqing  Ren,  and  Jian  Sun. Identitymappings
-       in deep residual networks. InComputer Vision – ECCV 2016,pages 630–645. Springer
+       in deep residual networks. InComputer Vision - ECCV 2016,pages 630-645. Springer
        International Publishing, 2016.
     """
     input_shape = xx.get_shape().as_list()
-    outputs_size = [input_shape[1]] + network_size
+    outputs_size = [input_shape[1], *network_size]
 
     for ii in range(1, len(outputs_size)):
         w_initializer = tf.random_normal_initializer(
diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
index 5206f44089..a8e4bc329f 100644
--- a/deepmd/utils/path.py
+++ b/deepmd/utils/path.py
@@ -114,7 +114,7 @@ def __str__(self) -> str:
         """Represent string."""
 
     def __repr__(self) -> str:
-        return f"{type(self)} ({str(self)})"
+        return f"{type(self)} ({self!s})"
 
     def __eq__(self, other) -> bool:
         return str(self) == str(other)
diff --git a/deepmd/utils/spin.py b/deepmd/utils/spin.py
index c969a8062a..7820627649 100644
--- a/deepmd/utils/spin.py
+++ b/deepmd/utils/spin.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     List,
+    Optional,
 )
 
 from deepmd.env import (
@@ -24,9 +25,9 @@ class Spin:
 
     def __init__(
         self,
-        use_spin: List[bool] = None,
-        spin_norm: List[float] = None,
-        virtual_len: List[float] = None,
+        use_spin: Optional[List[bool]] = None,
+        spin_norm: Optional[List[float]] = None,
+        virtual_len: Optional[List[float]] = None,
     ) -> None:
         """Constructor."""
         self.use_spin = use_spin
diff --git a/deepmd_cli/main.py b/deepmd_cli/main.py
index 94ceb9888d..fceca239ea 100644
--- a/deepmd_cli/main.py
+++ b/deepmd_cli/main.py
@@ -312,7 +312,7 @@ def main_parser() -> argparse.ArgumentParser:
     # The table is composed of fifth-order polynomial coefficients and is assembled
     # from two sub-tables. The first table takes the step(parameter) as it's uniform
     # step, while the second table takes 10 * step as it\s uniform step
-    #  The range of the first table is automatically detected by deepmd-kit, while the
+    #  The range of the first table is automatically detected by deepmd-kit, while the
     # second table ranges from the first table's upper boundary(upper) to the
     # extrapolate(parameter) * upper.
     parser_compress = subparsers.add_parser(
diff --git a/pyproject.toml b/pyproject.toml
index b169a3b0eb..0ab9390efb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -166,24 +166,20 @@ profile = "black"
 force_grid_wrap = 1
 
 [tool.ruff]
-target-version = "py37"
 select = [
     "E", # errors
     "F", # pyflakes
     "D", # pydocstyle
     "UP", # pyupgrade
     "C4", # flake8-comprehensions
+    "RUF", # ruff
+    "NPY", # numpy
 ]
 ignore = [
     "E501", # line too long
     "F841", # local variable is assigned to but never used
     "E741", # ambiguous variable name
     "E402", # module level import not at top of file
-    "D413", # missing blank line after last section
-    "D416", # section name should end with a colon
-    "D203", # 1 blank line required before class docstring
-    "D107", # missing docstring in __init__
-    "D213", # multi-line docstring summary should start at the second line
     "D100", # TODO: missing docstring in public module
     "D101", # TODO: missing docstring in public class
     "D102", # TODO: missing docstring in public method
@@ -195,3 +191,6 @@ ignore = [
     "D404", # TODO: first word of the docstring should not be This
 ]
 ignore-init-module-imports = true
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
diff --git a/source/install/build_tf.py b/source/install/build_tf.py
index 043c4c6c81..15847d2c21 100755
--- a/source/install/build_tf.py
+++ b/source/install/build_tf.py
@@ -151,7 +151,7 @@ def __call__(self):
             if not self.exists:
                 raise RuntimeError(
                     f"Download {self.filename} from {self.url} failed! "
-                    f"You can manually download it to {str(self.path)} and "
+                    f"You can manually download it to {self.path!s} and "
                     "retry the script."
                 )
         self.post_process()
diff --git a/source/tests/common.py b/source/tests/common.py
index e5dd1281f3..f8ed23df03 100644
--- a/source/tests/common.py
+++ b/source/tests/common.py
@@ -919,9 +919,7 @@ def check_type_map_consistency(self, type_map_list):
                 min_len = min([len(ii), len(ret)])
                 for idx in range(min_len):
                     if ii[idx] != ret[idx]:
-                        raise RuntimeError(
-                            f"inconsistent type map: {str(ret)} {str(ii)}"
-                        )
+                        raise RuntimeError(f"inconsistent type map: {ret!s} {ii!s}")
                 if len(ii) > len(ret):
                     ret = ii
         return ret
diff --git a/source/tests/test_argument_parser.py b/source/tests/test_argument_parser.py
index 524499935c..bb8dd9ed62 100644
--- a/source/tests/test_argument_parser.py
+++ b/source/tests/test_argument_parser.py
@@ -184,7 +184,7 @@ def run_test(self, *, command: str, mapping: "TEST_DICT"):
                     )
 
         # test default values
-        cmd_args = [command] + required
+        cmd_args = [command, *required]
         buffer = StringIO()
         try:
             with redirect_stderr(buffer):
diff --git a/source/tests/test_descrpt_sea_ef_rot.py b/source/tests/test_descrpt_sea_ef_rot.py
index d94565af96..56cdb357b0 100644
--- a/source/tests/test_descrpt_sea_ef_rot.py
+++ b/source/tests/test_descrpt_sea_ef_rot.py
@@ -108,7 +108,7 @@ def make_test_data(self, nframes):
         one_type = []
         for ii in range(2, 2 + self.ntypes):
             one_type = one_type + [ii - 2 for jj in range(self.natoms[ii])]
-        np.random.shuffle(one_type)
+        np.random.shuffle(one_type)  # noqa: NPY002
         one_type = np.array(one_type, dtype=int).reshape([1, -1])
         dtype = np.tile(one_type, [nframes, 1])
         defield = np.random.random(dcoord.shape)
@@ -162,7 +162,7 @@ def test_rot_axis(self, suffix=""):
         )
         self.sess.run(tf.global_variables_initializer())
 
-        np.random.seed(0)
+        np.random.seed(0)  # noqa: NPY002
         # make test data
         nframes = 2
         dcoord, dbox, dtype, defield = self.make_test_data(nframes)
@@ -308,7 +308,7 @@ def test_rot_diff_axis(self, suffix=""):
         )
         self.sess.run(tf.global_variables_initializer())
 
-        np.random.seed(0)
+        np.random.seed(0)  # noqa: NPY002
         # make test data
         nframes = 2
         dcoord, dbox, dtype, defield = self.make_test_data(nframes)
@@ -423,7 +423,7 @@ def test_rot_field_corot(self, suffix=""):
         )
         self.sess.run(tf.global_variables_initializer())
 
-        np.random.seed(0)
+        np.random.seed(0)  # noqa: NPY002
         # make test data
         nframes = 2
         dcoord, dbox, dtype, defield = self.make_test_data(nframes)
diff --git a/source/tests/test_fitting_stat.py b/source/tests/test_fitting_stat.py
index 045348440e..ad62c89f2a 100644
--- a/source/tests/test_fitting_stat.py
+++ b/source/tests/test_fitting_stat.py
@@ -28,12 +28,12 @@ def _make_fake_data(sys_natoms, sys_nframes, avgs, stds):
         tmp_data_a = []
         for jj in range(ndof):
             tmp_data_f.append(
-                np.random.normal(
+                np.random.normal(  # noqa: NPY002
                     loc=avgs[jj], scale=stds[jj], size=(sys_nframes[ii], 1)
                 )
             )
             tmp_data_a.append(
-                np.random.normal(
+                np.random.normal(  # noqa: NPY002
                     loc=avgs[jj], scale=stds[jj], size=(sys_nframes[ii], sys_natoms[ii])
                 )
             )

From ab357f842a592f5484ed8075c22e588ca8ed622c Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 15 Sep 2023 01:21:16 -0400
Subject: [PATCH 30/63] add citation for fparam (#2821)

It seems missing...
---
 CITATIONS.bib   | 13 +++++++++++++
 doc/credits.rst |  7 +++++++
 2 files changed, 20 insertions(+)

diff --git a/CITATIONS.bib b/CITATIONS.bib
index 09f0f14acb..930b4fc2a5 100644
--- a/CITATIONS.bib
+++ b/CITATIONS.bib
@@ -105,6 +105,19 @@ @misc{Zhang_2022_DPA1
     doi = {10.48550/arXiv.2208.08236},
 }
 
+@article{Zhang_PhysPlasmas_2020_v27_p122704,
+    annote = {frame-specific parameters (e.g. electronic temperature)},
+    author = {Zhang, Yuzhi and Gao, Chang and Liu, Qianrui and Zhang, Linfeng and Wang, Han and Chen, Mohan},
+    title = {{Warm dense matter simulation via electron temperature dependent deep potential molecular dynamics}},
+    journal = {Phys. Plasmas},
+    volume = {27},
+    number = {12},
+    pages = {122704},
+    year = {2020},
+    month = {12},
+    doi = {10.1063/5.0023265},
+}
+
 @article{Zhang_PhysRevB_2020_v102_p41121,
     annote = {fit dipole},
     title={{Deep neural network for the dielectric response of insulators}},
diff --git a/doc/credits.rst b/doc/credits.rst
index fad06e63ba..3612b8ace8 100644
--- a/doc/credits.rst
+++ b/doc/credits.rst
@@ -49,6 +49,13 @@ Cite DeePMD-kit and methods
 
    Zhang_2022_DPA1
 
+- If frame-specific parameters (`fparam`, e.g. electronic temperature) is used,
+
+.. bibliography::
+   :filter: False
+
+   Zhang_PhysPlasmas_2020_v27_p122704
+
 - If fitting dipole,
 
 .. bibliography::

From 0d5737f658592ef11dea091beaabb9524c74fb63 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 15 Sep 2023 01:59:54 -0400
Subject: [PATCH 31/63] make pairwise_dprc model work with MPI (#2818)

- make `aparam` accepts `nall` instead of `nloc`. A variable
`fitting_attr/aparam_nall` (dtype=bool) controls the behavior.
  - enable this behavior for se_a_mask, by the way
- fix the shape of atomic energy, which is `nloc` instead of `nall`
- set the minimal `nloc` to 1, as when nloc=0, many OPs (such as
prod_force) throw floating-point exception
- fix backward map when the shape of `nloc` is padded

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/descriptor/se_a_mask.py                |     4 +-
 deepmd/entrypoints/freeze.py                  |     1 +
 deepmd/model/pairwise_dprc.py                 |     7 +-
 source/api_c/include/c_api.h                  |    23 +
 source/api_c/include/c_api_internal.h         |     2 +
 source/api_c/include/deepmd.hpp               |    32 +-
 source/api_c/src/c_api.cc                     |    17 +-
 .../tests/test_deeppot_a_fparam_aparam.cc     |    54 +
 source/api_cc/include/DeepPot.h               |    21 +
 source/api_cc/include/common.h                |    18 +-
 source/api_cc/src/DeepPot.cc                  |   113 +-
 source/api_cc/src/common.cc                   |    83 +-
 source/lib/src/pairwise.cc                    |     2 +-
 source/op/pairwise.cc                         |    10 +-
 source/tests/infer/pairwise_dprc.pbtxt        | 44536 ++++++++++++++++
 source/tests/test_pairwise_dprc.py            |   181 +
 16 files changed, 45014 insertions(+), 90 deletions(-)
 create mode 100644 source/tests/infer/pairwise_dprc.pbtxt

diff --git a/deepmd/descriptor/se_a_mask.py b/deepmd/descriptor/se_a_mask.py
index e4625922cc..b9181fd6b0 100644
--- a/deepmd/descriptor/se_a_mask.py
+++ b/deepmd/descriptor/se_a_mask.py
@@ -301,10 +301,12 @@ def build(
         dstd = self.dstd
 
         """
-        ``aparam'' shape is [nframes, natoms]
+        ``aparam'' shape is [nframes, nall]
         aparam[:, :] is the real/virtual sign for each atom.
         """
         aparam = input_dict["aparam"]
+        with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
+            t_aparam_nall = tf.constant(True, name="aparam_nall", dtype=tf.bool)
         self.mask = tf.cast(aparam, tf.int32)
         self.mask = tf.reshape(self.mask, [-1, natoms[1]])
 
diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py
index c39dd4ad61..11e0d55645 100755
--- a/deepmd/entrypoints/freeze.py
+++ b/deepmd/entrypoints/freeze.py
@@ -224,6 +224,7 @@ def _make_node_names(
             "spin_attr/ntypes_spin",
             "fitting_attr/dfparam",
             "fitting_attr/daparam",
+            "fitting_attr/aparam_nall",
         ]
     elif model_type == "dos":
         nodes += [
diff --git a/deepmd/model/pairwise_dprc.py b/deepmd/model/pairwise_dprc.py
index bf158434b0..a9e154096a 100644
--- a/deepmd/model/pairwise_dprc.py
+++ b/deepmd/model/pairwise_dprc.py
@@ -125,6 +125,7 @@ def build(
         with tf.variable_scope("fitting_attr" + suffix, reuse=reuse):
             t_dfparam = tf.constant(0, name="dfparam", dtype=tf.int32)
             t_daparam = tf.constant(1, name="daparam", dtype=tf.int32)
+            t_aparam_nall = tf.constant(True, name="aparam_nall", dtype=tf.bool)
         with tf.variable_scope("descrpt_attr" + suffix, reuse=reuse):
             t_ntypes = tf.constant(self.ntypes, name="ntypes", dtype=tf.int32)
             t_rcut = tf.constant(
@@ -222,12 +223,14 @@ def build(
         virial = virial_qm + virial_qmmm
         virial = tf.identity(virial, name="o_virial" + suffix)
 
+        backward_qm_map_nloc = tf.slice(backward_qm_map, [0, 0], [-1, natoms[0]])
+        backward_qmmm_map_nloc = tf.slice(backward_qmmm_map, [0, 0], [-1, natoms[0]])
         atom_ener_qm = gather_placeholder(
-            qm_dict["atom_ener"], backward_qm_map, placeholder=0.0
+            qm_dict["atom_ener"], backward_qm_map_nloc, placeholder=0.0
         )
         atom_ener_qmmm = tf.math.segment_sum(
             gather_placeholder(
-                qmmm_dict["atom_ener"], backward_qmmm_map, placeholder=0.0
+                qmmm_dict["atom_ener"], backward_qmmm_map_nloc, placeholder=0.0
             ),
             qmmm_frame_idx,
         )
diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h
index 6aa1268123..b0c030962a 100644
--- a/source/api_c/include/c_api.h
+++ b/source/api_c/include/c_api.h
@@ -2,6 +2,9 @@
 #pragma once
 #ifdef __cplusplus
 extern "C" {
+#else
+// for C99
+#include <stdbool.h>
 #endif
 
 /**
@@ -717,6 +720,16 @@ int DP_DeepPotGetDimFParam(DP_DeepPot* dp);
  */
 int DP_DeepPotGetDimAParam(DP_DeepPot* dp);
 
+/**
+ * @brief Check whether the atomic dimension of atomic parameters is nall
+ * instead of nloc.
+ *
+ * @param[in] dp The DP to use.
+ * @return true the atomic dimension of atomic parameters is nall
+ * @return false the atomic dimension of atomic parameters is nloc
+ */
+bool DP_DeepPotIsAParamNAll(DP_DeepPot* dp);
+
 /**
  * @brief Get the type map of a DP.
  * @param[in] dp The DP to use.
@@ -737,6 +750,16 @@ int DP_DeepPotModelDeviGetDimFParam(DP_DeepPotModelDevi* dp);
  */
 int DP_DeepPotModelDeviGetDimAParam(DP_DeepPotModelDevi* dp);
 
+/**
+ * @brief Check whether the atomic dimension of atomic parameters is nall
+ * instead of nloc.
+ *
+ * @param[in] dp The DP Model Deviation to use.
+ * @return true the atomic dimension of atomic parameters is nall
+ * @return false the atomic dimension of atomic parameters is nloc
+ */
+bool DP_DeepPotModelDeviIsAParamNAll(DP_DeepPotModelDevi* dp);
+
 /**
  * @brief The deep tensor.
  **/
diff --git a/source/api_c/include/c_api_internal.h b/source/api_c/include/c_api_internal.h
index 44bce2c696..85e1d2f421 100644
--- a/source/api_c/include/c_api_internal.h
+++ b/source/api_c/include/c_api_internal.h
@@ -41,6 +41,7 @@ struct DP_DeepPot {
   std::string exception;
   int dfparam;
   int daparam;
+  bool aparam_nall;
 };
 
 struct DP_DeepPotModelDevi {
@@ -51,6 +52,7 @@ struct DP_DeepPotModelDevi {
   std::string exception;
   int dfparam;
   int daparam;
+  bool aparam_nall;
 };
 
 struct DP_DeepTensor {
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index 532e01e805..71ff5b3dcc 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -597,6 +597,7 @@ class DeepPot {
     DP_CHECK_OK(DP_DeepPotCheckOK, dp);
     dfparam = DP_DeepPotGetDimFParam(dp);
     daparam = DP_DeepPotGetDimAParam(dp);
+    aparam_nall = DP_DeepPotIsAParamNAll(dp);
   };
 
   /**
@@ -771,9 +772,12 @@ class DeepPot {
     VALUETYPE *force_ = &force[0];
     VALUETYPE *virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
-    validate_fparam_aparam(nframes, natoms - nghost, fparam, aparam);
+    validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
+                           fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
-    tile_fparam_aparam(aparam_, nframes, (natoms - nghost) * daparam, aparam);
+    tile_fparam_aparam(aparam_, nframes,
+                       (aparam_nall ? natoms : (natoms - nghost)) * daparam,
+                       aparam);
     const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
     const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
@@ -842,9 +846,12 @@ class DeepPot {
     VALUETYPE *atomic_ener_ = &atom_energy[0];
     VALUETYPE *atomic_virial_ = &atom_virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
-    validate_fparam_aparam(nframes, natoms - nghost, fparam, aparam);
+    validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
+                           fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
-    tile_fparam_aparam(aparam_, nframes, (natoms - nghost) * daparam, aparam);
+    tile_fparam_aparam(aparam_, nframes,
+                       (aparam_nall ? natoms : (natoms - nghost)) * daparam,
+                       aparam);
     const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
     const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
@@ -1039,6 +1046,7 @@ class DeepPot {
   DP_DeepPot *dp;
   int dfparam;
   int daparam;
+  bool aparam_nall;
   template <typename VALUETYPE>
   void validate_fparam_aparam(const int &nframes,
                               const int &nloc,
@@ -1128,6 +1136,7 @@ class DeepPotModelDevi {
     numb_models = models.size();
     dfparam = DP_DeepPotModelDeviGetDimFParam(dp);
     daparam = DP_DeepPotModelDeviGetDimAParam(dp);
+    aparam_nall = DP_DeepPotModelDeviIsAParamNAll(dp);
   };
 
   /**
@@ -1173,9 +1182,12 @@ class DeepPotModelDevi {
     VALUETYPE *force_ = &force_flat[0];
     VALUETYPE *virial_ = &virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
-    validate_fparam_aparam(nframes, natoms - nghost, fparam, aparam);
+    validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
+                           fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
-    tile_fparam_aparam(aparam_, nframes, (natoms - nghost) * daparam, aparam);
+    tile_fparam_aparam(aparam_, nframes,
+                       (aparam_nall ? natoms : (natoms - nghost)) * daparam,
+                       aparam);
     const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
     const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
@@ -1250,9 +1262,12 @@ class DeepPotModelDevi {
     VALUETYPE *atomic_ener_ = &atom_energy_flat[0];
     VALUETYPE *atomic_virial_ = &atom_virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
-    validate_fparam_aparam(nframes, natoms - nghost, fparam, aparam);
+    validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
+                           fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
-    tile_fparam_aparam(aparam_, nframes, (natoms - nghost) * daparam, aparam);
+    tile_fparam_aparam(aparam_, nframes,
+                       (aparam_nall ? natoms : (natoms - nghost)) * daparam,
+                       aparam);
     const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
     const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
@@ -1448,6 +1463,7 @@ class DeepPotModelDevi {
   int numb_models;
   int dfparam;
   int daparam;
+  bool aparam_nall;
   template <typename VALUETYPE>
   void validate_fparam_aparam(const int &nframes,
                               const int &nloc,
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index 1e2ee47b8b..9d1ed7d323 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -29,6 +29,7 @@ DP_DeepPot::DP_DeepPot() {}
 DP_DeepPot::DP_DeepPot(deepmd::DeepPot& dp) : dp(dp) {
   dfparam = dp.dim_fparam();
   daparam = dp.dim_aparam();
+  aparam_nall = dp.is_aparam_nall();
 }
 
 DP_DeepPot* DP_NewDeepPot(const char* c_model) {
@@ -65,6 +66,7 @@ DP_DeepPotModelDevi::DP_DeepPotModelDevi(deepmd::DeepPotModelDevi& dp)
     : dp(dp) {
   dfparam = dp.dim_fparam();
   daparam = dp.dim_aparam();
+  aparam_nall = dp.is_aparam_nall();
 }
 
 DP_DeepPotModelDevi* DP_NewDeepPotModelDevi(const char** c_models,
@@ -249,7 +251,10 @@ inline void DP_DeepPotComputeNList_variant(DP_DeepPot* dp,
   }
   std::vector<VALUETYPE> aparam_;
   if (aparam) {
-    aparam_.assign(aparam, aparam + nframes * (natoms - nghost) * dp->daparam);
+    aparam_.assign(aparam,
+                   aparam + nframes *
+                                (dp->aparam_nall ? natoms : (natoms - nghost)) *
+                                dp->daparam);
   }
   std::vector<double> e;
   std::vector<VALUETYPE> f, v, ae, av;
@@ -433,7 +438,9 @@ void DP_DeepPotModelDeviComputeNList_variant(DP_DeepPotModelDevi* dp,
   }
   std::vector<VALUETYPE> aparam_;
   if (aparam) {
-    aparam_.assign(aparam, aparam + (natoms - nghost) * dp->daparam);
+    aparam_.assign(
+        aparam,
+        aparam + (dp->aparam_nall ? natoms : (natoms - nghost)) * dp->daparam);
   }
   // different from DeepPot
   std::vector<double> e;
@@ -1031,6 +1038,8 @@ int DP_DeepPotGetDimFParam(DP_DeepPot* dp) { return dp->dfparam; }
 
 int DP_DeepPotGetDimAParam(DP_DeepPot* dp) { return dp->daparam; }
 
+bool DP_DeepPotIsAParamNAll(DP_DeepPot* dp) { return dp->aparam_nall; }
+
 const char* DP_DeepPotCheckOK(DP_DeepPot* dp) {
   return string_to_char(dp->exception);
 }
@@ -1133,6 +1142,10 @@ int DP_DeepPotModelDeviGetDimAParam(DP_DeepPotModelDevi* dp) {
   return dp->daparam;
 }
 
+bool DP_DeepPotModelDeviIsAParamNAll(DP_DeepPotModelDevi* dp) {
+  return dp->aparam_nall;
+}
+
 const char* DP_DeepPotModelDeviCheckOK(DP_DeepPotModelDevi* dp) {
   return string_to_char(dp->exception);
 }
diff --git a/source/api_c/tests/test_deeppot_a_fparam_aparam.cc b/source/api_c/tests/test_deeppot_a_fparam_aparam.cc
index f4cdc42e72..a728ede22d 100644
--- a/source/api_c/tests/test_deeppot_a_fparam_aparam.cc
+++ b/source/api_c/tests/test_deeppot_a_fparam_aparam.cc
@@ -380,3 +380,57 @@ TYPED_TEST(TestInferDeepPotAFParamAParam, cpu_lmp_nlist_2rc) {
     EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
   }
 }
+
+template <class VALUETYPE>
+class TestInferAParamNAll : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 0, 0, 0, 0, 0};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+  int natoms = 6;
+
+  deepmd::hpp::DeepPot dp;
+
+  void SetUp() override {
+    std::string file_name = "../../tests/infer/pairwise_dprc.pbtxt";
+    deepmd::hpp::convert_pbtxt_to_pb(file_name, "pairwise_dprc.pb");
+    dp.init("pairwise_dprc.pb");
+  };
+
+  void TearDown() override { remove("fparam_aparam.pb"); };
+};
+
+TYPED_TEST_SUITE(TestInferAParamNAll, ValueTypes);
+
+TYPED_TEST(TestInferAParamNAll, cpu_lmp_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  int& natoms = this->natoms;
+  deepmd::hpp::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  // nall aparam
+  std::vector<VALUETYPE> aparam_cpy(nall, 0);
+  // for some reason all QM atoms do not work
+  aparam_cpy[0] = 1;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::hpp::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+
+  double ener;
+  std::vector<VALUETYPE> force_, virial;
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0, std::vector<VALUETYPE>(), aparam_cpy);
+  // just check if the interface accepts nall aparam; no interest with results
+}
diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h
index 4db012ea4f..7c4a0afe10 100644
--- a/source/api_cc/include/DeepPot.h
+++ b/source/api_cc/include/DeepPot.h
@@ -291,6 +291,16 @@ class DeepPot {
    **/
   void get_type_map(std::string& type_map);
 
+  /**
+   * @brief Get whether the atom dimension of aparam is nall instead of fparam.
+   * @param[out] aparam_nall whether the atom dimension of aparam is nall
+   *instead of fparam.
+   **/
+  bool is_aparam_nall() const {
+    assert(inited);
+    return aparam_nall;
+  };
+
  private:
   tensorflow::Session* session;
   int num_intra_nthreads, num_inter_nthreads;
@@ -309,6 +319,7 @@ class DeepPot {
   int ntypes_spin;
   int dfparam;
   int daparam;
+  bool aparam_nall;
   /**
    * @brief Validate the size of frame and atomic parameters.
    * @param[in] nframes The number of frames.
@@ -572,6 +583,15 @@ class DeepPotModelDevi {
   void compute_relative_std_f(std::vector<VALUETYPE>& std,
                               const std::vector<VALUETYPE>& avg,
                               const VALUETYPE eps);
+  /**
+   * @brief Get whether the atom dimension of aparam is nall instead of fparam.
+   * @param[out] aparam_nall whether the atom dimension of aparam is nall
+   *instead of fparam.
+   **/
+  bool is_aparam_nall() const {
+    assert(inited);
+    return aparam_nall;
+  };
 
  private:
   unsigned numb_models;
@@ -592,6 +612,7 @@ class DeepPotModelDevi {
   int ntypes_spin;
   int dfparam;
   int daparam;
+  bool aparam_nall;
   template <typename VALUETYPE>
   void validate_fparam_aparam(const int& nloc,
                               const std::vector<VALUETYPE>& fparam,
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index 2bcb3cc77f..481e09cc89 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -90,7 +90,8 @@ void select_real_atoms_coord(std::vector<VALUETYPE>& dcoord,
                              const int& ntypes,
                              const int& nframes,
                              const int& daparam,
-                             const int& nall);
+                             const int& nall,
+                             const bool aparam_nall = false);
 
 /**
  * @brief Apply the given map to a vector.
@@ -228,6 +229,8 @@ int session_get_dtype(tensorflow::Session* session,
  * @param[in] aparam_ Atom parameters.
  * @param[in] atommap Atom map.
  * @param[in] scope The scope of the tensors.
+ * @param[in] aparam_nall Whether the atomic dimesion of atomic parameters is
+ * nall.
  */
 template <typename MODELTYPE, typename VALUETYPE>
 int session_input_tensors(
@@ -240,7 +243,8 @@ int session_input_tensors(
     const std::vector<VALUETYPE>& fparam_,
     const std::vector<VALUETYPE>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope = "");
+    const std::string scope = "",
+    const bool aparam_nall = false);
 
 /**
  * @brief Get input tensors.
@@ -255,6 +259,8 @@ int session_input_tensors(
  * @param[in] nghost Number of ghost atoms.
  * @param[in] ago Update the internal neighbour list if ago is 0.
  * @param[in] scope The scope of the tensors.
+ * @param[in] aparam_nall Whether the atomic dimesion of atomic parameters is
+ * nall.
  */
 template <typename MODELTYPE, typename VALUETYPE>
 int session_input_tensors(
@@ -269,7 +275,8 @@ int session_input_tensors(
     const deepmd::AtomMap& atommap,
     const int nghost,
     const int ago,
-    const std::string scope = "");
+    const std::string scope = "",
+    const bool aparam_nall = false);
 
 /**
  * @brief Get input tensors for mixed type.
@@ -285,6 +292,8 @@ int session_input_tensors(
  * @param[in] nghost Number of ghost atoms.
  * @param[in] ago Update the internal neighbour list if ago is 0.
  * @param[in] scope The scope of the tensors.
+ * @param[in] aparam_nall Whether the atomic dimesion of atomic parameters is
+ * nall.
  */
 template <typename MODELTYPE, typename VALUETYPE>
 int session_input_tensors_mixed_type(
@@ -298,7 +307,8 @@ int session_input_tensors_mixed_type(
     const std::vector<VALUETYPE>& fparam_,
     const std::vector<VALUETYPE>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope = "");
+    const std::string scope = "",
+    const bool aparam_nall = false);
 
 /**
  * @brief Read model file to a string.
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index e20989eb9d..785ed00cb8 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -479,6 +479,15 @@ void DeepPot::init(const std::string& model,
   if (daparam < 0) {
     daparam = 0;
   }
+  if (daparam > 0) {
+    try {
+      aparam_nall = get_scalar<bool>("fitting_attr/aparam_nall");
+    } catch (deepmd::deepmd_exception) {
+      aparam_nall = false;
+    }
+  } else {
+    aparam_nall = false;
+  }
   model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
   inited = true;
 
@@ -571,23 +580,25 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   assert(nloc == atommap.get_type().size());
   std::vector<VALUETYPE> fparam;
   std::vector<VALUETYPE> aparam;
-  validate_fparam_aparam(nframes, nloc, fparam_, aparam_);
+  validate_fparam_aparam(nframes, (aparam_nall ? nall : nloc), fparam_,
+                         aparam_);
   tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam, nframes, nloc * daparam, aparam_);
+  tile_fparam_aparam(aparam, nframes, (aparam_nall ? nall : nloc) * daparam,
+                     aparam_);
 
   std::vector<std::pair<std::string, Tensor>> input_tensors;
 
   if (dtype == tensorflow::DT_DOUBLE) {
-    int ret =
-        session_input_tensors<double>(input_tensors, dcoord_, ntypes, datype_,
-                                      dbox, cell_size, fparam, aparam, atommap);
+    int ret = session_input_tensors<double>(input_tensors, dcoord_, ntypes,
+                                            datype_, dbox, cell_size, fparam,
+                                            aparam, atommap, "", aparam_nall);
     assert(ret == nloc);
     run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
                       nframes);
   } else {
-    int ret =
-        session_input_tensors<float>(input_tensors, dcoord_, ntypes, datype_,
-                                     dbox, cell_size, fparam, aparam, atommap);
+    int ret = session_input_tensors<float>(input_tensors, dcoord_, ntypes,
+                                           datype_, dbox, cell_size, fparam,
+                                           aparam, atommap, "", aparam_nall);
     assert(ret == nloc);
     run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
                      nframes);
@@ -650,9 +661,12 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   int nframes = dcoord_.size() / nall / 3;
   std::vector<VALUETYPE> fparam;
   std::vector<VALUETYPE> aparam_;
-  validate_fparam_aparam(nframes, nall - nghost, fparam_, aparam__);
+  validate_fparam_aparam(nframes, (aparam_nall ? nall : (nall - nghost)),
+                         fparam_, aparam__);
   tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam_, nframes, (nall - nghost) * daparam, aparam__);
+  tile_fparam_aparam(aparam_, nframes,
+                     (aparam_nall ? nall : (nall - nghost)) * daparam,
+                     aparam__);
 
   // select real atoms
   std::vector<VALUETYPE> dcoord, dforce, aparam;
@@ -660,7 +674,7 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   int nghost_real, nall_real, nloc_real;
   select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
                           nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall);
+                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
 
   // internal nlist
   if (ago == 0) {
@@ -752,16 +766,16 @@ void DeepPot::compute_inner(ENERGYVTYPE& dener,
     nlist_data.make_inlist(nlist);
   }
   if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(input_tensors, dcoord_, ntypes,
-                                            datype_, dbox, nlist, fparam,
-                                            aparam, atommap, nghost, ago);
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam,
+        atommap, nghost, ago, "", aparam_nall);
     assert(nloc == ret);
     run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
                       nframes, nghost);
   } else {
-    int ret = session_input_tensors<float>(input_tensors, dcoord_, ntypes,
-                                           datype_, dbox, nlist, fparam, aparam,
-                                           atommap, nghost, ago);
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam,
+        atommap, nghost, ago, "", aparam_nall);
     assert(nloc == ret);
     run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
                      nframes, nghost);
@@ -839,15 +853,15 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   std::vector<std::pair<std::string, Tensor>> input_tensors;
 
   if (dtype == tensorflow::DT_DOUBLE) {
-    int nloc =
-        session_input_tensors<double>(input_tensors, dcoord_, ntypes, datype_,
-                                      dbox, cell_size, fparam, aparam, atommap);
+    int nloc = session_input_tensors<double>(input_tensors, dcoord_, ntypes,
+                                             datype_, dbox, cell_size, fparam,
+                                             aparam, atommap, "", aparam_nall);
     run_model<double>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
                       session, input_tensors, atommap, nframes);
   } else {
-    int nloc =
-        session_input_tensors<float>(input_tensors, dcoord_, ntypes, datype_,
-                                     dbox, cell_size, fparam, aparam, atommap);
+    int nloc = session_input_tensors<float>(input_tensors, dcoord_, ntypes,
+                                            datype_, dbox, cell_size, fparam,
+                                            aparam, atommap, "", aparam_nall);
     run_model<float>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
                      session, input_tensors, atommap, nframes);
   }
@@ -920,9 +934,11 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   int nloc = nall - nghost;
   std::vector<VALUETYPE> fparam;
   std::vector<VALUETYPE> aparam_;
-  validate_fparam_aparam(nframes, nloc, fparam_, aparam__);
+  validate_fparam_aparam(nframes, (aparam_nall ? nall : nloc), fparam_,
+                         aparam__);
   tile_fparam_aparam(fparam, nframes, dfparam, fparam_);
-  tile_fparam_aparam(aparam_, nframes, nloc * daparam, aparam__);
+  tile_fparam_aparam(aparam_, nframes, (aparam_nall ? nall : nloc) * daparam,
+                     aparam__);
   std::vector<std::pair<std::string, Tensor>> input_tensors;
   // select real atoms
   std::vector<VALUETYPE> dcoord, dforce, aparam, datom_energy, datom_virial;
@@ -930,7 +946,7 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   int nghost_real, nall_real, nloc_real;
   select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
                           nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall);
+                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
 
   if (ago == 0) {
     atommap = deepmd::AtomMap(datype.begin(), datype.begin() + nloc_real);
@@ -943,16 +959,16 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   }
 
   if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(input_tensors, dcoord, ntypes,
-                                            datype, dbox, nlist, fparam, aparam,
-                                            atommap, nghost_real, ago);
+    int ret = session_input_tensors<double>(
+        input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
+        atommap, nghost_real, ago, "", aparam_nall);
     assert(nloc_real == ret);
     run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
                       session, input_tensors, atommap, nframes, nghost_real);
   } else {
-    int ret = session_input_tensors<float>(input_tensors, dcoord, ntypes,
-                                           datype, dbox, nlist, fparam, aparam,
-                                           atommap, nghost_real, ago);
+    int ret = session_input_tensors<float>(
+        input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam,
+        atommap, nghost_real, ago, "", aparam_nall);
     assert(nloc_real == ret);
     run_model<float>(dener, dforce, dvirial, datom_energy, datom_virial,
                      session, input_tensors, atommap, nframes, nghost_real);
@@ -1055,14 +1071,14 @@ void DeepPot::compute_mixed_type(ENERGYVTYPE& dener,
   if (dtype == tensorflow::DT_DOUBLE) {
     int ret = session_input_tensors_mixed_type<double>(
         input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap);
+        fparam, aparam, atommap, "", aparam_nall);
     assert(ret == nloc);
     run_model<double>(dener, dforce_, dvirial, session, input_tensors, atommap,
                       nframes);
   } else {
     int ret = session_input_tensors_mixed_type<float>(
         input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap);
+        fparam, aparam, atommap, "", aparam_nall);
     assert(ret == nloc);
     run_model<float>(dener, dforce_, dvirial, session, input_tensors, atommap,
                      nframes);
@@ -1139,13 +1155,13 @@ void DeepPot::compute_mixed_type(ENERGYVTYPE& dener,
   if (dtype == tensorflow::DT_DOUBLE) {
     int nloc = session_input_tensors_mixed_type<double>(
         input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap);
+        fparam, aparam, atommap, "", aparam_nall);
     run_model<double>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
                       session, input_tensors, atommap, nframes);
   } else {
     int nloc = session_input_tensors_mixed_type<float>(
         input_tensors, nframes, dcoord_, ntypes, datype_, dbox, cell_size,
-        fparam, aparam, atommap);
+        fparam, aparam, atommap, "", aparam_nall);
     run_model<float>(dener, dforce_, dvirial, datom_energy_, datom_virial_,
                      session, input_tensors, atommap, nframes);
   }
@@ -1307,6 +1323,15 @@ void DeepPotModelDevi::init(const std::vector<std::string>& models,
   if (daparam < 0) {
     daparam = 0;
   }
+  if (daparam > 0) {
+    try {
+      aparam_nall = get_scalar<bool>("fitting_attr/aparam_nall");
+    } catch (deepmd::deepmd_exception) {
+      aparam_nall = false;
+    }
+  } else {
+    aparam_nall = false;
+  }
   model_type = get_scalar<STRINGTYPE>("model_attr/model_type");
   // rcut = get_rcut();
   // cell_size = rcut;
@@ -1425,7 +1450,7 @@ void DeepPotModelDevi::compute(std::vector<ENERGYTYPE>& all_energy,
   int nall = dcoord_.size() / 3;
   int nframes = 1;
   int nloc = nall - nghost;
-  validate_fparam_aparam(nloc, fparam, aparam_);
+  validate_fparam_aparam((aparam_nall ? nall : nloc), fparam, aparam_);
   std::vector<std::pair<std::string, Tensor>> input_tensors;
 
   // select real atoms
@@ -1434,7 +1459,7 @@ void DeepPotModelDevi::compute(std::vector<ENERGYTYPE>& all_energy,
   int nghost_real, nall_real, nloc_real;
   select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
                           nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall);
+                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
 
   // agp == 0 means that the LAMMPS nbor list has been updated
   if (ago == 0) {
@@ -1450,11 +1475,11 @@ void DeepPotModelDevi::compute(std::vector<ENERGYTYPE>& all_energy,
   if (dtype == tensorflow::DT_DOUBLE) {
     ret = session_input_tensors<double>(input_tensors, dcoord, ntypes, datype,
                                         dbox, nlist, fparam, aparam, atommap,
-                                        nghost_real, ago);
+                                        nghost_real, ago, "", aparam_nall);
   } else {
     ret = session_input_tensors<float>(input_tensors, dcoord, ntypes, datype,
                                        dbox, nlist, fparam, aparam, atommap,
-                                       nghost_real, ago);
+                                       nghost_real, ago, "", aparam_nall);
   }
   all_energy.resize(numb_models);
   all_force.resize(numb_models);
@@ -1523,7 +1548,7 @@ void DeepPotModelDevi::compute(
   int nframes = 1;
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
-  validate_fparam_aparam(nloc, fparam, aparam_);
+  validate_fparam_aparam((aparam_nall ? nall : nloc), fparam, aparam_);
   std::vector<std::pair<std::string, Tensor>> input_tensors;
 
   // select real atoms
@@ -1532,7 +1557,7 @@ void DeepPotModelDevi::compute(
   int nghost_real, nall_real, nloc_real;
   select_real_atoms_coord(dcoord, datype, aparam, nghost_real, fwd_map, bkw_map,
                           nall_real, nloc_real, dcoord_, datype_, aparam_,
-                          nghost, ntypes, nframes, daparam, nall);
+                          nghost, ntypes, nframes, daparam, nall, aparam_nall);
   // agp == 0 means that the LAMMPS nbor list has been updated
 
   if (ago == 0) {
@@ -1548,11 +1573,11 @@ void DeepPotModelDevi::compute(
   if (dtype == tensorflow::DT_DOUBLE) {
     ret = session_input_tensors<double>(input_tensors, dcoord, ntypes, datype,
                                         dbox, nlist, fparam, aparam, atommap,
-                                        nghost_real, ago);
+                                        nghost_real, ago, "", aparam_nall);
   } else {
     ret = session_input_tensors<float>(input_tensors, dcoord, ntypes, datype,
                                        dbox, nlist, fparam, aparam, atommap,
-                                       nghost_real, ago);
+                                       nghost_real, ago, "", aparam_nall);
   }
 
   all_energy.resize(numb_models);
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 43412c4c43..0e2526414d 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -164,7 +164,8 @@ void deepmd::select_real_atoms_coord(std::vector<VALUETYPE>& dcoord,
                                      const int& ntypes,
                                      const int& nframes,
                                      const int& daparam,
-                                     const int& nall) {
+                                     const int& nall,
+                                     const bool aparam_nall) {
   select_real_atoms(fwd_map, bkw_map, nghost_real, dcoord_, datype_, nghost,
                     ntypes);
   // resize to nall_real
@@ -177,9 +178,10 @@ void deepmd::select_real_atoms_coord(std::vector<VALUETYPE>& dcoord,
   select_map<int>(datype, datype_, fwd_map, 1);
   // aparam
   if (daparam > 0) {
-    aparam.resize(nframes * nloc_real);
-    select_map<VALUETYPE>(aparam, aparam_, fwd_map, daparam, nframes, nloc_real,
-                          nall - nghost);
+    aparam.resize(nframes * (aparam_nall ? nall_real : nloc_real));
+    select_map<VALUETYPE>(aparam, aparam_, fwd_map, daparam, nframes,
+                          (aparam_nall ? nall_real : nloc_real),
+                          (aparam_nall ? nall : (nall - nghost)));
   }
 }
 
@@ -199,7 +201,8 @@ template void deepmd::select_real_atoms_coord<double>(
     const int& ntypes,
     const int& nframes,
     const int& daparam,
-    const int& nall);
+    const int& nall,
+    const bool aparam_nall);
 
 template void deepmd::select_real_atoms_coord<float>(
     std::vector<float>& dcoord,
@@ -217,7 +220,8 @@ template void deepmd::select_real_atoms_coord<float>(
     const int& ntypes,
     const int& nframes,
     const int& daparam,
-    const int& nall);
+    const int& nall,
+    const bool aparam_nall);
 
 void deepmd::NeighborListData::copy_from_nlist(const InputNlist& inlist) {
   int inum = inlist.inum;
@@ -374,7 +378,8 @@ int deepmd::session_input_tensors(
     const std::vector<VALUETYPE>& fparam_,
     const std::vector<VALUETYPE>& aparam__,
     const deepmd::AtomMap& atommap,
-    const std::string scope) {
+    const std::string scope,
+    const bool aparam_nall) {
   int nframes = dcoord_.size() / 3 / datype_.size();
   int nall = datype_.size();
   int nloc = nall;
@@ -440,8 +445,10 @@ int deepmd::session_input_tensors(
   std::vector<VALUETYPE> dcoord(dcoord_);
   atommap.forward<VALUETYPE>(dcoord.begin(), dcoord_.begin(), 3, nframes, nall);
   std::vector<VALUETYPE> aparam_(aparam__);
-  atommap.forward<VALUETYPE>(aparam_.begin(), aparam__.begin(),
-                             aparam__.size() / nframes / nloc, nframes, nloc);
+  atommap.forward<VALUETYPE>(
+      aparam_.begin(), aparam__.begin(),
+      aparam__.size() / nframes / (aparam_nall ? nall : nloc), nframes,
+      (aparam_nall ? nall : nloc));
 
   for (int ii = 0; ii < nframes; ++ii) {
     for (int jj = 0; jj < nall * 3; ++jj) {
@@ -511,7 +518,8 @@ int deepmd::session_input_tensors(
     const deepmd::AtomMap& atommap,
     const int nghost,
     const int ago,
-    const std::string scope) {
+    const std::string scope,
+    const bool aparam_nall) {
   int nframes = dcoord_.size() / 3 / datype_.size();
   int nall = datype_.size();
   int nloc = nall - nghost;
@@ -573,8 +581,10 @@ int deepmd::session_input_tensors(
   std::vector<VALUETYPE> dcoord(dcoord_);
   atommap.forward<VALUETYPE>(dcoord.begin(), dcoord_.begin(), 3, nframes, nall);
   std::vector<VALUETYPE> aparam_(aparam__);
-  atommap.forward<VALUETYPE>(aparam_.begin(), aparam__.begin(),
-                             aparam__.size() / nframes / nloc, nframes, nloc);
+  atommap.forward<VALUETYPE>(
+      aparam_.begin(), aparam__.begin(),
+      aparam__.size() / nframes / (aparam_nall ? nall : nloc), nframes,
+      (aparam_nall ? nall : nloc));
 
   for (int ii = 0; ii < nframes; ++ii) {
     for (int jj = 0; jj < nall * 3; ++jj) {
@@ -645,7 +655,8 @@ int deepmd::session_input_tensors_mixed_type(
     const std::vector<VALUETYPE>& fparam_,
     const std::vector<VALUETYPE>& aparam__,
     const deepmd::AtomMap& atommap,
-    const std::string scope) {
+    const std::string scope,
+    const bool aparam_nall) {
   int nall = datype_.size() / nframes;
   int nloc = nall;
   assert(nall * 3 * nframes == dcoord_.size());
@@ -706,8 +717,10 @@ int deepmd::session_input_tensors_mixed_type(
   std::vector<VALUETYPE> dcoord(dcoord_);
   atommap.forward<VALUETYPE>(dcoord.begin(), dcoord_.begin(), 3, nframes, nall);
   std::vector<VALUETYPE> aparam_(aparam__);
-  atommap.forward<VALUETYPE>(aparam_.begin(), aparam__.begin(),
-                             aparam__.size() / nframes / nloc, nframes, nloc);
+  atommap.forward<VALUETYPE>(
+      aparam_.begin(), aparam__.begin(),
+      aparam__.size() / nframes / (aparam_nall ? nall : nloc), nframes,
+      (aparam_nall ? nall : nloc));
 
   for (int ii = 0; ii < nframes; ++ii) {
     for (int jj = 0; jj < nall * 3; ++jj) {
@@ -919,6 +932,10 @@ template int deepmd::session_get_scalar<int>(Session*,
                                              const std::string,
                                              const std::string);
 
+template bool deepmd::session_get_scalar<bool>(Session*,
+                                               const std::string,
+                                               const std::string);
+
 template void deepmd::session_get_vector<int>(std::vector<int>&,
                                               Session*,
                                               const std::string,
@@ -1092,7 +1109,8 @@ template int deepmd::session_input_tensors<double, double>(
     const std::vector<double>& fparam_,
     const std::vector<double>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 template int deepmd::session_input_tensors<float, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const std::vector<double>& dcoord_,
@@ -1103,7 +1121,8 @@ template int deepmd::session_input_tensors<float, double>(
     const std::vector<double>& fparam_,
     const std::vector<double>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 
 template int deepmd::session_input_tensors<double, float>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
@@ -1115,7 +1134,8 @@ template int deepmd::session_input_tensors<double, float>(
     const std::vector<float>& fparam_,
     const std::vector<float>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 template int deepmd::session_input_tensors<float, float>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const std::vector<float>& dcoord_,
@@ -1126,7 +1146,8 @@ template int deepmd::session_input_tensors<float, float>(
     const std::vector<float>& fparam_,
     const std::vector<float>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 
 template int deepmd::session_input_tensors<double, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
@@ -1140,7 +1161,8 @@ template int deepmd::session_input_tensors<double, double>(
     const deepmd::AtomMap& atommap,
     const int nghost,
     const int ago,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 template int deepmd::session_input_tensors<float, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const std::vector<double>& dcoord_,
@@ -1153,7 +1175,8 @@ template int deepmd::session_input_tensors<float, double>(
     const deepmd::AtomMap& atommap,
     const int nghost,
     const int ago,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 
 template int deepmd::session_input_tensors<double, float>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
@@ -1167,7 +1190,8 @@ template int deepmd::session_input_tensors<double, float>(
     const deepmd::AtomMap& atommap,
     const int nghost,
     const int ago,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 template int deepmd::session_input_tensors<float, float>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const std::vector<float>& dcoord_,
@@ -1180,7 +1204,8 @@ template int deepmd::session_input_tensors<float, float>(
     const deepmd::AtomMap& atommap,
     const int nghost,
     const int ago,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 
 template int deepmd::session_input_tensors_mixed_type<double, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
@@ -1193,7 +1218,8 @@ template int deepmd::session_input_tensors_mixed_type<double, double>(
     const std::vector<double>& fparam_,
     const std::vector<double>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 template int deepmd::session_input_tensors_mixed_type<float, double>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const int& nframes,
@@ -1205,7 +1231,8 @@ template int deepmd::session_input_tensors_mixed_type<float, double>(
     const std::vector<double>& fparam_,
     const std::vector<double>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 
 template int deepmd::session_input_tensors_mixed_type<double, float>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
@@ -1218,7 +1245,8 @@ template int deepmd::session_input_tensors_mixed_type<double, float>(
     const std::vector<float>& fparam_,
     const std::vector<float>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 template int deepmd::session_input_tensors_mixed_type<float, float>(
     std::vector<std::pair<std::string, tensorflow::Tensor>>& input_tensors,
     const int& nframes,
@@ -1230,7 +1258,8 @@ template int deepmd::session_input_tensors_mixed_type<float, float>(
     const std::vector<float>& fparam_,
     const std::vector<float>& aparam_,
     const deepmd::AtomMap& atommap,
-    const std::string scope);
+    const std::string scope,
+    const bool aparam_nall);
 
 void deepmd::print_summary(const std::string& pre) {
   int num_intra_nthreads, num_inter_nthreads;
diff --git a/source/lib/src/pairwise.cc b/source/lib/src/pairwise.cc
index 428e92baa4..3fea27bd71 100644
--- a/source/lib/src/pairwise.cc
+++ b/source/lib/src/pairwise.cc
@@ -95,7 +95,7 @@ void deepmd::dprc_pairwise_map_cpu(
   // (3, 4, 8, 9, -1, 10, -1)
   forward_qmmm_map.resize((nfragments - 1) * map_size);
   std::fill(forward_qmmm_map.begin(), forward_qmmm_map.end(), -1);
-  int nqm_real;
+  int nqm_real = nloc;  // init for nfragments = 1
   for (int ii = 0; ii < nfragments - 1; ++ii) {
     // real
     for (int jj = 0, kk = 0; jj < nqm; ++jj) {
diff --git a/source/op/pairwise.cc b/source/op/pairwise.cc
index dfcfce6736..ee55c3dff3 100644
--- a/source/op/pairwise.cc
+++ b/source/op/pairwise.cc
@@ -78,7 +78,7 @@ class PairwiseIdxOp : public OpKernel {
       nghost_qmmm.push_back(nghost_qmmm_ii);
       nframes_qmmm.push_back(backward_qmmm_map.size() / nall);
     }
-    int max_nloc_qm = 0, max_nloc_qmmm = 0, max_nghost_qm = 0,
+    int max_nloc_qm = 1, max_nloc_qmmm = 1, max_nghost_qm = 0,
         max_nghost_qmmm = 0;
     for (int ii = 0; ii < nframes; ++ii) {
       max_nloc_qm = std::max(max_nloc_qm, nloc_qm[ii]);
@@ -160,6 +160,10 @@ class PairwiseIdxOp : public OpKernel {
       }
       for (int jj = 0; jj < nall; ++jj) {
         m_backward_qm_map(ii, jj) = backward_qm_maps[ii][jj];
+        // the ghost index should add the padding indexes
+        if (m_backward_qm_map(ii, jj) >= nloc_qm[ii]) {
+          m_backward_qm_map(ii, jj) += max_nloc_qm - nloc_qm[ii];
+        }
       }
       for (int kk = 0; kk < nframes_qmmm[ii]; ++kk) {
         for (int jj = 0; jj < max_nloc_qmmm + max_nghost_qmmm; ++jj) {
@@ -180,6 +184,10 @@ class PairwiseIdxOp : public OpKernel {
         for (int jj = 0; jj < nall; ++jj) {
           // max_nloc_qmmm + max_nghost_qmmm
           m_backward_qmmm_map(nn, jj) = backward_qmmm_maps[ii][kk * nall + jj];
+          // the ghost index should add the padding indexes
+          if (m_backward_qmmm_map(nn, jj) >= nloc_qmmm[ii]) {
+            m_backward_qmmm_map(nn, jj) += max_nloc_qmmm - nloc_qmmm[ii];
+          }
         }
         m_qmmm_frame_idx(nn) = ii;
         nn++;
diff --git a/source/tests/infer/pairwise_dprc.pbtxt b/source/tests/infer/pairwise_dprc.pbtxt
new file mode 100644
index 0000000000..1469bda72f
--- /dev/null
+++ b/source/tests/infer/pairwise_dprc.pbtxt
@@ -0,0 +1,44536 @@
+node {
+  name: "train_attr/min_nbor_dist"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.9571980274118028
+      }
+    }
+  }
+}
+node {
+  name: "train_attr/training_script"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "{\"model\":{\"type\":\"pairwise_dprc\",\"type_map\":[\"C\",\"P\",\"O\",\"H\",\"OW\",\"HW\"],\"type_embedding\":{\"neuron\":[8],\"precision\":\"float32\",\"activation_function\":\"tanh\",\"resnet_dt\":false,\"trainable\":true,\"seed\":null},\"qm_model\":{\"descriptor\":{\"type\":\"se_atten_v2\",\"sel\":24,\"rcut_smth\":0.5,\"rcut\":9.0,\"attn_layer\":0,\"neuron\":[2,4,8],\"resnet_dt\":false,\"axis_neuron\":4,\"precision\":\"float32\",\"seed\":1,\"activation_function\":\"tanh\",\"type_one_side\":false,\"trainable\":true,\"exclude_types\":[],\"attn\":128,\"attn_dotr\":true,\"attn_mask\":false,\"set_davg_zero\":false},\"fitting_net\":{\"type\":\"ener\",\"neuron\":[2,4,8],\"resnet_dt\":true,\"precision\":\"float32\",\"atom_ener\":[null,null,null,null,0.0,0.0],\"seed\":1,\"numb_fparam\":0,\"numb_aparam\":0,\"activation_function\":\"tanh\",\"trainable\":true,\"rcond\":null,\"use_aparam_as_mask\":false},\"data_stat_nbatch\":10,\"data_stat_protect\":0.01,\"data_bias_nsample\":10,\"srtab_add_bias\":true,\"type\":\"standard\"},\"qmmm_model\":{\"descriptor\":{\"type\":\"se_atten_v2\",\"sel\":27,\"rcut_smth\":0.5,\"rcut\":6.0,\"attn_layer\":0,\"neuron\":[2,4,8],\"resnet_dt\":false,\"axis_neuron\":4,\"set_davg_zero\":true,\"exclude_types\":[[0,0],[0,1],[0,2],[0,3],[1,1],[1,2],[1,3],[2,2],[2,3],[3,3],[4,4],[4,5],[5,5]],\"precision\":\"float32\",\"seed\":1,\"activation_function\":\"tanh\",\"type_one_side\":false,\"trainable\":true,\"attn\":128,\"attn_dotr\":true,\"attn_mask\":false},\"fitting_net\":{\"type\":\"ener\",\"neuron\":[2,2,2],\"resnet_dt\":true,\"seed\":1,\"precision\":\"float32\",\"atom_ener\":[0.0,0.0,0.0,0.0,0.0,0.0],\"numb_fparam\":0,\"numb_aparam\":0,\"activation_function\":\"tanh\",\"trainable\":true,\"rcond\":null,\"use_aparam_as_mask\":false},\"data_stat_nbatch\":10,\"data_stat_protect\":0.01,\"data_bias_nsample\":10,\"srtab_add_bias\":true,\"type\":\"standard\"},\"data_stat_nbatch\":10,\"data_stat_protect\":0.01,\"data_bias_nsample\":10,\"srtab_add_bias\":true},\"learning_rate\":{\"type\":\"exp\",\"decay_steps\":5000,\"start_lr\":0.001,\"stop_lr\":3.51e-08,\"scale_by_worker\":\"linear\"},\"loss\":{\"type\":\"ener\",\"start_pref_e\":0.02,\"limit_pref_e\":1,\"start_pref_f\":1000,\"limit_pref_f\":1,\"start_pref_v\":0,\"limit_pref_v\":0,\"start_pref_ae\":0.0,\"limit_pref_ae\":0.0,\"start_pref_pf\":0.0,\"limit_pref_pf\":0.0,\"enable_atom_ener_coeff\":false,\"start_pref_gf\":0.0,\"limit_pref_gf\":0.0,\"numb_generalized_coord\":0},\"training\":{\"training_data\":{\"systems\":[\"../data\"],\"batch_size\":\"auto\",\"set_prefix\":\"set\",\"auto_prob\":\"prob_sys_size\",\"sys_probs\":null},\"numb_steps\":0,\"seed\":10,\"disp_file\":\"lcurve.out\",\"disp_freq\":100,\"save_freq\":1000,\"validation_data\":null,\"save_ckpt\":\"model.ckpt\",\"disp_training\":true,\"time_training\":true,\"profiling\":false,\"profiling_file\":\"timeline.json\",\"enable_profiler\":false,\"tensorboard\":false,\"tensorboard_log_dir\":\"log\",\"tensorboard_freq\":1}}"
+      }
+    }
+  }
+}
+node {
+  name: "model_type"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "original_model"
+      }
+    }
+  }
+}
+node {
+  name: "t_box"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "t_coord"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "t_aparam"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "t_type"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "t_natoms"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 8
+        }
+      }
+    }
+  }
+}
+node {
+  name: "t_mesh"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "model_attr/tmap"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "C P O H OW HW"
+      }
+    }
+  }
+}
+node {
+  name: "model_attr/model_type"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "ener"
+      }
+    }
+  }
+}
+node {
+  name: "model_attr/model_version"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "1.1"
+      }
+    }
+  }
+}
+node {
+  name: "fitting_attr/dfparam"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "fitting_attr/daparam"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "fitting_attr/aparam_nall"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr/ntypes"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 6
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr/rcut"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 9.0
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\t\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "t_box"
+  input: "Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Shape"
+  input: "Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice"
+  op: "StridedSlice"
+  input: "Shape"
+  input: "strided_slice/stack"
+  input: "strided_slice/stack_1"
+  input: "strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Cast"
+  op: "Cast"
+  input: "t_aparam"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_1"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_1/stack"
+  input: "strided_slice_1/stack_1"
+  input: "strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_1/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "strided_slice_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "Cast"
+  input: "Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "DprcPairwiseIdx"
+  op: "DprcPairwiseIdx"
+  input: "Reshape_1"
+  input: "t_natoms"
+}
+node {
+  name: "strided_slice_2/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_2/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_2/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_2"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_2/stack"
+  input: "strided_slice_2/stack_1"
+  input: "strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_2/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_2/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "strided_slice_2"
+  input: "Reshape_2/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "t_coord"
+  input: "Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_3/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_3/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_3/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_3"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_3/stack"
+  input: "strided_slice_3/stack_1"
+  input: "strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_3/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "strided_slice_3"
+  input: "Reshape_3/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "t_type"
+  input: "Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Shape"
+  input: "DprcPairwiseIdx:6"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4"
+  op: "StridedSlice"
+  input: "Shape_1"
+  input: "strided_slice_4/stack"
+  input: "strided_slice_4/stack_1"
+  input: "strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Shape"
+  input: "Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5"
+  op: "StridedSlice"
+  input: "Shape_2"
+  input: "strided_slice_5/stack"
+  input: "strided_slice_5/stack_1"
+  input: "strided_slice_5/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_3"
+  op: "Shape"
+  input: "Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6"
+  op: "StridedSlice"
+  input: "Shape_3"
+  input: "strided_slice_6/stack"
+  input: "strided_slice_6/stack_1"
+  input: "strided_slice_6/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Pack"
+  input: "strided_slice_5"
+  input: "concat/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "strided_slice_6"
+  input: "concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill"
+  op: "Fill"
+  input: "concat"
+  input: "Fill/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_1"
+  op: "Cast"
+  input: "Fill"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "Cast_1"
+  input: "Reshape_2"
+  input: "concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add"
+  op: "AddV2"
+  input: "DprcPairwiseIdx"
+  input: "add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2"
+  op: "GatherV2"
+  input: "concat_1"
+  input: "add"
+  input: "GatherV2/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_4"
+  op: "Shape"
+  input: "Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7"
+  op: "StridedSlice"
+  input: "Shape_4"
+  input: "strided_slice_7/stack"
+  input: "strided_slice_7/stack_1"
+  input: "strided_slice_7/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_5"
+  op: "Shape"
+  input: "Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8"
+  op: "StridedSlice"
+  input: "Shape_5"
+  input: "strided_slice_8/stack"
+  input: "strided_slice_8/stack_1"
+  input: "strided_slice_8/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_2/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_2/values_0"
+  op: "Pack"
+  input: "strided_slice_7"
+  input: "concat_2/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_2/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_2"
+  op: "ConcatV2"
+  input: "concat_2/values_0"
+  input: "strided_slice_8"
+  input: "concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_1/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Fill_1"
+  op: "Fill"
+  input: "concat_2"
+  input: "Fill_1/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_3/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_3"
+  op: "ConcatV2"
+  input: "Fill_1"
+  input: "Reshape_3"
+  input: "concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_1/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "AddV2"
+  input: "DprcPairwiseIdx"
+  input: "add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_1/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_1"
+  op: "GatherV2"
+  input: "concat_3"
+  input: "add_1"
+  input: "GatherV2_1/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GatherV2_2/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_2"
+  op: "GatherV2"
+  input: "Reshape_2"
+  input: "DprcPairwiseIdx:6"
+  input: "GatherV2_2/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Shape_6"
+  op: "Shape"
+  input: "GatherV2_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_9/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_9/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_9/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_9"
+  op: "StridedSlice"
+  input: "Shape_6"
+  input: "strided_slice_9/stack"
+  input: "strided_slice_9/stack_1"
+  input: "strided_slice_9/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_7"
+  op: "Shape"
+  input: "GatherV2_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_10/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_10/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_10/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_10"
+  op: "StridedSlice"
+  input: "Shape_7"
+  input: "strided_slice_10/stack"
+  input: "strided_slice_10/stack_1"
+  input: "strided_slice_10/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_4/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_4/values_0"
+  op: "Pack"
+  input: "strided_slice_9"
+  input: "concat_4/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_4/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_4"
+  op: "ConcatV2"
+  input: "concat_4/values_0"
+  input: "strided_slice_10"
+  input: "concat_4/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_2/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_2"
+  op: "Fill"
+  input: "concat_4"
+  input: "Fill_2/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_2"
+  op: "Cast"
+  input: "Fill_2"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_5/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_5"
+  op: "ConcatV2"
+  input: "Cast_2"
+  input: "GatherV2_2"
+  input: "concat_5/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_2/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_2"
+  op: "AddV2"
+  input: "DprcPairwiseIdx:2"
+  input: "add_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_3/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_3"
+  op: "GatherV2"
+  input: "concat_5"
+  input: "add_2"
+  input: "GatherV2_3/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GatherV2_4/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_4"
+  op: "GatherV2"
+  input: "Reshape_3"
+  input: "DprcPairwiseIdx:6"
+  input: "GatherV2_4/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Shape_8"
+  op: "Shape"
+  input: "GatherV2_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_11/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_11/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_11/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_11"
+  op: "StridedSlice"
+  input: "Shape_8"
+  input: "strided_slice_11/stack"
+  input: "strided_slice_11/stack_1"
+  input: "strided_slice_11/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_9"
+  op: "Shape"
+  input: "GatherV2_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_12/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_12/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_12/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_12"
+  op: "StridedSlice"
+  input: "Shape_9"
+  input: "strided_slice_12/stack"
+  input: "strided_slice_12/stack_1"
+  input: "strided_slice_12/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_6/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_6/values_0"
+  op: "Pack"
+  input: "strided_slice_11"
+  input: "concat_6/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_6/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_6"
+  op: "ConcatV2"
+  input: "concat_6/values_0"
+  input: "strided_slice_12"
+  input: "concat_6/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_3/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Fill_3"
+  op: "Fill"
+  input: "concat_6"
+  input: "Fill_3/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_7/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_7"
+  op: "ConcatV2"
+  input: "Fill_3"
+  input: "GatherV2_4"
+  input: "concat_7/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_3/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_3"
+  op: "AddV2"
+  input: "DprcPairwiseIdx:2"
+  input: "add_3/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_5/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_5"
+  op: "GatherV2"
+  input: "concat_7"
+  input: "add_3"
+  input: "GatherV2_5/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "GatherV2_6/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_6"
+  op: "GatherV2"
+  input: "Reshape"
+  input: "DprcPairwiseIdx:6"
+  input: "GatherV2_6/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 6
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000\002\000\000\000\003\000\000\000\004\000\000\000\005\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "one_hot/on_value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "one_hot/off_value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "one_hot/depth"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 6
+      }
+    }
+  }
+}
+node {
+  name: "one_hot"
+  op: "OneHot"
+  input: "Const"
+  input: "one_hot/depth"
+  input: "one_hot/on_value"
+  input: "one_hot/off_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_4/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\006\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_4"
+  op: "Reshape"
+  input: "one_hot"
+  input: "Reshape_4/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "type_embed_net/matrix_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "\354T\211>\035\322\362>P\373h\276+\345\265\274r\357\330>\367\240\230\276\2658\364<\325\216\013>U\000\330=Qk_<B\234\020\275Y0\263\275\033t:\276\246\367\216\276\234\021\260\275z=\276>JU\016\276\225\341\\=\305\r\213\276\"\002\177<\241\341\310>\027\205$>>>v\276\333.\031\276\377\206\356\276\322\r\330\275{Ph>\030Y\254\276D\032\367\275S\305\312\276\033\230,>)\217P\276\320k\371=\330\353\363\276\247[e\276o\224\246=\322\240>\274qa\251>\340\333?=M\266B>\024\345\361\276\356\231\216\275\341[\'>=\263\245\275w\322\275>\370A\r\277j1\225>\223\205\262\276"
+      }
+    }
+  }
+}
+node {
+  name: "type_embed_net/matrix_1/read"
+  op: "Identity"
+  input: "type_embed_net/matrix_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@type_embed_net/matrix_1"
+      }
+    }
+  }
+}
+node {
+  name: "type_embed_net/bias_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "\213\262\020?;I\021\277\207Pb?:\334W>\025\300\005\300\310!\354>E\013\355?\246\232\342?"
+      }
+    }
+  }
+}
+node {
+  name: "type_embed_net/bias_1/read"
+  op: "Identity"
+  input: "type_embed_net/bias_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@type_embed_net/bias_1"
+      }
+    }
+  }
+}
+node {
+  name: "type_embed_net/MatMul"
+  op: "MatMul"
+  input: "Reshape_4"
+  input: "type_embed_net/matrix_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "type_embed_net/BiasAdd"
+  op: "BiasAdd"
+  input: "type_embed_net/MatMul"
+  input: "type_embed_net/bias_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "type_embed_net/Tanh"
+  op: "Tanh"
+  input: "type_embed_net/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "type_embed_net/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "type_embed_net/Reshape"
+  op: "Reshape"
+  input: "type_embed_net/Tanh"
+  input: "type_embed_net/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_5/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_5"
+  op: "Reshape"
+  input: "type_embed_net/Reshape"
+  input: "Reshape_5/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "zeros"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 8
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "concat_8/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_8"
+  op: "ConcatV2"
+  input: "Reshape_5"
+  input: "zeros"
+  input: "concat_8/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "t_typeebd"
+  op: "Identity"
+  input: "concat_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "strided_slice_13/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_13/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_13/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_13"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_13/stack"
+  input: "strided_slice_13/stack_1"
+  input: "strided_slice_13/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul"
+  op: "Mul"
+  input: "strided_slice_13"
+  input: "mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_6/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_6/shape"
+  op: "Pack"
+  input: "Reshape_6/shape/0"
+  input: "mul"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_6"
+  op: "Reshape"
+  input: "GatherV2"
+  input: "Reshape_6/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_14/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_14/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_14/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_14"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_14/stack"
+  input: "strided_slice_14/stack_1"
+  input: "strided_slice_14/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_7/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_7/shape"
+  op: "Pack"
+  input: "Reshape_7/shape/0"
+  input: "strided_slice_14"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_7"
+  op: "Reshape"
+  input: "GatherV2_1"
+  input: "Reshape_7/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_10"
+  op: "Shape"
+  input: "Reshape_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_15/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_15/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_15/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_15"
+  op: "StridedSlice"
+  input: "Shape_10"
+  input: "strided_slice_15/stack"
+  input: "strided_slice_15/stack_1"
+  input: "strided_slice_15/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qm/sel"
+  op: "Const"
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 24
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qm/original_sel"
+  op: "Const"
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 24
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qm/t_avg"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 96
+          }
+        }
+        tensor_content: "\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\r*\264e\205\336\327?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000V\322\345F;\274\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\277\211\260/\206/\326?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000(Z]\014\212\333\325?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\230K\"9\363\016\323?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\t\004\343\307\226_\322?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qm/t_avg/read"
+  op: "Identity"
+  input: "descrpt_attr_qm/t_avg"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@descrpt_attr_qm/t_avg"
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qm/t_std"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 96
+          }
+        }
+        tensor_content: "2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?2\253`h.D\311?\306\262a\322\227.\317?\306\262a\322\227.\317?\306\262a\322\227.\317?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\020R\020\347Vn\300?\361\366$\240\350\321\312?\361\366$\240\350\321\312?\361\366$\240\350\321\312?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?y\346\232h\306\345\277?\\\250\007\271\3318\313?\\\250\007\271\3318\313?\\\250\007\271\3318\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?HX\315\326\374b\303?`\372\204\226\235\232\313?`\372\204\226\235\232\313?`\372\204\226\235\232\313?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\327\213]P\320\014\316?xs\325\377\356\005\314?xs\325\377\356\005\314?xs\325\377\356\005\314?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?\252E\005Z\027\021\311?M\021hN\213\256\311?M\021hN\213\256\311?M\021hN\213\256\311?"
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qm/t_std/read"
+  op: "Identity"
+  input: "descrpt_attr_qm/t_std"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@descrpt_attr_qm/t_std"
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_16/stack"
+  op: "Const"
+  input: "^descrpt_attr_qm/original_sel"
+  input: "^descrpt_attr_qm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_16/stack_1"
+  op: "Const"
+  input: "^descrpt_attr_qm/original_sel"
+  input: "^descrpt_attr_qm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_16/stack_2"
+  op: "Const"
+  input: "^descrpt_attr_qm/original_sel"
+  input: "^descrpt_attr_qm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_16"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_16/stack"
+  input: "strided_slice_16/stack_1"
+  input: "strided_slice_16/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_1/y"
+  op: "Const"
+  input: "^descrpt_attr_qm/original_sel"
+  input: "^descrpt_attr_qm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_1"
+  op: "Mul"
+  input: "strided_slice_16"
+  input: "mul_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_8/shape/0"
+  op: "Const"
+  input: "^descrpt_attr_qm/original_sel"
+  input: "^descrpt_attr_qm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_8/shape"
+  op: "Pack"
+  input: "Reshape_8/shape/0"
+  input: "mul_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_8"
+  op: "Reshape"
+  input: "Reshape_6"
+  input: "Reshape_8/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_9/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\t\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_9"
+  op: "Reshape"
+  input: "Reshape"
+  input: "Reshape_9/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_17/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_17/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_17/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_17"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_17/stack"
+  input: "strided_slice_17/stack_1"
+  input: "strided_slice_17/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_10/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_10/shape"
+  op: "Pack"
+  input: "Reshape_10/shape/0"
+  input: "strided_slice_17"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_10"
+  op: "Reshape"
+  input: "Reshape_7"
+  input: "Reshape_10/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "ProdEnvMatAMix/mesh"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ProdEnvMatAMix"
+  op: "ProdEnvMatAMix"
+  input: "Reshape_8"
+  input: "Reshape_10"
+  input: "DprcPairwiseIdx:4"
+  input: "Reshape_9"
+  input: "ProdEnvMatAMix/mesh"
+  input: "descrpt_attr_qm/t_avg/read"
+  input: "descrpt_attr_qm/t_std/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "rcut_a"
+    value {
+      f: -1.0
+    }
+  }
+  attr {
+    key: "rcut_r"
+    value {
+      f: 9.0
+    }
+  }
+  attr {
+    key: "rcut_r_smth"
+    value {
+      f: 0.5
+    }
+  }
+  attr {
+    key: "sel_a"
+    value {
+      list {
+        i: 24
+      }
+    }
+  }
+  attr {
+    key: "sel_r"
+    value {
+      list {
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_11/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_11"
+  op: "Reshape"
+  input: "ProdEnvMatAMix:4"
+  input: "Reshape_11/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_13/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377`\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_13"
+  op: "Reshape"
+  input: "ProdEnvMatAMix"
+  input: "Reshape_13/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value/Minimum/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value/Minimum"
+  op: "Minimum"
+  input: "Reshape_10"
+  input: "clip_by_value/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value"
+  op: "Maximum"
+  input: "clip_by_value/Minimum"
+  input: "clip_by_value/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_18/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_18/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_18/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_18"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_18/stack"
+  input: "strided_slice_18/stack_1"
+  input: "strided_slice_18/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Pack"
+  input: "Slice/size/0"
+  input: "strided_slice_18"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "clip_by_value"
+  input: "Slice/begin"
+  input: "Slice/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_14/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_14"
+  op: "Reshape"
+  input: "Slice"
+  input: "Reshape_14/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_rmat_qm"
+  op: "Identity"
+  input: "Reshape_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_rmat_deriv_qm"
+  op: "Identity"
+  input: "ProdEnvMatAMix:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_rij_qm"
+  op: "Identity"
+  input: "ProdEnvMatAMix:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_nlist_qm"
+  op: "Identity"
+  input: "ProdEnvMatAMix:3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_15/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\006\000\000\000\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_15"
+  op: "Reshape"
+  input: "descrpt_attr_qm/t_avg/read"
+  input: "Reshape_15/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Reshape_15"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_16/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\006\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_16"
+  op: "Reshape"
+  input: "Slice_1"
+  input: "Reshape_16/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_17/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\006\000\000\000\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_17"
+  op: "Reshape"
+  input: "descrpt_attr_qm/t_std/read"
+  input: "Reshape_17/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Reshape_17"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_18/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\006\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_18"
+  op: "Reshape"
+  input: "Slice_2"
+  input: "Reshape_18/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_16"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup"
+  op: "GatherV2"
+  input: "Reshape_16"
+  input: "Reshape_14"
+  input: "embedding_lookup/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_16"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup/Identity"
+  op: "Identity"
+  input: "embedding_lookup"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_19/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_19/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_19/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_19"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_19/stack"
+  input: "strided_slice_19/stack_1"
+  input: "strided_slice_19/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_19/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_19/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_19/shape"
+  op: "Pack"
+  input: "Reshape_19/shape/0"
+  input: "strided_slice_19"
+  input: "Reshape_19/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_19"
+  op: "Reshape"
+  input: "embedding_lookup/Identity"
+  input: "Reshape_19/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_1/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_18"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_1"
+  op: "GatherV2"
+  input: "Reshape_18"
+  input: "Reshape_14"
+  input: "embedding_lookup_1/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_18"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_1/Identity"
+  op: "Identity"
+  input: "embedding_lookup_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_20/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_20/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_20/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_20"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_20/stack"
+  input: "strided_slice_20/stack_1"
+  input: "strided_slice_20/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_20/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_20/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_20/shape"
+  op: "Pack"
+  input: "Reshape_20/shape/0"
+  input: "strided_slice_20"
+  input: "Reshape_20/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_20"
+  op: "Reshape"
+  input: "embedding_lookup_1/Identity"
+  input: "Reshape_20/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_21/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_21"
+  op: "Reshape"
+  input: "ProdEnvMatAMix"
+  input: "Reshape_21/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_3/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_3/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_3"
+  op: "Slice"
+  input: "Reshape_21"
+  input: "Slice_3/begin"
+  input: "Slice_3/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_21/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_21/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_21/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_21"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_21/stack"
+  input: "strided_slice_21/stack_1"
+  input: "strided_slice_21/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_22/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_22/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 24
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_22/shape"
+  op: "Pack"
+  input: "Reshape_22/shape/0"
+  input: "strided_slice_21"
+  input: "Reshape_22/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_22"
+  op: "Reshape"
+  input: "Slice_3"
+  input: "Reshape_22/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "mul_3"
+  op: "Mul"
+  input: "Reshape_22"
+  input: "Reshape_20"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "add_4"
+  op: "AddV2"
+  input: "mul_3"
+  input: "Reshape_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_4/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "mul_4"
+  op: "Mul"
+  input: "mul_4/x"
+  input: "add_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "sub_1/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "sub_1"
+  op: "Sub"
+  input: "sub_1/x"
+  input: "mul_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Neg"
+  op: "Neg"
+  input: "sub_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_5"
+  op: "Mul"
+  input: "Neg"
+  input: "sub_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_6"
+  op: "Mul"
+  input: "mul_5"
+  input: "sub_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "add_5/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "add_5"
+  op: "AddV2"
+  input: "mul_6"
+  input: "add_5/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "clip_by_value_1/Minimum/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_1/Minimum"
+  op: "Minimum"
+  input: "add_5"
+  input: "clip_by_value_1/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "clip_by_value_1/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_1"
+  op: "Maximum"
+  input: "clip_by_value_1/Minimum"
+  input: "clip_by_value_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Cast_4"
+  op: "Cast"
+  input: "clip_by_value_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "strided_slice_22/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_22/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_22/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_22"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_22/stack"
+  input: "strided_slice_22/stack_1"
+  input: "strided_slice_22/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_23/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_23/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 96
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_23/shape"
+  op: "Pack"
+  input: "Reshape_23/shape/0"
+  input: "strided_slice_22"
+  input: "Reshape_23/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_23"
+  op: "Reshape"
+  input: "o_rmat_qm"
+  input: "Reshape_23/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_24/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377`\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_24"
+  op: "Reshape"
+  input: "Reshape_23"
+  input: "Reshape_24/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_5"
+  op: "Cast"
+  input: "Reshape_24"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Slice_4/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_4/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377`\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_4"
+  op: "Slice"
+  input: "Cast_5"
+  input: "Slice_4/begin"
+  input: "Slice_4/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Shape_12"
+  op: "Shape"
+  input: "Slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_25/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_25/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_25/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_25"
+  op: "StridedSlice"
+  input: "Shape_12"
+  input: "strided_slice_25/stack"
+  input: "strided_slice_25/stack_1"
+  input: "strided_slice_25/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_26/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_26"
+  op: "Reshape"
+  input: "Slice_4"
+  input: "Reshape_26/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_5/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_5/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_5"
+  op: "Slice"
+  input: "Reshape_26"
+  input: "Slice_5/begin"
+  input: "Slice_5/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_27/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_27"
+  op: "Reshape"
+  input: "Slice_5"
+  input: "Reshape_27/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\025,/>\337\222\224\274"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_1/read"
+  op: "Identity"
+  input: "filter_type_all_qm/matrix_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/matrix_1"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "k\313\226\277\270\2765\277"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_1/read"
+  op: "Identity"
+  input: "filter_type_all_qm/bias_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/bias_1"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/MatMul"
+  op: "MatMul"
+  input: "Reshape_27"
+  input: "filter_type_all_qm/matrix_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/BiasAdd"
+  op: "BiasAdd"
+  input: "filter_type_all_qm/MatMul"
+  input: "filter_type_all_qm/bias_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tanh"
+  op: "Tanh"
+  input: "filter_type_all_qm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape"
+  op: "Reshape"
+  input: "filter_type_all_qm/Tanh"
+  input: "filter_type_all_qm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat"
+  op: "ConcatV2"
+  input: "Reshape_27"
+  input: "Reshape_27"
+  input: "filter_type_all_qm/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add"
+  op: "AddV2"
+  input: "filter_type_all_qm/concat"
+  input: "filter_type_all_qm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\364\000\235\274\271\244\373\276C\345\363>\317\257\364>\336\315\005\276\263&\237\275\305\332\024\277p7\310>"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_2/read"
+  op: "Identity"
+  input: "filter_type_all_qm/matrix_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/matrix_2"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "J\223\376\276n\333\375>\306A\010?\347\234\223\276"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_2/read"
+  op: "Identity"
+  input: "filter_type_all_qm/bias_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/bias_2"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/MatMul_1"
+  op: "MatMul"
+  input: "filter_type_all_qm/add"
+  input: "filter_type_all_qm/matrix_2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/BiasAdd_1"
+  op: "BiasAdd"
+  input: "filter_type_all_qm/MatMul_1"
+  input: "filter_type_all_qm/bias_2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tanh_1"
+  op: "Tanh"
+  input: "filter_type_all_qm/BiasAdd_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_1/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_1"
+  op: "Reshape"
+  input: "filter_type_all_qm/Tanh_1"
+  input: "filter_type_all_qm/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_1/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_1"
+  op: "ConcatV2"
+  input: "filter_type_all_qm/add"
+  input: "filter_type_all_qm/add"
+  input: "filter_type_all_qm/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add_1"
+  op: "AddV2"
+  input: "filter_type_all_qm/concat_1"
+  input: "filter_type_all_qm/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_3"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "\037\356\361>k\024\224>\322\335\271>\220\016\005>\306\231\244>\014\352\257\276\340G\335\275\225\242\202\276rC\"\274\204a/\276\010P\260\275F<\023\277D\025\365=`\217\302>3\000+\276\251\307\005?C\235\030\276\376v\365=\330\226\331\276\261\003\230>\206\356H>\324\306\340\274\361L\224\276W2B?&\214K\276v\251A\276\350a\213>\006\202\177>o\222U\276$m\230\276\313\335\300>h\255\243="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_3/read"
+  op: "Identity"
+  input: "filter_type_all_qm/matrix_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/matrix_3"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_3"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "g\264\263?\272E\330\277\244iX\276`8\223?\302\227\301?Q]\333>\000\341,\300m\030\204="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_3/read"
+  op: "Identity"
+  input: "filter_type_all_qm/bias_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/bias_3"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/MatMul_2"
+  op: "MatMul"
+  input: "filter_type_all_qm/add_1"
+  input: "filter_type_all_qm/matrix_3/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/BiasAdd_2"
+  op: "BiasAdd"
+  input: "filter_type_all_qm/MatMul_2"
+  input: "filter_type_all_qm/bias_3/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tanh_2"
+  op: "Tanh"
+  input: "filter_type_all_qm/BiasAdd_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_2/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_2"
+  op: "Reshape"
+  input: "filter_type_all_qm/Tanh_2"
+  input: "filter_type_all_qm/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_2/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_2"
+  op: "ConcatV2"
+  input: "filter_type_all_qm/add_1"
+  input: "filter_type_all_qm/add_1"
+  input: "filter_type_all_qm/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add_2"
+  op: "AddV2"
+  input: "filter_type_all_qm/concat_2"
+  input: "filter_type_all_qm/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_3/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_3"
+  op: "Reshape"
+  input: "Reshape_14"
+  input: "filter_type_all_qm/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/mul/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 7
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/mul"
+  op: "Mul"
+  input: "filter_type_all_qm/Reshape_3"
+  input: "filter_type_all_qm/mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tile/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\030\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tile"
+  op: "Tile"
+  input: "filter_type_all_qm/mul"
+  input: "filter_type_all_qm/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_4/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\030\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_4"
+  op: "Reshape"
+  input: "Reshape_11"
+  input: "filter_type_all_qm/Reshape_4/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add_3"
+  op: "AddV2"
+  input: "filter_type_all_qm/Tile"
+  input: "filter_type_all_qm/Reshape_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_5/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_5"
+  op: "Reshape"
+  input: "filter_type_all_qm/add_3"
+  input: "filter_type_all_qm/Reshape_5/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_6/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\007\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_6"
+  op: "Reshape"
+  input: "t_typeebd"
+  input: "filter_type_all_qm/Reshape_6/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tile_1/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\007\000\000\000\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tile_1"
+  op: "Tile"
+  input: "filter_type_all_qm/Reshape_6"
+  input: "filter_type_all_qm/Tile_1/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_7/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\007\000\000\000\001\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_7"
+  op: "Reshape"
+  input: "t_typeebd"
+  input: "filter_type_all_qm/Reshape_7/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tile_2/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\007\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tile_2"
+  op: "Tile"
+  input: "filter_type_all_qm/Reshape_7"
+  input: "filter_type_all_qm/Tile_2/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_3/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_3"
+  op: "ConcatV2"
+  input: "filter_type_all_qm/Tile_1"
+  input: "filter_type_all_qm/Tile_2"
+  input: "filter_type_all_qm/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_8/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\020\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_8"
+  op: "Reshape"
+  input: "filter_type_all_qm/concat_3"
+  input: "filter_type_all_qm/Reshape_8/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_1_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\n\007\217=\257\236\362\273IGS\274\rS\021<,\305\032?\260\300\034\276J7%\276\352\343\322=\303\311\263>\302\243g\275+a@<\000\207\311=\365\234\337<@\356S>C\221\206=\343D\277\276\217\003\212>1e\200>\251l\210>\210\337\237>\361\327\376>\230lP\276\023D\336\276\361@\345=U\003o\276\'\017\037\276\326\303\000\276\002\374F>\255\256\211>\364D\014?\346\232%<\375\251\204;"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_1_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qm/matrix_1_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/matrix_1_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_1_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "k\313\226\277\270\2765\277"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_1_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qm/bias_1_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/bias_1_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/MatMul_3"
+  op: "MatMul"
+  input: "filter_type_all_qm/Reshape_8"
+  input: "filter_type_all_qm/matrix_1_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/BiasAdd_3"
+  op: "BiasAdd"
+  input: "filter_type_all_qm/MatMul_3"
+  input: "filter_type_all_qm/bias_1_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tanh_3"
+  op: "Tanh"
+  input: "filter_type_all_qm/BiasAdd_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_9/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_9"
+  op: "Reshape"
+  input: "filter_type_all_qm/Tanh_3"
+  input: "filter_type_all_qm/Reshape_9/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_2_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\364\000\235\274\271\244\373\276C\345\363>\317\257\364>\336\315\005\276\263&\237\275\305\332\024\277p7\310>"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_2_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qm/matrix_2_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/matrix_2_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_2_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "J\223\376\276n\333\375>\306A\010?\347\234\223\276"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_2_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qm/bias_2_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/bias_2_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/MatMul_4"
+  op: "MatMul"
+  input: "filter_type_all_qm/Reshape_9"
+  input: "filter_type_all_qm/matrix_2_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/BiasAdd_4"
+  op: "BiasAdd"
+  input: "filter_type_all_qm/MatMul_4"
+  input: "filter_type_all_qm/bias_2_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tanh_4"
+  op: "Tanh"
+  input: "filter_type_all_qm/BiasAdd_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_10/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_10"
+  op: "Reshape"
+  input: "filter_type_all_qm/Tanh_4"
+  input: "filter_type_all_qm/Reshape_10/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_4/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_4"
+  op: "ConcatV2"
+  input: "filter_type_all_qm/Reshape_9"
+  input: "filter_type_all_qm/Reshape_9"
+  input: "filter_type_all_qm/concat_4/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add_4"
+  op: "AddV2"
+  input: "filter_type_all_qm/concat_4"
+  input: "filter_type_all_qm/Reshape_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_3_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "\037\356\361>k\024\224>\322\335\271>\220\016\005>\306\231\244>\014\352\257\276\340G\335\275\225\242\202\276rC\"\274\204a/\276\010P\260\275F<\023\277D\025\365=`\217\302>3\000+\276\251\307\005?C\235\030\276\376v\365=\330\226\331\276\261\003\230>\206\356H>\324\306\340\274\361L\224\276W2B?&\214K\276v\251A\276\350a\213>\006\202\177>o\222U\276$m\230\276\313\335\300>h\255\243="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/matrix_3_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qm/matrix_3_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/matrix_3_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_3_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "g\264\263?\272E\330\277\244iX\276`8\223?\302\227\301?Q]\333>\000\341,\300m\030\204="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/bias_3_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qm/bias_3_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/bias_3_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/MatMul_5"
+  op: "MatMul"
+  input: "filter_type_all_qm/add_4"
+  input: "filter_type_all_qm/matrix_3_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/BiasAdd_5"
+  op: "BiasAdd"
+  input: "filter_type_all_qm/MatMul_5"
+  input: "filter_type_all_qm/bias_3_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Tanh_5"
+  op: "Tanh"
+  input: "filter_type_all_qm/BiasAdd_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_11/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_11"
+  op: "Reshape"
+  input: "filter_type_all_qm/Tanh_5"
+  input: "filter_type_all_qm/Reshape_11/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_5/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/concat_5"
+  op: "ConcatV2"
+  input: "filter_type_all_qm/add_4"
+  input: "filter_type_all_qm/add_4"
+  input: "filter_type_all_qm/concat_5/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add_5"
+  op: "AddV2"
+  input: "filter_type_all_qm/concat_5"
+  input: "filter_type_all_qm/Reshape_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/embedding_lookup/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/add_5"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/embedding_lookup"
+  op: "GatherV2"
+  input: "filter_type_all_qm/add_5"
+  input: "filter_type_all_qm/Reshape_5"
+  input: "filter_type_all_qm/embedding_lookup/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qm/add_5"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/embedding_lookup/Identity"
+  op: "Identity"
+  input: "filter_type_all_qm/embedding_lookup"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_12/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/Reshape_12"
+  op: "Reshape"
+  input: "Cast_4"
+  input: "filter_type_all_qm/Reshape_12/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/mul_1"
+  op: "Mul"
+  input: "filter_type_all_qm/embedding_lookup/Identity"
+  input: "filter_type_all_qm/Reshape_12"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/mul_2"
+  op: "Mul"
+  input: "filter_type_all_qm/add_2"
+  input: "filter_type_all_qm/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qm/add_6"
+  op: "AddV2"
+  input: "filter_type_all_qm/mul_2"
+  input: "filter_type_all_qm/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_29/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\030\000\000\000\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_29"
+  op: "Reshape"
+  input: "filter_type_all_qm/add_6"
+  input: "Reshape_29/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_30/shape/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 24
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_30/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_30/shape"
+  op: "Pack"
+  input: "strided_slice_25"
+  input: "Reshape_30/shape/1"
+  input: "Reshape_30/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_30"
+  op: "Reshape"
+  input: "Slice_4"
+  input: "Reshape_30/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "BatchMatMulV2"
+  input: "Reshape_30"
+  input: "Reshape_29"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "truediv/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 24.0
+      }
+    }
+  }
+}
+node {
+  name: "truediv"
+  op: "RealDiv"
+  input: "MatMul"
+  input: "truediv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Slice_7/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_7/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_7"
+  op: "Slice"
+  input: "truediv"
+  input: "Slice_7/begin"
+  input: "Slice_7/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "BatchMatMulV2"
+  input: "truediv"
+  input: "Slice_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_31/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_31"
+  op: "Reshape"
+  input: "MatMul_1"
+  input: "Reshape_31/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_7"
+  op: "Cast"
+  input: "Reshape_31"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Shape_13"
+  op: "Shape"
+  input: "Reshape_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_26/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_26/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_26/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_26"
+  op: "StridedSlice"
+  input: "Shape_13"
+  input: "strided_slice_26/stack"
+  input: "strided_slice_26/stack_1"
+  input: "strided_slice_26/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_27/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_27/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_27/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_27"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_27/stack"
+  input: "strided_slice_27/stack_1"
+  input: "strided_slice_27/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_32/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_32/shape"
+  op: "Pack"
+  input: "strided_slice_26"
+  input: "strided_slice_27"
+  input: "Reshape_32/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_32"
+  op: "Reshape"
+  input: "Cast_7"
+  input: "Reshape_32/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_9/concat"
+  op: "Identity"
+  input: "Reshape_32"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_descriptor_qm"
+  op: "Identity"
+  input: "concat_9/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "fitting_attr_qm/t_bias_atom_e"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+        }
+        tensor_content: "\024w(\231N\030\n@bOp\273\211e\341?\024w(\231N\030\n@(m\272a\235\353\027@\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "fitting_attr_qm/t_bias_atom_e/read"
+  op: "Identity"
+  input: "fitting_attr_qm/t_bias_atom_e"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@fitting_attr_qm/t_bias_atom_e"
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_30/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_30/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_30/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_30"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_30/stack"
+  input: "strided_slice_30/stack_1"
+  input: "strided_slice_30/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_34/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_34/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_34/shape"
+  op: "Pack"
+  input: "Reshape_34/shape/0"
+  input: "strided_slice_30"
+  input: "Reshape_34/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_34"
+  op: "Reshape"
+  input: "o_descriptor_qm"
+  input: "Reshape_34/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_31/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_31/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_31/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_31"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_31/stack"
+  input: "strided_slice_31/stack_1"
+  input: "strided_slice_31/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "zeros_1/packed/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "zeros_1/packed"
+  op: "Pack"
+  input: "strided_slice_15"
+  input: "strided_slice_31"
+  input: "zeros_1/packed/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "zeros_1/Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "zeros_1"
+  op: "Fill"
+  input: "zeros_1/packed"
+  input: "zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_32/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_32/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_32/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_32"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_32/stack"
+  input: "strided_slice_32/stack_1"
+  input: "strided_slice_32/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_35/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_35/shape"
+  op: "Pack"
+  input: "Reshape_35/shape/0"
+  input: "strided_slice_32"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_35"
+  op: "Reshape"
+  input: "GatherV2_1"
+  input: "Reshape_35/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_33/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_33/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_33/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_33"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_33/stack"
+  input: "strided_slice_33/stack_1"
+  input: "strided_slice_33/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Slice_9/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_9/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_9/size"
+  op: "Pack"
+  input: "Slice_9/size/0"
+  input: "strided_slice_33"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_9"
+  op: "Slice"
+  input: "Reshape_35"
+  input: "Slice_9/begin"
+  input: "Slice_9/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GreaterEqual/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "GreaterEqual"
+  op: "GreaterEqual"
+  input: "Slice_9"
+  input: "GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_9"
+  op: "Cast"
+  input: "GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_36/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_36"
+  op: "Reshape"
+  input: "Slice_9"
+  input: "Reshape_36/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value_2/Minimum/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_2/Minimum"
+  op: "Minimum"
+  input: "Reshape_36"
+  input: "clip_by_value_2/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value_2/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_2"
+  op: "Maximum"
+  input: "clip_by_value_2/Minimum"
+  input: "clip_by_value_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_2/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@t_typeebd"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_2"
+  op: "GatherV2"
+  input: "t_typeebd"
+  input: "clip_by_value_2"
+  input: "embedding_lookup_2/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@t_typeebd"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_2/Identity"
+  op: "Identity"
+  input: "embedding_lookup_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Cast_10"
+  op: "Cast"
+  input: "embedding_lookup_2/Identity"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_37/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_37"
+  op: "Reshape"
+  input: "Reshape_34"
+  input: "Reshape_37/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_11/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_11"
+  op: "ConcatV2"
+  input: "Reshape_37"
+  input: "Cast_10"
+  input: "concat_11/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_34/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_34/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_34/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_34"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_34/stack"
+  input: "strided_slice_34/stack_1"
+  input: "strided_slice_34/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_38/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_38/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 40
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_38/shape"
+  op: "Pack"
+  input: "Reshape_38/shape/0"
+  input: "strided_slice_34"
+  input: "Reshape_38/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_38"
+  op: "Reshape"
+  input: "concat_11"
+  input: "Reshape_38/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_35/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_35/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_35/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_35"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_35/stack"
+  input: "strided_slice_35/stack_1"
+  input: "strided_slice_35/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Cast_11"
+  op: "Cast"
+  input: "Reshape_38"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Slice_10/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_10/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_10/size/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_10/size"
+  op: "Pack"
+  input: "Slice_10/size/0"
+  input: "strided_slice_35"
+  input: "Slice_10/size/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_10"
+  op: "Slice"
+  input: "Cast_11"
+  input: "Slice_10/begin"
+  input: "Slice_10/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_39/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377(\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_39"
+  op: "Reshape"
+  input: "Slice_10"
+  input: "Reshape_39/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_0_qm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 40
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\2241\000\276\224\223j>\333(%<S\373\300\276\027\351z<\261\325\272=\006W\273=\356\267\247\276\363q\344\275\352A\016\274c^\313=Q\017\'\275!\256\222>\rr\263=c\366q\275\261\261j\276C^G>\343\347\177\273\304@\'\275\3331r>s\263\211\276Fa\212\275\336\335\262\275\347\253J=\254\3762>h\375\263\2764\223\230=\254jO\276\364\211a=\271\262\210\276\272\030\313\273\016\'\016>j$Z\276\265\024j>\303(\033\275\267\206\347\275\236\237\367\274\023I5<\306\007\310=1Bx\276\022\rR>Xf9\276\005\177\237;B\301\226=X\024\212=M;H\275nq\216\275h\257\216\2769\341\235<v\211\363\275~\322\353\275\26232\275\277\300\232\276\001aU\2756\307\">\245 o>NN\016=\310\2134\275\242\347)=\333\364\211>N\372\021>\373\254\242\276\315\262F\275\321=?>&\221\333=-](\276?\357\t;\247\264(\276>\256\350\274g@\256\276s\200\220>\253nD>7\266T\275\034K\312\273\370DT\275\340PB\275&\\\362\275\310u5\275\221B:\275\357\360\377\275"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qm/matrix/read"
+  op: "Identity"
+  input: "layer_0_qm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_0_qm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\'\255[\277\211WI\276"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qm/bias/read"
+  op: "Identity"
+  input: "layer_0_qm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_0_qm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qm/MatMul"
+  op: "MatMul"
+  input: "Reshape_39"
+  input: "layer_0_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_0_qm/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_0_qm/MatMul"
+  input: "layer_0_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_0_qm/Tanh"
+  op: "Tanh"
+  input: "layer_0_qm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_0_qm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qm/Reshape"
+  op: "Reshape"
+  input: "layer_0_qm/Tanh"
+  input: "layer_0_qm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_1_qm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\034\273\367=n\035R\274\367\370\266\274\223\265{<\361\010\206?v\300\207\276\314\024\217\276\351\2426>"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qm/matrix/read"
+  op: "Identity"
+  input: "layer_1_qm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_1_qm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "k\313\226\277\270\2765\277\244\303\337\277o\241\375\276"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qm/bias/read"
+  op: "Identity"
+  input: "layer_1_qm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_1_qm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qm/MatMul"
+  op: "MatMul"
+  input: "layer_0_qm/Reshape"
+  input: "layer_1_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_1_qm/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_1_qm/MatMul"
+  input: "layer_1_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_1_qm/Tanh"
+  op: "Tanh"
+  input: "layer_1_qm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_1_qm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qm/Reshape"
+  op: "Reshape"
+  input: "layer_1_qm/Tanh"
+  input: "layer_1_qm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_2_qm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "k\t^\274T\360\261\276\317u\254>\010\005\255>H:\275\275\340\022a\275\034\203\322\276\021\223\215>\256@\207\275\315=K\276\006\300\207=,\271\245=@E\035>\020\016\267=\331w\007?\274_\034\276;!\245>\026\0003\275\255UR=\377\340\n>3nI\276\3762\234=\2629\235\276\230s\361\276\235\ni>p\373\251>d\2230\276$\271X>y\2133=}\027&\276\036\020\210>H\215n>"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qm/matrix/read"
+  op: "Identity"
+  input: "layer_2_qm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_2_qm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "J\223\376\276n\333\375>\306A\010?\347\234\223\2767\000\265?%\034\345<{\002\023?c\350o?"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qm/bias/read"
+  op: "Identity"
+  input: "layer_2_qm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_2_qm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qm/MatMul"
+  op: "MatMul"
+  input: "layer_1_qm/Reshape"
+  input: "layer_2_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_2_qm/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_2_qm/MatMul"
+  input: "layer_2_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_2_qm/Tanh"
+  op: "Tanh"
+  input: "layer_2_qm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_2_qm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qm/Reshape"
+  op: "Reshape"
+  input: "layer_2_qm/Tanh"
+  input: "layer_2_qm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "final_layer_qm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\257\255\013?\337\374\252>\302\236\326>\020\244\031>\203\020\276>\327 \313\276V\203\377\275+\330\226\276"
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qm/matrix/read"
+  op: "Identity"
+  input: "final_layer_qm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@final_layer_qm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        float_val: 1.4039429426193237
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qm/bias/read"
+  op: "Identity"
+  input: "final_layer_qm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@final_layer_qm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qm/MatMul"
+  op: "MatMul"
+  input: "layer_2_qm/Reshape"
+  input: "final_layer_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "final_layer_qm/BiasAdd"
+  op: "BiasAdd"
+  input: "final_layer_qm/MatMul"
+  input: "final_layer_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "Cast_12"
+  op: "Cast"
+  input: "final_layer_qm/BiasAdd"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_40/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_40"
+  op: "Reshape"
+  input: "zeros_1"
+  input: "Reshape_40/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_12/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_12"
+  op: "ConcatV2"
+  input: "Reshape_40"
+  input: "Cast_10"
+  input: "concat_12/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_36/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_36/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_36/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_36"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_36/stack"
+  input: "strided_slice_36/stack_1"
+  input: "strided_slice_36/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_41/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_41/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 40
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_41/shape"
+  op: "Pack"
+  input: "Reshape_41/shape/0"
+  input: "strided_slice_36"
+  input: "Reshape_41/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_41"
+  op: "Reshape"
+  input: "concat_12"
+  input: "Reshape_41/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_37/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_37/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_37/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_37"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_37/stack"
+  input: "strided_slice_37/stack_1"
+  input: "strided_slice_37/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Cast_13"
+  op: "Cast"
+  input: "Reshape_41"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Slice_11/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_11/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_11/size/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_11/size"
+  op: "Pack"
+  input: "Slice_11/size/0"
+  input: "strided_slice_37"
+  input: "Slice_11/size/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_11"
+  op: "Slice"
+  input: "Cast_13"
+  input: "Slice_11/begin"
+  input: "Slice_11/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_42/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377(\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_42"
+  op: "Reshape"
+  input: "Slice_11"
+  input: "Reshape_42/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_0_qm_1/MatMul"
+  op: "MatMul"
+  input: "Reshape_42"
+  input: "layer_0_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_0_qm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_0_qm_1/MatMul"
+  input: "layer_0_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_0_qm_1/Tanh"
+  op: "Tanh"
+  input: "layer_0_qm_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_0_qm_1/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qm_1/Reshape"
+  op: "Reshape"
+  input: "layer_0_qm_1/Tanh"
+  input: "layer_0_qm_1/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_1_qm_1/MatMul"
+  op: "MatMul"
+  input: "layer_0_qm_1/Reshape"
+  input: "layer_1_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_1_qm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_1_qm_1/MatMul"
+  input: "layer_1_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_1_qm_1/Tanh"
+  op: "Tanh"
+  input: "layer_1_qm_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_1_qm_1/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qm_1/Reshape"
+  op: "Reshape"
+  input: "layer_1_qm_1/Tanh"
+  input: "layer_1_qm_1/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_2_qm_1/MatMul"
+  op: "MatMul"
+  input: "layer_1_qm_1/Reshape"
+  input: "layer_2_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_2_qm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_2_qm_1/MatMul"
+  input: "layer_2_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_2_qm_1/Tanh"
+  op: "Tanh"
+  input: "layer_2_qm_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_2_qm_1/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qm_1/Reshape"
+  op: "Reshape"
+  input: "layer_2_qm_1/Tanh"
+  input: "layer_2_qm_1/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "final_layer_qm_1/MatMul"
+  op: "MatMul"
+  input: "layer_2_qm_1/Reshape"
+  input: "final_layer_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "final_layer_qm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "final_layer_qm_1/MatMul"
+  input: "final_layer_qm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "Cast_14"
+  op: "Cast"
+  input: "final_layer_qm_1/BiasAdd"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sub_2"
+  op: "Sub"
+  input: "Cast_12"
+  input: "Cast_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Shape_15"
+  op: "Shape"
+  input: "Reshape_38"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_38/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_38/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_38/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_38"
+  op: "StridedSlice"
+  input: "Shape_15"
+  input: "strided_slice_38/stack"
+  input: "strided_slice_38/stack_1"
+  input: "strided_slice_38/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_39/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_39/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_39/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_39"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_39/stack"
+  input: "strided_slice_39/stack_1"
+  input: "strided_slice_39/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_43/shape"
+  op: "Pack"
+  input: "strided_slice_38"
+  input: "strided_slice_39"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_43"
+  op: "Reshape"
+  input: "sub_2"
+  input: "Reshape_43/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_3/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@fitting_attr_qm/t_bias_atom_e"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_3"
+  op: "GatherV2"
+  input: "fitting_attr_qm/t_bias_atom_e/read"
+  input: "clip_by_value_2"
+  input: "embedding_lookup_3/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@fitting_attr_qm/t_bias_atom_e"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_3/Identity"
+  op: "Identity"
+  input: "embedding_lookup_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Shape_16"
+  op: "Shape"
+  input: "Reshape_38"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_40/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_40/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_40/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_40"
+  op: "StridedSlice"
+  input: "Shape_16"
+  input: "strided_slice_40/stack"
+  input: "strided_slice_40/stack_1"
+  input: "strided_slice_40/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_41/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_41/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_41/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_41"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_41/stack"
+  input: "strided_slice_41/stack_1"
+  input: "strided_slice_41/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Rank"
+  input: "strided_slice_41"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "range/start"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "range/delta"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "range"
+  op: "Range"
+  input: "range/start"
+  input: "Rank"
+  input: "range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Sum"
+  op: "Sum"
+  input: "strided_slice_41"
+  input: "range"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_44/shape"
+  op: "Pack"
+  input: "strided_slice_40"
+  input: "Sum"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_44"
+  op: "Reshape"
+  input: "embedding_lookup_3/Identity"
+  input: "Reshape_44/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_6"
+  op: "AddV2"
+  input: "Reshape_43"
+  input: "Reshape_44"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_8"
+  op: "Mul"
+  input: "add_6"
+  input: "Cast_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_45/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_45"
+  op: "Reshape"
+  input: "mul_8"
+  input: "Reshape_45/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_42/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_42/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_42/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_42"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_42/stack"
+  input: "strided_slice_42/stack_1"
+  input: "strided_slice_42/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "o_atom_energy_qm/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "o_atom_energy_qm/shape"
+  op: "Pack"
+  input: "o_atom_energy_qm/shape/0"
+  input: "strided_slice_42"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_atom_energy_qm"
+  op: "Reshape"
+  input: "Reshape_45"
+  input: "o_atom_energy_qm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_energy_qm/reduction_indices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "o_energy_qm"
+  op: "Sum"
+  input: "o_atom_energy_qm"
+  input: "o_energy_qm/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Shape"
+  input: "Reshape_45"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/grad_ys_0/Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/grad_ys_0"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/grad_ys_0/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_45_grad/Shape"
+  op: "Shape"
+  input: "mul_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_45_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/grad_ys_0"
+  input: "gradients/Reshape_45_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/mul_8_grad/Shape"
+  op: "Shape"
+  input: "add_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/mul_8_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/mul_8_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/mul_8_grad/Shape"
+  input: "gradients/mul_8_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/mul_8_grad/Mul"
+  op: "Mul"
+  input: "gradients/Reshape_45_grad/Reshape"
+  input: "Cast_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "gradients/mul_8_grad/Sum"
+  op: "Sum"
+  input: "gradients/mul_8_grad/Mul"
+  input: "gradients/mul_8_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/mul_8_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/mul_8_grad/Sum"
+  input: "gradients/mul_8_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/add_6_grad/Shape"
+  op: "Shape"
+  input: "Reshape_43"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_6_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_44"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_6_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_6_grad/Shape"
+  input: "gradients/add_6_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/add_6_grad/Sum"
+  op: "Sum"
+  input: "gradients/mul_8_grad/Reshape"
+  input: "gradients/add_6_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_6_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_6_grad/Sum"
+  input: "gradients/add_6_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_43_grad/Shape"
+  op: "Shape"
+  input: "sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_43_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_6_grad/Reshape"
+  input: "gradients/Reshape_43_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/sub_2_grad/Shape"
+  op: "Shape"
+  input: "Cast_12"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/sub_2_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/sub_2_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/sub_2_grad/Shape"
+  input: "gradients/sub_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/sub_2_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_43_grad/Reshape"
+  input: "gradients/sub_2_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/sub_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/sub_2_grad/Sum"
+  input: "gradients/sub_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Cast_12_grad/Cast"
+  op: "Cast"
+  input: "gradients/sub_2_grad/Reshape"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/final_layer_qm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/Cast_12_grad/Cast"
+  input: "final_layer_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/layer_2_qm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "layer_2_qm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/layer_2_qm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/final_layer_qm/MatMul_grad/MatMul"
+  input: "gradients/layer_2_qm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/layer_2_qm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "layer_2_qm/Tanh"
+  input: "gradients/layer_2_qm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/layer_2_qm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/layer_2_qm/Tanh_grad/TanhGrad"
+  input: "layer_2_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/layer_1_qm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "layer_1_qm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/layer_1_qm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/layer_2_qm/MatMul_grad/MatMul"
+  input: "gradients/layer_1_qm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/layer_1_qm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "layer_1_qm/Tanh"
+  input: "gradients/layer_1_qm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/layer_1_qm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/layer_1_qm/Tanh_grad/TanhGrad"
+  input: "layer_1_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/layer_0_qm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "layer_0_qm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/layer_0_qm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/layer_1_qm/MatMul_grad/MatMul"
+  input: "gradients/layer_0_qm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/layer_0_qm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "layer_0_qm/Tanh"
+  input: "gradients/layer_0_qm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/layer_0_qm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/layer_0_qm/Tanh_grad/TanhGrad"
+  input: "layer_0_qm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_39_grad/Shape"
+  op: "Shape"
+  input: "Slice_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_39_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/layer_0_qm/MatMul_grad/MatMul"
+  input: "gradients/Reshape_39_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/Shape"
+  op: "Shape"
+  input: "Slice_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/stack"
+  op: "Pack"
+  input: "gradients/Slice_10_grad/Rank"
+  input: "gradients/Slice_10_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_10/begin"
+  input: "gradients/Slice_10_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/sub"
+  op: "Sub"
+  input: "gradients/Slice_10_grad/Shape_1"
+  input: "gradients/Slice_10_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/sub_1"
+  op: "Sub"
+  input: "gradients/Slice_10_grad/sub"
+  input: "Slice_10/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/Slice_10_grad/sub_1"
+  input: "gradients/Slice_10_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/concat"
+  op: "ConcatV2"
+  input: "gradients/Slice_10_grad/Reshape"
+  input: "gradients/Slice_10_grad/Reshape_1"
+  input: "gradients/Slice_10_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_10_grad/Pad"
+  op: "Pad"
+  input: "gradients/Reshape_39_grad/Reshape"
+  input: "gradients/Slice_10_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Cast_11_grad/Cast"
+  op: "Cast"
+  input: "gradients/Slice_10_grad/Pad"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_38_grad/Shape"
+  op: "Shape"
+  input: "concat_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_38_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Cast_11_grad/Cast"
+  input: "gradients/Reshape_38_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/concat_11_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/concat_11_grad/mod"
+  op: "FloorMod"
+  input: "concat_11/axis"
+  input: "gradients/concat_11_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/concat_11_grad/ShapeN"
+  op: "ShapeN"
+  input: "Reshape_37"
+  input: "Cast_10"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/concat_11_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients/concat_11_grad/mod"
+  input: "gradients/concat_11_grad/ShapeN"
+  input: "gradients/concat_11_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients/concat_11_grad/Slice"
+  op: "Slice"
+  input: "gradients/Reshape_38_grad/Reshape"
+  input: "gradients/concat_11_grad/ConcatOffset"
+  input: "gradients/concat_11_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_37_grad/Shape"
+  op: "Shape"
+  input: "Reshape_34"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_37_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/concat_11_grad/Slice"
+  input: "gradients/Reshape_37_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_34_grad/Shape"
+  op: "Shape"
+  input: "o_descriptor_qm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_34_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Reshape_37_grad/Reshape"
+  input: "gradients/Reshape_34_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_32_grad/Shape"
+  op: "Shape"
+  input: "Cast_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_32_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Reshape_34_grad/Reshape"
+  input: "gradients/Reshape_32_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Cast_7_grad/Cast"
+  op: "Cast"
+  input: "gradients/Reshape_32_grad/Reshape"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_31_grad/Shape"
+  op: "Shape"
+  input: "MatMul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_31_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Cast_7_grad/Cast"
+  input: "gradients/Reshape_31_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "BatchMatMulV2"
+  input: "Slice_7"
+  input: "gradients/Reshape_31_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "BatchMatMulV2"
+  input: "truediv"
+  input: "gradients/Reshape_31_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/Shape"
+  op: "Shape"
+  input: "truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/Shape_1"
+  op: "Shape"
+  input: "Slice_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice"
+  op: "StridedSlice"
+  input: "gradients/MatMul_1_grad/Shape"
+  input: "gradients/MatMul_1_grad/strided_slice/stack"
+  input: "gradients/MatMul_1_grad/strided_slice/stack_1"
+  input: "gradients/MatMul_1_grad/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/strided_slice_1"
+  op: "StridedSlice"
+  input: "gradients/MatMul_1_grad/Shape_1"
+  input: "gradients/MatMul_1_grad/strided_slice_1/stack"
+  input: "gradients/MatMul_1_grad/strided_slice_1/stack_1"
+  input: "gradients/MatMul_1_grad/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/MatMul_1_grad/strided_slice"
+  input: "gradients/MatMul_1_grad/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "gradients/MatMul_1_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/MatMul_1_grad/Sum"
+  input: "gradients/MatMul_1_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "gradients/MatMul_1_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/MatMul_1_grad/Sum_1"
+  input: "gradients/MatMul_1_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/Shape"
+  op: "Shape"
+  input: "Slice_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/stack"
+  op: "Pack"
+  input: "gradients/Slice_7_grad/Rank"
+  input: "gradients/Slice_7_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_7/begin"
+  input: "gradients/Slice_7_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/Shape_1"
+  op: "Shape"
+  input: "truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/sub"
+  op: "Sub"
+  input: "gradients/Slice_7_grad/Shape_1"
+  input: "gradients/Slice_7_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/sub_1"
+  op: "Sub"
+  input: "gradients/Slice_7_grad/sub"
+  input: "Slice_7/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/Slice_7_grad/sub_1"
+  input: "gradients/Slice_7_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/concat"
+  op: "ConcatV2"
+  input: "gradients/Slice_7_grad/Reshape"
+  input: "gradients/Slice_7_grad/Reshape_1"
+  input: "gradients/Slice_7_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_7_grad/Pad"
+  op: "Pad"
+  input: "gradients/MatMul_1_grad/Reshape_1"
+  input: "gradients/Slice_7_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/AddN"
+  op: "AddN"
+  input: "gradients/MatMul_1_grad/Reshape"
+  input: "gradients/Slice_7_grad/Pad"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/Reshape"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/truediv_grad/Shape"
+  op: "Shape"
+  input: "MatMul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/truediv_grad/Shape_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/truediv_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/truediv_grad/Shape"
+  input: "gradients/truediv_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/truediv_grad/RealDiv"
+  op: "RealDiv"
+  input: "gradients/AddN"
+  input: "truediv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/truediv_grad/Sum"
+  op: "Sum"
+  input: "gradients/truediv_grad/RealDiv"
+  input: "gradients/truediv_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/truediv_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/truediv_grad/Sum"
+  input: "gradients/truediv_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "BatchMatMulV2"
+  input: "Reshape_29"
+  input: "gradients/truediv_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "BatchMatMulV2"
+  input: "Reshape_30"
+  input: "gradients/truediv_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/Shape"
+  op: "Shape"
+  input: "Reshape_30"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_29"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice"
+  op: "StridedSlice"
+  input: "gradients/MatMul_grad/Shape"
+  input: "gradients/MatMul_grad/strided_slice/stack"
+  input: "gradients/MatMul_grad/strided_slice/stack_1"
+  input: "gradients/MatMul_grad/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/strided_slice_1"
+  op: "StridedSlice"
+  input: "gradients/MatMul_grad/Shape_1"
+  input: "gradients/MatMul_grad/strided_slice_1/stack"
+  input: "gradients/MatMul_grad/strided_slice_1/stack_1"
+  input: "gradients/MatMul_grad/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/MatMul_grad/strided_slice"
+  input: "gradients/MatMul_grad/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/Sum"
+  op: "Sum"
+  input: "gradients/MatMul_grad/MatMul"
+  input: "gradients/MatMul_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/MatMul_grad/Sum"
+  input: "gradients/MatMul_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "gradients/MatMul_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/MatMul_grad/Sum_1"
+  input: "gradients/MatMul_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_30_grad/Shape"
+  op: "Shape"
+  input: "Slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_30_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/MatMul_grad/Reshape"
+  input: "gradients/Reshape_30_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_29_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/add_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_29_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/MatMul_grad/Reshape_1"
+  input: "gradients/Reshape_29_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qm/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/filter_type_all_qm/add_6_grad/Shape"
+  input: "gradients/filter_type_all_qm/add_6_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_29_grad/Reshape"
+  input: "gradients/filter_type_all_qm/add_6_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_6_grad/Sum"
+  input: "gradients/filter_type_all_qm/add_6_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_29_grad/Reshape"
+  input: "gradients/filter_type_all_qm/add_6_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_6_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_6_grad/Sum_1"
+  input: "gradients/filter_type_all_qm/add_6_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/mul_2_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/mul_2_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qm/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/mul_2_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/filter_type_all_qm/mul_2_grad/Shape"
+  input: "gradients/filter_type_all_qm/mul_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/mul_2_grad/Mul"
+  op: "Mul"
+  input: "gradients/filter_type_all_qm/add_6_grad/Reshape"
+  input: "filter_type_all_qm/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/mul_2_grad/Sum"
+  op: "Sum"
+  input: "gradients/filter_type_all_qm/mul_2_grad/Mul"
+  input: "gradients/filter_type_all_qm/mul_2_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/mul_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/mul_2_grad/Sum"
+  input: "gradients/filter_type_all_qm/mul_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/AddN_1"
+  op: "AddN"
+  input: "gradients/filter_type_all_qm/add_6_grad/Reshape_1"
+  input: "gradients/filter_type_all_qm/mul_2_grad/Reshape"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/filter_type_all_qm/add_6_grad/Reshape_1"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qm/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/filter_type_all_qm/add_2_grad/Shape"
+  input: "gradients/filter_type_all_qm/add_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/Sum"
+  op: "Sum"
+  input: "gradients/AddN_1"
+  input: "gradients/filter_type_all_qm/add_2_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_2_grad/Sum"
+  input: "gradients/filter_type_all_qm/add_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/AddN_1"
+  input: "gradients/filter_type_all_qm/add_2_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_2_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_2_grad/Sum_1"
+  input: "gradients/filter_type_all_qm/add_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_2_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_2_grad/mod"
+  op: "FloorMod"
+  input: "filter_type_all_qm/concat_2/axis"
+  input: "gradients/filter_type_all_qm/concat_2_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_2_grad/ShapeN"
+  op: "ShapeN"
+  input: "filter_type_all_qm/add_1"
+  input: "filter_type_all_qm/add_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_2_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients/filter_type_all_qm/concat_2_grad/mod"
+  input: "gradients/filter_type_all_qm/concat_2_grad/ShapeN"
+  input: "gradients/filter_type_all_qm/concat_2_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_2_grad/Slice"
+  op: "Slice"
+  input: "gradients/filter_type_all_qm/add_2_grad/Reshape"
+  input: "gradients/filter_type_all_qm/concat_2_grad/ConcatOffset"
+  input: "gradients/filter_type_all_qm/concat_2_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_2_grad/Slice_1"
+  op: "Slice"
+  input: "gradients/filter_type_all_qm/add_2_grad/Reshape"
+  input: "gradients/filter_type_all_qm/concat_2_grad/ConcatOffset:1"
+  input: "gradients/filter_type_all_qm/concat_2_grad/ShapeN:1"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Reshape_2_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/Tanh_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Reshape_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_2_grad/Reshape_1"
+  input: "gradients/filter_type_all_qm/Reshape_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Tanh_2_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "filter_type_all_qm/Tanh_2"
+  input: "gradients/filter_type_all_qm/Reshape_2_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/MatMul_2_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/filter_type_all_qm/Tanh_2_grad/TanhGrad"
+  input: "filter_type_all_qm/matrix_3/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/AddN_2"
+  op: "AddN"
+  input: "gradients/filter_type_all_qm/concat_2_grad/Slice"
+  input: "gradients/filter_type_all_qm/concat_2_grad/Slice_1"
+  input: "gradients/filter_type_all_qm/MatMul_2_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/filter_type_all_qm/concat_2_grad/Slice"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qm/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/filter_type_all_qm/add_1_grad/Shape"
+  input: "gradients/filter_type_all_qm/add_1_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/AddN_2"
+  input: "gradients/filter_type_all_qm/add_1_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_1_grad/Sum"
+  input: "gradients/filter_type_all_qm/add_1_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/AddN_2"
+  input: "gradients/filter_type_all_qm/add_1_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_1_grad/Sum_1"
+  input: "gradients/filter_type_all_qm/add_1_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_1_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_1_grad/mod"
+  op: "FloorMod"
+  input: "filter_type_all_qm/concat_1/axis"
+  input: "gradients/filter_type_all_qm/concat_1_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_1_grad/ShapeN"
+  op: "ShapeN"
+  input: "filter_type_all_qm/add"
+  input: "filter_type_all_qm/add"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_1_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients/filter_type_all_qm/concat_1_grad/mod"
+  input: "gradients/filter_type_all_qm/concat_1_grad/ShapeN"
+  input: "gradients/filter_type_all_qm/concat_1_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_1_grad/Slice"
+  op: "Slice"
+  input: "gradients/filter_type_all_qm/add_1_grad/Reshape"
+  input: "gradients/filter_type_all_qm/concat_1_grad/ConcatOffset"
+  input: "gradients/filter_type_all_qm/concat_1_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_1_grad/Slice_1"
+  op: "Slice"
+  input: "gradients/filter_type_all_qm/add_1_grad/Reshape"
+  input: "gradients/filter_type_all_qm/concat_1_grad/ConcatOffset:1"
+  input: "gradients/filter_type_all_qm/concat_1_grad/ShapeN:1"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Reshape_1_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/Tanh_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_1_grad/Reshape_1"
+  input: "gradients/filter_type_all_qm/Reshape_1_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Tanh_1_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "filter_type_all_qm/Tanh_1"
+  input: "gradients/filter_type_all_qm/Reshape_1_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/filter_type_all_qm/Tanh_1_grad/TanhGrad"
+  input: "filter_type_all_qm/matrix_2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/AddN_3"
+  op: "AddN"
+  input: "gradients/filter_type_all_qm/concat_1_grad/Slice"
+  input: "gradients/filter_type_all_qm/concat_1_grad/Slice_1"
+  input: "gradients/filter_type_all_qm/MatMul_1_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/filter_type_all_qm/concat_1_grad/Slice"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/filter_type_all_qm/add_grad/Shape"
+  input: "gradients/filter_type_all_qm/add_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/AddN_3"
+  input: "gradients/filter_type_all_qm/add_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_grad/Sum"
+  input: "gradients/filter_type_all_qm/add_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/AddN_3"
+  input: "gradients/filter_type_all_qm/add_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_grad/Sum_1"
+  input: "gradients/filter_type_all_qm/add_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_grad/mod"
+  op: "FloorMod"
+  input: "filter_type_all_qm/concat/axis"
+  input: "gradients/filter_type_all_qm/concat_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_grad/ShapeN"
+  op: "ShapeN"
+  input: "Reshape_27"
+  input: "Reshape_27"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients/filter_type_all_qm/concat_grad/mod"
+  input: "gradients/filter_type_all_qm/concat_grad/ShapeN"
+  input: "gradients/filter_type_all_qm/concat_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_grad/Slice"
+  op: "Slice"
+  input: "gradients/filter_type_all_qm/add_grad/Reshape"
+  input: "gradients/filter_type_all_qm/concat_grad/ConcatOffset"
+  input: "gradients/filter_type_all_qm/concat_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/concat_grad/Slice_1"
+  op: "Slice"
+  input: "gradients/filter_type_all_qm/add_grad/Reshape"
+  input: "gradients/filter_type_all_qm/concat_grad/ConcatOffset:1"
+  input: "gradients/filter_type_all_qm/concat_grad/ShapeN:1"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/filter_type_all_qm/add_grad/Reshape_1"
+  input: "gradients/filter_type_all_qm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "filter_type_all_qm/Tanh"
+  input: "gradients/filter_type_all_qm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients/filter_type_all_qm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/filter_type_all_qm/Tanh_grad/TanhGrad"
+  input: "filter_type_all_qm/matrix_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/AddN_4"
+  op: "AddN"
+  input: "gradients/filter_type_all_qm/concat_grad/Slice"
+  input: "gradients/filter_type_all_qm/concat_grad/Slice_1"
+  input: "gradients/filter_type_all_qm/MatMul_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/filter_type_all_qm/concat_grad/Slice"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_27_grad/Shape"
+  op: "Shape"
+  input: "Slice_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_27_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/AddN_4"
+  input: "gradients/Reshape_27_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/Shape"
+  op: "Shape"
+  input: "Slice_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/stack"
+  op: "Pack"
+  input: "gradients/Slice_5_grad/Rank"
+  input: "gradients/Slice_5_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_5/begin"
+  input: "gradients/Slice_5_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_26"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/sub"
+  op: "Sub"
+  input: "gradients/Slice_5_grad/Shape_1"
+  input: "gradients/Slice_5_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/sub_1"
+  op: "Sub"
+  input: "gradients/Slice_5_grad/sub"
+  input: "Slice_5/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/Slice_5_grad/sub_1"
+  input: "gradients/Slice_5_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/concat"
+  op: "ConcatV2"
+  input: "gradients/Slice_5_grad/Reshape"
+  input: "gradients/Slice_5_grad/Reshape_1"
+  input: "gradients/Slice_5_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_5_grad/Pad"
+  op: "Pad"
+  input: "gradients/Reshape_27_grad/Reshape"
+  input: "gradients/Slice_5_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_26_grad/Shape"
+  op: "Shape"
+  input: "Slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_26_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Slice_5_grad/Pad"
+  input: "gradients/Reshape_26_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/AddN_5"
+  op: "AddN"
+  input: "gradients/Reshape_30_grad/Reshape"
+  input: "gradients/Reshape_26_grad/Reshape"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/Reshape_30_grad/Reshape"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/Shape"
+  op: "Shape"
+  input: "Slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/stack"
+  op: "Pack"
+  input: "gradients/Slice_4_grad/Rank"
+  input: "gradients/Slice_4_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_4/begin"
+  input: "gradients/Slice_4_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/sub"
+  op: "Sub"
+  input: "gradients/Slice_4_grad/Shape_1"
+  input: "gradients/Slice_4_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/sub_1"
+  op: "Sub"
+  input: "gradients/Slice_4_grad/sub"
+  input: "Slice_4/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/Slice_4_grad/sub_1"
+  input: "gradients/Slice_4_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/concat"
+  op: "ConcatV2"
+  input: "gradients/Slice_4_grad/Reshape"
+  input: "gradients/Slice_4_grad/Reshape_1"
+  input: "gradients/Slice_4_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Slice_4_grad/Pad"
+  op: "Pad"
+  input: "gradients/AddN_5"
+  input: "gradients/Slice_4_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Cast_5_grad/Cast"
+  op: "Cast"
+  input: "gradients/Slice_4_grad/Pad"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_24_grad/Shape"
+  op: "Shape"
+  input: "Reshape_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_24_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Cast_5_grad/Cast"
+  input: "gradients/Reshape_24_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_23_grad/Shape"
+  op: "Shape"
+  input: "o_rmat_qm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_23_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Reshape_24_grad/Reshape"
+  input: "gradients/Reshape_23_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_43/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_43/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_43/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_43"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_43/stack"
+  input: "strided_slice_43/stack_1"
+  input: "strided_slice_43/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_10/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 96
+      }
+    }
+  }
+}
+node {
+  name: "mul_10"
+  op: "Mul"
+  input: "strided_slice_43"
+  input: "mul_10/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_46/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_46/shape"
+  op: "Pack"
+  input: "Reshape_46/shape/0"
+  input: "mul_10"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_46"
+  op: "Reshape"
+  input: "gradients/Reshape_23_grad/Reshape"
+  input: "Reshape_46/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "ProdForceSeA"
+  op: "ProdForceSeA"
+  input: "Reshape_46"
+  input: "o_rmat_deriv_qm"
+  input: "o_nlist_qm"
+  input: "DprcPairwiseIdx:4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "n_a_sel"
+    value {
+      i: 24
+    }
+  }
+  attr {
+    key: "n_r_sel"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "ProdVirialSeA"
+  op: "ProdVirialSeA"
+  input: "Reshape_46"
+  input: "o_rmat_deriv_qm"
+  input: "o_rij_qm"
+  input: "o_nlist_qm"
+  input: "DprcPairwiseIdx:4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "n_a_sel"
+    value {
+      i: 24
+    }
+  }
+  attr {
+    key: "n_r_sel"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "strided_slice_44/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_44/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_44/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_44"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_44/stack"
+  input: "strided_slice_44/stack_1"
+  input: "strided_slice_44/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_11/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_11"
+  op: "Mul"
+  input: "mul_11/x"
+  input: "strided_slice_44"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_47/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_47/shape"
+  op: "Pack"
+  input: "Reshape_47/shape/0"
+  input: "mul_11"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_47"
+  op: "Reshape"
+  input: "ProdForceSeA"
+  input: "Reshape_47/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_45/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_45/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_45/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_45"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_45/stack"
+  input: "strided_slice_45/stack_1"
+  input: "strided_slice_45/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_12/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_12"
+  op: "Mul"
+  input: "mul_12/x"
+  input: "strided_slice_45"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_force_qm/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "o_force_qm/shape"
+  op: "Pack"
+  input: "o_force_qm/shape/0"
+  input: "mul_12"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_force_qm"
+  op: "Reshape"
+  input: "Reshape_47"
+  input: "o_force_qm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_virial_qm/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\t\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "o_virial_qm"
+  op: "Reshape"
+  input: "ProdVirialSeA"
+  input: "o_virial_qm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_46/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_46/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_46/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_46"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_46/stack"
+  input: "strided_slice_46/stack_1"
+  input: "strided_slice_46/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_13/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 9
+      }
+    }
+  }
+}
+node {
+  name: "mul_13"
+  op: "Mul"
+  input: "mul_13/x"
+  input: "strided_slice_46"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_atom_virial_qm/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "o_atom_virial_qm/shape"
+  op: "Pack"
+  input: "o_atom_virial_qm/shape/0"
+  input: "mul_13"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_atom_virial_qm"
+  op: "Reshape"
+  input: "ProdVirialSeA:1"
+  input: "o_atom_virial_qm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_47/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_47/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_47/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_47"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_47/stack"
+  input: "strided_slice_47/stack_1"
+  input: "strided_slice_47/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_14/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_14"
+  op: "Mul"
+  input: "strided_slice_47"
+  input: "mul_14/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_48/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_48/shape"
+  op: "Pack"
+  input: "Reshape_48/shape/0"
+  input: "mul_14"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_48"
+  op: "Reshape"
+  input: "GatherV2_3"
+  input: "Reshape_48/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_48/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_48/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_48/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_48"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_48/stack"
+  input: "strided_slice_48/stack_1"
+  input: "strided_slice_48/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_49/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_49/shape"
+  op: "Pack"
+  input: "Reshape_49/shape/0"
+  input: "strided_slice_48"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_49"
+  op: "Reshape"
+  input: "GatherV2_5"
+  input: "Reshape_49/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_17"
+  op: "Shape"
+  input: "Reshape_48"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_49/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_49/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_49/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_49"
+  op: "StridedSlice"
+  input: "Shape_17"
+  input: "strided_slice_49/stack"
+  input: "strided_slice_49/stack_1"
+  input: "strided_slice_49/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qmmm/sel"
+  op: "Const"
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 27
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qmmm/original_sel"
+  op: "Const"
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 27
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qmmm/t_avg"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 108
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qmmm/t_avg/read"
+  op: "Identity"
+  input: "descrpt_attr_qmmm/t_avg"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@descrpt_attr_qmmm/t_avg"
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qmmm/t_std"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 108
+          }
+        }
+        tensor_content: "Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?Z(^\225\340\215\314?\322\2418\227~`\312?\322\2418\227~`\312?\322\2418\227~`\312?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?zKb\226SU\303?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\344\232\210\002\013p\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\033\301\010\211\322\033\303?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\332\240\004\304 \321\305?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\\\327\"\236\207?\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?\212\217\241\303\031V\306?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?Tg\010\230S!\320?6\257\362\316[\226\307?6\257\362\316[\226\307?6\257\362\316[\226\307?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?<\232Cu\314]\313?\347j\370\021B\373\304?\347j\370\021B\373\304?\347j\370\021B\373\304?"
+      }
+    }
+  }
+}
+node {
+  name: "descrpt_attr_qmmm/t_std/read"
+  op: "Identity"
+  input: "descrpt_attr_qmmm/t_std"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@descrpt_attr_qmmm/t_std"
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_50/stack"
+  op: "Const"
+  input: "^descrpt_attr_qmmm/original_sel"
+  input: "^descrpt_attr_qmmm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_50/stack_1"
+  op: "Const"
+  input: "^descrpt_attr_qmmm/original_sel"
+  input: "^descrpt_attr_qmmm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_50/stack_2"
+  op: "Const"
+  input: "^descrpt_attr_qmmm/original_sel"
+  input: "^descrpt_attr_qmmm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_50"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_50/stack"
+  input: "strided_slice_50/stack_1"
+  input: "strided_slice_50/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_15/y"
+  op: "Const"
+  input: "^descrpt_attr_qmmm/original_sel"
+  input: "^descrpt_attr_qmmm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_15"
+  op: "Mul"
+  input: "strided_slice_50"
+  input: "mul_15/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_50/shape/0"
+  op: "Const"
+  input: "^descrpt_attr_qmmm/original_sel"
+  input: "^descrpt_attr_qmmm/sel"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_50/shape"
+  op: "Pack"
+  input: "Reshape_50/shape/0"
+  input: "mul_15"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_50"
+  op: "Reshape"
+  input: "Reshape_48"
+  input: "Reshape_50/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_51/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\t\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_51"
+  op: "Reshape"
+  input: "GatherV2_6"
+  input: "Reshape_51/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_51/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_51/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_51/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_51"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_51/stack"
+  input: "strided_slice_51/stack_1"
+  input: "strided_slice_51/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_52/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_52/shape"
+  op: "Pack"
+  input: "Reshape_52/shape/0"
+  input: "strided_slice_51"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_52"
+  op: "Reshape"
+  input: "Reshape_49"
+  input: "Reshape_52/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "ProdEnvMatAMix_1/mesh"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ProdEnvMatAMix_1"
+  op: "ProdEnvMatAMix"
+  input: "Reshape_50"
+  input: "Reshape_52"
+  input: "DprcPairwiseIdx:5"
+  input: "Reshape_51"
+  input: "ProdEnvMatAMix_1/mesh"
+  input: "descrpt_attr_qmmm/t_avg/read"
+  input: "descrpt_attr_qmmm/t_std/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "rcut_a"
+    value {
+      f: -1.0
+    }
+  }
+  attr {
+    key: "rcut_r"
+    value {
+      f: 6.0
+    }
+  }
+  attr {
+    key: "rcut_r_smth"
+    value {
+      f: 0.5
+    }
+  }
+  attr {
+    key: "sel_a"
+    value {
+      list {
+        i: 27
+      }
+    }
+  }
+  attr {
+    key: "sel_r"
+    value {
+      list {
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_53/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_53"
+  op: "Reshape"
+  input: "ProdEnvMatAMix_1:4"
+  input: "Reshape_53/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_55/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377l\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_55"
+  op: "Reshape"
+  input: "ProdEnvMatAMix_1"
+  input: "Reshape_55/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value_3/Minimum/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_3/Minimum"
+  op: "Minimum"
+  input: "Reshape_52"
+  input: "clip_by_value_3/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value_3/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_3"
+  op: "Maximum"
+  input: "clip_by_value_3/Minimum"
+  input: "clip_by_value_3/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_52/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_52/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_52/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_52"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_52/stack"
+  input: "strided_slice_52/stack_1"
+  input: "strided_slice_52/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Slice_12/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_12/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_12/size"
+  op: "Pack"
+  input: "Slice_12/size/0"
+  input: "strided_slice_52"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_12"
+  op: "Slice"
+  input: "clip_by_value_3"
+  input: "Slice_12/begin"
+  input: "Slice_12/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_56/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_56"
+  op: "Reshape"
+  input: "Slice_12"
+  input: "Reshape_56/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_rmat_qmmm"
+  op: "Identity"
+  input: "Reshape_55"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_rmat_deriv_qmmm"
+  op: "Identity"
+  input: "ProdEnvMatAMix_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_rij_qmmm"
+  op: "Identity"
+  input: "ProdEnvMatAMix_1:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_nlist_qmmm"
+  op: "Identity"
+  input: "ProdEnvMatAMix_1:3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_57/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\006\000\000\000\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_57"
+  op: "Reshape"
+  input: "descrpt_attr_qmmm/t_avg/read"
+  input: "Reshape_57/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_13/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_13/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_13"
+  op: "Slice"
+  input: "Reshape_57"
+  input: "Slice_13/begin"
+  input: "Slice_13/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_58/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\006\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_58"
+  op: "Reshape"
+  input: "Slice_13"
+  input: "Reshape_58/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_59/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\006\000\000\000\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_59"
+  op: "Reshape"
+  input: "descrpt_attr_qmmm/t_std/read"
+  input: "Reshape_59/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_14/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_14/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_14"
+  op: "Slice"
+  input: "Reshape_59"
+  input: "Slice_14/begin"
+  input: "Slice_14/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_60/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\006\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_60"
+  op: "Reshape"
+  input: "Slice_14"
+  input: "Reshape_60/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_4/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_58"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_4"
+  op: "GatherV2"
+  input: "Reshape_58"
+  input: "Reshape_56"
+  input: "embedding_lookup_4/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_58"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_4/Identity"
+  op: "Identity"
+  input: "embedding_lookup_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_53/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_53/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_53/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_53"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_53/stack"
+  input: "strided_slice_53/stack_1"
+  input: "strided_slice_53/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_61/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_61/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_61/shape"
+  op: "Pack"
+  input: "Reshape_61/shape/0"
+  input: "strided_slice_53"
+  input: "Reshape_61/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_61"
+  op: "Reshape"
+  input: "embedding_lookup_4/Identity"
+  input: "Reshape_61/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_5/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_60"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_5"
+  op: "GatherV2"
+  input: "Reshape_60"
+  input: "Reshape_56"
+  input: "embedding_lookup_5/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_60"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_5/Identity"
+  op: "Identity"
+  input: "embedding_lookup_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_54/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_54/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_54/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_54"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_54/stack"
+  input: "strided_slice_54/stack_1"
+  input: "strided_slice_54/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_62/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_62/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_62/shape"
+  op: "Pack"
+  input: "Reshape_62/shape/0"
+  input: "strided_slice_54"
+  input: "Reshape_62/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_62"
+  op: "Reshape"
+  input: "embedding_lookup_5/Identity"
+  input: "Reshape_62/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_63/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_63"
+  op: "Reshape"
+  input: "ProdEnvMatAMix_1"
+  input: "Reshape_63/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_15/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_15/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_15"
+  op: "Slice"
+  input: "Reshape_63"
+  input: "Slice_15/begin"
+  input: "Slice_15/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_55/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_55/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_55/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_55"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_55/stack"
+  input: "strided_slice_55/stack_1"
+  input: "strided_slice_55/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_64/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_64/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 27
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_64/shape"
+  op: "Pack"
+  input: "Reshape_64/shape/0"
+  input: "strided_slice_55"
+  input: "Reshape_64/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_64"
+  op: "Reshape"
+  input: "Slice_15"
+  input: "Reshape_64/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "mul_17"
+  op: "Mul"
+  input: "Reshape_64"
+  input: "Reshape_62"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "add_7"
+  op: "AddV2"
+  input: "mul_17"
+  input: "Reshape_61"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_18/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "mul_18"
+  op: "Mul"
+  input: "mul_18/x"
+  input: "add_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "sub_4/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "sub_4"
+  op: "Sub"
+  input: "sub_4/x"
+  input: "mul_18"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Neg_1"
+  op: "Neg"
+  input: "sub_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_19"
+  op: "Mul"
+  input: "Neg_1"
+  input: "sub_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_20"
+  op: "Mul"
+  input: "mul_19"
+  input: "sub_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "add_8/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "add_8"
+  op: "AddV2"
+  input: "mul_20"
+  input: "add_8/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "clip_by_value_4/Minimum/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_4/Minimum"
+  op: "Minimum"
+  input: "add_8"
+  input: "clip_by_value_4/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "clip_by_value_4/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_4"
+  op: "Maximum"
+  input: "clip_by_value_4/Minimum"
+  input: "clip_by_value_4/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Cast_16"
+  op: "Cast"
+  input: "clip_by_value_4"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "strided_slice_56/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_56/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_56/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_56"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_56/stack"
+  input: "strided_slice_56/stack_1"
+  input: "strided_slice_56/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_65/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_65/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 108
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_65/shape"
+  op: "Pack"
+  input: "Reshape_65/shape/0"
+  input: "strided_slice_56"
+  input: "Reshape_65/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_65"
+  op: "Reshape"
+  input: "o_rmat_qmmm"
+  input: "Reshape_65/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_66/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377l\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_66"
+  op: "Reshape"
+  input: "Reshape_65"
+  input: "Reshape_66/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_18"
+  op: "Shape"
+  input: "Reshape_66"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_57/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_57/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_57/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_57"
+  op: "StridedSlice"
+  input: "Shape_18"
+  input: "strided_slice_57/stack"
+  input: "strided_slice_57/stack_1"
+  input: "strided_slice_57/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 7
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\360?\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\360?"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_67/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_67"
+  op: "Reshape"
+  input: "Const_1"
+  input: "Reshape_67/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_68/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_68"
+  op: "Reshape"
+  input: "Reshape_56"
+  input: "Reshape_68/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "mul_21/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 7
+      }
+    }
+  }
+}
+node {
+  name: "mul_21"
+  op: "Mul"
+  input: "Reshape_68"
+  input: "mul_21/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Tile/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000l\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Tile"
+  op: "Tile"
+  input: "mul_21"
+  input: "Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_69/shape/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 27
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_69/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_69/shape"
+  op: "Pack"
+  input: "strided_slice_57"
+  input: "Reshape_69/shape/1"
+  input: "Reshape_69/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_69"
+  op: "Reshape"
+  input: "Reshape_53"
+  input: "Reshape_69/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Tile_1/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Tile_1"
+  op: "Tile"
+  input: "Reshape_69"
+  input: "Tile_1/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_70/shape/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 108
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_70/shape"
+  op: "Pack"
+  input: "strided_slice_57"
+  input: "Reshape_70/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_70"
+  op: "Reshape"
+  input: "Tile_1"
+  input: "Reshape_70/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_9"
+  op: "AddV2"
+  input: "Tile"
+  input: "Reshape_70"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_71/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_71"
+  op: "Reshape"
+  input: "add_9"
+  input: "Reshape_71/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_6/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_67"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_6"
+  op: "GatherV2"
+  input: "Reshape_67"
+  input: "Reshape_71"
+  input: "embedding_lookup_6/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Reshape_67"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_6/Identity"
+  op: "Identity"
+  input: "embedding_lookup_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_72/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377l\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_72"
+  op: "Reshape"
+  input: "embedding_lookup_6/Identity"
+  input: "Reshape_72/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "mul_22"
+  op: "Mul"
+  input: "Reshape_66"
+  input: "Reshape_72"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Cast_17"
+  op: "Cast"
+  input: "mul_22"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Slice_16/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_16/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377l\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_16"
+  op: "Slice"
+  input: "Cast_17"
+  input: "Slice_16/begin"
+  input: "Slice_16/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Shape_20"
+  op: "Shape"
+  input: "Slice_16"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_60/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_60/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_60/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_60"
+  op: "StridedSlice"
+  input: "Shape_20"
+  input: "strided_slice_60/stack"
+  input: "strided_slice_60/stack_1"
+  input: "strided_slice_60/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_74/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_74"
+  op: "Reshape"
+  input: "Slice_16"
+  input: "Reshape_74/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Slice_17/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_17/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_17"
+  op: "Slice"
+  input: "Reshape_74"
+  input: "Slice_17/begin"
+  input: "Slice_17/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_75/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_75"
+  op: "Reshape"
+  input: "Slice_17"
+  input: "Reshape_75/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\025,/>\337\222\224\274"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_1/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/matrix_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/matrix_1"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "k\313\226\277\270\2765\277"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_1/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/bias_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/bias_1"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/MatMul"
+  op: "MatMul"
+  input: "Reshape_75"
+  input: "filter_type_all_qmmm/matrix_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/BiasAdd"
+  op: "BiasAdd"
+  input: "filter_type_all_qmmm/MatMul"
+  input: "filter_type_all_qmmm/bias_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tanh"
+  op: "Tanh"
+  input: "filter_type_all_qmmm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/Tanh"
+  input: "filter_type_all_qmmm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat"
+  op: "ConcatV2"
+  input: "Reshape_75"
+  input: "Reshape_75"
+  input: "filter_type_all_qmmm/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/concat"
+  input: "filter_type_all_qmmm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\364\000\235\274\271\244\373\276C\345\363>\317\257\364>\336\315\005\276\263&\237\275\305\332\024\277p7\310>"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_2/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/matrix_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/matrix_2"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "J\223\376\276n\333\375>\306A\010?\347\234\223\276"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_2/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/bias_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/bias_2"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/MatMul_1"
+  op: "MatMul"
+  input: "filter_type_all_qmmm/add"
+  input: "filter_type_all_qmmm/matrix_2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/BiasAdd_1"
+  op: "BiasAdd"
+  input: "filter_type_all_qmmm/MatMul_1"
+  input: "filter_type_all_qmmm/bias_2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tanh_1"
+  op: "Tanh"
+  input: "filter_type_all_qmmm/BiasAdd_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_1/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_1"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/Tanh_1"
+  input: "filter_type_all_qmmm/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_1/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_1"
+  op: "ConcatV2"
+  input: "filter_type_all_qmmm/add"
+  input: "filter_type_all_qmmm/add"
+  input: "filter_type_all_qmmm/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add_1"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/concat_1"
+  input: "filter_type_all_qmmm/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_3"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "\037\356\361>k\024\224>\322\335\271>\220\016\005>\306\231\244>\014\352\257\276\340G\335\275\225\242\202\276rC\"\274\204a/\276\010P\260\275F<\023\277D\025\365=`\217\302>3\000+\276\251\307\005?C\235\030\276\376v\365=\330\226\331\276\261\003\230>\206\356H>\324\306\340\274\361L\224\276W2B?&\214K\276v\251A\276\350a\213>\006\202\177>o\222U\276$m\230\276\313\335\300>h\255\243="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_3/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/matrix_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/matrix_3"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_3"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "g\264\263?\272E\330\277\244iX\276`8\223?\302\227\301?Q]\333>\000\341,\300m\030\204="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_3/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/bias_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/bias_3"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/MatMul_2"
+  op: "MatMul"
+  input: "filter_type_all_qmmm/add_1"
+  input: "filter_type_all_qmmm/matrix_3/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/BiasAdd_2"
+  op: "BiasAdd"
+  input: "filter_type_all_qmmm/MatMul_2"
+  input: "filter_type_all_qmmm/bias_3/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tanh_2"
+  op: "Tanh"
+  input: "filter_type_all_qmmm/BiasAdd_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_2/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_2"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/Tanh_2"
+  input: "filter_type_all_qmmm/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_2/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_2"
+  op: "ConcatV2"
+  input: "filter_type_all_qmmm/add_1"
+  input: "filter_type_all_qmmm/add_1"
+  input: "filter_type_all_qmmm/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add_2"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/concat_2"
+  input: "filter_type_all_qmmm/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_3/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_3"
+  op: "Reshape"
+  input: "Reshape_56"
+  input: "filter_type_all_qmmm/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/mul/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 7
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/mul"
+  op: "Mul"
+  input: "filter_type_all_qmmm/Reshape_3"
+  input: "filter_type_all_qmmm/mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tile/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\033\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tile"
+  op: "Tile"
+  input: "filter_type_all_qmmm/mul"
+  input: "filter_type_all_qmmm/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_4/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\033\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_4"
+  op: "Reshape"
+  input: "Reshape_53"
+  input: "filter_type_all_qmmm/Reshape_4/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add_3"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/Tile"
+  input: "filter_type_all_qmmm/Reshape_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_5/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_5"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/add_3"
+  input: "filter_type_all_qmmm/Reshape_5/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_6/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\007\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_6"
+  op: "Reshape"
+  input: "t_typeebd"
+  input: "filter_type_all_qmmm/Reshape_6/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tile_1/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\007\000\000\000\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tile_1"
+  op: "Tile"
+  input: "filter_type_all_qmmm/Reshape_6"
+  input: "filter_type_all_qmmm/Tile_1/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_7/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\007\000\000\000\001\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_7"
+  op: "Reshape"
+  input: "t_typeebd"
+  input: "filter_type_all_qmmm/Reshape_7/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tile_2/multiples"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\007\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tile_2"
+  op: "Tile"
+  input: "filter_type_all_qmmm/Reshape_7"
+  input: "filter_type_all_qmmm/Tile_2/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_3/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_3"
+  op: "ConcatV2"
+  input: "filter_type_all_qmmm/Tile_1"
+  input: "filter_type_all_qmmm/Tile_2"
+  input: "filter_type_all_qmmm/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_8/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\020\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_8"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/concat_3"
+  input: "filter_type_all_qmmm/Reshape_8/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_1_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\n\007\217=\257\236\362\273IGS\274\rS\021<,\305\032?\260\300\034\276J7%\276\352\343\322=\303\311\263>\302\243g\275+a@<\000\207\311=\365\234\337<@\356S>C\221\206=\343D\277\276\217\003\212>1e\200>\251l\210>\210\337\237>\361\327\376>\230lP\276\023D\336\276\361@\345=U\003o\276\'\017\037\276\326\303\000\276\002\374F>\255\256\211>\364D\014?\346\232%<\375\251\204;"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_1_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/matrix_1_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/matrix_1_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_1_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "k\313\226\277\270\2765\277"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_1_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/bias_1_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/bias_1_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/MatMul_3"
+  op: "MatMul"
+  input: "filter_type_all_qmmm/Reshape_8"
+  input: "filter_type_all_qmmm/matrix_1_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/BiasAdd_3"
+  op: "BiasAdd"
+  input: "filter_type_all_qmmm/MatMul_3"
+  input: "filter_type_all_qmmm/bias_1_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tanh_3"
+  op: "Tanh"
+  input: "filter_type_all_qmmm/BiasAdd_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_9/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_9"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/Tanh_3"
+  input: "filter_type_all_qmmm/Reshape_9/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_2_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\364\000\235\274\271\244\373\276C\345\363>\317\257\364>\336\315\005\276\263&\237\275\305\332\024\277p7\310>"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_2_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/matrix_2_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/matrix_2_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_2_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "J\223\376\276n\333\375>\306A\010?\347\234\223\276"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_2_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/bias_2_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/bias_2_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/MatMul_4"
+  op: "MatMul"
+  input: "filter_type_all_qmmm/Reshape_9"
+  input: "filter_type_all_qmmm/matrix_2_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/BiasAdd_4"
+  op: "BiasAdd"
+  input: "filter_type_all_qmmm/MatMul_4"
+  input: "filter_type_all_qmmm/bias_2_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tanh_4"
+  op: "Tanh"
+  input: "filter_type_all_qmmm/BiasAdd_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_10/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_10"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/Tanh_4"
+  input: "filter_type_all_qmmm/Reshape_10/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_4/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_4"
+  op: "ConcatV2"
+  input: "filter_type_all_qmmm/Reshape_9"
+  input: "filter_type_all_qmmm/Reshape_9"
+  input: "filter_type_all_qmmm/concat_4/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add_4"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/concat_4"
+  input: "filter_type_all_qmmm/Reshape_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_3_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "\037\356\361>k\024\224>\322\335\271>\220\016\005>\306\231\244>\014\352\257\276\340G\335\275\225\242\202\276rC\"\274\204a/\276\010P\260\275F<\023\277D\025\365=`\217\302>3\000+\276\251\307\005?C\235\030\276\376v\365=\330\226\331\276\261\003\230>\206\356H>\324\306\340\274\361L\224\276W2B?&\214K\276v\251A\276\350a\213>\006\202\177>o\222U\276$m\230\276\313\335\300>h\255\243="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/matrix_3_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/matrix_3_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/matrix_3_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_3_two_side_ebd"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 8
+          }
+        }
+        tensor_content: "g\264\263?\272E\330\277\244iX\276`8\223?\302\227\301?Q]\333>\000\341,\300m\030\204="
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/bias_3_two_side_ebd/read"
+  op: "Identity"
+  input: "filter_type_all_qmmm/bias_3_two_side_ebd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/bias_3_two_side_ebd"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/MatMul_5"
+  op: "MatMul"
+  input: "filter_type_all_qmmm/add_4"
+  input: "filter_type_all_qmmm/matrix_3_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/BiasAdd_5"
+  op: "BiasAdd"
+  input: "filter_type_all_qmmm/MatMul_5"
+  input: "filter_type_all_qmmm/bias_3_two_side_ebd/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Tanh_5"
+  op: "Tanh"
+  input: "filter_type_all_qmmm/BiasAdd_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_11/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_11"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/Tanh_5"
+  input: "filter_type_all_qmmm/Reshape_11/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_5/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/concat_5"
+  op: "ConcatV2"
+  input: "filter_type_all_qmmm/add_4"
+  input: "filter_type_all_qmmm/add_4"
+  input: "filter_type_all_qmmm/concat_5/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add_5"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/concat_5"
+  input: "filter_type_all_qmmm/Reshape_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/embedding_lookup/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/add_5"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/embedding_lookup"
+  op: "GatherV2"
+  input: "filter_type_all_qmmm/add_5"
+  input: "filter_type_all_qmmm/Reshape_5"
+  input: "filter_type_all_qmmm/embedding_lookup/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@filter_type_all_qmmm/add_5"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/embedding_lookup/Identity"
+  op: "Identity"
+  input: "filter_type_all_qmmm/embedding_lookup"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_12/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/Reshape_12"
+  op: "Reshape"
+  input: "Cast_16"
+  input: "filter_type_all_qmmm/Reshape_12/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/mul_1"
+  op: "Mul"
+  input: "filter_type_all_qmmm/embedding_lookup/Identity"
+  input: "filter_type_all_qmmm/Reshape_12"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/mul_2"
+  op: "Mul"
+  input: "filter_type_all_qmmm/add_2"
+  input: "filter_type_all_qmmm/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "filter_type_all_qmmm/add_6"
+  op: "AddV2"
+  input: "filter_type_all_qmmm/mul_2"
+  input: "filter_type_all_qmmm/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_77/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\033\000\000\000\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_77"
+  op: "Reshape"
+  input: "filter_type_all_qmmm/add_6"
+  input: "Reshape_77/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_78/shape/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 27
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_78/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_78/shape"
+  op: "Pack"
+  input: "strided_slice_60"
+  input: "Reshape_78/shape/1"
+  input: "Reshape_78/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_78"
+  op: "Reshape"
+  input: "Slice_16"
+  input: "Reshape_78/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "MatMul_2"
+  op: "BatchMatMulV2"
+  input: "Reshape_78"
+  input: "Reshape_77"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "truediv_1/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 27.0
+      }
+    }
+  }
+}
+node {
+  name: "truediv_1"
+  op: "RealDiv"
+  input: "MatMul_2"
+  input: "truediv_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Slice_19/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_19/size"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\377\377\377\377\377\377\377\377\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_19"
+  op: "Slice"
+  input: "truediv_1"
+  input: "Slice_19/begin"
+  input: "Slice_19/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "MatMul_3"
+  op: "BatchMatMulV2"
+  input: "truediv_1"
+  input: "Slice_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_79/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_79"
+  op: "Reshape"
+  input: "MatMul_3"
+  input: "Reshape_79/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_19"
+  op: "Cast"
+  input: "Reshape_79"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Shape_21"
+  op: "Shape"
+  input: "Reshape_65"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_61/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_61/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_61/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_61"
+  op: "StridedSlice"
+  input: "Shape_21"
+  input: "strided_slice_61/stack"
+  input: "strided_slice_61/stack_1"
+  input: "strided_slice_61/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_62/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_62/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_62/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_62"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_62/stack"
+  input: "strided_slice_62/stack_1"
+  input: "strided_slice_62/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_80/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_80/shape"
+  op: "Pack"
+  input: "strided_slice_61"
+  input: "strided_slice_62"
+  input: "Reshape_80/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_80"
+  op: "Reshape"
+  input: "Cast_19"
+  input: "Reshape_80/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_13/concat"
+  op: "Identity"
+  input: "Reshape_80"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_descriptor_qmmm"
+  op: "Identity"
+  input: "concat_13/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "fitting_attr_qmmm/t_bias_atom_e"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+          dim {
+            size: 6
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "fitting_attr_qmmm/t_bias_atom_e/read"
+  op: "Identity"
+  input: "fitting_attr_qmmm/t_bias_atom_e"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@fitting_attr_qmmm/t_bias_atom_e"
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_65/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_65/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_65/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_65"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_65/stack"
+  input: "strided_slice_65/stack_1"
+  input: "strided_slice_65/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_82/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_82/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_82/shape"
+  op: "Pack"
+  input: "Reshape_82/shape/0"
+  input: "strided_slice_65"
+  input: "Reshape_82/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_82"
+  op: "Reshape"
+  input: "o_descriptor_qmmm"
+  input: "Reshape_82/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_66/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_66/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_66/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_66"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_66/stack"
+  input: "strided_slice_66/stack_1"
+  input: "strided_slice_66/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "zeros_2/packed/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "zeros_2/packed"
+  op: "Pack"
+  input: "strided_slice_49"
+  input: "strided_slice_66"
+  input: "zeros_2/packed/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "zeros_2/Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "zeros_2"
+  op: "Fill"
+  input: "zeros_2/packed"
+  input: "zeros_2/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_67/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_67/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_67/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_67"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_67/stack"
+  input: "strided_slice_67/stack_1"
+  input: "strided_slice_67/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_83/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_83/shape"
+  op: "Pack"
+  input: "Reshape_83/shape/0"
+  input: "strided_slice_67"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_83"
+  op: "Reshape"
+  input: "GatherV2_5"
+  input: "Reshape_83/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_68/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_68/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_68/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_68"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_68/stack"
+  input: "strided_slice_68/stack_1"
+  input: "strided_slice_68/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Slice_21/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_21/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_21/size"
+  op: "Pack"
+  input: "Slice_21/size/0"
+  input: "strided_slice_68"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_21"
+  op: "Slice"
+  input: "Reshape_83"
+  input: "Slice_21/begin"
+  input: "Slice_21/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GreaterEqual_1/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "GreaterEqual_1"
+  op: "GreaterEqual"
+  input: "Slice_21"
+  input: "GreaterEqual_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_21"
+  op: "Cast"
+  input: "GreaterEqual_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_84/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_84"
+  op: "Reshape"
+  input: "Slice_21"
+  input: "Reshape_84/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value_5/Minimum/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_5/Minimum"
+  op: "Minimum"
+  input: "Reshape_84"
+  input: "clip_by_value_5/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "clip_by_value_5/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "clip_by_value_5"
+  op: "Maximum"
+  input: "clip_by_value_5/Minimum"
+  input: "clip_by_value_5/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_7/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@t_typeebd"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_7"
+  op: "GatherV2"
+  input: "t_typeebd"
+  input: "clip_by_value_5"
+  input: "embedding_lookup_7/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@t_typeebd"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_7/Identity"
+  op: "Identity"
+  input: "embedding_lookup_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Cast_22"
+  op: "Cast"
+  input: "embedding_lookup_7/Identity"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_85/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_85"
+  op: "Reshape"
+  input: "Reshape_82"
+  input: "Reshape_85/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_15/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_15"
+  op: "ConcatV2"
+  input: "Reshape_85"
+  input: "Cast_22"
+  input: "concat_15/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_69/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_69/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_69/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_69"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_69/stack"
+  input: "strided_slice_69/stack_1"
+  input: "strided_slice_69/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_86/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_86/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 40
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_86/shape"
+  op: "Pack"
+  input: "Reshape_86/shape/0"
+  input: "strided_slice_69"
+  input: "Reshape_86/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_86"
+  op: "Reshape"
+  input: "concat_15"
+  input: "Reshape_86/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_70/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_70/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_70/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_70"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_70/stack"
+  input: "strided_slice_70/stack_1"
+  input: "strided_slice_70/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Cast_23"
+  op: "Cast"
+  input: "Reshape_86"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Slice_22/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_22/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_22/size/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_22/size"
+  op: "Pack"
+  input: "Slice_22/size/0"
+  input: "strided_slice_70"
+  input: "Slice_22/size/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_22"
+  op: "Slice"
+  input: "Cast_23"
+  input: "Slice_22/begin"
+  input: "Slice_22/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_87/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377(\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_87"
+  op: "Reshape"
+  input: "Slice_22"
+  input: "Reshape_87/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 40
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\2241\000\276\224\223j>\333(%<S\373\300\276\027\351z<\261\325\272=\006W\273=\356\267\247\276\363q\344\275\352A\016\274c^\313=Q\017\'\275!\256\222>\rr\263=c\366q\275\261\261j\276C^G>\343\347\177\273\304@\'\275\3331r>s\263\211\276Fa\212\275\336\335\262\275\347\253J=\254\3762>h\375\263\2764\223\230=\254jO\276\364\211a=\271\262\210\276\272\030\313\273\016\'\016>j$Z\276\265\024j>\303(\033\275\267\206\347\275\236\237\367\274\023I5<\306\007\310=1Bx\276\022\rR>Xf9\276\005\177\237;B\301\226=X\024\212=M;H\275nq\216\275h\257\216\2769\341\235<v\211\363\275~\322\353\275\26232\275\277\300\232\276\001aU\2756\307\">\245 o>NN\016=\310\2134\275\242\347)=\333\364\211>N\372\021>\373\254\242\276\315\262F\275\321=?>&\221\333=-](\276?\357\t;\247\264(\276>\256\350\274g@\256\276s\200\220>\253nD>7\266T\275\034K\312\273\370DT\275\340PB\275&\\\362\275\310u5\275\221B:\275\357\360\377\275"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/matrix/read"
+  op: "Identity"
+  input: "layer_0_qmmm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_0_qmmm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\'\255[\277\211WI\276"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/bias/read"
+  op: "Identity"
+  input: "layer_0_qmmm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_0_qmmm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/MatMul"
+  op: "MatMul"
+  input: "Reshape_87"
+  input: "layer_0_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_0_qmmm/MatMul"
+  input: "layer_0_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/Tanh"
+  op: "Tanh"
+  input: "layer_0_qmmm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm/Reshape"
+  op: "Reshape"
+  input: "layer_0_qmmm/Tanh"
+  input: "layer_0_qmmm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\035\264\027>*\253\200\274;\030\340\274\313#\232<"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/matrix/read"
+  op: "Identity"
+  input: "layer_1_qmmm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_1_qmmm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "k\313\226\277\270\2765\277"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/bias/read"
+  op: "Identity"
+  input: "layer_1_qmmm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_1_qmmm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/MatMul"
+  op: "MatMul"
+  input: "layer_0_qmmm/Reshape"
+  input: "layer_1_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_1_qmmm/MatMul"
+  input: "layer_1_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/idt"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\207%\316=|,\316="
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/idt/read"
+  op: "Identity"
+  input: "layer_1_qmmm/idt"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_1_qmmm/idt"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/Tanh"
+  op: "Tanh"
+  input: "layer_1_qmmm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/Reshape"
+  op: "Reshape"
+  input: "layer_1_qmmm/Tanh"
+  input: "layer_1_qmmm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm/mul"
+  op: "Mul"
+  input: "layer_1_qmmm/Reshape"
+  input: "layer_1_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "add_10"
+  op: "AddV2"
+  input: "layer_0_qmmm/Reshape"
+  input: "layer_1_qmmm/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\035J\300\274y\031\032\277\333Z\025?\344\326\025?"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/matrix/read"
+  op: "Identity"
+  input: "layer_2_qmmm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_2_qmmm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "J\223\376\276n\333\375>"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/bias/read"
+  op: "Identity"
+  input: "layer_2_qmmm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_2_qmmm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/MatMul"
+  op: "MatMul"
+  input: "add_10"
+  input: "layer_2_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_2_qmmm/MatMul"
+  input: "layer_2_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/idt"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "d*\317=X\313\313="
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/idt/read"
+  op: "Identity"
+  input: "layer_2_qmmm/idt"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@layer_2_qmmm/idt"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/Tanh"
+  op: "Tanh"
+  input: "layer_2_qmmm/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/Reshape"
+  op: "Reshape"
+  input: "layer_2_qmmm/Tanh"
+  input: "layer_2_qmmm/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm/mul"
+  op: "Mul"
+  input: "layer_2_qmmm/Reshape"
+  input: "layer_2_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "add_11"
+  op: "AddV2"
+  input: "add_10"
+  input: "layer_2_qmmm/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm/matrix"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\037\356q?k\024\024?"
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm/matrix/read"
+  op: "Identity"
+  input: "final_layer_qmmm/matrix"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@final_layer_qmmm/matrix"
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm/bias"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        float_val: 1.4039429426193237
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm/bias/read"
+  op: "Identity"
+  input: "final_layer_qmmm/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@final_layer_qmmm/bias"
+      }
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm/MatMul"
+  op: "MatMul"
+  input: "add_11"
+  input: "final_layer_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm/BiasAdd"
+  op: "BiasAdd"
+  input: "final_layer_qmmm/MatMul"
+  input: "final_layer_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "Cast_24"
+  op: "Cast"
+  input: "final_layer_qmmm/BiasAdd"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_88/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_88"
+  op: "Reshape"
+  input: "zeros_2"
+  input: "Reshape_88/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "concat_16/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_16"
+  op: "ConcatV2"
+  input: "Reshape_88"
+  input: "Cast_22"
+  input: "concat_16/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_71/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_71/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_71/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_71"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_71/stack"
+  input: "strided_slice_71/stack_1"
+  input: "strided_slice_71/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_89/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_89/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 40
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_89/shape"
+  op: "Pack"
+  input: "Reshape_89/shape/0"
+  input: "strided_slice_71"
+  input: "Reshape_89/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_89"
+  op: "Reshape"
+  input: "concat_16"
+  input: "Reshape_89/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_72/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_72/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_72/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_72"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_72/stack"
+  input: "strided_slice_72/stack_1"
+  input: "strided_slice_72/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Cast_25"
+  op: "Cast"
+  input: "Reshape_89"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Slice_23/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_23/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_23/size/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_23/size"
+  op: "Pack"
+  input: "Slice_23/size/0"
+  input: "strided_slice_72"
+  input: "Slice_23/size/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_23"
+  op: "Slice"
+  input: "Cast_25"
+  input: "Slice_23/begin"
+  input: "Slice_23/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Reshape_90/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377(\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_90"
+  op: "Reshape"
+  input: "Slice_23"
+  input: "Reshape_90/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm_1/MatMul"
+  op: "MatMul"
+  input: "Reshape_90"
+  input: "layer_0_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_0_qmmm_1/MatMul"
+  input: "layer_0_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm_1/Tanh"
+  op: "Tanh"
+  input: "layer_0_qmmm_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm_1/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_0_qmmm_1/Reshape"
+  op: "Reshape"
+  input: "layer_0_qmmm_1/Tanh"
+  input: "layer_0_qmmm_1/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm_1/MatMul"
+  op: "MatMul"
+  input: "layer_0_qmmm_1/Reshape"
+  input: "layer_1_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_1_qmmm_1/MatMul"
+  input: "layer_1_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm_1/Tanh"
+  op: "Tanh"
+  input: "layer_1_qmmm_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm_1/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm_1/Reshape"
+  op: "Reshape"
+  input: "layer_1_qmmm_1/Tanh"
+  input: "layer_1_qmmm_1/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_1_qmmm_1/mul"
+  op: "Mul"
+  input: "layer_1_qmmm_1/Reshape"
+  input: "layer_1_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "add_12"
+  op: "AddV2"
+  input: "layer_0_qmmm_1/Reshape"
+  input: "layer_1_qmmm_1/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm_1/MatMul"
+  op: "MatMul"
+  input: "add_12"
+  input: "layer_2_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "layer_2_qmmm_1/MatMul"
+  input: "layer_2_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm_1/Tanh"
+  op: "Tanh"
+  input: "layer_2_qmmm_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm_1/Reshape/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm_1/Reshape"
+  op: "Reshape"
+  input: "layer_2_qmmm_1/Tanh"
+  input: "layer_2_qmmm_1/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "layer_2_qmmm_1/mul"
+  op: "Mul"
+  input: "layer_2_qmmm_1/Reshape"
+  input: "layer_2_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "add_13"
+  op: "AddV2"
+  input: "add_12"
+  input: "layer_2_qmmm_1/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm_1/MatMul"
+  op: "MatMul"
+  input: "add_13"
+  input: "final_layer_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "final_layer_qmmm_1/BiasAdd"
+  op: "BiasAdd"
+  input: "final_layer_qmmm_1/MatMul"
+  input: "final_layer_qmmm/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "Cast_26"
+  op: "Cast"
+  input: "final_layer_qmmm_1/BiasAdd"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sub_5"
+  op: "Sub"
+  input: "Cast_24"
+  input: "Cast_26"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Shape_23"
+  op: "Shape"
+  input: "Reshape_86"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_73/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_73/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_73/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_73"
+  op: "StridedSlice"
+  input: "Shape_23"
+  input: "strided_slice_73/stack"
+  input: "strided_slice_73/stack_1"
+  input: "strided_slice_73/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_74/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_74/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_74/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_74"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_74/stack"
+  input: "strided_slice_74/stack_1"
+  input: "strided_slice_74/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_91/shape"
+  op: "Pack"
+  input: "strided_slice_73"
+  input: "strided_slice_74"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_91"
+  op: "Reshape"
+  input: "sub_5"
+  input: "Reshape_91/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "embedding_lookup_8/axis"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@fitting_attr_qmmm/t_bias_atom_e"
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "embedding_lookup_8"
+  op: "GatherV2"
+  input: "fitting_attr_qmmm/t_bias_atom_e/read"
+  input: "clip_by_value_5"
+  input: "embedding_lookup_8/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@fitting_attr_qmmm/t_bias_atom_e"
+      }
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "embedding_lookup_8/Identity"
+  op: "Identity"
+  input: "embedding_lookup_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Shape_24"
+  op: "Shape"
+  input: "Reshape_86"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_75/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_75/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_75/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_75"
+  op: "StridedSlice"
+  input: "Shape_24"
+  input: "strided_slice_75/stack"
+  input: "strided_slice_75/stack_1"
+  input: "strided_slice_75/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_76/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_76/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_76/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_76"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_76/stack"
+  input: "strided_slice_76/stack_1"
+  input: "strided_slice_76/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Rank"
+  input: "strided_slice_76"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "range_1/start"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "range_1/delta"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "range_1"
+  op: "Range"
+  input: "range_1/start"
+  input: "Rank_1"
+  input: "range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Sum_1"
+  op: "Sum"
+  input: "strided_slice_76"
+  input: "range_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Reshape_92/shape"
+  op: "Pack"
+  input: "strided_slice_75"
+  input: "Sum_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_92"
+  op: "Reshape"
+  input: "embedding_lookup_8/Identity"
+  input: "Reshape_92/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_14"
+  op: "AddV2"
+  input: "Reshape_91"
+  input: "Reshape_92"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "mul_24"
+  op: "Mul"
+  input: "add_14"
+  input: "Cast_21"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "Reshape_93/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_93"
+  op: "Reshape"
+  input: "mul_24"
+  input: "Reshape_93/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_77/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_77/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_77/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_77"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_77/stack"
+  input: "strided_slice_77/stack_1"
+  input: "strided_slice_77/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "o_atom_energy_qmmm/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "o_atom_energy_qmmm/shape"
+  op: "Pack"
+  input: "o_atom_energy_qmmm/shape/0"
+  input: "strided_slice_77"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_atom_energy_qmmm"
+  op: "Reshape"
+  input: "Reshape_93"
+  input: "o_atom_energy_qmmm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_energy_qmmm/reduction_indices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "o_energy_qmmm"
+  op: "Sum"
+  input: "o_atom_energy_qmmm"
+  input: "o_energy_qmmm/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/Shape"
+  op: "Shape"
+  input: "Reshape_93"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/grad_ys_0/Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/grad_ys_0"
+  op: "Fill"
+  input: "gradients_1/Shape"
+  input: "gradients_1/grad_ys_0/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_93_grad/Shape"
+  op: "Shape"
+  input: "mul_24"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_93_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/grad_ys_0"
+  input: "gradients_1/Reshape_93_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_24_grad/Shape"
+  op: "Shape"
+  input: "add_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_24_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_21"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_24_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/mul_24_grad/Shape"
+  input: "gradients_1/mul_24_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_24_grad/Mul"
+  op: "Mul"
+  input: "gradients_1/Reshape_93_grad/Reshape"
+  input: "Cast_21"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_24_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/mul_24_grad/Mul"
+  input: "gradients_1/mul_24_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_24_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/mul_24_grad/Sum"
+  input: "gradients_1/mul_24_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/add_14_grad/Shape"
+  op: "Shape"
+  input: "Reshape_91"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_14_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_92"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_14_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/add_14_grad/Shape"
+  input: "gradients_1/add_14_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/add_14_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/mul_24_grad/Reshape"
+  input: "gradients_1/add_14_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/add_14_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/add_14_grad/Sum"
+  input: "gradients_1/add_14_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_91_grad/Shape"
+  op: "Shape"
+  input: "sub_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_91_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/add_14_grad/Reshape"
+  input: "gradients_1/Reshape_91_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/sub_5_grad/Shape"
+  op: "Shape"
+  input: "Cast_24"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/sub_5_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_26"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/sub_5_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/sub_5_grad/Shape"
+  input: "gradients_1/sub_5_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/sub_5_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/Reshape_91_grad/Reshape"
+  input: "gradients_1/sub_5_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/sub_5_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/sub_5_grad/Sum"
+  input: "gradients_1/sub_5_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Cast_24_grad/Cast"
+  op: "Cast"
+  input: "gradients_1/sub_5_grad/Reshape"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/final_layer_qmmm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/Cast_24_grad/Cast"
+  input: "final_layer_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/Shape"
+  op: "Shape"
+  input: "add_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/Shape_1"
+  op: "Shape"
+  input: "layer_2_qmmm/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/add_11_grad/Shape"
+  input: "gradients_1/add_11_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/final_layer_qmmm/MatMul_grad/MatMul"
+  input: "gradients_1/add_11_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/add_11_grad/Sum"
+  input: "gradients_1/add_11_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/final_layer_qmmm/MatMul_grad/MatMul"
+  input: "gradients_1/add_11_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/add_11_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/add_11_grad/Sum_1"
+  input: "gradients_1/add_11_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/mul_grad/Shape"
+  op: "Shape"
+  input: "layer_2_qmmm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/mul_grad/Shape_1"
+  op: "Shape"
+  input: "layer_2_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/mul_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/layer_2_qmmm/mul_grad/Shape"
+  input: "gradients_1/layer_2_qmmm/mul_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/mul_grad/Mul"
+  op: "Mul"
+  input: "gradients_1/add_11_grad/Reshape_1"
+  input: "layer_2_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/mul_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/layer_2_qmmm/mul_grad/Mul"
+  input: "gradients_1/layer_2_qmmm/mul_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/mul_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/layer_2_qmmm/mul_grad/Sum"
+  input: "gradients_1/layer_2_qmmm/mul_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "layer_2_qmmm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/layer_2_qmmm/mul_grad/Reshape"
+  input: "gradients_1/layer_2_qmmm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "layer_2_qmmm/Tanh"
+  input: "gradients_1/layer_2_qmmm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_2_qmmm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/layer_2_qmmm/Tanh_grad/TanhGrad"
+  input: "layer_2_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN"
+  op: "AddN"
+  input: "gradients_1/add_11_grad/Reshape"
+  input: "gradients_1/layer_2_qmmm/MatMul_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/add_11_grad/Reshape"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/Shape"
+  op: "Shape"
+  input: "layer_0_qmmm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/Shape_1"
+  op: "Shape"
+  input: "layer_1_qmmm/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/add_10_grad/Shape"
+  input: "gradients_1/add_10_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/AddN"
+  input: "gradients_1/add_10_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/add_10_grad/Sum"
+  input: "gradients_1/add_10_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/AddN"
+  input: "gradients_1/add_10_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/add_10_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/add_10_grad/Sum_1"
+  input: "gradients_1/add_10_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/mul_grad/Shape"
+  op: "Shape"
+  input: "layer_1_qmmm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/mul_grad/Shape_1"
+  op: "Shape"
+  input: "layer_1_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/mul_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/layer_1_qmmm/mul_grad/Shape"
+  input: "gradients_1/layer_1_qmmm/mul_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/mul_grad/Mul"
+  op: "Mul"
+  input: "gradients_1/add_10_grad/Reshape_1"
+  input: "layer_1_qmmm/idt/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/mul_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/layer_1_qmmm/mul_grad/Mul"
+  input: "gradients_1/layer_1_qmmm/mul_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/mul_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/layer_1_qmmm/mul_grad/Sum"
+  input: "gradients_1/layer_1_qmmm/mul_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "layer_1_qmmm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/layer_1_qmmm/mul_grad/Reshape"
+  input: "gradients_1/layer_1_qmmm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "layer_1_qmmm/Tanh"
+  input: "gradients_1/layer_1_qmmm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_1_qmmm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/layer_1_qmmm/Tanh_grad/TanhGrad"
+  input: "layer_1_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_1"
+  op: "AddN"
+  input: "gradients_1/add_10_grad/Reshape"
+  input: "gradients_1/layer_1_qmmm/MatMul_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/add_10_grad/Reshape"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_0_qmmm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "layer_0_qmmm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_0_qmmm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/AddN_1"
+  input: "gradients_1/layer_0_qmmm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_0_qmmm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "layer_0_qmmm/Tanh"
+  input: "gradients_1/layer_0_qmmm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/layer_0_qmmm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/layer_0_qmmm/Tanh_grad/TanhGrad"
+  input: "layer_0_qmmm/matrix/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_87_grad/Shape"
+  op: "Shape"
+  input: "Slice_22"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_87_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/layer_0_qmmm/MatMul_grad/MatMul"
+  input: "gradients_1/Reshape_87_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/Shape"
+  op: "Shape"
+  input: "Slice_22"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/stack"
+  op: "Pack"
+  input: "gradients_1/Slice_22_grad/Rank"
+  input: "gradients_1/Slice_22_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_22/begin"
+  input: "gradients_1/Slice_22_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/sub"
+  op: "Sub"
+  input: "gradients_1/Slice_22_grad/Shape_1"
+  input: "gradients_1/Slice_22_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/sub_1"
+  op: "Sub"
+  input: "gradients_1/Slice_22_grad/sub"
+  input: "Slice_22/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/Slice_22_grad/sub_1"
+  input: "gradients_1/Slice_22_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/concat"
+  op: "ConcatV2"
+  input: "gradients_1/Slice_22_grad/Reshape"
+  input: "gradients_1/Slice_22_grad/Reshape_1"
+  input: "gradients_1/Slice_22_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_22_grad/Pad"
+  op: "Pad"
+  input: "gradients_1/Reshape_87_grad/Reshape"
+  input: "gradients_1/Slice_22_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Cast_23_grad/Cast"
+  op: "Cast"
+  input: "gradients_1/Slice_22_grad/Pad"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_86_grad/Shape"
+  op: "Shape"
+  input: "concat_15"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_86_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/Cast_23_grad/Cast"
+  input: "gradients_1/Reshape_86_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/concat_15_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/concat_15_grad/mod"
+  op: "FloorMod"
+  input: "concat_15/axis"
+  input: "gradients_1/concat_15_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/concat_15_grad/ShapeN"
+  op: "ShapeN"
+  input: "Reshape_85"
+  input: "Cast_22"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/concat_15_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients_1/concat_15_grad/mod"
+  input: "gradients_1/concat_15_grad/ShapeN"
+  input: "gradients_1/concat_15_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients_1/concat_15_grad/Slice"
+  op: "Slice"
+  input: "gradients_1/Reshape_86_grad/Reshape"
+  input: "gradients_1/concat_15_grad/ConcatOffset"
+  input: "gradients_1/concat_15_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_85_grad/Shape"
+  op: "Shape"
+  input: "Reshape_82"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_85_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/concat_15_grad/Slice"
+  input: "gradients_1/Reshape_85_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_82_grad/Shape"
+  op: "Shape"
+  input: "o_descriptor_qmmm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_82_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/Reshape_85_grad/Reshape"
+  input: "gradients_1/Reshape_82_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_80_grad/Shape"
+  op: "Shape"
+  input: "Cast_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_80_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/Reshape_82_grad/Reshape"
+  input: "gradients_1/Reshape_80_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Cast_19_grad/Cast"
+  op: "Cast"
+  input: "gradients_1/Reshape_80_grad/Reshape"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_79_grad/Shape"
+  op: "Shape"
+  input: "MatMul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_79_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/Cast_19_grad/Cast"
+  input: "gradients_1/Reshape_79_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/MatMul"
+  op: "BatchMatMulV2"
+  input: "Slice_19"
+  input: "gradients_1/Reshape_79_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/MatMul_1"
+  op: "BatchMatMulV2"
+  input: "truediv_1"
+  input: "gradients_1/Reshape_79_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/Shape"
+  op: "Shape"
+  input: "truediv_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/Shape_1"
+  op: "Shape"
+  input: "Slice_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice"
+  op: "StridedSlice"
+  input: "gradients_1/MatMul_3_grad/Shape"
+  input: "gradients_1/MatMul_3_grad/strided_slice/stack"
+  input: "gradients_1/MatMul_3_grad/strided_slice/stack_1"
+  input: "gradients_1/MatMul_3_grad/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/strided_slice_1"
+  op: "StridedSlice"
+  input: "gradients_1/MatMul_3_grad/Shape_1"
+  input: "gradients_1/MatMul_3_grad/strided_slice_1/stack"
+  input: "gradients_1/MatMul_3_grad/strided_slice_1/stack_1"
+  input: "gradients_1/MatMul_3_grad/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/MatMul_3_grad/strided_slice"
+  input: "gradients_1/MatMul_3_grad/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/MatMul_3_grad/MatMul"
+  input: "gradients_1/MatMul_3_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/MatMul_3_grad/Sum"
+  input: "gradients_1/MatMul_3_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/MatMul_3_grad/MatMul_1"
+  input: "gradients_1/MatMul_3_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_3_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/MatMul_3_grad/Sum_1"
+  input: "gradients_1/MatMul_3_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/Shape"
+  op: "Shape"
+  input: "Slice_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/stack"
+  op: "Pack"
+  input: "gradients_1/Slice_19_grad/Rank"
+  input: "gradients_1/Slice_19_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_19/begin"
+  input: "gradients_1/Slice_19_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/Shape_1"
+  op: "Shape"
+  input: "truediv_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/sub"
+  op: "Sub"
+  input: "gradients_1/Slice_19_grad/Shape_1"
+  input: "gradients_1/Slice_19_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/sub_1"
+  op: "Sub"
+  input: "gradients_1/Slice_19_grad/sub"
+  input: "Slice_19/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/Slice_19_grad/sub_1"
+  input: "gradients_1/Slice_19_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/concat"
+  op: "ConcatV2"
+  input: "gradients_1/Slice_19_grad/Reshape"
+  input: "gradients_1/Slice_19_grad/Reshape_1"
+  input: "gradients_1/Slice_19_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_19_grad/Pad"
+  op: "Pad"
+  input: "gradients_1/MatMul_3_grad/Reshape_1"
+  input: "gradients_1/Slice_19_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_2"
+  op: "AddN"
+  input: "gradients_1/MatMul_3_grad/Reshape"
+  input: "gradients_1/Slice_19_grad/Pad"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/MatMul_3_grad/Reshape"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/truediv_1_grad/Shape"
+  op: "Shape"
+  input: "MatMul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/truediv_1_grad/Shape_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/truediv_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/truediv_1_grad/Shape"
+  input: "gradients_1/truediv_1_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/truediv_1_grad/RealDiv"
+  op: "RealDiv"
+  input: "gradients_1/AddN_2"
+  input: "truediv_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/truediv_1_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/truediv_1_grad/RealDiv"
+  input: "gradients_1/truediv_1_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/truediv_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/truediv_1_grad/Sum"
+  input: "gradients_1/truediv_1_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/MatMul"
+  op: "BatchMatMulV2"
+  input: "Reshape_77"
+  input: "gradients_1/truediv_1_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/MatMul_1"
+  op: "BatchMatMulV2"
+  input: "Reshape_78"
+  input: "gradients_1/truediv_1_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/Shape"
+  op: "Shape"
+  input: "Reshape_78"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_77"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice"
+  op: "StridedSlice"
+  input: "gradients_1/MatMul_2_grad/Shape"
+  input: "gradients_1/MatMul_2_grad/strided_slice/stack"
+  input: "gradients_1/MatMul_2_grad/strided_slice/stack_1"
+  input: "gradients_1/MatMul_2_grad/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/strided_slice_1"
+  op: "StridedSlice"
+  input: "gradients_1/MatMul_2_grad/Shape_1"
+  input: "gradients_1/MatMul_2_grad/strided_slice_1/stack"
+  input: "gradients_1/MatMul_2_grad/strided_slice_1/stack_1"
+  input: "gradients_1/MatMul_2_grad/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/MatMul_2_grad/strided_slice"
+  input: "gradients_1/MatMul_2_grad/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/MatMul_2_grad/MatMul"
+  input: "gradients_1/MatMul_2_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/MatMul_2_grad/Sum"
+  input: "gradients_1/MatMul_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/MatMul_2_grad/MatMul_1"
+  input: "gradients_1/MatMul_2_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/MatMul_2_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/MatMul_2_grad/Sum_1"
+  input: "gradients_1/MatMul_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_78_grad/Shape"
+  op: "Shape"
+  input: "Slice_16"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_78_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/MatMul_2_grad/Reshape"
+  input: "gradients_1/Reshape_78_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_77_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/add_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_77_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/MatMul_2_grad/Reshape_1"
+  input: "gradients_1/Reshape_77_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qmmm/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Shape"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/Reshape_77_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Sum"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/Reshape_77_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_6_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Sum_1"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/mul_2_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/mul_2_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qmmm/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/mul_2_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/Shape"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/mul_2_grad/Mul"
+  op: "Mul"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Reshape"
+  input: "filter_type_all_qmmm/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/mul_2_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/Mul"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/mul_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/Sum"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_3"
+  op: "AddN"
+  input: "gradients_1/filter_type_all_qmmm/add_6_grad/Reshape_1"
+  input: "gradients_1/filter_type_all_qmmm/mul_2_grad/Reshape"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/filter_type_all_qmmm/add_6_grad/Reshape_1"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qmmm/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Shape"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/AddN_3"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Sum"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/AddN_3"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_2_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Sum_1"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_2_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_2_grad/mod"
+  op: "FloorMod"
+  input: "filter_type_all_qmmm/concat_2/axis"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_2_grad/ShapeN"
+  op: "ShapeN"
+  input: "filter_type_all_qmmm/add_1"
+  input: "filter_type_all_qmmm/add_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_2_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/mod"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/ShapeN"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_2_grad/Slice"
+  op: "Slice"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/ConcatOffset"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_2_grad/Slice_1"
+  op: "Slice"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/ConcatOffset:1"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/ShapeN:1"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Reshape_2_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/Tanh_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Reshape_2_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_2_grad/Reshape_1"
+  input: "gradients_1/filter_type_all_qmmm/Reshape_2_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Tanh_2_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "filter_type_all_qmmm/Tanh_2"
+  input: "gradients_1/filter_type_all_qmmm/Reshape_2_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/MatMul_2_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/filter_type_all_qmmm/Tanh_2_grad/TanhGrad"
+  input: "filter_type_all_qmmm/matrix_3/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_4"
+  op: "AddN"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/Slice"
+  input: "gradients_1/filter_type_all_qmmm/concat_2_grad/Slice_1"
+  input: "gradients_1/filter_type_all_qmmm/MatMul_2_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/filter_type_all_qmmm/concat_2_grad/Slice"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qmmm/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Shape"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/AddN_4"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Sum"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/AddN_4"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Sum_1"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_1_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_1_grad/mod"
+  op: "FloorMod"
+  input: "filter_type_all_qmmm/concat_1/axis"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_1_grad/ShapeN"
+  op: "ShapeN"
+  input: "filter_type_all_qmmm/add"
+  input: "filter_type_all_qmmm/add"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_1_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/mod"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/ShapeN"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_1_grad/Slice"
+  op: "Slice"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/ConcatOffset"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_1_grad/Slice_1"
+  op: "Slice"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/ConcatOffset:1"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/ShapeN:1"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Reshape_1_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/Tanh_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_1_grad/Reshape_1"
+  input: "gradients_1/filter_type_all_qmmm/Reshape_1_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Tanh_1_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "filter_type_all_qmmm/Tanh_1"
+  input: "gradients_1/filter_type_all_qmmm/Reshape_1_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/filter_type_all_qmmm/Tanh_1_grad/TanhGrad"
+  input: "filter_type_all_qmmm/matrix_2/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_5"
+  op: "AddN"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/Slice"
+  input: "gradients_1/filter_type_all_qmmm/concat_1_grad/Slice_1"
+  input: "gradients_1/filter_type_all_qmmm/MatMul_1_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/filter_type_all_qmmm/concat_1_grad/Slice"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/Shape_1"
+  op: "Shape"
+  input: "filter_type_all_qmmm/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Shape"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/AddN_5"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Sum"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients_1/AddN_5"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/BroadcastGradientArgs:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Sum_1"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_grad/mod"
+  op: "FloorMod"
+  input: "filter_type_all_qmmm/concat/axis"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_grad/ShapeN"
+  op: "ShapeN"
+  input: "Reshape_75"
+  input: "Reshape_75"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_grad/ConcatOffset"
+  op: "ConcatOffset"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/mod"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/ShapeN"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/ShapeN:1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_grad/Slice"
+  op: "Slice"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/ConcatOffset"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/ShapeN"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/concat_grad/Slice_1"
+  op: "Slice"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Reshape"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/ConcatOffset:1"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/ShapeN:1"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Reshape_grad/Shape"
+  op: "Shape"
+  input: "filter_type_all_qmmm/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Reshape_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/filter_type_all_qmmm/add_grad/Reshape_1"
+  input: "gradients_1/filter_type_all_qmmm/Reshape_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_has_manual_control_dependencies"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/Tanh_grad/TanhGrad"
+  op: "TanhGrad"
+  input: "filter_type_all_qmmm/Tanh"
+  input: "gradients_1/filter_type_all_qmmm/Reshape_grad/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "gradients_1/filter_type_all_qmmm/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients_1/filter_type_all_qmmm/Tanh_grad/TanhGrad"
+  input: "filter_type_all_qmmm/matrix_1/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_6"
+  op: "AddN"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/Slice"
+  input: "gradients_1/filter_type_all_qmmm/concat_grad/Slice_1"
+  input: "gradients_1/filter_type_all_qmmm/MatMul_grad/MatMul"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/filter_type_all_qmmm/concat_grad/Slice"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_75_grad/Shape"
+  op: "Shape"
+  input: "Slice_17"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_75_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/AddN_6"
+  input: "gradients_1/Reshape_75_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/Shape"
+  op: "Shape"
+  input: "Slice_17"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/stack"
+  op: "Pack"
+  input: "gradients_1/Slice_17_grad/Rank"
+  input: "gradients_1/Slice_17_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_17/begin"
+  input: "gradients_1/Slice_17_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_74"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/sub"
+  op: "Sub"
+  input: "gradients_1/Slice_17_grad/Shape_1"
+  input: "gradients_1/Slice_17_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/sub_1"
+  op: "Sub"
+  input: "gradients_1/Slice_17_grad/sub"
+  input: "Slice_17/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/Slice_17_grad/sub_1"
+  input: "gradients_1/Slice_17_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/concat"
+  op: "ConcatV2"
+  input: "gradients_1/Slice_17_grad/Reshape"
+  input: "gradients_1/Slice_17_grad/Reshape_1"
+  input: "gradients_1/Slice_17_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_17_grad/Pad"
+  op: "Pad"
+  input: "gradients_1/Reshape_75_grad/Reshape"
+  input: "gradients_1/Slice_17_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_74_grad/Shape"
+  op: "Shape"
+  input: "Slice_16"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_74_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/Slice_17_grad/Pad"
+  input: "gradients_1/Reshape_74_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/AddN_7"
+  op: "AddN"
+  input: "gradients_1/Reshape_78_grad/Reshape"
+  input: "gradients_1/Reshape_74_grad/Reshape"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients_1/Reshape_78_grad/Reshape"
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/Rank"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/Shape"
+  op: "Shape"
+  input: "Slice_16"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/stack/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/stack"
+  op: "Pack"
+  input: "gradients_1/Slice_16_grad/Rank"
+  input: "gradients_1/Slice_16_grad/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/Reshape"
+  op: "Reshape"
+  input: "Slice_16/begin"
+  input: "gradients_1/Slice_16_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/Shape_1"
+  op: "Shape"
+  input: "Cast_17"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/sub"
+  op: "Sub"
+  input: "gradients_1/Slice_16_grad/Shape_1"
+  input: "gradients_1/Slice_16_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/sub_1"
+  op: "Sub"
+  input: "gradients_1/Slice_16_grad/sub"
+  input: "Slice_16/begin"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients_1/Slice_16_grad/sub_1"
+  input: "gradients_1/Slice_16_grad/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/concat/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/concat"
+  op: "ConcatV2"
+  input: "gradients_1/Slice_16_grad/Reshape"
+  input: "gradients_1/Slice_16_grad/Reshape_1"
+  input: "gradients_1/Slice_16_grad/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Slice_16_grad/Pad"
+  op: "Pad"
+  input: "gradients_1/AddN_7"
+  input: "gradients_1/Slice_16_grad/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Cast_17_grad/Cast"
+  op: "Cast"
+  input: "gradients_1/Slice_16_grad/Pad"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_22_grad/Shape"
+  op: "Shape"
+  input: "Reshape_66"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_22_grad/Shape_1"
+  op: "Shape"
+  input: "Reshape_72"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_22_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients_1/mul_22_grad/Shape"
+  input: "gradients_1/mul_22_grad/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_22_grad/Mul"
+  op: "Mul"
+  input: "gradients_1/Cast_17_grad/Cast"
+  input: "Reshape_72"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_22_grad/Sum"
+  op: "Sum"
+  input: "gradients_1/mul_22_grad/Mul"
+  input: "gradients_1/mul_22_grad/BroadcastGradientArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients_1/mul_22_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/mul_22_grad/Sum"
+  input: "gradients_1/mul_22_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_66_grad/Shape"
+  op: "Shape"
+  input: "Reshape_65"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_66_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/mul_22_grad/Reshape"
+  input: "gradients_1/Reshape_66_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_65_grad/Shape"
+  op: "Shape"
+  input: "o_rmat_qmmm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "gradients_1/Reshape_65_grad/Reshape"
+  op: "Reshape"
+  input: "gradients_1/Reshape_66_grad/Reshape"
+  input: "gradients_1/Reshape_65_grad/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_78/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_78/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_78/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_78"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_78/stack"
+  input: "strided_slice_78/stack_1"
+  input: "strided_slice_78/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_26/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 108
+      }
+    }
+  }
+}
+node {
+  name: "mul_26"
+  op: "Mul"
+  input: "strided_slice_78"
+  input: "mul_26/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_94/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_94/shape"
+  op: "Pack"
+  input: "Reshape_94/shape/0"
+  input: "mul_26"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_94"
+  op: "Reshape"
+  input: "gradients_1/Reshape_65_grad/Reshape"
+  input: "Reshape_94/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "ProdForceSeA_1"
+  op: "ProdForceSeA"
+  input: "Reshape_94"
+  input: "o_rmat_deriv_qmmm"
+  input: "o_nlist_qmmm"
+  input: "DprcPairwiseIdx:5"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "n_a_sel"
+    value {
+      i: 27
+    }
+  }
+  attr {
+    key: "n_r_sel"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "ProdVirialSeA_1"
+  op: "ProdVirialSeA"
+  input: "Reshape_94"
+  input: "o_rmat_deriv_qmmm"
+  input: "o_rij_qmmm"
+  input: "o_nlist_qmmm"
+  input: "DprcPairwiseIdx:5"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "n_a_sel"
+    value {
+      i: 27
+    }
+  }
+  attr {
+    key: "n_r_sel"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "strided_slice_79/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_79/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_79/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_79"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_79/stack"
+  input: "strided_slice_79/stack_1"
+  input: "strided_slice_79/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_27/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_27"
+  op: "Mul"
+  input: "mul_27/x"
+  input: "strided_slice_79"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Reshape_95/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_95/shape"
+  op: "Pack"
+  input: "Reshape_95/shape/0"
+  input: "mul_27"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_95"
+  op: "Reshape"
+  input: "ProdForceSeA_1"
+  input: "Reshape_95/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_80/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_80/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_80/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_80"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_80/stack"
+  input: "strided_slice_80/stack_1"
+  input: "strided_slice_80/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_28/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_28"
+  op: "Mul"
+  input: "mul_28/x"
+  input: "strided_slice_80"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_force_qmmm/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "o_force_qmmm/shape"
+  op: "Pack"
+  input: "o_force_qmmm/shape/0"
+  input: "mul_28"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_force_qmmm"
+  op: "Reshape"
+  input: "Reshape_95"
+  input: "o_force_qmmm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_virial_qmmm/shape"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\377\377\377\377\t\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "o_virial_qmmm"
+  op: "Reshape"
+  input: "ProdVirialSeA_1"
+  input: "o_virial_qmmm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_81/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_81/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_81/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_81"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_81/stack"
+  input: "strided_slice_81/stack_1"
+  input: "strided_slice_81/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_29/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 9
+      }
+    }
+  }
+}
+node {
+  name: "mul_29"
+  op: "Mul"
+  input: "mul_29/x"
+  input: "strided_slice_81"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_atom_virial_qmmm/shape/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "o_atom_virial_qmmm/shape"
+  op: "Pack"
+  input: "o_atom_virial_qmmm/shape/0"
+  input: "mul_29"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_atom_virial_qmmm"
+  op: "Reshape"
+  input: "ProdVirialSeA_1:1"
+  input: "o_atom_virial_qmmm/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "SegmentSum"
+  op: "SegmentSum"
+  input: "o_energy_qmmm"
+  input: "DprcPairwiseIdx:6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_15"
+  op: "AddV2"
+  input: "o_energy_qm"
+  input: "SegmentSum"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_energy"
+  op: "Identity"
+  input: "add_15"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_82/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_82/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_82/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_82"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_82/stack"
+  input: "strided_slice_82/stack_1"
+  input: "strided_slice_82/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_96/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_96/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "strided_slice_82"
+  input: "Reshape_96/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_96"
+  op: "Reshape"
+  input: "o_force_qm"
+  input: "Reshape_96/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_25"
+  op: "Shape"
+  input: "Reshape_96"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_83/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_83/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_83/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_83"
+  op: "StridedSlice"
+  input: "Shape_25"
+  input: "strided_slice_83/stack"
+  input: "strided_slice_83/stack_1"
+  input: "strided_slice_83/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_26"
+  op: "Shape"
+  input: "Reshape_96"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_84/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_84/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_84/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_84"
+  op: "StridedSlice"
+  input: "Shape_26"
+  input: "strided_slice_84/stack"
+  input: "strided_slice_84/stack_1"
+  input: "strided_slice_84/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_17/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_17/values_0"
+  op: "Pack"
+  input: "strided_slice_83"
+  input: "concat_17/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_17/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_17"
+  op: "ConcatV2"
+  input: "concat_17/values_0"
+  input: "strided_slice_84"
+  input: "concat_17/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_4/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_4"
+  op: "Fill"
+  input: "concat_17"
+  input: "Fill_4/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_27"
+  op: "Cast"
+  input: "Fill_4"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_18/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_18"
+  op: "ConcatV2"
+  input: "Cast_27"
+  input: "Reshape_96"
+  input: "concat_18/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_16/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_16"
+  op: "AddV2"
+  input: "DprcPairwiseIdx:1"
+  input: "add_16/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_7/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_7"
+  op: "GatherV2"
+  input: "concat_18"
+  input: "add_16"
+  input: "GatherV2_7/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_85/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_85/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_85/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_85"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_85/stack"
+  input: "strided_slice_85/stack_1"
+  input: "strided_slice_85/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_97/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_97/shape"
+  op: "Pack"
+  input: "strided_slice_4"
+  input: "strided_slice_85"
+  input: "Reshape_97/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_97"
+  op: "Reshape"
+  input: "o_force_qmmm"
+  input: "Reshape_97/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_27"
+  op: "Shape"
+  input: "Reshape_97"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_86/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_86/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_86/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_86"
+  op: "StridedSlice"
+  input: "Shape_27"
+  input: "strided_slice_86/stack"
+  input: "strided_slice_86/stack_1"
+  input: "strided_slice_86/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_28"
+  op: "Shape"
+  input: "Reshape_97"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_87/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_87/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_87/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_87"
+  op: "StridedSlice"
+  input: "Shape_28"
+  input: "strided_slice_87/stack"
+  input: "strided_slice_87/stack_1"
+  input: "strided_slice_87/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_19/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_19/values_0"
+  op: "Pack"
+  input: "strided_slice_86"
+  input: "concat_19/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_19/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_19"
+  op: "ConcatV2"
+  input: "concat_19/values_0"
+  input: "strided_slice_87"
+  input: "concat_19/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_5/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_5"
+  op: "Fill"
+  input: "concat_19"
+  input: "Fill_5/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_28"
+  op: "Cast"
+  input: "Fill_5"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_20/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_20"
+  op: "ConcatV2"
+  input: "Cast_28"
+  input: "Reshape_97"
+  input: "concat_20/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_17/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_17"
+  op: "AddV2"
+  input: "DprcPairwiseIdx:3"
+  input: "add_17/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_8/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_8"
+  op: "GatherV2"
+  input: "concat_20"
+  input: "add_17"
+  input: "GatherV2_8/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "SegmentSum_1"
+  op: "SegmentSum"
+  input: "GatherV2_8"
+  input: "DprcPairwiseIdx:6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_18"
+  op: "AddV2"
+  input: "GatherV2_7"
+  input: "SegmentSum_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_88/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_88/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_88/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_88"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_88/stack"
+  input: "strided_slice_88/stack_1"
+  input: "strided_slice_88/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_30/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "mul_30"
+  op: "Mul"
+  input: "mul_30/x"
+  input: "strided_slice_88"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_force/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "mul_30"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_force"
+  op: "Reshape"
+  input: "add_18"
+  input: "o_force/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "SegmentSum_2"
+  op: "SegmentSum"
+  input: "o_virial_qmmm"
+  input: "DprcPairwiseIdx:6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_19"
+  op: "AddV2"
+  input: "o_virial_qm"
+  input: "SegmentSum_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_virial"
+  op: "Identity"
+  input: "add_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_89/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_89/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_89/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_89"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_89/stack"
+  input: "strided_slice_89/stack_1"
+  input: "strided_slice_89/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Slice_24/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_24/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_24/size"
+  op: "Pack"
+  input: "Slice_24/size/0"
+  input: "strided_slice_89"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_24"
+  op: "Slice"
+  input: "DprcPairwiseIdx:1"
+  input: "Slice_24/begin"
+  input: "Slice_24/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_90/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_90/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_90/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_90"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_90/stack"
+  input: "strided_slice_90/stack_1"
+  input: "strided_slice_90/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Slice_25/begin"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Slice_25/size/0"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_25/size"
+  op: "Pack"
+  input: "Slice_25/size/0"
+  input: "strided_slice_90"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_25"
+  op: "Slice"
+  input: "DprcPairwiseIdx:3"
+  input: "Slice_25/begin"
+  input: "Slice_25/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_29"
+  op: "Shape"
+  input: "o_atom_energy_qm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_91/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_91/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_91/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_91"
+  op: "StridedSlice"
+  input: "Shape_29"
+  input: "strided_slice_91/stack"
+  input: "strided_slice_91/stack_1"
+  input: "strided_slice_91/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_30"
+  op: "Shape"
+  input: "o_atom_energy_qm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_92/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_92/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_92/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_92"
+  op: "StridedSlice"
+  input: "Shape_30"
+  input: "strided_slice_92/stack"
+  input: "strided_slice_92/stack_1"
+  input: "strided_slice_92/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_21/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_21/values_0"
+  op: "Pack"
+  input: "strided_slice_91"
+  input: "concat_21/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_21/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_21"
+  op: "ConcatV2"
+  input: "concat_21/values_0"
+  input: "strided_slice_92"
+  input: "concat_21/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_6/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_6"
+  op: "Fill"
+  input: "concat_21"
+  input: "Fill_6/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_29"
+  op: "Cast"
+  input: "Fill_6"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_22/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_22"
+  op: "ConcatV2"
+  input: "Cast_29"
+  input: "o_atom_energy_qm"
+  input: "concat_22/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_20/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_20"
+  op: "AddV2"
+  input: "Slice_24"
+  input: "add_20/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_9/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_9"
+  op: "GatherV2"
+  input: "concat_22"
+  input: "add_20"
+  input: "GatherV2_9/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_31"
+  op: "Shape"
+  input: "o_atom_energy_qmmm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_93/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_93/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_93/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_93"
+  op: "StridedSlice"
+  input: "Shape_31"
+  input: "strided_slice_93/stack"
+  input: "strided_slice_93/stack_1"
+  input: "strided_slice_93/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_32"
+  op: "Shape"
+  input: "o_atom_energy_qmmm"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_94/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_94/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_94/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_94"
+  op: "StridedSlice"
+  input: "Shape_32"
+  input: "strided_slice_94/stack"
+  input: "strided_slice_94/stack_1"
+  input: "strided_slice_94/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_23/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_23/values_0"
+  op: "Pack"
+  input: "strided_slice_93"
+  input: "concat_23/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_23/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_23"
+  op: "ConcatV2"
+  input: "concat_23/values_0"
+  input: "strided_slice_94"
+  input: "concat_23/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_7/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_7"
+  op: "Fill"
+  input: "concat_23"
+  input: "Fill_7/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_30"
+  op: "Cast"
+  input: "Fill_7"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_24/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_24"
+  op: "ConcatV2"
+  input: "Cast_30"
+  input: "o_atom_energy_qmmm"
+  input: "concat_24/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_21/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_21"
+  op: "AddV2"
+  input: "Slice_25"
+  input: "add_21/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_10/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_10"
+  op: "GatherV2"
+  input: "concat_24"
+  input: "add_21"
+  input: "GatherV2_10/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "SegmentSum_3"
+  op: "SegmentSum"
+  input: "GatherV2_10"
+  input: "DprcPairwiseIdx:6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_22"
+  op: "AddV2"
+  input: "GatherV2_9"
+  input: "SegmentSum_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "o_atom_energy"
+  op: "Identity"
+  input: "add_22"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_95/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_95/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_95/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_95"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:4"
+  input: "strided_slice_95/stack"
+  input: "strided_slice_95/stack_1"
+  input: "strided_slice_95/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_98/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 9
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_98/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "strided_slice_95"
+  input: "Reshape_98/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_98"
+  op: "Reshape"
+  input: "o_atom_virial_qm"
+  input: "Reshape_98/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_33"
+  op: "Shape"
+  input: "Reshape_98"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_96/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_96/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_96/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_96"
+  op: "StridedSlice"
+  input: "Shape_33"
+  input: "strided_slice_96/stack"
+  input: "strided_slice_96/stack_1"
+  input: "strided_slice_96/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_34"
+  op: "Shape"
+  input: "Reshape_98"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_97/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_97/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_97/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_97"
+  op: "StridedSlice"
+  input: "Shape_34"
+  input: "strided_slice_97/stack"
+  input: "strided_slice_97/stack_1"
+  input: "strided_slice_97/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_25/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_25/values_0"
+  op: "Pack"
+  input: "strided_slice_96"
+  input: "concat_25/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_25/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_25"
+  op: "ConcatV2"
+  input: "concat_25/values_0"
+  input: "strided_slice_97"
+  input: "concat_25/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_8/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_8"
+  op: "Fill"
+  input: "concat_25"
+  input: "Fill_8/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_31"
+  op: "Cast"
+  input: "Fill_8"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_26/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_26"
+  op: "ConcatV2"
+  input: "Cast_31"
+  input: "Reshape_98"
+  input: "concat_26/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_23/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_23"
+  op: "AddV2"
+  input: "DprcPairwiseIdx:1"
+  input: "add_23/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_11/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_11"
+  op: "GatherV2"
+  input: "concat_26"
+  input: "add_23"
+  input: "GatherV2_11/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "strided_slice_98/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_98/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_98/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_98"
+  op: "StridedSlice"
+  input: "DprcPairwiseIdx:5"
+  input: "strided_slice_98/stack"
+  input: "strided_slice_98/stack_1"
+  input: "strided_slice_98/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Reshape_99/shape/2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 9
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_99/shape"
+  op: "Pack"
+  input: "strided_slice_4"
+  input: "strided_slice_98"
+  input: "Reshape_99/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Reshape_99"
+  op: "Reshape"
+  input: "o_atom_virial_qmmm"
+  input: "Reshape_99/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_35"
+  op: "Shape"
+  input: "Reshape_99"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_99/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_99/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_99/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_99"
+  op: "StridedSlice"
+  input: "Shape_35"
+  input: "strided_slice_99/stack"
+  input: "strided_slice_99/stack_1"
+  input: "strided_slice_99/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Shape_36"
+  op: "Shape"
+  input: "Reshape_99"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+  experimental_type {
+    type_id: TFT_PRODUCT
+    args {
+      type_id: TFT_SHAPE_TENSOR
+      args {
+        type_id: TFT_INT32
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_100/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_100/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_100/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_100"
+  op: "StridedSlice"
+  input: "Shape_36"
+  input: "strided_slice_100/stack"
+  input: "strided_slice_100/stack_1"
+  input: "strided_slice_100/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_27/values_0/1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_27/values_0"
+  op: "Pack"
+  input: "strided_slice_99"
+  input: "concat_27/values_0/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "concat_27/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_27"
+  op: "ConcatV2"
+  input: "concat_27/values_0"
+  input: "strided_slice_100"
+  input: "concat_27/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_9/value"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_9"
+  op: "Fill"
+  input: "concat_27"
+  input: "Fill_9/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Cast_32"
+  op: "Cast"
+  input: "Fill_9"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Truncate"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "concat_28/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_28"
+  op: "ConcatV2"
+  input: "Cast_32"
+  input: "Reshape_99"
+  input: "concat_28/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_24/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "add_24"
+  op: "AddV2"
+  input: "DprcPairwiseIdx:3"
+  input: "add_24/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "GatherV2_12/axis"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "GatherV2_12"
+  op: "GatherV2"
+  input: "concat_28"
+  input: "add_24"
+  input: "GatherV2_12/axis"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "batch_dims"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "SegmentSum_4"
+  op: "SegmentSum"
+  input: "GatherV2_12"
+  input: "DprcPairwiseIdx:6"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "add_25"
+  op: "AddV2"
+  input: "GatherV2_11"
+  input: "SegmentSum_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+}
+node {
+  name: "strided_slice_101/stack"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_101/stack_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_101/stack_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_101"
+  op: "StridedSlice"
+  input: "t_natoms"
+  input: "strided_slice_101/stack"
+  input: "strided_slice_101/stack_1"
+  input: "strided_slice_101/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "mul_31/x"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 9
+      }
+    }
+  }
+}
+node {
+  name: "mul_31"
+  op: "Mul"
+  input: "mul_31/x"
+  input: "strided_slice_101"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "o_atom_virial/shape"
+  op: "Pack"
+  input: "strided_slice"
+  input: "mul_31"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "o_atom_virial"
+  op: "Reshape"
+  input: "add_25"
+  input: "o_atom_virial/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 1395
+}
diff --git a/source/tests/test_pairwise_dprc.py b/source/tests/test_pairwise_dprc.py
index 04aaa237b1..7fbb4fdf19 100644
--- a/source/tests/test_pairwise_dprc.py
+++ b/source/tests/test_pairwise_dprc.py
@@ -357,6 +357,187 @@ def test_model_ener(self):
         self.assertAllClose(e[1] + e[2] + e[3] - 3 * e[0], e[4] - e[0])
         self.assertAllClose(f[1] + f[2] + f[3] - 3 * f[0], f[4] - f[0])
 
+    def test_nloc(self):
+        jfile = tests_path / "pairwise_dprc.json"
+        jdata = j_loader(jfile)
+        model = Model(**jdata["model"])
+
+        sys = dpdata.LabeledSystem()
+        sys.data["atom_names"] = ["C", "N", "O", "H", "OW", "HW"]
+        sys.data["coords"] = np.array(
+            [
+                2.48693,
+                -0.12642,
+                0.45320,
+                3.86292,
+                -0.00082,
+                0.07286,
+                4.19135,
+                0.35148,
+                -1.21253,
+                3.35886,
+                0.58875,
+                -2.08423,
+                5.67422,
+                0.44076,
+                -1.45160,
+                2.40712,
+                -0.32538,
+                1.52137,
+                2.04219,
+                -0.93912,
+                -0.12445,
+                1.98680,
+                0.81574,
+                0.21261,
+                4.57186,
+                -0.33026,
+                0.71127,
+                6.24532,
+                0.18814,
+                -0.55212,
+                5.92647,
+                1.46447,
+                -1.74069,
+                5.95030,
+                -0.25321,
+                -2.24804,
+                -0.32794,
+                1.50468,
+                0.83176,
+                0.23662,
+                2.24068,
+                1.13166,
+                -0.24528,
+                1.59132,
+                -0.14907,
+                -0.50371,
+                -1.24800,
+                -0.05601,
+                -0.28305,
+                -1.84629,
+                0.67555,
+                -0.68673,
+                -0.40535,
+                0.41384,
+                0.38397,
+                0.80987,
+                -1.90358,
+                1.30191,
+                0.68503,
+                -2.22909,
+                0.11626,
+                -0.11276,
+                -1.70506,
+            ]
+        ).reshape(1, 21, 3)
+        sys.data["atom_types"] = np.array(
+            [0, 1, 0, 2, 0, 3, 3, 3, 3, 3, 3, 3, 4, 5, 5, 4, 5, 5, 4, 5, 5]
+        )
+        sys.data["cells"] = np.array([np.eye(3) * 30])
+        nframes = 1
+        natoms = 21
+        sys.data["coords"] = sys.data["coords"].reshape([nframes, natoms, 3])
+        sys.data["cells"] = sys.data["cells"].reshape([nframes, 3, 3])
+        sys.data["energies"] = np.ones(
+            [
+                nframes,
+            ]
+        )
+        sys.data["forces"] = np.zeros([nframes, natoms, 3])
+        sys.data["nopbc"] = True
+        sys.to_deepmd_npy("system", prec=np.float64)
+        idxs = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
+        np.save("system/set.000/aparam.npy", idxs)
+
+        systems = j_must_have(jdata["training"]["training_data"], "systems")
+        batch_size = 1
+        test_size = 1
+        rcut = model.get_rcut()
+
+        data = DeepmdDataSystem(systems, batch_size, test_size, rcut)
+        data.add("energy", 1, atomic=False, must=True, high_prec=True)
+        data.add("aparam", 1, atomic=True, must=True, high_prec=True)
+        test_data = data.get_test()
+
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name="t_energy")
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord")
+        t_type = tf.placeholder(tf.int32, [None], name="i_type")
+        t_natoms = tf.placeholder(tf.int32, [model.get_ntypes() + 2], name="i_natoms")
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name="i_box")
+        t_mesh = tf.placeholder(tf.int32, [None], name="i_mesh")
+        is_training = tf.placeholder(tf.bool)
+        t_aparam = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_aparam")
+        input_dict = {}
+        input_dict["aparam"] = t_aparam
+
+        model.data_stat(data)
+        model_pred = model.build(
+            t_coord,
+            t_type,
+            t_natoms,
+            t_box,
+            t_mesh,
+            input_dict,
+            suffix="se_a_atom_ener_0",
+            reuse=False,
+        )
+        energy = model_pred["energy"]
+        force = model_pred["force"]
+        virial = model_pred["virial"]
+
+        test_types = np.array(
+            [
+                [0, 0, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5],
+            ]
+        )
+        nloc1 = 17
+        # aparam: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 2. 3. 1. 1. 2. 2. 3. 3.]]
+        feed_dict_test = {
+            t_energy: np.reshape(test_data["energy"], [-1]),
+            t_coord: np.reshape(test_data["coord"], [-1]),
+            t_box: np.reshape(test_data["box"], (1, 9)),
+            t_type: np.reshape(test_types, [-1]),
+            t_natoms: [nloc1, 21, nloc1, 0, 0, 0, 0, 0],
+            t_mesh: test_data["default_mesh"],
+            t_aparam: np.reshape(test_data["aparam"], [-1]),
+            is_training: False,
+        }
+        sess = self.cached_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [e1, f1, v1] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
+
+        idx_map = np.concatenate([np.arange(nloc1, 21), np.arange(nloc1)])
+        idx_map_inv = np.argsort(idx_map)
+        feed_dict_test = {
+            t_energy: np.reshape(test_data["energy"], [-1]),
+            t_coord: np.reshape(np.reshape(test_data["coord"], [-1, 3])[idx_map], [-1]),
+            t_box: np.reshape(test_data["box"], (1, 9)),
+            t_type: np.reshape(test_types, [-1])[idx_map],
+            t_natoms: [21 - nloc1, 21, 21 - nloc1, 0, 0, 0, 0, 0],
+            t_mesh: test_data["default_mesh"],
+            t_aparam: np.reshape(test_data["aparam"], [-1])[idx_map],
+            is_training: False,
+        }
+        [e2, f2, v2] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
+        f2 = np.reshape(np.reshape(f2, [-1, 3])[idx_map_inv], f2.shape)
+
+        feed_dict_test = {
+            t_energy: np.reshape(test_data["energy"], [-1]),
+            t_coord: np.reshape(test_data["coord"], [-1]),
+            t_box: np.reshape(test_data["box"], (1, 9)),
+            t_type: np.reshape(test_types, [-1]),
+            t_natoms: [21, 21, 21, 0, 0, 0, 0, 0],
+            t_mesh: test_data["default_mesh"],
+            t_aparam: np.reshape(test_data["aparam"], [-1]),
+            is_training: False,
+        }
+        [e3, f3, v3] = sess.run([energy, force, virial], feed_dict=feed_dict_test)
+
+        np.testing.assert_allclose(e1 + e2, e3, 6)
+        np.testing.assert_allclose(f1 + f2, f3, 6)
+        np.testing.assert_allclose(v1 + v2, v3, 6)
+
 
 def _init_models():
     system = dpdata.LabeledSystem()

From 19c6a688ffc3ce9e5d7ffb482f925dbd59a05880 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?=
 <yifanl0716@gmail.com>
Date: Sun, 17 Sep 2023 19:50:33 -0500
Subject: [PATCH 32/63] lmp: support other units for both pair deepmd and fix
 dplr (#2800)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR is intended to do 2 things:
1. support other unit styles like si and nano (except lj);
2. support the non-metal units for fix dplr and compute deeptensor/atom.

Unittests for both features mentioned above have also been added in this
pull request.

---------

Signed-off-by: Yifan Li李一帆 <yifanl0716@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 doc/third-party/lammps-command.md      |  10 ++
 source/lmp/compute_deeptensor_atom.cpp |  31 ++---
 source/lmp/compute_deeptensor_atom.h   |   1 +
 source/lmp/fix_dplr.cpp                |  59 +++++----
 source/lmp/fix_dplr.h                  |   1 +
 source/lmp/pair_deepmd.cpp             | 140 ++++++++++++----------
 source/lmp/pair_deepmd.h               |   2 +-
 source/lmp/tests/constants.py          |  17 +++
 source/lmp/tests/test_deeptensor.py    |  59 ++++++++-
 source/lmp/tests/test_dplr.py          |  80 +++++++++++--
 source/lmp/tests/test_lammps.py        | 160 ++++++++++++++++---------
 source/lmp/tests/write_lmp_data.py     |  22 ++--
 12 files changed, 406 insertions(+), 176 deletions(-)
 create mode 100644 source/lmp/tests/constants.py

diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index e1d482381f..a9d849bc7c 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -1,5 +1,14 @@
 # LAMMPS commands
 
+## units
+All units in LAMMPS except `lj` are supported. `lj` is not supported.
+
+The most commonly used units are `metal`, since the internal units of distance, energy, force, and charge in DeePMD-kit are `\AA`, `eV`, `eV / \AA`, and `proton charge`, respectively. These units are consistent with the `metal` units in LAMMPS.
+
+If one wants to use other units like `real` or `si`, it is welcome to do so. There is no need to do the unit conversion mannualy. The unit conversion is done automatically by LAMMPS.
+
+The only thing that one needs to take care is the unit of the output of `compute deeptensor/atom`. Working with `metal` units for `compute deeptensor/atom` is totally fine, since there is no unit conversion. For other unit styles, we currently assume that the output of the `compute deeptensor/atom` command has the unit of distance and have applied the unit conversion factor of distance. If a user wants to infer quantities with units other than distance, the user is encouraged to open a GitHub feature request, so that the unit conversion factor can be added.
+
 ## Enable DeePMD-kit plugin (plugin mode)
 
 If you are using the plugin mode, enable DeePMD-kit package in LAMMPS with `plugin` command:
@@ -119,6 +128,7 @@ dump            1 all custom 100 water.dump id type c_dipole[1] c_dipole[2] c_di
 
 ### Restrictions
 - The `deeptensor/atom` compute is provided in the USER-DEEPMD package, which is compiled from the DeePMD-kit, visit the [DeePMD-kit website](https://github.com/deepmodeling/deepmd-kit) for more information.
+- For the issue of using a unit style for `compute deeptensor/atom`, refer to the discussions in [units](#units) of this page.
 
 
 ## Long-range interaction
diff --git a/source/lmp/compute_deeptensor_atom.cpp b/source/lmp/compute_deeptensor_atom.cpp
index 0de523e1bf..2f4486002e 100644
--- a/source/lmp/compute_deeptensor_atom.cpp
+++ b/source/lmp/compute_deeptensor_atom.cpp
@@ -26,12 +26,12 @@ using namespace LAMMPS_NS;
 
 ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS *lmp, int narg, char **arg)
     : Compute(lmp, narg, arg), dp(lmp), tensor(nullptr) {
-  if (!(strcmp(update->unit_style, "metal") == 0 ||
-        strcmp(update->unit_style, "real") == 0)) {
-    error->all(
-        FLERR,
-        "Compute deeptensor/atom requires metal or real unit; please set it by "
-        "\"units metal\" or \"units real\"");
+  if (strcmp(update->unit_style, "lj") == 0) {
+    error->all(FLERR,
+               "Compute deeptensor/atom does not support unit style lj. Please "
+               "use other "
+               "unit styles like metal or real unit instead. You may set it by "
+               "\"units metal\" or \"units real\"");
   }
 
   if (narg < 4) {
@@ -57,6 +57,8 @@ ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS *lmp, int narg, char **arg)
   timeflag = 1;
 
   nmax = 0;
+
+  dist_unit_cvt_factor = force->angstrom;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -116,16 +118,17 @@ void ComputeDeeptensorAtom::compute_peratom() {
     dtype[ii] = type[ii] - 1;
   }
   // get box
-  dbox[0] = domain->h[0];  // xx
-  dbox[4] = domain->h[1];  // yy
-  dbox[8] = domain->h[2];  // zz
-  dbox[7] = domain->h[3];  // zy
-  dbox[6] = domain->h[4];  // zx
-  dbox[3] = domain->h[5];  // yx
+  dbox[0] = domain->h[0] / dist_unit_cvt_factor;  // xx
+  dbox[4] = domain->h[1] / dist_unit_cvt_factor;  // yy
+  dbox[8] = domain->h[2] / dist_unit_cvt_factor;  // zz
+  dbox[7] = domain->h[3] / dist_unit_cvt_factor;  // zy
+  dbox[6] = domain->h[4] / dist_unit_cvt_factor;  // zx
+  dbox[3] = domain->h[5] / dist_unit_cvt_factor;  // yx
   // get coord
   for (int ii = 0; ii < nall; ++ii) {
     for (int dd = 0; dd < 3; ++dd) {
-      dcoord[ii * 3 + dd] = x[ii][dd] - domain->boxlo[dd];
+      dcoord[ii * 3 + dd] =
+          (x[ii][dd] - domain->boxlo[dd]) / dist_unit_cvt_factor;
     }
   }
 
@@ -155,7 +158,7 @@ void ComputeDeeptensorAtom::compute_peratom() {
     // record when selected and in group
     if (selected && ingroup) {
       for (int jj = 0; jj < size_peratom_cols; ++jj) {
-        tensor[ii][jj] = atensor[iter_tensor + jj];
+        tensor[ii][jj] = atensor[iter_tensor + jj] * dist_unit_cvt_factor;
       }
     }
     // if not selected or not in group set to 0.
diff --git a/source/lmp/compute_deeptensor_atom.h b/source/lmp/compute_deeptensor_atom.h
index ab25da4246..a90283aa9e 100644
--- a/source/lmp/compute_deeptensor_atom.h
+++ b/source/lmp/compute_deeptensor_atom.h
@@ -36,6 +36,7 @@ class ComputeDeeptensorAtom : public Compute {
   void compute_peratom() override;
   double memory_usage() override;
   void init_list(int, class NeighList *) override;
+  double dist_unit_cvt_factor;
 
  private:
   int nmax;
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
index a0df7efd24..77bf0d56c0 100644
--- a/source/lmp/fix_dplr.cpp
+++ b/source/lmp/fix_dplr.cpp
@@ -61,10 +61,11 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
   qe2f = force->qe2f;
   xstyle = ystyle = zstyle = NONE;
 
-  if (strcmp(update->unit_style, "metal") != 0) {
-    error->all(
-        FLERR,
-        "Fix dplr requires metal unit, please set it by \"units metal\"");
+  if (strcmp(update->unit_style, "lj") == 0) {
+    error->all(FLERR,
+               "Fix dplr does not support unit style lj. Please use other "
+               "unit styles like metal or real unit instead. You may set it by "
+               "\"units metal\" or \"units real\"");
   }
 
   int iarg = 3;
@@ -142,6 +143,9 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
   if (!pair_deepmd) {
     error->all(FLERR, "pair_style deepmd should be set before this fix\n");
   }
+  ener_unit_cvt_factor = pair_deepmd->ener_unit_cvt_factor;
+  dist_unit_cvt_factor = pair_deepmd->dist_unit_cvt_factor;
+  force_unit_cvt_factor = pair_deepmd->force_unit_cvt_factor;
 
   int n = atom->ntypes;
   std::vector<std::string> type_names = pair_deepmd->type_names;
@@ -445,16 +449,17 @@ void FixDPLR::pre_force(int vflag) {
     dtype[ii] = type_idx_map[type[ii] - 1];
   }
   // get box
-  dbox[0] = domain->h[0];  // xx
-  dbox[4] = domain->h[1];  // yy
-  dbox[8] = domain->h[2];  // zz
-  dbox[7] = domain->h[3];  // zy
-  dbox[6] = domain->h[4];  // zx
-  dbox[3] = domain->h[5];  // yx
+  dbox[0] = domain->h[0] / dist_unit_cvt_factor;  // xx
+  dbox[4] = domain->h[1] / dist_unit_cvt_factor;  // yy
+  dbox[8] = domain->h[2] / dist_unit_cvt_factor;  // zz
+  dbox[7] = domain->h[3] / dist_unit_cvt_factor;  // zy
+  dbox[6] = domain->h[4] / dist_unit_cvt_factor;  // zx
+  dbox[3] = domain->h[5] / dist_unit_cvt_factor;  // yx
   // get coord
   for (int ii = 0; ii < nall; ++ii) {
     for (int dd = 0; dd < 3; ++dd) {
-      dcoord[ii * 3 + dd] = x[ii][dd] - domain->boxlo[dd];
+      dcoord[ii * 3 + dd] =
+          (x[ii][dd] - domain->boxlo[dd]) / dist_unit_cvt_factor;
     }
   }
   // get lammps nlist
@@ -523,9 +528,11 @@ void FixDPLR::pre_force(int vflag) {
     int res_idx = sel_fwd[idx0];
     // int ret_idx = dpl_bwd[res_idx];
     for (int dd = 0; dd < 3; ++dd) {
-      x[idx1][dd] = x[idx0][dd] + tensor[res_idx * 3 + dd];
+      x[idx1][dd] =
+          x[idx0][dd] + tensor[res_idx * 3 + dd] * dist_unit_cvt_factor;
       // res_buff[idx1 * odim + dd] = tensor[res_idx * odim + dd];
-      dipole_recd[idx0 * 3 + dd] = tensor[res_idx * 3 + dd];
+      dipole_recd[idx0 * 3 + dd] =
+          tensor[res_idx * 3 + dd] * dist_unit_cvt_factor;
     }
   }
   // cout << "-------------------- fix/dplr: pre force " << endl;
@@ -568,17 +575,18 @@ void FixDPLR::post_force(int vflag) {
     for (int ii = 0; ii < nall; ++ii) {
       dtype[ii] = type_idx_map[type[ii] - 1];
     }
-    dbox[0] = domain->h[0];  // xx
-    dbox[4] = domain->h[1];  // yy
-    dbox[8] = domain->h[2];  // zz
-    dbox[7] = domain->h[3];  // zy
-    dbox[6] = domain->h[4];  // zx
-    dbox[3] = domain->h[5];  // yx
+    dbox[0] = domain->h[0] / dist_unit_cvt_factor;  // xx
+    dbox[4] = domain->h[1] / dist_unit_cvt_factor;  // yy
+    dbox[8] = domain->h[2] / dist_unit_cvt_factor;  // zz
+    dbox[7] = domain->h[3] / dist_unit_cvt_factor;  // zy
+    dbox[6] = domain->h[4] / dist_unit_cvt_factor;  // zx
+    dbox[3] = domain->h[5] / dist_unit_cvt_factor;  // yx
     // get coord
     double **x = atom->x;
     for (int ii = 0; ii < nall; ++ii) {
       for (int dd = 0; dd < 3; ++dd) {
-        dcoord[ii * 3 + dd] = x[ii][dd] - domain->boxlo[dd];
+        dcoord[ii * 3 + dd] =
+            (x[ii][dd] - domain->boxlo[dd]) / dist_unit_cvt_factor;
       }
     }
     // revise force according to efield
@@ -599,7 +607,7 @@ void FixDPLR::post_force(int vflag) {
     for (int ii = 0; ii < nlocal; ++ii) {
       double tmpf[3];
       for (int dd = 0; dd < 3; ++dd) {
-        tmpf[dd] = q[ii] * efield[dd];
+        tmpf[dd] = q[ii] * efield[dd] * force->qe2f;
       }
       for (int dd = 0; dd < 3; ++dd) {
         dfele[ii * 3 + dd] += tmpf[dd];
@@ -632,8 +640,17 @@ void FixDPLR::post_force(int vflag) {
   vector<FLOAT_PREC> dfcorr, dvcorr;
   // compute
   try {
+    for (int ii = 0; ii < nlocal * 3; ++ii) {
+      dfele[ii] /= force_unit_cvt_factor;
+    }
     dtm.compute(dfcorr, dvcorr, dcoord, dtype, dbox, valid_pairs, dfele, nghost,
                 lmp_list);
+    for (int ii = 0; ii < nlocal * 3; ++ii) {
+      dfcorr[ii] *= force_unit_cvt_factor;
+    }
+    for (int ii = 0; ii < 9; ++ii) {
+      dvcorr[ii] *= ener_unit_cvt_factor;
+    }
   } catch (deepmd_compat::deepmd_exception &e) {
     error->one(FLERR, e.what());
   }
diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h
index 23ae1c818d..a6822fe4fe 100644
--- a/source/lmp/fix_dplr.h
+++ b/source/lmp/fix_dplr.h
@@ -54,6 +54,7 @@ class FixDPLR : public Fix {
   void unpack_reverse_comm(int, int *, double *) override;
   double compute_scalar(void) override;
   double compute_vector(int) override;
+  double ener_unit_cvt_factor, dist_unit_cvt_factor, force_unit_cvt_factor;
 
  private:
   PairDeepMD *pair_deepmd;
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 489c31ff19..3fa592bf58 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -344,18 +344,16 @@ PairDeepMD::PairDeepMD(LAMMPS *lmp)
   if (lmp->citeme) {
     lmp->citeme->add(cite_user_deepmd_package);
   }
-  int unit_convert;
-  if (strcmp(update->unit_style, "metal") == 0) {
-    unit_convert = utils::NOCONVERT;
-  } else if (strcmp(update->unit_style, "real") == 0) {
-    unit_convert = utils::METAL2REAL;
-  } else {
+  if (strcmp(update->unit_style, "lj") == 0) {
     error->all(FLERR,
-               "Pair deepmd requires metal or real unit, please set it by "
+               "Pair deepmd does not support unit style lj. Please use other "
+               "unit styles like metal or real unit instead. You may set it by "
                "\"units metal\" or \"units real\"");
   }
-  ener_unit_cvt_factor =
-      utils::get_conversion_factor(utils::ENERGY, unit_convert);
+  ener_unit_cvt_factor = force->boltz / 8.617343e-5;
+  dist_unit_cvt_factor = force->angstrom;
+  force_unit_cvt_factor = ener_unit_cvt_factor / dist_unit_cvt_factor;
+
   restartinfo = 1;
 #if LAMMPS_VERSION_NUMBER >= 20201130
   centroidstressflag =
@@ -368,7 +366,6 @@ PairDeepMD::PairDeepMD(LAMMPS *lmp)
   pppmflag = 1;
   respa_enable = 0;
   writedata = 0;
-  unit_convert_flag = utils::get_supported_conversions(utils::ENERGY);
 
   cutoff = 0.;
   numb_types = 0;
@@ -480,17 +477,18 @@ void PairDeepMD::compute(int eflag, int vflag) {
   vector<double> daparam;
 
   // get box
-  dbox[0] = domain->h[0];  // xx
-  dbox[4] = domain->h[1];  // yy
-  dbox[8] = domain->h[2];  // zz
-  dbox[7] = domain->h[3];  // zy
-  dbox[6] = domain->h[4];  // zx
-  dbox[3] = domain->h[5];  // yx
+  dbox[0] = domain->h[0] / dist_unit_cvt_factor;  // xx
+  dbox[4] = domain->h[1] / dist_unit_cvt_factor;  // yy
+  dbox[8] = domain->h[2] / dist_unit_cvt_factor;  // zz
+  dbox[7] = domain->h[3] / dist_unit_cvt_factor;  // zy
+  dbox[6] = domain->h[4] / dist_unit_cvt_factor;  // zx
+  dbox[3] = domain->h[5] / dist_unit_cvt_factor;  // yx
 
   // get coord
   for (int ii = 0; ii < nall; ++ii) {
     for (int dd = 0; dd < 3; ++dd) {
-      dcoord[ii * 3 + dd] = x[ii][dd] - domain->boxlo[dd];
+      dcoord[ii * 3 + dd] =
+          (x[ii][dd] - domain->boxlo[dd]) / dist_unit_cvt_factor;
     }
   }
 
@@ -586,7 +584,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
         }
         if (eflag_atom) {
           for (int ii = 0; ii < nlocal; ++ii) {
-            eatom[ii] += scale[1][1] * deatom[ii];
+            eatom[ii] += scale[1][1] * deatom[ii] * ener_unit_cvt_factor;
           }
         }
         // Added by Davide Tisi 2020
@@ -600,15 +598,24 @@ void PairDeepMD::compute(int eflag, int vflag) {
             // vatom[ii][3] += 1.0 * dvatom[9*ii+3];
             // vatom[ii][4] += 1.0 * dvatom[9*ii+6];
             // vatom[ii][5] += 1.0 * dvatom[9*ii+7];
-            cvatom[ii][0] += scale[1][1] * dvatom[9 * ii + 0];  // xx
-            cvatom[ii][1] += scale[1][1] * dvatom[9 * ii + 4];  // yy
-            cvatom[ii][2] += scale[1][1] * dvatom[9 * ii + 8];  // zz
-            cvatom[ii][3] += scale[1][1] * dvatom[9 * ii + 3];  // xy
-            cvatom[ii][4] += scale[1][1] * dvatom[9 * ii + 6];  // xz
-            cvatom[ii][5] += scale[1][1] * dvatom[9 * ii + 7];  // yz
-            cvatom[ii][6] += scale[1][1] * dvatom[9 * ii + 1];  // yx
-            cvatom[ii][7] += scale[1][1] * dvatom[9 * ii + 2];  // zx
-            cvatom[ii][8] += scale[1][1] * dvatom[9 * ii + 5];  // zy
+            cvatom[ii][0] +=
+                scale[1][1] * dvatom[9 * ii + 0] * ener_unit_cvt_factor;  // xx
+            cvatom[ii][1] +=
+                scale[1][1] * dvatom[9 * ii + 4] * ener_unit_cvt_factor;  // yy
+            cvatom[ii][2] +=
+                scale[1][1] * dvatom[9 * ii + 8] * ener_unit_cvt_factor;  // zz
+            cvatom[ii][3] +=
+                scale[1][1] * dvatom[9 * ii + 3] * ener_unit_cvt_factor;  // xy
+            cvatom[ii][4] +=
+                scale[1][1] * dvatom[9 * ii + 6] * ener_unit_cvt_factor;  // xz
+            cvatom[ii][5] +=
+                scale[1][1] * dvatom[9 * ii + 7] * ener_unit_cvt_factor;  // yz
+            cvatom[ii][6] +=
+                scale[1][1] * dvatom[9 * ii + 1] * ener_unit_cvt_factor;  // yx
+            cvatom[ii][7] +=
+                scale[1][1] * dvatom[9 * ii + 2] * ener_unit_cvt_factor;  // zx
+            cvatom[ii][8] +=
+                scale[1][1] * dvatom[9 * ii + 5] * ener_unit_cvt_factor;  // zy
           }
         }
       }
@@ -638,7 +645,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
       dvatom = all_atom_virial[0];
       if (eflag_atom) {
         for (int ii = 0; ii < nlocal; ++ii) {
-          eatom[ii] += scale[1][1] * deatom[ii];
+          eatom[ii] += scale[1][1] * deatom[ii] * ener_unit_cvt_factor;
         }
       }
       // Added by Davide Tisi 2020
@@ -652,15 +659,24 @@ void PairDeepMD::compute(int eflag, int vflag) {
           // vatom[ii][3] += 1.0 * dvatom[9*ii+3];
           // vatom[ii][4] += 1.0 * dvatom[9*ii+6];
           // vatom[ii][5] += 1.0 * dvatom[9*ii+7];
-          cvatom[ii][0] += scale[1][1] * dvatom[9 * ii + 0];  // xx
-          cvatom[ii][1] += scale[1][1] * dvatom[9 * ii + 4];  // yy
-          cvatom[ii][2] += scale[1][1] * dvatom[9 * ii + 8];  // zz
-          cvatom[ii][3] += scale[1][1] * dvatom[9 * ii + 3];  // xy
-          cvatom[ii][4] += scale[1][1] * dvatom[9 * ii + 6];  // xz
-          cvatom[ii][5] += scale[1][1] * dvatom[9 * ii + 7];  // yz
-          cvatom[ii][6] += scale[1][1] * dvatom[9 * ii + 1];  // yx
-          cvatom[ii][7] += scale[1][1] * dvatom[9 * ii + 2];  // zx
-          cvatom[ii][8] += scale[1][1] * dvatom[9 * ii + 5];  // zy
+          cvatom[ii][0] +=
+              scale[1][1] * dvatom[9 * ii + 0] * ener_unit_cvt_factor;  // xx
+          cvatom[ii][1] +=
+              scale[1][1] * dvatom[9 * ii + 4] * ener_unit_cvt_factor;  // yy
+          cvatom[ii][2] +=
+              scale[1][1] * dvatom[9 * ii + 8] * ener_unit_cvt_factor;  // zz
+          cvatom[ii][3] +=
+              scale[1][1] * dvatom[9 * ii + 3] * ener_unit_cvt_factor;  // xy
+          cvatom[ii][4] +=
+              scale[1][1] * dvatom[9 * ii + 6] * ener_unit_cvt_factor;  // xz
+          cvatom[ii][5] +=
+              scale[1][1] * dvatom[9 * ii + 7] * ener_unit_cvt_factor;  // yz
+          cvatom[ii][6] +=
+              scale[1][1] * dvatom[9 * ii + 1] * ener_unit_cvt_factor;  // yx
+          cvatom[ii][7] +=
+              scale[1][1] * dvatom[9 * ii + 2] * ener_unit_cvt_factor;  // zx
+          cvatom[ii][8] +=
+              scale[1][1] * dvatom[9 * ii + 5] * ener_unit_cvt_factor;  // zy
         }
       }
       if (out_freq > 0 && update->ntimestep % out_freq == 0) {
@@ -729,12 +745,12 @@ void PairDeepMD::compute(int eflag, int vflag) {
           all_v_avg = sqrt(all_v_avg / 9);
         }
         if (rank == 0) {
-          all_v_max *= scale[1][1];
-          all_v_min *= scale[1][1];
-          all_v_avg *= scale[1][1];
-          all_f_max *= scale[1][1];
-          all_f_min *= scale[1][1];
-          all_f_avg *= scale[1][1];
+          all_v_max *= scale[1][1] * ener_unit_cvt_factor;
+          all_v_min *= scale[1][1] * ener_unit_cvt_factor;
+          all_v_avg *= scale[1][1] * ener_unit_cvt_factor;
+          all_f_max *= scale[1][1] * force_unit_cvt_factor;
+          all_f_min *= scale[1][1] * force_unit_cvt_factor;
+          all_f_avg *= scale[1][1] * force_unit_cvt_factor;
           fp << setw(12) << update->ntimestep << " " << setw(18) << all_v_max
              << " " << setw(18) << all_v_min << " " << setw(18) << all_v_avg
              << " " << setw(18) << all_f_max << " " << setw(18) << all_f_min
@@ -760,7 +776,8 @@ void PairDeepMD::compute(int eflag, int vflag) {
                       displacements, MPI_DOUBLE, 0, world);
           if (rank == 0) {
             for (int dd = 0; dd < all_nlocal; ++dd) {
-              std_f_all[tagrecv[dd] - 1] = stdfrecv[dd] * scale[1][1];
+              std_f_all[tagrecv[dd] - 1] =
+                  stdfrecv[dd] * scale[1][1] * force_unit_cvt_factor;
             }
             for (int dd = 0; dd < all_nlocal; ++dd) {
               fp << " " << setw(18) << std_f_all[dd];
@@ -790,7 +807,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
   if (!atom->sp_flag) {
     for (int ii = 0; ii < nall; ++ii) {
       for (int dd = 0; dd < 3; ++dd) {
-        f[ii][dd] += scale[1][1] * dforce[3 * ii + dd];
+        f[ii][dd] += scale[1][1] * dforce[3 * ii + dd] * force_unit_cvt_factor;
       }
     }
   } else {
@@ -799,13 +816,14 @@ void PairDeepMD::compute(int eflag, int vflag) {
     for (int ii = 0; ii < nall; ++ii) {
       for (int dd = 0; dd < 3; ++dd) {
         int new_idx = new_idx_map[ii];
-        f[ii][dd] += scale[1][1] * dforce[3 * new_idx + dd];
+        f[ii][dd] +=
+            scale[1][1] * dforce[3 * new_idx + dd] * force_unit_cvt_factor;
         if (dtype[ii] < numb_types_spin && ii < nlocal) {
           fm[ii][dd] += scale[1][1] * dforce[3 * (new_idx + nlocal) + dd] /
-                        (hbar / spin_norm[dtype[ii]]);
+                        (hbar / spin_norm[dtype[ii]]) * force_unit_cvt_factor;
         } else if (dtype[ii] < numb_types_spin) {
           fm[ii][dd] += scale[1][1] * dforce[3 * (new_idx + nghost) + dd] /
-                        (hbar / spin_norm[dtype[ii]]);
+                        (hbar / spin_norm[dtype[ii]]) * force_unit_cvt_factor;
         }
       }
     }
@@ -819,15 +837,15 @@ void PairDeepMD::compute(int eflag, int vflag) {
 
   // accumulate energy and virial
   if (eflag) {
-    eng_vdwl += scale[1][1] * dener;
+    eng_vdwl += scale[1][1] * dener * ener_unit_cvt_factor;
   }
   if (vflag) {
-    virial[0] += 1.0 * dvirial[0] * scale[1][1];
-    virial[1] += 1.0 * dvirial[4] * scale[1][1];
-    virial[2] += 1.0 * dvirial[8] * scale[1][1];
-    virial[3] += 1.0 * dvirial[3] * scale[1][1];
-    virial[4] += 1.0 * dvirial[6] * scale[1][1];
-    virial[5] += 1.0 * dvirial[7] * scale[1][1];
+    virial[0] += 1.0 * dvirial[0] * scale[1][1] * ener_unit_cvt_factor;
+    virial[1] += 1.0 * dvirial[4] * scale[1][1] * ener_unit_cvt_factor;
+    virial[2] += 1.0 * dvirial[8] * scale[1][1] * ener_unit_cvt_factor;
+    virial[3] += 1.0 * dvirial[3] * scale[1][1] * ener_unit_cvt_factor;
+    virial[4] += 1.0 * dvirial[6] * scale[1][1] * ener_unit_cvt_factor;
+    virial[5] += 1.0 * dvirial[7] * scale[1][1] * ener_unit_cvt_factor;
   }
 }
 
@@ -854,7 +872,7 @@ void PairDeepMD::allocate() {
         continue;
       }
       setflag[i][j] = 1;
-      scale[i][j] = 1.0 * ener_unit_cvt_factor;
+      scale[i][j] = 1.0;
     }
   }
 }
@@ -904,7 +922,7 @@ void PairDeepMD::settings(int narg, char **arg) {
     } catch (deepmd_compat::deepmd_exception &e) {
       error->one(FLERR, e.what());
     }
-    cutoff = deep_pot.cutoff();
+    cutoff = deep_pot.cutoff() * dist_unit_cvt_factor;
     numb_types = deep_pot.numb_types();
     numb_types_spin = deep_pot.numb_types_spin();
     dim_fparam = deep_pot.dim_fparam();
@@ -917,12 +935,12 @@ void PairDeepMD::settings(int narg, char **arg) {
     } catch (deepmd_compat::deepmd_exception &e) {
       error->one(FLERR, e.what());
     }
-    cutoff = deep_pot_model_devi.cutoff();
+    cutoff = deep_pot_model_devi.cutoff() * dist_unit_cvt_factor;
     numb_types = deep_pot_model_devi.numb_types();
     numb_types_spin = deep_pot_model_devi.numb_types_spin();
     dim_fparam = deep_pot_model_devi.dim_fparam();
     dim_aparam = deep_pot_model_devi.dim_aparam();
-    assert(cutoff == deep_pot.cutoff());
+    assert(cutoff == deep_pot.cutoff() * dist_unit_cvt_factor);
     assert(numb_types == deep_pot.numb_types());
     assert(numb_types_spin == deep_pot.numb_types_spin());
     assert(dim_fparam == deep_pot.dim_fparam());
@@ -1197,7 +1215,7 @@ void PairDeepMD::coeff(int narg, char **arg) {
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo, i); j <= jhi; j++) {
       setflag[i][j] = 1;
-      scale[i][j] = 1.0 * ener_unit_cvt_factor;
+      scale[i][j] = 1.0;
       if (i > numb_types || j > numb_types) {
         char warning_msg[1024];
         sprintf(warning_msg,
@@ -1244,7 +1262,7 @@ double PairDeepMD::init_one(int i, int j) {
   }
 
   if (setflag[i][j] == 0) {
-    scale[i][j] = 1.0 * ener_unit_cvt_factor;
+    scale[i][j] = 1.0;
   }
   scale[j][i] = scale[i][j];
 
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index d04ea97916..bde7745d36 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -76,6 +76,7 @@ class PairDeepMD : public Pair {
   std::vector<std::string> get_file_content(
       const std::vector<std::string> &models);
   std::vector<std::string> type_names;
+  double ener_unit_cvt_factor, dist_unit_cvt_factor, force_unit_cvt_factor;
 
  protected:
   virtual void allocate();
@@ -132,7 +133,6 @@ class PairDeepMD : public Pair {
   tagint *tagsend, *tagrecv;
   double *stdfsend, *stdfrecv;
   std::vector<int> type_idx_map;
-  double ener_unit_cvt_factor;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/tests/constants.py b/source/lmp/tests/constants.py
new file mode 100644
index 0000000000..016bf66288
--- /dev/null
+++ b/source/lmp/tests/constants.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+# https://github.com/lammps/lammps/blob/1e1311cf401c5fc2614b5d6d0ff3230642b76597/src/update.cpp#L193
+nktv2p = 1.6021765e6
+nktv2p_real = 68568.415
+metal2real = 23.060549
+
+dist_metal2real = 1.0
+ener_metal2real = 23.060549
+force_metal2real = ener_metal2real / dist_metal2real
+mass_metal2real = 1.0
+charge_metal2real = 1.0
+
+dist_metal2si = 1.0e-10
+ener_metal2si = 1.3806504e-23 / 8.617343e-5
+force_metal2si = ener_metal2si / dist_metal2si
+mass_metal2si = 1e-3 / 6.02214e23
+charge_metal2si = 1.6021765e-19
diff --git a/source/lmp/tests/test_deeptensor.py b/source/lmp/tests/test_deeptensor.py
index 8c7d7d9640..3e684b386e 100644
--- a/source/lmp/tests/test_deeptensor.py
+++ b/source/lmp/tests/test_deeptensor.py
@@ -6,6 +6,7 @@
     Path,
 )
 
+import constants
 import numpy as np
 import pytest
 from lammps import (
@@ -23,6 +24,7 @@
 pb_file2 = Path(__file__).parent / "deepdipole_new.pb"
 system_file = Path(__file__).parent.parent.parent / "tests"
 data_file = Path(__file__).parent / "data.lmp"
+data_file_si = Path(__file__).parent / "data.si"
 data_type_map_file = Path(__file__).parent / "data_type_map.lmp"
 
 # this is as the same as python and c++ tests, test_deepdipole.py
@@ -75,24 +77,49 @@ def setup_module():
     write_lmp_data(box, coord, type_OH, data_file)
     # TODO
     # write_lmp_data(box, coord, type_HO, data_type_map_file)
+    write_lmp_data(
+        box * constants.dist_metal2si,
+        coord * constants.dist_metal2si,
+        type_OH,
+        data_file_si,
+    )
 
 
 def teardown_module():
     os.remove(data_file)
     # os.remove(data_type_map_file)
+    os.remove(data_file_si)
 
 
-def _lammps(data_file) -> PyLammps:
+def _lammps(data_file, units="metal") -> PyLammps:
     lammps = PyLammps()
-    lammps.units("metal")
+    lammps.units(units)
     lammps.boundary("p p p")
     lammps.atom_style("atomic")
-    lammps.neighbor("2.0 bin")
+    if units == "metal" or units == "real":
+        lammps.neighbor("2.0 bin")
+    elif units == "si":
+        lammps.neighbor("2.0e-10 bin")
+    else:
+        raise ValueError("units should be metal, real, or si")
     lammps.neigh_modify("every 10 delay 0 check no")
     lammps.read_data(data_file.resolve())
-    lammps.mass("1 16")
-    lammps.mass("2 2")
-    lammps.timestep(0.0005)
+    if units == "metal" or units == "real":
+        lammps.mass("1 16")
+        lammps.mass("2 2")
+    elif units == "si":
+        lammps.mass("1 %.10e" % (16 * constants.mass_metal2si))
+        lammps.mass("2 %.10e" % (2 * constants.mass_metal2si))
+    else:
+        raise ValueError("units should be metal, real, or si")
+    if units == "metal":
+        lammps.timestep(0.0005)
+    elif units == "real":
+        lammps.timestep(0.5)
+    elif units == "si":
+        lammps.timestep(5e-16)
+    else:
+        raise ValueError("units should be metal, real, or si")
     lammps.fix("1 all nve")
     return lammps
 
@@ -109,6 +136,13 @@ def lammps():
 #    yield _lammps(data_file=data_type_map_file)
 
 
+@pytest.fixture
+def lammps_si():
+    lmp = _lammps(data_file=data_file_si, units="si")
+    yield lmp
+    lmp.close()
+
+
 def test_compute_deeptensor_atom(lammps):
     lammps.pair_style(f"deepmd {pb_file.resolve()}")
     lammps.pair_coeff("* *")
@@ -120,3 +154,16 @@ def test_compute_deeptensor_atom(lammps):
     assert np.array(lammps.variables["tensor"].value) == pytest.approx(
         expected_d[idx_map]
     )
+
+
+def test_compute_deeptensor_atom_si(lammps_si):
+    lammps_si.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_si.pair_coeff("* *")
+    lammps_si.compute(f"tensor all deeptensor/atom {pb_file2.resolve()}")
+    lammps_si.variable("tensor atom c_tensor[1]")
+    lammps_si.dump("1 all custom 1 dump id c_tensor[1]")
+    lammps_si.run(0)
+    idx_map = lammps_si.lmp.numpy.extract_atom("id") - 1
+    assert np.array(lammps_si.variables["tensor"].value) == pytest.approx(
+        expected_d[idx_map] * constants.dist_metal2si
+    )
diff --git a/source/lmp/tests/test_dplr.py b/source/lmp/tests/test_dplr.py
index ceebb71310..9c8f1c0d4f 100644
--- a/source/lmp/tests/test_dplr.py
+++ b/source/lmp/tests/test_dplr.py
@@ -6,6 +6,7 @@
     Path,
 )
 
+import constants
 import numpy as np
 import pytest
 from lammps import (
@@ -20,6 +21,7 @@
 dipole_pbtxt_file = Path(__file__).parent / "lrdipole.pbtxt"
 dipole_pb_file = Path(__file__).parent / "lrdipole.pb"
 data_file = Path(__file__).parent / "data.lmp"
+data_file_si = Path(__file__).parent / "data.si"
 data_type_map_file = Path(__file__).parent / "data_type_map.lmp"
 
 # this is as the same as python and c++ tests, test_deeppot_a.py
@@ -261,9 +263,6 @@
 mesh = 10
 
 
-# https://github.com/lammps/lammps/blob/1e1311cf401c5fc2614b5d6d0ff3230642b76597/src/update.cpp#L193
-nktv2p = 1.6021765e6
-
 sp.check_output(
     "{} -m deepmd convert-from pbtxt -i {} -o {}".format(
         sys.executable,
@@ -280,21 +279,45 @@ def setup_module():
     write_lmp_data_full(
         box, coord, mol_list, type_HO, charge, data_type_map_file, bond_list, mass_list
     )
+    write_lmp_data_full(
+        box * constants.dist_metal2si,
+        coord * constants.dist_metal2si,
+        mol_list,
+        type_OH,
+        charge * constants.charge_metal2si,
+        data_file_si,
+        bond_list,
+        mass_list * constants.mass_metal2si,
+    )
 
 
 def teardown_module():
     os.remove(data_file)
+    os.remove(data_type_map_file)
+    os.remove(data_file_si)
 
 
-def _lammps(data_file, exclude_type="1 3") -> PyLammps:
+def _lammps(data_file, exclude_type="1 3", units="metal") -> PyLammps:
     lammps = PyLammps()
-    lammps.units("metal")
+    lammps.units(units)
     lammps.boundary("p p p")
     lammps.atom_style("full")
-    lammps.neighbor("0.2 bin")
+    if units == "metal" or units == "real":
+        lammps.neighbor("0.2 bin")
+    elif units == "si":
+        lammps.neighbor("2.0e-11 bin")
+    else:
+        raise ValueError("units should be metal, real, or si")
     lammps.neigh_modify("every 1 delay 0 check no exclude type " + exclude_type)
     lammps.read_data(data_file.resolve())
-    lammps.timestep(0.0005)
+    if units == "metal":
+        lammps.timestep(0.0005)
+    elif units == "real":
+        lammps.timestep(0.5)
+    elif units == "si":
+        lammps.timestep(5e-16)
+    else:
+        raise ValueError("units should be metal, real, or si")
     lammps.fix("1 all nve")
     return lammps
 
@@ -313,6 +336,13 @@ def lammps_type_map():
     lmp.close()
 
 
+@pytest.fixture
+def lammps_si():
+    lmp = _lammps(data_file=data_file_si, units="si")
+    yield lmp
+    lmp.close()
+
+
 def test_pair_deepmd_sr(lammps):
     lammps.pair_style(f"deepmd {pb_file.resolve()}")
     lammps.pair_coeff("* *")
@@ -349,7 +379,7 @@ def test_pair_deepmd_sr_virial(lammps):
     for ii in range(9):
         assert np.array(lammps.variables[f"virial{ii}"].value)[
             idx_list
-        ] / nktv2p == pytest.approx(expected_v_sr[:, ii])
+        ] / constants.nktv2p == pytest.approx(expected_v_sr[:, ii])
     os.remove("dump")
 
 
@@ -502,3 +532,37 @@ def test_pair_deepmd_lr_type_map(lammps_type_map):
                 expected_f_lr[lammps_type_map.atoms[ii].id - 1]
             )
     lammps_type_map.run(1)
+
+
+def test_pair_deepmd_lr_si(lammps_si):
+    lammps_si.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_si.pair_coeff("* *")
+    lammps_si.bond_style("zero")
+    lammps_si.bond_coeff("*")
+    lammps_si.special_bonds("lj/coul 1 1 1 angle no")
+    lammps_si.kspace_style("pppm/dplr 1e-5")
+    lammps_si.kspace_modify(
+        f"gewald {beta / constants.dist_metal2si:.6e} diff ik mesh {mesh:d} {mesh:d} {mesh:d}"
+    )
+    lammps_si.fix(
+        f"0 all dplr model {pb_file.resolve()} type_associate 1 3 bond_type 1"
+    )
+    lammps_si.fix_modify("0 virial yes")
+    lammps_si.run(0)
+    for ii in range(8):
+        if lammps_si.atoms[ii].id > 6:
+            assert lammps_si.atoms[ii].position == pytest.approx(
+                expected_WC[lammps_si.atoms[ii].id - 7] * constants.dist_metal2si
+            )
+    assert lammps_si.eval("elong") == pytest.approx(
+        expected_e_kspace * constants.ener_metal2si
+    )
+    assert lammps_si.eval("pe") == pytest.approx(
+        expected_e_lr * constants.ener_metal2si
+    )
+    for ii in range(8):
+        if lammps_si.atoms[ii].id <= 6:
+            assert lammps_si.atoms[ii].force == pytest.approx(
+                expected_f_lr[lammps_si.atoms[ii].id - 1] * constants.force_metal2si
+            )
+    lammps_si.run(1)
diff --git a/source/lmp/tests/test_lammps.py b/source/lmp/tests/test_lammps.py
index 78eaf7ea4e..028b403abf 100644
--- a/source/lmp/tests/test_lammps.py
+++ b/source/lmp/tests/test_lammps.py
@@ -6,6 +6,7 @@
     Path,
 )
 
+import constants
 import numpy as np
 import pytest
 from lammps import (
@@ -23,6 +24,7 @@
 pb_file2 = Path(__file__).parent / "graph2.pb"
 system_file = Path(__file__).parent.parent.parent / "tests"
 data_file = Path(__file__).parent / "data.lmp"
+data_file_si = Path(__file__).parent / "data.si"
 data_type_map_file = Path(__file__).parent / "data_type_map.lmp"
 md_file = Path(__file__).parent / "md.out"
 
@@ -215,10 +217,6 @@
 type_OH = np.array([1, 2, 2, 1, 2, 2])
 type_HO = np.array([2, 1, 1, 2, 1, 1])
 
-# https://github.com/lammps/lammps/blob/1e1311cf401c5fc2614b5d6d0ff3230642b76597/src/update.cpp#L193
-nktv2p = 1.6021765e6
-nktv2p_real = 68568.415
-metal2real = 23.060549
 
 sp.check_output(
     "{} -m deepmd convert-from pbtxt -i {} -o {}".format(
@@ -239,6 +237,12 @@
 def setup_module():
     write_lmp_data(box, coord, type_OH, data_file)
     write_lmp_data(box, coord, type_HO, data_type_map_file)
+    write_lmp_data(
+        box * constants.dist_metal2si,
+        coord * constants.dist_metal2si,
+        type_OH,
+        data_file_si,
+    )
 
 
 def teardown_module():
@@ -251,17 +255,30 @@ def _lammps(data_file, units="metal") -> PyLammps:
     lammps.units(units)
     lammps.boundary("p p p")
     lammps.atom_style("atomic")
-    lammps.neighbor("2.0 bin")
+    if units == "metal" or units == "real":
+        lammps.neighbor("2.0 bin")
+    elif units == "si":
+        lammps.neighbor("2.0e-10 bin")
+    else:
+        raise ValueError("units should be metal, real, or si")
     lammps.neigh_modify("every 10 delay 0 check no")
     lammps.read_data(data_file.resolve())
-    lammps.mass("1 16")
-    lammps.mass("2 2")
+    if units == "metal" or units == "real":
+        lammps.mass("1 16")
+        lammps.mass("2 2")
+    elif units == "si":
+        lammps.mass("1 %.10e" % (16 * constants.mass_metal2si))
+        lammps.mass("2 %.10e" % (2 * constants.mass_metal2si))
+    else:
+        raise ValueError("units should be metal, real, or si")
     if units == "metal":
         lammps.timestep(0.0005)
     elif units == "real":
         lammps.timestep(0.5)
+    elif units == "si":
+        lammps.timestep(5e-16)
     else:
-        raise ValueError("units should be metal or real")
+        raise ValueError("units should be metal, real, or si")
     lammps.fix("1 all nve")
     return lammps
 
@@ -287,6 +304,13 @@ def lammps_real():
     lmp.close()
 
 
+@pytest.fixture
+def lammps_si():
+    lmp = _lammps(data_file=data_file_si, units="si")
+    yield lmp
+    lmp.close()
+
+
 def test_pair_deepmd(lammps):
     lammps.pair_style(f"deepmd {pb_file.resolve()}")
     lammps.pair_coeff("* *")
@@ -319,7 +343,7 @@ def test_pair_deepmd_virial(lammps):
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
-        ) / nktv2p == pytest.approx(expected_v[idx_map, ii])
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
 
 
 def test_pair_deepmd_model_devi(lammps):
@@ -374,7 +398,7 @@ def test_pair_deepmd_model_devi_virial(lammps):
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
-        ) / nktv2p == pytest.approx(expected_v[idx_map, ii])
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
     # load model devi
     md = np.loadtxt(md_file.resolve())
     expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
@@ -472,10 +496,12 @@ def test_pair_deepmd_real(lammps_real):
     lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
     lammps_real.pair_coeff("* *")
     lammps_real.run(0)
-    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
     for ii in range(6):
         assert lammps_real.atoms[ii].force == pytest.approx(
-            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
     lammps_real.run(1)
 
@@ -491,16 +517,20 @@ def test_pair_deepmd_virial_real(lammps_real):
         "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
     )
     lammps_real.run(0)
-    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
     for ii in range(6):
         assert lammps_real.atoms[ii].force == pytest.approx(
-            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
     idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
-        ) / nktv2p_real == pytest.approx(expected_v[idx_map, ii] * metal2real)
+        ) / constants.nktv2p_real == pytest.approx(
+            expected_v[idx_map, ii] * constants.ener_metal2real
+        )
 
 
 def test_pair_deepmd_model_devi_real(lammps_real):
@@ -511,25 +541,27 @@ def test_pair_deepmd_model_devi_real(lammps_real):
     )
     lammps_real.pair_coeff("* *")
     lammps_real.run(0)
-    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
     for ii in range(6):
         assert lammps_real.atoms[ii].force == pytest.approx(
-            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
     # load model devi
     md = np.loadtxt(md_file.resolve())
     expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
-    assert md[7:] == pytest.approx(expected_md_f * metal2real)
-    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
-    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
-    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
     expected_md_v = (
         np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
     )
-    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
-    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
     assert md[3] == pytest.approx(
-        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
     )
 
 
@@ -548,30 +580,34 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real):
         "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
     )
     lammps_real.run(0)
-    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
     for ii in range(6):
         assert lammps_real.atoms[ii].force == pytest.approx(
-            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
     idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
-        ) / nktv2p_real == pytest.approx(expected_v[idx_map, ii] * metal2real)
+        ) / constants.nktv2p_real == pytest.approx(
+            expected_v[idx_map, ii] * constants.ener_metal2real
+        )
     # load model devi
     md = np.loadtxt(md_file.resolve())
     expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
-    assert md[7:] == pytest.approx(expected_md_f * metal2real)
-    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
-    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
-    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
     expected_md_v = (
         np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
     )
-    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
-    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
     assert md[3] == pytest.approx(
-        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
     )
 
 
@@ -582,32 +618,34 @@ def test_pair_deepmd_model_devi_atomic_relative_real(lammps_real):
             pb_file.resolve(),
             pb_file2.resolve(),
             md_file.resolve(),
-            relative * metal2real,
+            relative * constants.force_metal2real,
         )
     )
     lammps_real.pair_coeff("* *")
     lammps_real.run(0)
-    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
     for ii in range(6):
         assert lammps_real.atoms[ii].force == pytest.approx(
-            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
     # load model devi
     md = np.loadtxt(md_file.resolve())
     norm = np.linalg.norm(np.mean([expected_f, expected_f2], axis=0), axis=1)
     expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
     expected_md_f /= norm + relative
-    assert md[7:] == pytest.approx(expected_md_f * metal2real)
-    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
-    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
-    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
     expected_md_v = (
         np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
     )
-    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
-    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
     assert md[3] == pytest.approx(
-        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
     )
 
 
@@ -618,22 +656,24 @@ def test_pair_deepmd_model_devi_atomic_relative_v_real(lammps_real):
             pb_file.resolve(),
             pb_file2.resolve(),
             md_file.resolve(),
-            relative * metal2real,
+            relative * constants.ener_metal2real,
         )
     )
     lammps_real.pair_coeff("* *")
     lammps_real.run(0)
-    assert lammps_real.eval("pe") == pytest.approx(expected_e * metal2real)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
     for ii in range(6):
         assert lammps_real.atoms[ii].force == pytest.approx(
-            expected_f[lammps_real.atoms[ii].id - 1] * metal2real
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
     md = np.loadtxt(md_file.resolve())
     expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
-    assert md[7:] == pytest.approx(expected_md_f * metal2real)
-    assert md[4] == pytest.approx(np.max(expected_md_f) * metal2real)
-    assert md[5] == pytest.approx(np.min(expected_md_f) * metal2real)
-    assert md[6] == pytest.approx(np.mean(expected_md_f) * metal2real)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
     expected_md_v = (
         np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
     )
@@ -644,8 +684,20 @@ def test_pair_deepmd_model_devi_atomic_relative_v_real(lammps_real):
         / 6
     )
     expected_md_v /= norm + relative
-    assert md[1] == pytest.approx(np.max(expected_md_v) * metal2real)
-    assert md[2] == pytest.approx(np.min(expected_md_v) * metal2real)
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
     assert md[3] == pytest.approx(
-        np.sqrt(np.mean(np.square(expected_md_v))) * metal2real
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
     )
+
+
+def test_pair_deepmd_si(lammps_si):
+    lammps_si.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_si.pair_coeff("* *")
+    lammps_si.run(0)
+    assert lammps_si.eval("pe") == pytest.approx(expected_e * constants.ener_metal2si)
+    for ii in range(6):
+        assert lammps_si.atoms[ii].force == pytest.approx(
+            expected_f[lammps_si.atoms[ii].id - 1] * constants.force_metal2si
+        )
+    lammps_si.run(1)
diff --git a/source/lmp/tests/write_lmp_data.py b/source/lmp/tests/write_lmp_data.py
index 05a2760f4c..12e91764f1 100644
--- a/source/lmp/tests/write_lmp_data.py
+++ b/source/lmp/tests/write_lmp_data.py
@@ -11,13 +11,13 @@ def write_lmp_data(box, coord, type_list, file_name):
         f.write(comment_lmp_data + "\n")
         f.write("%d atoms\n" % (natom))
         f.write("%d atom types\n" % (ntype))
-        f.write(f"{box[0]:.8f} {box[1]:.8f} xlo xhi\n")
-        f.write(f"{box[2]:.8f} {box[3]:.8f} ylo yhi\n")
-        f.write(f"{box[4]:.8f} {box[5]:.8f} zlo zhi\n")
-        f.write(f"{box[6]:.8f} {box[7]:.8f} {box[8]:.8f} xy xz yz\n\nAtoms\n\n")
+        f.write(f"{box[0]:.10e} {box[1]:.10e} xlo xhi\n")
+        f.write(f"{box[2]:.10e} {box[3]:.10e} ylo yhi\n")
+        f.write(f"{box[4]:.10e} {box[5]:.10e} zlo zhi\n")
+        f.write(f"{box[6]:.10e} {box[7]:.10e} {box[8]:.10e} xy xz yz\n\nAtoms\n\n")
         for i in range(natom):
             f.write(
-                "%d %d %.8f %.8f %.8f\n"
+                "%d %d %.10e %.10e %.10e\n"
                 % (i + 1, type_list[i], coord[i][0], coord[i][1], coord[i][2])
             )
         f.write("\n")
@@ -38,17 +38,17 @@ def write_lmp_data_full(
         f.write("%d atom types\n" % (ntype))
         f.write("%d bonds\n" % (nbond_list.sum()))
         f.write("%d bond types\n" % (nbond_type))
-        f.write(f"{box[0]:.8f} {box[1]:.8f} xlo xhi\n")
-        f.write(f"{box[2]:.8f} {box[3]:.8f} ylo yhi\n")
-        f.write(f"{box[4]:.8f} {box[5]:.8f} zlo zhi\n")
-        f.write(f"{box[6]:.8f} {box[7]:.8f} {box[8]:.8f} xy xz yz\n")
+        f.write(f"{box[0]:.10e} {box[1]:.10e} xlo xhi\n")
+        f.write(f"{box[2]:.10e} {box[3]:.10e} ylo yhi\n")
+        f.write(f"{box[4]:.10e} {box[5]:.10e} zlo zhi\n")
+        f.write(f"{box[6]:.10e} {box[7]:.10e} {box[8]:.10e} xy xz yz\n")
         f.write("\nMasses\n\n")
         for i in range(3):
-            f.write(f"{i+1:d} {mass_list[i]:.6f}\n")
+            f.write(f"{i+1:d} {mass_list[i]:.10e}\n")
         f.write("\nAtoms\n\n")
         for i in range(natom):
             f.write(
-                "%d %d %d %d %.8f %.8f %.8f\n"
+                "%d %d %d %.10e %.10e %.10e %.10e\n"
                 % (
                     i + 1,
                     mol_list[i],

From c92d1f1a3f24bbf1c0b712a3455a204903c7fa8b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 17 Sep 2023 20:55:09 -0400
Subject: [PATCH 33/63] fix CTest (#2828)

1. set `WORKING_DIRECTORY` for `runUnitTest_cc` and `runUnitTest_c`
(help find the graph files);
2. set `BUILD_RPATH` for `deepmd_cc`. (help find `deepmd_op`)

Then CTest should work, using either the `ctest` or `cmake --build .
--target test` command.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/api_c/tests/CMakeLists.txt  |  5 ++++-
 source/api_cc/CMakeLists.txt       |  3 ++-
 source/api_cc/tests/CMakeLists.txt |  5 ++++-
 source/install/test_cc.sh          | 11 +----------
 source/install/test_cc_local.sh    | 11 +----------
 5 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/source/api_c/tests/CMakeLists.txt b/source/api_c/tests/CMakeLists.txt
index a3559a4e19..d4233a83e8 100644
--- a/source/api_c/tests/CMakeLists.txt
+++ b/source/api_c/tests/CMakeLists.txt
@@ -11,6 +11,9 @@ target_link_libraries(runUnitTests_c PRIVATE GTest::gtest_main ${LIB_DEEPMD_C}
                                              rt coverage_config)
 target_link_libraries(runUnitTests_c PRIVATE ${LIB_DEEPMD} ${LIB_DEEPMD_CC})
 target_precompile_headers(runUnitTests_c PRIVATE test_utils.h [["deepmd.hpp"]])
-add_test(runUnitTests_c runUnitTests_c)
+add_test(
+  NAME runUnitTests_c
+  COMMAND runUnitTests_c
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 set_target_properties(runUnitTests_c PROPERTIES INSTALL_RPATH "$ORIGIN/../lib")
 install(TARGETS runUnitTests_c DESTINATION bin/)
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index bdcb51a498..2f296e3dfd 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -25,7 +25,8 @@ if(Protobuf_LIBRARY)
 endif()
 
 set_target_properties(
-  ${libname} PROPERTIES INSTALL_RPATH "$ORIGIN;${TensorFlow_LIBRARY_PATH}")
+  ${libname} PROPERTIES INSTALL_RPATH "$ORIGIN;${TensorFlow_LIBRARY_PATH}"
+                        BUILD_RPATH "$ORIGIN/../op")
 target_compile_definitions(${libname} PRIVATE TF_PRIVATE)
 if(CMAKE_TESTING_ENABLED)
   target_link_libraries(${libname} PRIVATE coverage_config)
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index ff6e432abd..1511dbe3bc 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -7,7 +7,10 @@ add_executable(runUnitTests_cc ${TEST_SRC})
 target_link_libraries(runUnitTests_cc GTest::gtest_main ${LIB_DEEPMD_CC} rt
                       coverage_config)
 target_precompile_headers(runUnitTests_cc PRIVATE test_utils.h)
-add_test(runUnitTest_cc runUnitTests_cc)
+add_test(
+  NAME runUnitTest_cc
+  COMMAND runUnitTests_cc
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 set_target_properties(runUnitTests_cc PROPERTIES INSTALL_RPATH "$ORIGIN/../lib")
 target_compile_definitions(runUnitTests_cc PUBLIC ${prec_def})
 install(TARGETS runUnitTests_cc DESTINATION bin/)
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index 55fe03bad8..eeff8c47bc 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -14,13 +14,4 @@ cd ${BUILD_TMP_DIR}
 cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
 cmake --build . -j${NPROC}
 cmake --install .
-
-#------------------
-# go to a subdirectory...
-# TODO: detect directory of graph files
-mkdir -p ${BUILD_TMP_DIR}/exec_tests
-cd ${BUILD_TMP_DIR}/exec_tests
-
-${INSTALL_PREFIX}/bin/runUnitTests_lib
-${INSTALL_PREFIX}/bin/runUnitTests_cc
-${INSTALL_PREFIX}/bin/runUnitTests_c
+ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index ec1bfadd69..14f86a6646 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -15,13 +15,4 @@ cd ${BUILD_TMP_DIR}
 cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
 cmake --build . -j${NPROC}
 cmake --install .
-
-#------------------
-# go to a subdirectory...
-# TODO: detect directory of graph files
-mkdir -p ${BUILD_TMP_DIR}/exec_tests
-cd ${BUILD_TMP_DIR}/exec_tests
-
-${INSTALL_PREFIX}/bin/runUnitTests_lib
-${INSTALL_PREFIX}/bin/runUnitTests_cc
-${INSTALL_PREFIX}/bin/runUnitTests_c
+ctest --output-on-failure

From 0b4656362b5d74e832c2893c17ecb585ccf2e43c Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 17 Sep 2023 20:57:16 -0400
Subject: [PATCH 34/63] add tox configutation (#2829)

Now, one can execute `tox` to test the Python package in an isolated
environment. (The required packages can be automatically installed)

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 pyproject.toml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 0ab9390efb..8a63a8727e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -138,6 +138,31 @@ test-command = [
     "dp -h",
 ]
 
+# One can run `tox` or `tox -e gpu`
+# to run pytest in an isolated environment
+# Use with pipx:
+# $ pip install -U pipx
+# $ pipx tox
+[tool.tox]
+legacy_tox_ini = """
+    [tox]
+    min_version = 4.0
+
+    [testenv]
+    extras =
+        test
+        cpu
+    commands = pytest source/tests
+
+    [testenv:gpu]
+    extras =
+        test
+        gpu
+    commands = pytest source/tests
+    setenv =
+        DP_VARIANT = cuda
+"""
+
 # selectively turn of lintner warnings, always include reasoning why any warning should
 # be silenced
 

From a597cc62092e0620de6483286381b0d409c5444b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 17 Sep 2023 23:27:36 -0400
Subject: [PATCH 35/63] use parse_version from packaging.version instead of
 pkg_resources (#2830)

it seems pkg_resources is deprecated

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/tests/test_pairwise_dprc.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/source/tests/test_pairwise_dprc.py b/source/tests/test_pairwise_dprc.py
index 7fbb4fdf19..0f3f9fad50 100644
--- a/source/tests/test_pairwise_dprc.py
+++ b/source/tests/test_pairwise_dprc.py
@@ -9,9 +9,7 @@
     run_dp,
     tests_path,
 )
-from pkg_resources import (
-    parse_version,
-)
+from packaging.version import parse as parse_version
 
 from deepmd import (
     DeepPotential,

From b8a47719d695a133b8f8eabef2eacc5746848520 Mon Sep 17 00:00:00 2001
From: mingzhong15 <46273005+mingzhong15@users.noreply.github.com>
Date: Mon, 18 Sep 2023 11:45:47 +0800
Subject: [PATCH 36/63] add citation for `aparam` (#2825)

add reference for `aparam`

---------

Signed-off-by: mingzhong15 <46273005+mingzhong15@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 CITATIONS.bib   | 9 +++++++++
 doc/credits.rst | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/CITATIONS.bib b/CITATIONS.bib
index 930b4fc2a5..ac682b28f7 100644
--- a/CITATIONS.bib
+++ b/CITATIONS.bib
@@ -118,6 +118,15 @@ @article{Zhang_PhysPlasmas_2020_v27_p122704
     doi = {10.1063/5.0023265},
 }
 
+@misc{Zeng_2023_TTMDPMD,
+   annote = {atom-specific parameter (e.g. electron temperature) },
+   author = {Zeng, Qiyu and Chen, Bo and Zhang, Shen and Kang, Dongdong and Wang, Han and Yu, Xiaoxiang and Dai, Jiayu},
+   title = {{Full-scale ab initio simulations of laser-driven atomistic dynamics}},
+   publisher = {arXiv},
+   year = {2023},
+   doi = {10.48550/arXiv.2308.13863},
+}
+
 @article{Zhang_PhysRevB_2020_v102_p41121,
     annote = {fit dipole},
     title={{Deep neural network for the dielectric response of insulators}},
diff --git a/doc/credits.rst b/doc/credits.rst
index 3612b8ace8..3fbe1d56d8 100644
--- a/doc/credits.rst
+++ b/doc/credits.rst
@@ -56,6 +56,13 @@ Cite DeePMD-kit and methods
 
    Zhang_PhysPlasmas_2020_v27_p122704
 
+- If atom-specific parameters (`aparam`, e.g. electronic temperature) is used,
+
+.. bibliography::
+   :filter: False
+
+   Zeng_2023_TTMDPMD
+
 - If fitting dipole,
 
 .. bibliography::

From ce7532a14d8fbd3835ea48cc6fa7604833b2dc28 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 17 Sep 2023 23:47:17 -0400
Subject: [PATCH 37/63] Fix invalid escape sequence (#2820)

Fix the following warnings: DeprecationWarning: invalid escape sequence
---
 deepmd/utils/argcheck.py | 2 +-
 deepmd/utils/convert.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index f670feb578..7bd373b492 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1203,7 +1203,7 @@ def loss_ener_spin():
     doc_start_pref_pf = start_pref("atom_pref")
     doc_limit_pref_pf = limit_pref("atom_pref")
     doc_relative_f = "If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label."
-    doc_enable_atom_ener_coeff = "If true, the energy will be computed as \sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
+    doc_enable_atom_ener_coeff = r"If true, the energy will be computed as \sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
     return [
         Argument(
             "start_pref_e",
diff --git a/deepmd/utils/convert.py b/deepmd/utils/convert.py
index cb337088b2..dd26fa1058 100644
--- a/deepmd/utils/convert.py
+++ b/deepmd/utils/convert.py
@@ -33,7 +33,7 @@ def detect_model_version(input_model: str):
     elif file_content.find("model_attr/model_version") == -1:
         name_dsea = file_content.find('name: "DescrptSeA"')
         post_dsea = file_content[name_dsea:]
-        post_dsea2 = post_dsea[:300].find("\}")
+        post_dsea2 = post_dsea[:300].find(r"}")
         search_double = post_dsea[:post_dsea2]
         if search_double.find("DT_DOUBLE") == -1:
             version = "1.2"

From a735bed9470f0855a9c1e8e3f57a454c3e00fe5d Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 19 Sep 2023 01:18:06 -0400
Subject: [PATCH 38/63] make the pairwise DPRc model 2x faster (#2833)

This PR does a trick to speed up the pairwise DPRc model. Considering
#2618 is not ready and is quite difficult to implement, in this PR,
multiple frames are merged into one frame before feed to `prod_env_mat`
OP, and the mesh is faked to make it perform the same behavior as the
multiple frames.
A new `mesh` shape is proposed. The first element stores `nloc`, and the
following 15 elements store nothing to distinguish it from other mesh.
The `(16 : 16 + nloc)` elements store `ilist`, `(16 + nloc : 16 + nloc *
2)` store `numneigh`, and the rest elements (in the shape of
`sum(numneigh)`) store neighbors. The `nei_mode` is 4 for this
situation.

`prod_env_mat` OP is not a bottleneck anymore, as shown below.

![image](https://github.com/deepmodeling/deepmd-kit/assets/9496702/eea64b99-d630-4ea1-99f4-e7d49c126c33)

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/model/pairwise_dprc.py          |  55 ++++---
 source/lib/include/gpu_rocm.h          |   2 +-
 source/op/pairwise.cc                  | 200 ++++++++++++++++++++++++-
 source/op/prod_env_mat_multi_device.cc | 128 +++++++++++++++-
 source/tests/test_pairwise_dprc.py     | 168 ++++++++++++++++++++-
 5 files changed, 526 insertions(+), 27 deletions(-)

diff --git a/deepmd/model/pairwise_dprc.py b/deepmd/model/pairwise_dprc.py
index a9e154096a..8f46ec239d 100644
--- a/deepmd/model/pairwise_dprc.py
+++ b/deepmd/model/pairwise_dprc.py
@@ -59,6 +59,10 @@ def __init__(
         compress: Optional[dict] = None,
         **kwargs,
     ) -> None:
+        # internal variable to compare old and new behavior
+        # expect they give the same results
+        self.merge_frames = True
+
         super().__init__(
             type_embedding=type_embedding,
             type_map=type_map,
@@ -151,16 +155,27 @@ def build(
         atype = tf.reshape(atype_, [nframes, natoms[1], 1])
         nframes_qmmm = tf.shape(qmmm_frame_idx)[0]
 
+        if self.merge_frames:
+            (
+                forward_qmmm_map,
+                backward_qmmm_map,
+                natoms_qmmm,
+                mesh_qmmm,
+            ) = op_module.convert_forward_map(forward_qmmm_map, natoms_qmmm, natoms)
+            coord_qmmm = tf.reshape(coord, [1, -1, 3])
+            atype_qmmm = tf.reshape(atype, [1, -1, 1])
+            box_qmmm = tf.reshape(box[0], [1, 9])
+        else:
+            mesh_qmmm = make_default_mesh(False, True)
+            coord_qmmm = tf.gather(coord, qmmm_frame_idx)
+            atype_qmmm = tf.gather(atype, qmmm_frame_idx)
+            box_qmmm = tf.gather(box, qmmm_frame_idx)
+
         coord_qm = gather_placeholder(coord, forward_qm_map)
         atype_qm = gather_placeholder(atype, forward_qm_map, placeholder=-1)
-        coord_qmmm = gather_placeholder(
-            tf.gather(coord, qmmm_frame_idx), forward_qmmm_map
-        )
-        atype_qmmm = gather_placeholder(
-            tf.gather(atype, qmmm_frame_idx), forward_qmmm_map, placeholder=-1
-        )
+        coord_qmmm = gather_placeholder(coord_qmmm, forward_qmmm_map)
+        atype_qmmm = gather_placeholder(atype_qmmm, forward_qmmm_map, placeholder=-1)
         box_qm = box
-        box_qmmm = tf.gather(box, qmmm_frame_idx)
 
         type_embedding = self.typeebd.build(
             self.ntypes,
@@ -189,7 +204,7 @@ def build(
             atype_qmmm,
             natoms_qmmm,
             box_qmmm,
-            mesh_mixed_type,
+            mesh_qmmm,
             input_dict_qmmm,
             frz_model=frz_model,
             ckpt_meta=ckpt_meta,
@@ -197,10 +212,14 @@ def build(
             reuse=reuse,
         )
 
-        energy_qm = qm_dict["energy"]
-        energy_qmmm = tf.math.segment_sum(qmmm_dict["energy"], qmmm_frame_idx)
-        energy = energy_qm + energy_qmmm
-        energy = tf.identity(energy, name="o_energy" + suffix)
+        if self.merge_frames:
+            qmmm_dict = qmmm_dict.copy()
+            sub_nframes = tf.shape(backward_qmmm_map)[0]
+            qmmm_dict["force"] = tf.tile(qmmm_dict["force"], [sub_nframes, 1])
+            qmmm_dict["atom_ener"] = tf.tile(qmmm_dict["atom_ener"], [sub_nframes, 1])
+            qmmm_dict["atom_virial"] = tf.tile(
+                qmmm_dict["atom_virial"], [sub_nframes, 1]
+            )
 
         force_qm = gather_placeholder(
             tf.reshape(qm_dict["force"], (nframes, natoms_qm[1], 3)),
@@ -218,11 +237,6 @@ def build(
         force = force_qm + force_qmmm
         force = tf.reshape(force, (nframes, 3 * natoms[1]), name="o_force" + suffix)
 
-        virial_qm = qm_dict["virial"]
-        virial_qmmm = tf.math.segment_sum(qmmm_dict["virial"], qmmm_frame_idx)
-        virial = virial_qm + virial_qmmm
-        virial = tf.identity(virial, name="o_virial" + suffix)
-
         backward_qm_map_nloc = tf.slice(backward_qm_map, [0, 0], [-1, natoms[0]])
         backward_qmmm_map_nloc = tf.slice(backward_qmmm_map, [0, 0], [-1, natoms[0]])
         atom_ener_qm = gather_placeholder(
@@ -255,6 +269,13 @@ def build(
             atom_virial, (nframes, 9 * natoms[1]), name="o_atom_virial" + suffix
         )
 
+        energy = tf.reduce_sum(atom_ener, axis=1, name="o_energy" + suffix)
+        virial = tf.reduce_sum(
+            tf.reshape(atom_virial, (nframes, natoms[1], 9)),
+            axis=1,
+            name="o_virial" + suffix,
+        )
+
         model_dict = {}
         model_dict["energy"] = energy
         model_dict["force"] = force
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index e628d109d7..4c3c1b41a9 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -59,7 +59,7 @@ void memcpy_host_to_device(FPTYPE *device, const FPTYPE *host, const int size) {
 }
 
 template <typename FPTYPE>
-void memcpy_device_to_host(FPTYPE *device, std::vector<FPTYPE> &host) {
+void memcpy_device_to_host(const FPTYPE *device, std::vector<FPTYPE> &host) {
   DPErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(),
                        hipMemcpyDeviceToHost));
 }
diff --git a/source/op/pairwise.cc b/source/op/pairwise.cc
index ee55c3dff3..d60bc3bccc 100644
--- a/source/op/pairwise.cc
+++ b/source/op/pairwise.cc
@@ -14,6 +14,15 @@ REGISTER_OP("DprcPairwiseIdx")
     .Output("natoms_qmmm: int32")
     .Output("qmmm_frame_idx: int32");
 
+REGISTER_OP("ConvertForwardMap")
+    .Input("sub_forward_map: int32")
+    .Input("sub_natoms: int32")
+    .Input("natoms: int32")
+    .Output("forward_map: int32")
+    .Output("backward_map: int32")
+    .Output("new_natoms: int32")
+    .Output("mesh: int32");
+
 using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -208,8 +217,193 @@ class PairwiseIdxOp : public OpKernel {
   }
 };
 
+template <typename Device>
+class ConvertForwardMapOp : public OpKernel {
+ public:
+  explicit ConvertForwardMapOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(
+        context, [this](OpKernelContext* context) { this->_Compute(context); });
+  }
+
+  void _Compute(OpKernelContext* context) {
+    // Grab the input tensor
+    int tmp_idx = 0;
+    const Tensor& sub_forward_map_tensor = context->input(tmp_idx++);
+    const Tensor& sub_natoms_tensor = context->input(tmp_idx++);
+    const Tensor& natoms_tensor = context->input(tmp_idx++);
+
+    // set size of the sample
+    OP_REQUIRES(context, (sub_forward_map_tensor.shape().dims() == 2),
+                errors::InvalidArgument("Dim of idxs should be 2"));
+    OP_REQUIRES(context, (natoms_tensor.shape().dims() == 1),
+                errors::InvalidArgument("Dim of natoms should be 1"));
+
+    auto sub_forward_map = sub_forward_map_tensor.matrix<int>();
+    int sub_nframes = sub_forward_map_tensor.shape().dim_size(0);
+    auto sub_natoms = sub_natoms_tensor.vec<int>();
+    auto natoms = natoms_tensor.vec<int>();
+    int sub_nloc = sub_natoms(0);
+    int sub_nall = sub_natoms(1);
+    int nloc = natoms(0);
+    int nall = natoms(1);
+
+    // merge multiple sub-frames into one frame
+    // firstly, we need to get the nloc and nghost size to allocate
+    int new_nloc = 0, new_nghost = 0;
+
+    for (int ii = 0; ii < sub_nframes; ++ii) {
+      for (int jj = 0; jj < sub_nloc; ++jj) {
+        if (sub_forward_map(ii, jj) != -1) {
+          new_nloc++;
+        }
+      }
+      for (int jj = sub_nloc; jj < sub_nall; ++jj) {
+        if (sub_forward_map(ii, jj) != -1) {
+          new_nghost++;
+        }
+      }
+    }
+    if (new_nloc == 0) {
+      new_nloc = 1;
+    }
+    int new_nall = new_nloc + new_nghost;
+
+    // Create an output tensor
+    TensorShape forward_map_shape;
+    forward_map_shape.AddDim(1);
+    forward_map_shape.AddDim(new_nall);
+    TensorShape backward_map_shape;
+    // since the atom index can not be repeated, we still need
+    // to split to multiple frames
+    backward_map_shape.AddDim(sub_nframes);
+    backward_map_shape.AddDim(nall);
+    TensorShape new_natoms_shape;
+    new_natoms_shape.AddDim(natoms_tensor.shape().dim_size(0));
+
+    Tensor* forward_map_tensor = NULL;
+    Tensor* backward_map_tensor = NULL;
+    Tensor* new_natoms_tensor = NULL;
+    tmp_idx = 0;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(tmp_idx++, forward_map_shape,
+                                            &forward_map_tensor));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(tmp_idx++, backward_map_shape,
+                                            &backward_map_tensor));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(tmp_idx++, new_natoms_shape,
+                                            &new_natoms_tensor));
+
+    auto forward_map = forward_map_tensor->matrix<int>();
+    auto backward_map = backward_map_tensor->matrix<int>();
+    auto new_natoms = new_natoms_tensor->vec<int>();
+
+    // fill -1 in backward_map_tensor
+    for (int ii = 0; ii < sub_nframes; ++ii) {
+      for (int jj = 0; jj < nall; ++jj) {
+        backward_map(ii, jj) = -1;
+      }
+    }
+
+    std::vector<int> start_kk(sub_nframes),
+        end_kk(sub_nframes);  // current forward map index
+    int kk = 0;
+    // assume nlist to contain all atoms; it should not be a problem for small
+    // residues
+    std::vector<std::vector<int>> jlist(new_nloc);
+    for (int ii = 0; ii < sub_nframes; ++ii) {
+      start_kk[ii] = kk;
+      for (int jj = 0; jj < sub_nloc; ++jj) {
+        if (sub_forward_map(ii, jj) != -1) {
+          forward_map(0, kk) = sub_forward_map(ii, jj);
+          backward_map(ii, sub_forward_map(ii, jj)) = kk;
+          kk++;
+        }
+      }
+      end_kk[ii] = kk;
+      // add neighbors to each other
+      for (int mm = start_kk[ii]; mm < end_kk[ii]; ++mm) {
+        for (int nn = start_kk[ii]; nn < end_kk[ii]; ++nn) {
+          if (mm != nn) {
+            jlist[mm].push_back(nn);
+          }
+        }
+      }
+    }
+    for (int ii = 0; ii < sub_nframes; ++ii) {
+      int start_ghost_kk = kk;
+      for (int jj = sub_nloc; jj < sub_nall; ++jj) {
+        if (sub_forward_map(ii, jj) != -1) {
+          forward_map(0, kk) = sub_forward_map(ii, jj);
+          backward_map(ii, sub_forward_map(ii, jj)) = kk;
+          kk++;
+        }
+      }
+      int end_ghost_kk = kk;
+      // add ghost neighbors to real atoms
+      for (int mm = start_kk[ii]; mm < end_kk[ii]; ++mm) {
+        for (int nn = start_ghost_kk; nn < end_ghost_kk; ++nn) {
+          jlist[mm].push_back(nn);
+        }
+      }
+    }
+
+    // natoms
+    new_natoms(0) = new_nloc;
+    new_natoms(1) = new_nall;
+    new_natoms(2) = new_nloc;
+    for (int ii = 3; ii < new_natoms.size(); ++ii) {
+      new_natoms(ii) = 0;
+    }
+
+    // mesh:
+    //   first element: nloc (a number)
+    //   2~16: empty (to distinguish from other mesh)
+    //   ilist: nloc
+    //   numneigh: nloc
+    //   jlist: sum(numneigh)
+
+    // calculate numneigh
+    std::vector<int> numneigh(new_nloc);
+    for (int ii = 0; ii < new_nloc; ++ii) {
+      numneigh[ii] = jlist[ii].size();
+    }
+    int size_mesh =
+        std::accumulate(numneigh.begin(), numneigh.end(), 2 * new_nloc + 16);
+
+    TensorShape mesh_shape;
+    mesh_shape.AddDim(size_mesh);
+    Tensor* mesh_tensor = NULL;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(tmp_idx++, mesh_shape, &mesh_tensor));
+    auto mesh = mesh_tensor->vec<int>();
+    mesh(0) = new_nloc;
+    for (int ii = 1; ii < 16; ++ii) {
+      mesh(ii) = 0;
+    }
+    for (int ii = 0; ii < new_nloc; ++ii) {
+      mesh(ii + 16) = ii;
+    }
+    for (int ii = 0; ii < new_nloc; ++ii) {
+      mesh(ii + 16 + new_nloc) = numneigh[ii];
+    }
+    kk = 0;
+    for (int ii = 0; ii < new_nloc; ++ii) {
+      for (int jj = 0; jj < numneigh[ii]; ++jj) {
+        mesh(16 + 2 * new_nloc + kk) = jlist[ii][jj];
+        kk++;
+      }
+    }
+  }
+};
+
 // Register the CPU kernels.
-#define REGISTER_CPU(T)                                               \
-  REGISTER_KERNEL_BUILDER(Name("DprcPairwiseIdx").Device(DEVICE_CPU), \
-                          PairwiseIdxOp<CPUDevice>);
+#define REGISTER_CPU(T)                                                 \
+  REGISTER_KERNEL_BUILDER(Name("DprcPairwiseIdx").Device(DEVICE_CPU),   \
+                          PairwiseIdxOp<CPUDevice>);                    \
+  REGISTER_KERNEL_BUILDER(Name("ConvertForwardMap").Device(DEVICE_CPU), \
+                          ConvertForwardMapOp<CPUDevice>);
 REGISTER_CPU();
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a8882fb5f4..73a0d3c4c1 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -507,6 +507,9 @@ class ProdEnvMatAOp : public OpKernel {
       // no pbc
       assert(nloc == nall);
       nei_mode = -1;
+    } else if (mesh_tensor.shape().dim_size(0) > 16) {
+      // pass neighbor list inside the tensor
+      nei_mode = 4;
     } else if (mesh_tensor.shape().dim_size(0) == 7 ||
                mesh_tensor.shape().dim_size(0) == 1) {
       throw deepmd::deepmd_exception(
@@ -799,6 +802,9 @@ class ProdEnvMatROp : public OpKernel {
       // no pbc
       assert(nloc == nall);
       nei_mode = -1;
+    } else if (mesh_tensor.shape().dim_size(0) > 16) {
+      // pass neighbor list inside the tensor
+      nei_mode = 4;
     } else if (mesh_tensor.shape().dim_size(0) == 7 ||
                mesh_tensor.shape().dim_size(0) == 1) {
       throw deepmd::deepmd_exception(
@@ -1101,14 +1107,15 @@ class ProdEnvMatAMixOp : public OpKernel {
     } else if (mesh_tensor.shape().dim_size(0) == 6 ||
                mesh_tensor.shape().dim_size(0) == 7) {
       // manual copied pbc
-      assert(nloc == nall);
       nei_mode = 1;
       b_nlist_map = true;
     } else if (mesh_tensor.shape().dim_size(0) == 0 ||
                mesh_tensor.shape().dim_size(0) == 1) {
       // no pbc
-      assert(nloc == nall);
       nei_mode = -1;
+    } else if (mesh_tensor.shape().dim_size(0) > 16) {
+      // pass neighbor list inside the tensor
+      nei_mode = 4;
     } else {
       throw deepmd::deepmd_exception("invalid mesh tensor");
     }
@@ -1429,6 +1436,24 @@ static void _map_nei_info_cpu(int* nlist,
                            ntypes, b_nlist_map);
 }
 
+/**
+ * @param[in] nei_mode -1, 1, 3, or 4.
+ *   - -1: Build neighbor list without PBC. The size of mesh should
+ *     be 0 (no mixed) or 1 (mixed).
+ *   - 1: Build neighbor list with PBC. The size of mesh should
+ *     be 6 (no mixed) or 7 (mixed).
+ *   - 3：Use neighbor list from given pointers. The size of mesh should be 16.
+ *     The first element is ago (whether update the internal neighbour list).
+ *     The second element is the number of local atoms. The 5th-8th, 9th-12th,
+ *     and 13th-16th elements are the pointer (int*, 4x size of int) to
+ *     ilist, numneigh, firstneigh. The pointer should be valid during the
+ *     execution of this op, so it may be created and given by an external
+ *     program calling the TensorFlow session.
+ *   - 4: Use neighbor list stored in the tensor. The size of mesh should be
+ *     16 + 2 * nloc + sum(numneigh). Starting from the 17th element, the
+ *     elements are ilist (size of nloc), numneigh (size of nloc), and neighbors
+ *     (size of numneigh[i] for each i).
+ */
 template <typename FPTYPE>
 static void _prepare_coord_nlist_cpu(OpKernelContext* context,
                                      FPTYPE const** coord,
@@ -1453,7 +1478,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial) {
   inlist.inum = nloc;
-  if (nei_mode != 3) {
+  if (nei_mode != 3 && nei_mode != 4) {
     // build nlist by myself
     // normalize and copy coord
     if (nei_mode == 1) {
@@ -1474,6 +1499,19 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
     inlist.ilist = &ilist[0];
     inlist.numneigh = &numneigh[0];
     inlist.firstneigh = &firstneigh[0];
+  } else if (nei_mode == 4) {
+    std::memcpy(&ilist[0], 16 + mesh_tensor_data, sizeof(int) * nloc);
+    std::memcpy(&numneigh[0], 16 + nloc + mesh_tensor_data, sizeof(int) * nloc);
+    for (int ii = 0, kk = 0; ii < nloc; ++ii) {
+      jlist[ii].resize(numneigh[ii]);
+      std::memcpy(&jlist[ii][0], 16 + 2 * nloc + kk + mesh_tensor_data,
+                  sizeof(int) * numneigh[ii]);
+      firstneigh[ii] = &jlist[ii][0];
+      kk += numneigh[ii];
+    }
+    inlist.ilist = &ilist[0];
+    inlist.numneigh = &numneigh[0];
+    inlist.firstneigh = &firstneigh[0];
   } else {
     // copy pointers to nlist data
     memcpy(&inlist.ilist, 4 + mesh_tensor_data, sizeof(int*));
@@ -1675,7 +1713,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
                                      const float& rcut_r,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial) {
-  if (nei_mode != 3) {
+  if (nei_mode != 3 && nei_mode != 4) {
     inlist.inum = nloc;
     // build nlist by myself
     // normalize and copy coord
@@ -1705,6 +1743,46 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
     inlist.ilist = ilist;
     inlist.numneigh = numneigh;
     inlist.firstneigh = firstneigh;
+  } else if (nei_mode == 4) {
+    // TODO: in theory, it will be faster to put everything on GPUs...
+    std::vector<int> mesh_tensor_data_host(mesh_tensor_size);
+    std::vector<int> ilist_host(nloc);
+    std::vector<int> numneigh_host(nloc);
+    std::vector<int*> firstneigh_host(nloc);
+    std::vector<int> fake_mesh(16);
+
+    // copy from gpu to cpu
+    deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host);
+    std::memcpy(&ilist_host[0], &mesh_tensor_data_host[16], sizeof(int) * nloc);
+    std::memcpy(&numneigh_host[0], &mesh_tensor_data_host[16 + nloc],
+                sizeof(int) * nloc);
+    for (int ii = 0, kk = 0; ii < nloc; ++ii) {
+      firstneigh_host[ii] = &mesh_tensor_data_host[16 + 2 * nloc + kk];
+      kk += numneigh_host[ii];
+    }
+    // make a fake mesh
+    fake_mesh[0] = 0;
+    fake_mesh[1] = nloc;
+    std::memcpy(&fake_mesh[4], &ilist_host, sizeof(int*));
+    std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*));
+    std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**));
+    // copy from cpu to gpu
+    int* fake_mesh_dev = NULL;
+    deepmd::malloc_device_memory(fake_mesh_dev, 16);
+    deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh);
+
+    deepmd::InputNlist inlist_temp;
+    inlist_temp.inum = nloc;
+    // everything should be copied to GPU...
+    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
+                                nbor_list_dev, fake_mesh_dev, 16);
+    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
+                errors::InvalidArgument(
+                    "Assert failed, max neighbor size of atom(lammps) " +
+                    std::to_string(max_numneigh(inlist_temp)) +
+                    " is larger than " + std::to_string(max_nbor_size) +
+                    ", which currently is not supported by deepmd-kit."));
+    deepmd::delete_device_memory(fake_mesh_dev);
   } else {
     // update nbor list
     deepmd::InputNlist inlist_temp;
@@ -1908,7 +1986,7 @@ static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context,
                                           const float& rcut_r,
                                           const int& max_cpy_trial,
                                           const int& max_nnei_trial) {
-  if (nei_mode != 3) {
+  if (nei_mode != 3 && nei_mode != 4) {
     inlist.inum = nloc;
     // build nlist by myself
     // normalize and copy coord
@@ -1938,6 +2016,46 @@ static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context,
     inlist.ilist = ilist;
     inlist.numneigh = numneigh;
     inlist.firstneigh = firstneigh;
+  } else if (nei_mode == 4) {
+    // TODO: in theory, it will be faster to put everything on GPUs...
+    std::vector<int> mesh_tensor_data_host(mesh_tensor_size);
+    std::vector<int> ilist_host(nloc);
+    std::vector<int> numneigh_host(nloc);
+    std::vector<int*> firstneigh_host(nloc);
+    std::vector<int> fake_mesh(16);
+
+    // copy from gpu to cpu
+    deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host);
+    std::memcpy(&ilist_host[0], &mesh_tensor_data_host[16], sizeof(int) * nloc);
+    std::memcpy(&numneigh_host[0], &mesh_tensor_data_host[16 + nloc],
+                sizeof(int) * nloc);
+    for (int ii = 0, kk = 0; ii < nloc; ++ii) {
+      firstneigh_host[ii] = &mesh_tensor_data_host[16 + 2 * nloc + kk];
+      kk += numneigh_host[ii];
+    }
+    // make a fake mesh
+    fake_mesh[0] = 0;
+    fake_mesh[1] = nloc;
+    std::memcpy(&fake_mesh[4], &ilist_host, sizeof(int*));
+    std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*));
+    std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**));
+    // copy from cpu to gpu
+    int* fake_mesh_dev = NULL;
+    deepmd::malloc_device_memory(fake_mesh_dev, 16);
+    deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh);
+
+    deepmd::InputNlist inlist_temp;
+    inlist_temp.inum = nloc;
+    // everything should be copied to GPU...
+    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
+                                nbor_list_dev, fake_mesh_dev, 16);
+    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
+                errors::InvalidArgument(
+                    "Assert failed, max neighbor size of atom(lammps) " +
+                    std::to_string(max_numneigh(inlist_temp)) +
+                    " is larger than " + std::to_string(max_nbor_size) +
+                    ", which currently is not supported by deepmd-kit."));
+    deepmd::delete_device_memory(fake_mesh_dev);
   } else {
     // update nbor list
     deepmd::InputNlist inlist_temp;
diff --git a/source/tests/test_pairwise_dprc.py b/source/tests/test_pairwise_dprc.py
index 0f3f9fad50..e95b66c7a0 100644
--- a/source/tests/test_pairwise_dprc.py
+++ b/source/tests/test_pairwise_dprc.py
@@ -97,6 +97,169 @@ def test_op_single_frame(self):
         np.testing.assert_array_equal(qmmm_frame_idx, np.array([0, 0, 0], dtype=int))
 
 
+class TestConvertForwardMapOP(tf.test.TestCase):
+    """Test convert_forward_map OP."""
+
+    def test_convert_forward_map(self):
+        forward_qmmm_map = np.array(
+            [
+                [3, 4, 0, 1, 2, 10, 11],
+                [3, 4, 5, 6, 7, 10, -1],
+                [3, 4, 8, 9, -1, 10, -1],
+            ],
+            dtype=int,
+        )
+        natoms_qmmm = np.array([5, 7, 5], dtype=int)
+        natoms = np.array([10, 12, 10], dtype=int)
+        with self.cached_session() as sess:
+            (
+                forward_qmmm_map,
+                backward_qmmm_map,
+                natoms_qmmm,
+                mesh_qmmm,
+            ) = run_sess(
+                sess,
+                op_module.convert_forward_map(forward_qmmm_map, natoms_qmmm, natoms),
+            )
+        np.testing.assert_array_equal(
+            forward_qmmm_map,
+            np.array([[3, 4, 0, 1, 2, 3, 4, 5, 6, 7, 3, 4, 8, 9, 10, 11, 10, 10]]),
+        )
+        np.testing.assert_array_equal(
+            backward_qmmm_map,
+            np.array(
+                [
+                    [2, 3, 4, 0, 1, -1, -1, -1, -1, -1, 14, 15],
+                    [-1, -1, -1, 5, 6, 7, 8, 9, -1, -1, 16, -1],
+                    [-1, -1, -1, 10, 11, -1, -1, -1, 12, 13, 17, -1],
+                ]
+            ),
+        )
+        np.testing.assert_array_equal(natoms_qmmm, np.array([14, 18, 14], dtype=int))
+        np.testing.assert_array_equal(
+            mesh_qmmm,
+            np.array(
+                [
+                    14,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    1,
+                    2,
+                    3,
+                    4,
+                    5,
+                    6,
+                    7,
+                    8,
+                    9,
+                    10,
+                    11,
+                    12,
+                    13,
+                    6,
+                    6,
+                    6,
+                    6,
+                    6,
+                    5,
+                    5,
+                    5,
+                    5,
+                    5,
+                    4,
+                    4,
+                    4,
+                    4,
+                    1,
+                    2,
+                    3,
+                    4,
+                    14,
+                    15,
+                    0,
+                    2,
+                    3,
+                    4,
+                    14,
+                    15,
+                    0,
+                    1,
+                    3,
+                    4,
+                    14,
+                    15,
+                    0,
+                    1,
+                    2,
+                    4,
+                    14,
+                    15,
+                    0,
+                    1,
+                    2,
+                    3,
+                    14,
+                    15,
+                    6,
+                    7,
+                    8,
+                    9,
+                    16,
+                    5,
+                    7,
+                    8,
+                    9,
+                    16,
+                    5,
+                    6,
+                    8,
+                    9,
+                    16,
+                    5,
+                    6,
+                    7,
+                    9,
+                    16,
+                    5,
+                    6,
+                    7,
+                    8,
+                    16,
+                    11,
+                    12,
+                    13,
+                    17,
+                    10,
+                    12,
+                    13,
+                    17,
+                    10,
+                    11,
+                    13,
+                    17,
+                    10,
+                    11,
+                    12,
+                    17,
+                ]
+            ),
+        )
+
+
 @unittest.skipIf(
     parse_version(tf.__version__) < parse_version("1.15"),
     f"The current tf version {tf.__version__} is too low to run the new testing model.",
@@ -291,6 +454,7 @@ def test_model_ener(self):
         input_dict["aparam"] = t_aparam
 
         model.data_stat(data)
+        # model.merge_frames = False
         model_pred = model.build(
             t_coord,
             t_type,
@@ -298,7 +462,7 @@ def test_model_ener(self):
             t_box,
             t_mesh,
             input_dict,
-            suffix="se_a_atom_ener_0",
+            suffix="pairwise_dprc_0",
             reuse=False,
         )
         energy = model_pred["energy"]
@@ -354,6 +518,8 @@ def test_model_ener(self):
         # the model is pairwise!
         self.assertAllClose(e[1] + e[2] + e[3] - 3 * e[0], e[4] - e[0])
         self.assertAllClose(f[1] + f[2] + f[3] - 3 * f[0], f[4] - f[0])
+        self.assertAllClose(e[0], 0.189075, 1e-6)
+        self.assertAllClose(f[0, 0], 0.060047, 1e-6)
 
     def test_nloc(self):
         jfile = tests_path / "pairwise_dprc.json"

From de5cc2966e435ef622a4343ca5c97ab273933711 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:18:40 +0800
Subject: [PATCH 39/63] Bump docker/build-push-action from 4.2.1 to 5.0.0
 (#2835)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps
[docker/build-push-action](https://github.com/docker/build-push-action)
from 4.2.1 to 5.0.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/docker/build-push-action/releases">docker/build-push-action's
releases</a>.</em></p>
<blockquote>
<h2>v5.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a
href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions
Runner v2.308.0</a> or later) by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/build-push-action/pull/954">docker/build-push-action#954</a></li>
<li>Bump <code>@​actions/core</code> from 1.10.0 to 1.10.1 in <a
href="https://redirect.github.com/docker/build-push-action/pull/959">docker/build-push-action#959</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/build-push-action/compare/v4.2.1...v5.0.0">https://github.com/docker/build-push-action/compare/v4.2.1...v5.0.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/docker/build-push-action/commit/0565240e2d4ab88bba5387d719585280857ece09"><code>0565240</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/959">#959</a>
from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a
href="https://github.com/docker/build-push-action/commit/3ab07f880128dd3b47d7764b661d608b1e37712a"><code>3ab07f8</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/build-push-action/commit/b9e7e4daec1dd1fed28b226354d2eef8aa92ca38"><code>b9e7e4d</code></a>
chore(deps): Bump <code>@​actions/core</code> from 1.10.0 to 1.10.1</li>
<li><a
href="https://github.com/docker/build-push-action/commit/04d1a3b0491bb1fbd0843d1fea3390e385bf2252"><code>04d1a3b</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/build-push-action/issues/954">#954</a>
from crazy-max/update-node20</li>
<li><a
href="https://github.com/docker/build-push-action/commit/1a4d1a13fb219ebf616f93930a8c4c6a9ff24155"><code>1a4d1a1</code></a>
chore: node 20 as default runtime</li>
<li><a
href="https://github.com/docker/build-push-action/commit/675965c0e16f1a0f94ecafff969d8c966f92c17b"><code>675965c</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/build-push-action/commit/58ee34cb6bad9fc3b471453afb4ed741cb0e6ff3"><code>58ee34c</code></a>
chore: fix author in package.json</li>
<li><a
href="https://github.com/docker/build-push-action/commit/c97c4060bdc51e97b1b2a972eab2f77d6ae8e57a"><code>c97c406</code></a>
fix ProxyConfig type when checking length</li>
<li><a
href="https://github.com/docker/build-push-action/commit/47d5369e0b15ff3b951d5787a265fbecf0fc2bac"><code>47d5369</code></a>
vendor: bump <code>@​docker/actions-toolkit</code> from 0.8.0 to
0.12.0</li>
<li><a
href="https://github.com/docker/build-push-action/commit/8895c7468fbe88881dcc4c5b416553e604722cf2"><code>8895c74</code></a>
chore: update dev dependencies</li>
<li>Additional commits viewable in <a
href="https://github.com/docker/build-push-action/compare/0a97817b6ade9f46837855d676c4cca3a2471fc9...0565240e2d4ab88bba5387d719585280857ece09">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/build-push-action&package-manager=github_actions&previous-version=4.2.1&new-version=5.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_wheel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 85b2d6b884..4b7ac4cb03 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -114,7 +114,7 @@ jobs:
           images: ghcr.io/deepmodeling/deepmd-kit
 
       - name: Build and push Docker image
-        uses: docker/build-push-action@0a97817b6ade9f46837855d676c4cca3a2471fc9
+        uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09
         with:
           context: source/install/docker
           push: ${{ github.repository_owner == 'deepmodeling' && github.event_name == 'push' }}

From 4a82b5fd156b3349b4ab5afbba8a6167876e010e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:19:02 +0800
Subject: [PATCH 40/63] Bump docker/metadata-action from 4.6.0 to 5.0.0 (#2836)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps
[docker/metadata-action](https://github.com/docker/metadata-action) from
4.6.0 to 5.0.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/docker/metadata-action/releases">docker/metadata-action's
releases</a>.</em></p>
<blockquote>
<h2>v5.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a
href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions
Runner v2.308.0</a> or later) by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/metadata-action/pull/328">docker/metadata-action#328</a></li>
<li>Bump <code>@​actions/core</code> from 1.10.0 to 1.10.1 in <a
href="https://redirect.github.com/docker/metadata-action/pull/333">docker/metadata-action#333</a></li>
<li>Bump csv-parse from 5.4.0 to 5.5.0 in <a
href="https://redirect.github.com/docker/metadata-action/pull/320">docker/metadata-action#320</a></li>
<li>Bump semver from 7.5.1 to 7.5.2 in <a
href="https://redirect.github.com/docker/metadata-action/pull/304">docker/metadata-action#304</a></li>
<li>Bump handlebars from 4.7.7 to 4.7.8 in <a
href="https://redirect.github.com/docker/metadata-action/pull/315">docker/metadata-action#315</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/metadata-action/compare/v4.6.0...v5.0.0">https://github.com/docker/metadata-action/compare/v4.6.0...v5.0.0</a></p>
</blockquote>
</details>
<details>
<summary>Upgrade guide</summary>
<p><em>Sourced from <a
href="https://github.com/docker/metadata-action/blob/master/UPGRADE.md">docker/metadata-action's
upgrade guide</a>.</em></p>
<blockquote>
<h1>Upgrade notes</h1>
<h2>v2 to v3</h2>
<ul>
<li>Repository has been moved to docker org. Replace
<code>crazy-max/ghaction-docker-meta@v2</code>
with <code>docker/metadata-action@v5</code></li>
<li>The default bake target has been changed:
<code>ghaction-docker-meta</code> &gt;
<code>docker-metadata-action</code></li>
</ul>
<h2>v1 to v2</h2>
<ul>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#inputs">inputs</a>
<ul>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-sha"><code>tag-sha</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-edge--tag-edge-branch"><code>tag-edge</code>
/ <code>tag-edge-branch</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-semver"><code>tag-semver</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-match--tag-match-group"><code>tag-match</code>
/ <code>tag-match-group</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-latest"><code>tag-latest</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-schedule"><code>tag-schedule</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#tag-custom--tag-custom-only"><code>tag-custom</code>
/ <code>tag-custom-only</code></a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#label-custom"><code>label-custom</code></a></li>
</ul>
</li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#basic-workflow">Basic
workflow</a></li>
<li><a
href="https://github.com/docker/metadata-action/blob/master/#semver-workflow">Semver
workflow</a></li>
</ul>
<h3>inputs</h3>
<table>
<thead>
<tr>
<th>New</th>
<th>Unchanged</th>
<th>Removed</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>tags</code></td>
<td><code>images</code></td>
<td><code>tag-sha</code></td>
</tr>
<tr>
<td><code>flavor</code></td>
<td><code>sep-tags</code></td>
<td><code>tag-edge</code></td>
</tr>
<tr>
<td><code>labels</code></td>
<td><code>sep-labels</code></td>
<td><code>tag-edge-branch</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-semver</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-match</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-match-group</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-latest</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-schedule</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-custom</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-custom-only</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>label-custom</code></td>
</tr>
</tbody>
</table>
<h4><code>tag-sha</code></h4>
<pre lang="yaml"><code>tags: |
  type=sha
</code></pre>
<h4><code>tag-edge</code> / <code>tag-edge-branch</code></h4>
<pre lang="yaml"><code>tags: |
  # default branch
&lt;/tr&gt;&lt;/table&gt;
</code></pre>
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/docker/metadata-action/commit/96383f45573cb7f253c731d3b3ab81c87ef81934"><code>96383f4</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/metadata-action/issues/320">#320</a>
from docker/dependabot/npm_and_yarn/csv-parse-5.5.0</li>
<li><a
href="https://github.com/docker/metadata-action/commit/f138b9677be8facef947d88435c8d031d8a79369"><code>f138b96</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/metadata-action/commit/9cf7015b158c3e131ff76dce3394dab1faad5e80"><code>9cf7015</code></a>
Bump csv-parse from 5.4.0 to 5.5.0</li>
<li><a
href="https://github.com/docker/metadata-action/commit/5a8a5ff8df6f0d538e271b850038e246ea183fad"><code>5a8a5ff</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/metadata-action/issues/315">#315</a>
from docker/dependabot/npm_and_yarn/handlebars-4.7.8</li>
<li><a
href="https://github.com/docker/metadata-action/commit/2279d9af58c689771623e3186f32ba64ddbd7f64"><code>2279d9a</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/metadata-action/commit/c65993321368dc48142a65ec99de67a6937ec99d"><code>c659933</code></a>
Bump handlebars from 4.7.7 to 4.7.8</li>
<li><a
href="https://github.com/docker/metadata-action/commit/48d23ccc0584dd6bfdd83362765262c24f6a294b"><code>48d23cc</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/metadata-action/issues/333">#333</a>
from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a
href="https://github.com/docker/metadata-action/commit/b83ffb48d66bbb5ed28a5d5a2db0c67edd743fe1"><code>b83ffb4</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/metadata-action/commit/3207f2405ffc191c8e86fc9366a9fab616ded30e"><code>3207f24</code></a>
Bump <code>@​actions/core</code> from 1.10.0 to 1.10.1</li>
<li><a
href="https://github.com/docker/metadata-action/commit/63f4a263e5ff110c8dad55b848effaa50b9967b8"><code>63f4a26</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/metadata-action/issues/328">#328</a>
from crazy-max/update-node20</li>
<li>Additional commits viewable in <a
href="https://github.com/docker/metadata-action/compare/818d4b7b91585d195f67373fd9cb0332e31a7175...96383f45573cb7f253c731d3b3ab81c87ef81934">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/metadata-action&package-manager=github_actions&previous-version=4.6.0&new-version=5.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_wheel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 4b7ac4cb03..beb46cac70 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -109,7 +109,7 @@ jobs:
 
       - name: Extract metadata (tags, labels) for Docker
         id: meta
-        uses: docker/metadata-action@818d4b7b91585d195f67373fd9cb0332e31a7175
+        uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934
         with:
           images: ghcr.io/deepmodeling/deepmd-kit
 

From 158eaddbaa3932b8f475120ad92965b0d07017c4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:19:29 +0800
Subject: [PATCH 41/63] Bump docker/setup-qemu-action from 2 to 3 (#2837)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps
[docker/setup-qemu-action](https://github.com/docker/setup-qemu-action)
from 2 to 3.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/docker/setup-qemu-action/releases">docker/setup-qemu-action's
releases</a>.</em></p>
<blockquote>
<h2>v3.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a
href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions
Runner v2.308.0</a> or later) by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/setup-qemu-action/pull/102">docker/setup-qemu-action#102</a></li>
<li>Bump <code>@​actions/core</code> from 1.10.0 to 1.10.1 in <a
href="https://redirect.github.com/docker/setup-qemu-action/pull/103">docker/setup-qemu-action#103</a></li>
<li>Bump semver from 6.3.0 to 6.3.1 in <a
href="https://redirect.github.com/docker/setup-qemu-action/pull/89">docker/setup-qemu-action#89</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/setup-qemu-action/compare/v2.2.0...v3.0.0">https://github.com/docker/setup-qemu-action/compare/v2.2.0...v3.0.0</a></p>
<h2>v2.2.0</h2>
<ul>
<li>Trim off spaces in <code>platforms</code> input by <a
href="https://github.com/Chocobo1"><code>@​Chocobo1</code></a> in <a
href="https://redirect.github.com/docker/setup-qemu-action/pull/64">docker/setup-qemu-action#64</a></li>
<li>Switch to actions-toolkit implementation by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/setup-qemu-action/pull/70">docker/setup-qemu-action#70</a>
<a
href="https://redirect.github.com/docker/setup-qemu-action/pull/80">docker/setup-qemu-action#80</a>
<a
href="https://redirect.github.com/docker/setup-qemu-action/pull/83">docker/setup-qemu-action#83</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/setup-qemu-action/compare/v2.1.0...v2.2.0">https://github.com/docker/setup-qemu-action/compare/v2.1.0...v2.2.0</a></p>
<h2>v2.1.0</h2>
<ul>
<li>Use context for inputs by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> (<a
href="https://redirect.github.com/docker/setup-qemu-action/issues/62">#62</a>)</li>
<li>Use built-in <code>getExecOutput</code> by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> (<a
href="https://redirect.github.com/docker/setup-qemu-action/issues/61">#61</a>)</li>
<li>Remove workaround for <code>setOutput</code> by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> (<a
href="https://redirect.github.com/docker/setup-qemu-action/issues/63">#63</a>)</li>
<li>Bump <code>@​actions/core</code> from 1.6.0 to 1.10.0 (<a
href="https://redirect.github.com/docker/setup-qemu-action/issues/54">#54</a>
<a
href="https://redirect.github.com/docker/setup-qemu-action/issues/58">#58</a>
<a
href="https://redirect.github.com/docker/setup-qemu-action/issues/59">#59</a>)</li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/setup-qemu-action/compare/v2.0.0...v2.1.0">https://github.com/docker/setup-qemu-action/compare/v2.0.0...v2.1.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/68827325e0b33c7199eb31dd4e31fbe9023e06e3"><code>6882732</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/setup-qemu-action/issues/103">#103</a>
from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/183f4af5043c24c5dcd58e6e9868b7df0a201a17"><code>183f4af</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/f17493529ee8f8b9fd58ed31aa1e9816c8345eb8"><code>f174935</code></a>
build(deps): bump <code>@​actions/core</code> from 1.10.0 to 1.10.1</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/2e423eb50075d10a6af463e39e267d4057cc07bb"><code>2e423eb</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/setup-qemu-action/issues/89">#89</a>
from docker/dependabot/npm_and_yarn/semver-6.3.1</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/ecc406afa7d79acce0ed8a04bb5eb086136fd000"><code>ecc406a</code></a>
Bump semver from 6.3.0 to 6.3.1</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/12dec5e201ab2f6dbea47c28f4a138590189069a"><code>12dec5e</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/setup-qemu-action/issues/102">#102</a>
from crazy-max/update-node20</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/c29b31213096fed2f8b8b36e66385d52653bf332"><code>c29b312</code></a>
chore: node 20 as default runtime</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/34ae628c8f4fb0d5c8921b9a0ffa9dcf66120c07"><code>34ae628</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/1f3d2e1ac09548bbfb7d44b1809ac66e3d551ea4"><code>1f3d2e1</code></a>
chore: fix author in package.json</li>
<li><a
href="https://github.com/docker/setup-qemu-action/commit/277dbe8c9c1990b42dc38cb1f35a593889a31a62"><code>277dbe8</code></a>
vendor: bump <code>@​docker/actions-toolkit</code> from 0.3.0 to
0.12.0</li>
<li>Additional commits viewable in <a
href="https://github.com/docker/setup-qemu-action/compare/v2...v3">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/setup-qemu-action&package-manager=github_actions&previous-version=2&new-version=3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_wheel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index beb46cac70..2c0bc5d387 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -43,7 +43,7 @@ jobs:
           submodules: true
           # https://github.com/pypa/setuptools_scm/issues/480
           fetch-depth: 0
-      - uses: docker/setup-qemu-action@v2
+      - uses: docker/setup-qemu-action@v3
         name: Setup QEMU
         if: matrix.platform_id == 'manylinux_aarch64'
       - name: Build wheels

From 0aae8e5bd9c8e5052e080ded3d71ce4f47ba5638 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:19:36 +0800
Subject: [PATCH 42/63] Bump docker/login-action from 2.2.0 to 3.0.0 (#2834)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [docker/login-action](https://github.com/docker/login-action) from
2.2.0 to 3.0.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/docker/login-action/releases">docker/login-action's
releases</a>.</em></p>
<blockquote>
<h2>v3.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a
href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions
Runner v2.308.0</a> or later) by <a
href="https://github.com/crazy-max"><code>@​crazy-max</code></a> in <a
href="https://redirect.github.com/docker/login-action/pull/593">docker/login-action#593</a></li>
<li>Bump <code>@​actions/core</code> from 1.10.0 to 1.10.1 in <a
href="https://redirect.github.com/docker/login-action/pull/598">docker/login-action#598</a></li>
<li>Bump <code>@​aws-sdk/client-ecr</code> and
<code>@​aws-sdk/client-ecr-public</code> to 3.410.0 in <a
href="https://redirect.github.com/docker/login-action/pull/555">docker/login-action#555</a>
<a
href="https://redirect.github.com/docker/login-action/pull/560">docker/login-action#560</a>
<a
href="https://redirect.github.com/docker/login-action/pull/582">docker/login-action#582</a>
<a
href="https://redirect.github.com/docker/login-action/pull/599">docker/login-action#599</a></li>
<li>Bump semver from 6.3.0 to 6.3.1 in <a
href="https://redirect.github.com/docker/login-action/pull/556">docker/login-action#556</a></li>
<li>Bump https-proxy-agent to 7.0.2 <a
href="https://redirect.github.com/docker/login-action/pull/561">docker/login-action#561</a>
<a
href="https://redirect.github.com/docker/login-action/pull/588">docker/login-action#588</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/docker/login-action/compare/v2.2.0...v3.0.0">https://github.com/docker/login-action/compare/v2.2.0...v3.0.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/docker/login-action/commit/343f7c4344506bcbf9b4de18042ae17996df046d"><code>343f7c4</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/login-action/issues/599">#599</a>
from docker/dependabot/npm_and_yarn/aws-sdk-dependenc...</li>
<li><a
href="https://github.com/docker/login-action/commit/aad0f974f21dc644b324e9fa84c4e364f62acbe6"><code>aad0f97</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/login-action/commit/2e0cd391447ec1a654ce199502a5d596fad131a2"><code>2e0cd39</code></a>
build(deps): bump the aws-sdk-dependencies group with 2 updates</li>
<li><a
href="https://github.com/docker/login-action/commit/203bc9c4eff55a7fac1552bc4811dc0ea4814f2e"><code>203bc9c</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/login-action/issues/588">#588</a>
from docker/dependabot/npm_and_yarn/proxy-agent-depen...</li>
<li><a
href="https://github.com/docker/login-action/commit/2199648fc889a2592472959743a8e7d4423bcb29"><code>2199648</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/login-action/commit/b489376173c4ff2c6e783dcb597ba8eff69245fe"><code>b489376</code></a>
build(deps): bump the proxy-agent-dependencies group with 1 update</li>
<li><a
href="https://github.com/docker/login-action/commit/7c309e74e68d0a0055fd02607b10b3d96510544c"><code>7c309e7</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/login-action/issues/598">#598</a>
from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a
href="https://github.com/docker/login-action/commit/0ccf222961de35820c1704a0293ca7483b07d065"><code>0ccf222</code></a>
chore: update generated content</li>
<li><a
href="https://github.com/docker/login-action/commit/56d703e106032867ad04c1e54d781c209f451e26"><code>56d703e</code></a>
Merge pull request <a
href="https://redirect.github.com/docker/login-action/issues/597">#597</a>
from docker/dependabot/github_actions/aws-actions/con...</li>
<li><a
href="https://github.com/docker/login-action/commit/24d3b3519e6e369d4d0a307a02881c2f81318560"><code>24d3b35</code></a>
build(deps): bump <code>@​actions/core</code> from 1.10.0 to 1.10.1</li>
<li>Additional commits viewable in <a
href="https://github.com/docker/login-action/compare/465a07811f14bebb1938fbed4728c6a1ff8901fc...343f7c4344506bcbf9b4de18042ae17996df046d">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/login-action&package-manager=github_actions&previous-version=2.2.0&new-version=3.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_wheel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 2c0bc5d387..49ed433609 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -101,7 +101,7 @@ jobs:
           name: artifact
           path: source/install/docker/dist
       - name: Log in to the Container registry
-        uses: docker/login-action@465a07811f14bebb1938fbed4728c6a1ff8901fc
+        uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d
         with:
           registry: ghcr.io
           username: ${{ github.actor }}

From 339ce4743194632ad6c4a906fb7c9fa4fe41dc32 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 19 Sep 2023 14:46:01 -0400
Subject: [PATCH 43/63] remove `_cuda` or `_rocm` suffix (#2839)

Remove all `_cuda` or `_rocm` suffixes in function names, as proposed in
#2838. They can be merged in the following PRs.

(Replace all: `gpu_cuda` -> `gpu`; `gpu_rocm` -> `gpu`)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/coord.h                  |  32 +-
 source/lib/include/fmt_nlist.h              |  72 ++---
 source/lib/include/gelu.h                   |  36 +--
 source/lib/include/neighbor_list.h          |  38 +--
 source/lib/include/prod_env_mat.h           | 140 ++++----
 source/lib/include/prod_force.h             |  64 ++--
 source/lib/include/prod_force_grad.h        |  56 ++--
 source/lib/include/prod_virial.h            |  72 ++---
 source/lib/include/prod_virial_grad.h       |  56 ++--
 source/lib/include/region.h                 |  14 +-
 source/lib/include/tabulate.h               | 176 +++++-----
 source/lib/src/cuda/gelu.cu                 |  44 ++-
 source/lib/src/cuda/neighbor_list.cu        |   4 +-
 source/lib/src/cuda/prod_env_mat.cu         | 334 ++++++++++---------
 source/lib/src/cuda/prod_force.cu           |  96 +++---
 source/lib/src/cuda/prod_force_grad.cu      |  84 ++---
 source/lib/src/cuda/prod_virial.cu          | 108 +++----
 source/lib/src/cuda/prod_virial_grad.cu     |  84 ++---
 source/lib/src/cuda/tabulate.cu             | 328 ++++++++++---------
 source/lib/src/rocm/coord.hip.cu            |  96 +++---
 source/lib/src/rocm/gelu.hip.cu             |  44 ++-
 source/lib/src/rocm/neighbor_list.hip.cu    |  70 ++--
 source/lib/src/rocm/prod_env_mat.hip.cu     | 334 ++++++++++---------
 source/lib/src/rocm/prod_force.hip.cu       |  96 +++---
 source/lib/src/rocm/prod_force_grad.hip.cu  |  84 ++---
 source/lib/src/rocm/prod_virial.hip.cu      | 108 +++----
 source/lib/src/rocm/prod_virial_grad.hip.cu |  84 ++---
 source/lib/src/rocm/region.hip.cu           |  44 ++-
 source/lib/src/rocm/tabulate.hip.cu         | 328 ++++++++++---------
 source/lib/tests/test_coord.cc              |  38 +--
 source/lib/tests/test_env_mat_a.cc          |  40 +--
 source/lib/tests/test_env_mat_a_mix.cc      |  20 +-
 source/lib/tests/test_env_mat_r.cc          |  40 +--
 source/lib/tests/test_fmt_nlist.cc          |  44 +--
 source/lib/tests/test_gelu.cc               |  28 +-
 source/lib/tests/test_neighbor_list.cc      |  10 +-
 source/lib/tests/test_prod_force_a.cc       |  12 +-
 source/lib/tests/test_prod_force_grad_a.cc  |   8 +-
 source/lib/tests/test_prod_force_grad_r.cc  |   8 +-
 source/lib/tests/test_prod_force_r.cc       |  12 +-
 source/lib/tests/test_prod_virial_a.cc      |  16 +-
 source/lib/tests/test_prod_virial_grad_a.cc |   8 +-
 source/lib/tests/test_prod_virial_grad_r.cc |   8 +-
 source/lib/tests/test_prod_virial_r.cc      |  16 +-
 source/lib/tests/test_simulation_region.cc  |  10 +-
 source/lib/tests/test_tabulate_se_a.cc      |  40 +--
 source/lib/tests/test_tabulate_se_r.cc      |  30 +-
 source/lib/tests/test_tabulate_se_t.cc      |  24 +-
 source/op/gelu_multi_device.cc              |  12 +-
 source/op/prod_env_mat_multi_device.cc      | 338 ++++++++++----------
 source/op/prod_force_grad_multi_device.cc   |  16 +-
 source/op/prod_force_multi_device.cc        |  16 +-
 source/op/prod_virial_grad_multi_device.cc  |  16 +-
 source/op/prod_virial_multi_device.cc       |  16 +-
 source/op/tabulate_multi_device.cc          |  96 +++---
 55 files changed, 1999 insertions(+), 2049 deletions(-)

diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h
index 56d90fbb17..fb60f6440b 100644
--- a/source/lib/include/coord.h
+++ b/source/lib/include/coord.h
@@ -92,9 +92,9 @@ int copy_coord_gpu(FPTYPE* out_c,
 // input:
 // natom, box_info: boxt, rec_boxt
 template <typename FPTYPE>
-void normalize_coord_gpu_rocm(FPTYPE* coord,
-                              const int natom,
-                              const deepmd::Region<FPTYPE>& region);
+void normalize_coord_gpu(FPTYPE* coord,
+                         const int natom,
+                         const deepmd::Region<FPTYPE>& region);
 
 // copy coordinates
 // outputs:
@@ -111,19 +111,19 @@ void normalize_coord_gpu_rocm(FPTYPE* coord,
 //	1: the memory is not large enough to hold all copied coords and types.
 //	   i.e. nall > mem_nall
 template <typename FPTYPE>
-int copy_coord_gpu_rocm(FPTYPE* out_c,
-                        int* out_t,
-                        int* mapping,
-                        int* nall,
-                        int* int_data,
-                        const FPTYPE* in_c,
-                        const int* in_t,
-                        const int& nloc,
-                        const int& mem_nall,
-                        const int& loc_cellnum,
-                        const int& total_cellnum,
-                        const int* cell_info,
-                        const deepmd::Region<FPTYPE>& region);
+int copy_coord_gpu(FPTYPE* out_c,
+                   int* out_t,
+                   int* mapping,
+                   int* nall,
+                   int* int_data,
+                   const FPTYPE* in_c,
+                   const int* in_t,
+                   const int& nloc,
+                   const int& mem_nall,
+                   const int& loc_cellnum,
+                   const int& total_cellnum,
+                   const int* cell_info,
+                   const deepmd::Region<FPTYPE>& region);
 #endif  // TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 60e34c7da9..1e7c6574cc 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -20,50 +20,50 @@ void format_nlist_cpu(int* nlist,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void format_nbor_list_gpu_cuda(int* nlist,
-                               const FPTYPE* coord,
-                               const int* type,
-                               const deepmd::InputNlist& gpu_inlist,
-                               int* array_int,
-                               uint_64* array_longlong,
-                               const int max_nbor_size,
-                               const int nloc,
-                               const int nall,
-                               const float rcut,
-                               const std::vector<int> sec);
+void format_nbor_list_gpu(int* nlist,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          int* array_int,
+                          uint_64* array_longlong,
+                          const int max_nbor_size,
+                          const int nloc,
+                          const int nall,
+                          const float rcut,
+                          const std::vector<int> sec);
 
 template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu_cuda(uint_64* key,
-                                               int* out_type,
-                                               int* out_index,
-                                               const int* in_type,
-                                               const FPTYPE* in_dist,
-                                               const int* in_index,
-                                               const int size_of_array);
+void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                          int* out_type,
+                                          int* out_index,
+                                          const int* in_type,
+                                          const FPTYPE* in_dist,
+                                          const int* in_index,
+                                          const int size_of_array);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void format_nbor_list_gpu_rocm(int* nlist,
-                               const FPTYPE* coord,
-                               const int* type,
-                               const deepmd::InputNlist& gpu_inlist,
-                               int* array_int,
-                               uint_64* array_longlong,
-                               const int max_nbor_size,
-                               const int nloc,
-                               const int nall,
-                               const float rcut,
-                               const std::vector<int> sec);
+void format_nbor_list_gpu(int* nlist,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          int* array_int,
+                          uint_64* array_longlong,
+                          const int max_nbor_size,
+                          const int nloc,
+                          const int nall,
+                          const float rcut,
+                          const std::vector<int> sec);
 
 template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu_rocm(uint_64* key,
-                                               int* out_type,
-                                               int* out_index,
-                                               const int* in_type,
-                                               const FPTYPE* in_dist,
-                                               const int* in_index,
-                                               const int size_of_array);
+void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                          int* out_type,
+                                          int* out_index,
+                                          const int* in_type,
+                                          const FPTYPE* in_dist,
+                                          const int* in_index,
+                                          const int size_of_array);
 #endif  // TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h
index a3985ce0cc..946c283c8d 100644
--- a/source/lib/include/gelu.h
+++ b/source/lib/include/gelu.h
@@ -22,38 +22,38 @@ void gelu_grad_grad_cpu(FPTYPE* out,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size);
+void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size);
 
 template <typename FPTYPE>
-void gelu_grad_gpu_cuda(FPTYPE* out,
+void gelu_grad_gpu(FPTYPE* out,
+                   const FPTYPE* xx,
+                   const FPTYPE* dy,
+                   const int_64 size);
+
+template <typename FPTYPE>
+void gelu_grad_grad_gpu(FPTYPE* out,
                         const FPTYPE* xx,
                         const FPTYPE* dy,
+                        const FPTYPE* dy_2,
                         const int_64 size);
-
-template <typename FPTYPE>
-void gelu_grad_grad_gpu_cuda(FPTYPE* out,
-                             const FPTYPE* xx,
-                             const FPTYPE* dy,
-                             const FPTYPE* dy_2,
-                             const int_64 size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void gelu_gpu_rocm(FPTYPE* out, const FPTYPE* xx, const int_64 size);
+void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size);
 
 template <typename FPTYPE>
-void gelu_grad_gpu_rocm(FPTYPE* out,
+void gelu_grad_gpu(FPTYPE* out,
+                   const FPTYPE* xx,
+                   const FPTYPE* dy,
+                   const int_64 size);
+
+template <typename FPTYPE>
+void gelu_grad_grad_gpu(FPTYPE* out,
                         const FPTYPE* xx,
                         const FPTYPE* dy,
+                        const FPTYPE* dy_2,
                         const int_64 size);
 
-template <typename FPTYPE>
-void gelu_grad_grad_gpu_rocm(FPTYPE* out,
-                             const FPTYPE* xx,
-                             const FPTYPE* dy,
-                             const FPTYPE* dy_2,
-                             const int_64 size);
-
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 4e0ce4f2de..5ed2dd4501 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -150,7 +150,7 @@ int build_nlist_gpu(InputNlist& nlist,
  * @param ftype_in The input atom type.
  * @param nloc The number of atoms.
  */
-void filter_ftype_gpu_cuda(int* ftype_out, const int* ftype_in, const int nloc);
+void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc);
 
 void use_nei_info_gpu(int* nlist,
                       int* ntype,
@@ -177,14 +177,14 @@ void use_nei_info_gpu(int* nlist,
 //	1: the memory is not large enough to hold all neighbors.
 //	   i.e. max_list_size > mem_nall
 template <typename FPTYPE>
-int build_nlist_gpu_rocm(InputNlist& nlist,
-                         int* max_list_size,
-                         int* nlist_data,
-                         const FPTYPE* c_cpy,
-                         const int& nloc,
-                         const int& nall,
-                         const int& mem_size,
-                         const float& rcut);
+int build_nlist_gpu(InputNlist& nlist,
+                    int* max_list_size,
+                    int* nlist_data,
+                    const FPTYPE* c_cpy,
+                    const int& nloc,
+                    const int& nall,
+                    const int& mem_size,
+                    const float& rcut);
 /**
  * @brief Filter the fake atom type.
  * @details If >=0, set to 0; if <0, set to -1.
@@ -192,17 +192,17 @@ int build_nlist_gpu_rocm(InputNlist& nlist,
  * @param ftype_in The input atom type.
  * @param nloc The number of atoms.
  */
-void filter_ftype_gpu_rocm(int* ftype_out, const int* ftype_in, const int nloc);
+void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc);
 
-void use_nei_info_gpu_rocm(int* nlist,
-                           int* ntype,
-                           bool* nmask,
-                           const int* type,
-                           const int* nlist_map,
-                           const int nloc,
-                           const int nnei,
-                           const int ntypes,
-                           const bool b_nlist_map);
+void use_nei_info_gpu(int* nlist,
+                      int* ntype,
+                      bool* nmask,
+                      const int* type,
+                      const int* nlist_map,
+                      const int nloc,
+                      const int nnei,
+                      const int ntypes,
+                      const bool b_nlist_map);
 
 #endif  // TENSORFLOW_USE_ROCM
 
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index a1cd27bef0..91f09f74e7 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -44,43 +44,43 @@ void prod_env_mat_r_cpu(FPTYPE *em,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void prod_env_mat_a_gpu_cuda(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec,
-                             const int *f_type = NULL);
+void prod_env_mat_a_gpu(FPTYPE *em,
+                        FPTYPE *em_deriv,
+                        FPTYPE *rij,
+                        int *nlist,
+                        const FPTYPE *coord,
+                        const int *type,
+                        const InputNlist &gpu_inlist,
+                        int *array_int,
+                        unsigned long long *array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE *avg,
+                        const FPTYPE *std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec,
+                        const int *f_type = NULL);
 
 template <typename FPTYPE>
-void prod_env_mat_r_gpu_cuda(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec);
+void prod_env_mat_r_gpu(FPTYPE *em,
+                        FPTYPE *em_deriv,
+                        FPTYPE *rij,
+                        int *nlist,
+                        const FPTYPE *coord,
+                        const int *type,
+                        const InputNlist &gpu_inlist,
+                        int *array_int,
+                        unsigned long long *array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE *avg,
+                        const FPTYPE *std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec);
 
 void env_mat_nbor_update(InputNlist &inlist,
                          InputNlist &gpu_inlist,
@@ -92,43 +92,43 @@ void env_mat_nbor_update(InputNlist &inlist,
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void prod_env_mat_a_gpu_rocm(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec,
-                             const int *f_type = NULL);
+void prod_env_mat_a_gpu(FPTYPE *em,
+                        FPTYPE *em_deriv,
+                        FPTYPE *rij,
+                        int *nlist,
+                        const FPTYPE *coord,
+                        const int *type,
+                        const InputNlist &gpu_inlist,
+                        int *array_int,
+                        unsigned long long *array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE *avg,
+                        const FPTYPE *std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec,
+                        const int *f_type = NULL);
 
 template <typename FPTYPE>
-void prod_env_mat_r_gpu_rocm(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec);
+void prod_env_mat_r_gpu(FPTYPE *em,
+                        FPTYPE *em_deriv,
+                        FPTYPE *rij,
+                        int *nlist,
+                        const FPTYPE *coord,
+                        const int *type,
+                        const InputNlist &gpu_inlist,
+                        int *array_int,
+                        unsigned long long *array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE *avg,
+                        const FPTYPE *std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec);
 
 void env_mat_nbor_update(InputNlist &inlist,
                          InputNlist &gpu_inlist,
diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h
index ce3e020a3b..03c72ba661 100644
--- a/source/lib/include/prod_force.h
+++ b/source/lib/include/prod_force.h
@@ -69,46 +69,46 @@ void prod_force_r_cpu(FPTYPE* force,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void prod_force_a_gpu_cuda(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes);
+void prod_force_a_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes);
 
 template <typename FPTYPE>
-void prod_force_r_gpu_cuda(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes);
+void prod_force_r_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void prod_force_a_gpu_rocm(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes);
+void prod_force_a_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes);
 
 template <typename FPTYPE>
-void prod_force_r_gpu_rocm(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes);
+void prod_force_r_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes);
 #endif  // TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h
index 4d224ad93f..5d0ab50b68 100644
--- a/source/lib/include/prod_force_grad.h
+++ b/source/lib/include/prod_force_grad.h
@@ -23,41 +23,41 @@ void prod_force_grad_r_cpu(FPTYPE* grad_net,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes);
+void prod_force_grad_a_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes);
 
 template <typename FPTYPE>
-void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes);
+void prod_force_grad_r_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes);
+void prod_force_grad_a_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes);
 
 template <typename FPTYPE>
-void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes);
+void prod_force_grad_r_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes);
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h
index 46e0ef3ab9..348188874c 100644
--- a/source/lib/include/prod_virial.h
+++ b/source/lib/include/prod_virial.h
@@ -27,50 +27,50 @@ void prod_virial_r_cpu(FPTYPE* virial,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void prod_virial_a_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
+void prod_virial_a_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* env_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei);
 
 template <typename FPTYPE>
-void prod_virial_r_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
+void prod_virial_r_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* env_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void prod_virial_a_gpu_rocm(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
+void prod_virial_a_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* env_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei);
 
 template <typename FPTYPE>
-void prod_virial_r_gpu_rocm(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
+void prod_virial_r_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* env_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei);
 #endif  // TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h
index d840e6b718..6e0c232f8a 100644
--- a/source/lib/include/prod_virial_grad.h
+++ b/source/lib/include/prod_virial_grad.h
@@ -23,42 +23,42 @@ void prod_virial_grad_r_cpu(FPTYPE* grad_net,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
+void prod_virial_grad_a_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei);
 
 template <typename FPTYPE>
-void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
+void prod_virial_grad_r_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
+void prod_virial_grad_a_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei);
 
 template <typename FPTYPE>
-void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
+void prod_virial_grad_r_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei);
 #endif  // TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/region.h b/source/lib/include/region.h
index 6aaf805ccd..9db2735462 100644
--- a/source/lib/include/region.h
+++ b/source/lib/include/region.h
@@ -46,16 +46,16 @@ void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region);
 #if TENSORFLOW_USE_ROCM
 // only for unittest
 template <typename FPTYPE>
-void convert_to_inter_gpu_rocm(FPTYPE* ri,
-                               const Region<FPTYPE>& region,
-                               const FPTYPE* rp);
+void convert_to_inter_gpu(FPTYPE* ri,
+                          const Region<FPTYPE>& region,
+                          const FPTYPE* rp);
 
 template <typename FPTYPE>
-void convert_to_phys_gpu_rocm(FPTYPE* rp,
-                              const Region<FPTYPE>& region,
-                              const FPTYPE* ri);
+void convert_to_phys_gpu(FPTYPE* rp,
+                         const Region<FPTYPE>& region,
+                         const FPTYPE* ri);
 
 template <typename FPTYPE>
-void volume_gpu_rocm(FPTYPE* volume, const Region<FPTYPE>& region);
+void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region);
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h
index 2e2c021d9c..96072e6a33 100644
--- a/source/lib/include/tabulate.h
+++ b/source/lib/include/tabulate.h
@@ -110,216 +110,216 @@ void tabulate_fusion_se_r_grad_grad_cpu(FPTYPE* dz_dy,
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
+void tabulate_fusion_se_a_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const FPTYPE* two_embed,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size,
+                              const bool is_sorted = true);
+
+template <typename FPTYPE>
+void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
                                    const FPTYPE* table,
                                    const FPTYPE* table_info,
                                    const FPTYPE* em_x,
                                    const FPTYPE* em,
                                    const FPTYPE* two_embed,
+                                   const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei,
                                    const int last_layer_size,
                                    const bool is_sorted = true);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
+void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
                                         const FPTYPE* table,
                                         const FPTYPE* table_info,
                                         const FPTYPE* em_x,
                                         const FPTYPE* em,
-                                        const FPTYPE* two_embed,
-                                        const FPTYPE* dy,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei,
                                         const int last_layer_size,
                                         const bool is_sorted = true);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size,
-                                             const bool is_sorted = true);
+void tabulate_fusion_se_t_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei_i,
+                              const int nnei_j,
+                              const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
+void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
                                    const FPTYPE* table,
                                    const FPTYPE* table_info,
                                    const FPTYPE* em_x,
                                    const FPTYPE* em,
+                                   const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei_i,
                                    const int nnei_j,
                                    const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
+void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
                                         const FPTYPE* table,
                                         const FPTYPE* table_info,
                                         const FPTYPE* em_x,
                                         const FPTYPE* em,
-                                        const FPTYPE* dy,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei_i,
                                         const int nnei_j,
                                         const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei_i,
-                                             const int nnei_j,
-                                             const int last_layer_size);
+void tabulate_fusion_se_r_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
+void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
                                    const FPTYPE* table,
                                    const FPTYPE* table_info,
                                    const FPTYPE* em,
+                                   const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei,
                                    const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
+void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
                                         const FPTYPE* table,
                                         const FPTYPE* table_info,
                                         const FPTYPE* em,
-                                        const FPTYPE* dy,
+                                        const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei,
                                         const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out,
+void tabulate_fusion_se_a_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const FPTYPE* two_embed,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size,
+                              const bool is_sorted = true);
+
+template <typename FPTYPE>
+void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
                                    const FPTYPE* table,
                                    const FPTYPE* table_info,
                                    const FPTYPE* em_x,
                                    const FPTYPE* em,
                                    const FPTYPE* two_embed,
+                                   const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei,
                                    const int last_layer_size,
                                    const bool is_sorted = true);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
+void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
                                         const FPTYPE* table,
                                         const FPTYPE* table_info,
                                         const FPTYPE* em_x,
                                         const FPTYPE* em,
-                                        const FPTYPE* two_embed,
-                                        const FPTYPE* dy,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei,
                                         const int last_layer_size,
                                         const bool is_sorted = true);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size,
-                                             const bool is_sorted = true);
+void tabulate_fusion_se_t_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei_i,
+                              const int nnei_j,
+                              const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out,
+void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
                                    const FPTYPE* table,
                                    const FPTYPE* table_info,
                                    const FPTYPE* em_x,
                                    const FPTYPE* em,
+                                   const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei_i,
                                    const int nnei_j,
                                    const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu_rocm(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
+void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
                                         const FPTYPE* table,
                                         const FPTYPE* table_info,
                                         const FPTYPE* em_x,
                                         const FPTYPE* em,
-                                        const FPTYPE* dy,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei_i,
                                         const int nnei_j,
                                         const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu_rocm(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei_i,
-                                             const int nnei_j,
-                                             const int last_layer_size);
+void tabulate_fusion_se_r_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu_rocm(FPTYPE* out,
+void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
                                    const FPTYPE* table,
                                    const FPTYPE* table_info,
                                    const FPTYPE* em,
+                                   const FPTYPE* dy,
                                    const int nloc,
                                    const int nnei,
                                    const int last_layer_size);
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu_rocm(FPTYPE* dy_dem,
+void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
                                         const FPTYPE* table,
                                         const FPTYPE* table_info,
                                         const FPTYPE* em,
-                                        const FPTYPE* dy,
+                                        const FPTYPE* dz_dy_dem,
                                         const int nloc,
                                         const int nnei,
                                         const int last_layer_size);
 
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu_rocm(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size);
-
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
index af78043cca..823a843b2a 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/cuda/gelu.cu
@@ -63,7 +63,7 @@ __global__ void gelu_grad_grad(FPTYPE* out,
 
 namespace deepmd {
 template <typename FPTYPE>
-void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
+void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
@@ -78,10 +78,10 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
 }
 
 template <typename FPTYPE>
-void gelu_grad_gpu_cuda(FPTYPE* out,
-                        const FPTYPE* xx,
-                        const FPTYPE* dy,
-                        const int_64 size) {
+void gelu_grad_gpu(FPTYPE* out,
+                   const FPTYPE* xx,
+                   const FPTYPE* dy,
+                   const int_64 size) {
   if (size <= 0) {
     return;
   }
@@ -96,11 +96,11 @@ void gelu_grad_gpu_cuda(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void gelu_grad_grad_gpu_cuda(FPTYPE* out,
-                             const FPTYPE* xx,
-                             const FPTYPE* dy,
-                             const FPTYPE* dy_2,
-                             const int_64 size) {
+void gelu_grad_grad_gpu(FPTYPE* out,
+                        const FPTYPE* xx,
+                        const FPTYPE* dy,
+                        const FPTYPE* dy_2,
+                        const int_64 size) {
   if (size <= 0) {
     return;
   }
@@ -114,28 +114,24 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void gelu_gpu_cuda<float>(float* out,
+template void gelu_gpu<float>(float* out, const float* x, const int_64 size);
+template void gelu_gpu<double>(double* out, const double* x, const int_64 size);
+template void gelu_grad_gpu<float>(float* out,
                                    const float* x,
+                                   const float* dy,
                                    const int_64 size);
-template void gelu_gpu_cuda<double>(double* out,
+template void gelu_grad_gpu<double>(double* out,
                                     const double* x,
+                                    const double* dy,
                                     const int_64 size);
-template void gelu_grad_gpu_cuda<float>(float* out,
+template void gelu_grad_grad_gpu<float>(float* out,
                                         const float* x,
                                         const float* dy,
+                                        const float* dy_2,
                                         const int_64 size);
-template void gelu_grad_gpu_cuda<double>(double* out,
+template void gelu_grad_grad_gpu<double>(double* out,
                                          const double* x,
                                          const double* dy,
+                                         const double* dy_2,
                                          const int_64 size);
-template void gelu_grad_grad_gpu_cuda<float>(float* out,
-                                             const float* x,
-                                             const float* dy,
-                                             const float* dy_2,
-                                             const int_64 size);
-template void gelu_grad_grad_gpu_cuda<double>(double* out,
-                                              const double* x,
-                                              const double* dy,
-                                              const double* dy_2,
-                                              const int_64 size);
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
index 4fae6f3874..7cac07690b 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/cuda/neighbor_list.cu
@@ -294,9 +294,7 @@ __global__ void map_filter_ftype(int *ftype_out,
   }
 }
 
-void filter_ftype_gpu_cuda(int *ftype_out,
-                           const int *ftype_in,
-                           const int nloc) {
+void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index 8a085a47b5..e603b25db7 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -558,17 +558,17 @@ __global__ void compute_env_mat_r(FPTYPE* em,
 
 namespace deepmd {
 template <typename FPTYPE>
-void format_nbor_list_gpu_cuda(int* nlist,
-                               const FPTYPE* coord,
-                               const int* type,
-                               const deepmd::InputNlist& gpu_inlist,
-                               int* array_int,
-                               uint_64* array_longlong,
-                               const int max_nbor_size,
-                               const int nloc,
-                               const int nall,
-                               const float rcut,
-                               const std::vector<int> sec) {
+void format_nbor_list_gpu(int* nlist,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          int* array_int,
+                          uint_64* array_longlong,
+                          const int max_nbor_size,
+                          const int nloc,
+                          const int nall,
+                          const float rcut,
+                          const std::vector<int> sec) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int LEN = 256;
@@ -613,24 +613,24 @@ void format_nbor_list_gpu_cuda(int* nlist,
 }
 
 template <typename FPTYPE>
-void prod_env_mat_a_gpu_cuda(FPTYPE* em,
-                             FPTYPE* em_deriv,
-                             FPTYPE* rij,
-                             int* nlist,
-                             const FPTYPE* coord,
-                             const int* type,
-                             const InputNlist& gpu_inlist,
-                             int* array_int,
-                             uint_64* array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE* avg,
-                             const FPTYPE* std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec,
-                             const int* f_type) {
+void prod_env_mat_a_gpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& gpu_inlist,
+                        int* array_int,
+                        uint_64* array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec,
+                        const int* f_type) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   if (f_type == NULL) {
@@ -643,9 +643,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
       cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
   DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
-  format_nbor_list_gpu_cuda(nlist, coord, f_type, gpu_inlist, array_int,
-                            array_longlong, max_nbor_size, nloc, nall, rcut,
-                            sec);
+  format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
+                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
   nborErrcheck(cudaGetLastError());
   nborErrcheck(cudaDeviceSynchronize());
 
@@ -656,23 +655,23 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
 }
 
 template <typename FPTYPE>
-void prod_env_mat_r_gpu_cuda(FPTYPE* em,
-                             FPTYPE* em_deriv,
-                             FPTYPE* rij,
-                             int* nlist,
-                             const FPTYPE* coord,
-                             const int* type,
-                             const InputNlist& gpu_inlist,
-                             int* array_int,
-                             uint_64* array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE* avg,
-                             const FPTYPE* std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec) {
+void prod_env_mat_r_gpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& gpu_inlist,
+                        int* array_int,
+                        uint_64* array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int nnei = sec.back();
@@ -682,9 +681,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
       cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
   DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
-  format_nbor_list_gpu_cuda(nlist, coord, type, gpu_inlist, array_int,
-                            array_longlong, max_nbor_size, nloc, nall, rcut,
-                            sec);
+  format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
+                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
   nborErrcheck(cudaGetLastError());
   nborErrcheck(cudaDeviceSynchronize());
 
@@ -695,13 +693,13 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
 }
 
 template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu_cuda(uint_64* key,
-                                               int* out_type,
-                                               int* out_index,
-                                               const int* in_type,
-                                               const FPTYPE* in_dist,
-                                               const int* in_index,
-                                               const int size_of_array) {
+void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                          int* out_type,
+                                          int* out_index,
+                                          const int* in_type,
+                                          const FPTYPE* in_dist,
+                                          const int* in_index,
+                                          const int size_of_array) {
   const int nblock = (size_of_array + TPB - 1) / TPB;
   encoding_decoding_nbor_info<<<nblock, TPB>>>(
       key, out_type, out_index, in_type, in_dist, in_index, size_of_array);
@@ -709,116 +707,110 @@ void test_encoding_decoding_nbor_info_gpu_cuda(uint_64* key,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void prod_env_mat_a_gpu_cuda<float>(float* em,
-                                             float* em_deriv,
-                                             float* rij,
-                                             int* nlist,
-                                             const float* coord,
-                                             const int* type,
-                                             const InputNlist& gpu_inlist,
-                                             int* array_int,
-                                             unsigned long long* array_longlong,
-                                             const int max_nbor_size,
-                                             const float* avg,
-                                             const float* std,
-                                             const int nloc,
-                                             const int nall,
-                                             const float rcut,
-                                             const float rcut_smth,
-                                             const std::vector<int> sec,
-                                             const int* f_type);
-template void prod_env_mat_a_gpu_cuda<double>(
-    double* em,
-    double* em_deriv,
-    double* rij,
-    int* nlist,
-    const double* coord,
-    const int* type,
-    const InputNlist& gpu_inlist,
-    int* array_int,
-    unsigned long long* array_longlong,
-    const int max_nbor_size,
-    const double* avg,
-    const double* std,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const float rcut_smth,
-    const std::vector<int> sec,
-    const int* f_type);
-template void prod_env_mat_r_gpu_cuda<float>(float* em,
-                                             float* em_deriv,
-                                             float* rij,
-                                             int* nlist,
-                                             const float* coord,
-                                             const int* type,
-                                             const InputNlist& gpu_inlist,
-                                             int* array_int,
-                                             unsigned long long* array_longlong,
-                                             const int max_nbor_size,
-                                             const float* avg,
-                                             const float* std,
-                                             const int nloc,
-                                             const int nall,
-                                             const float rcut,
-                                             const float rcut_smth,
-                                             const std::vector<int> sec);
-template void prod_env_mat_r_gpu_cuda<double>(
-    double* em,
-    double* em_deriv,
-    double* rij,
-    int* nlist,
-    const double* coord,
-    const int* type,
-    const InputNlist& gpu_inlist,
-    int* array_int,
-    unsigned long long* array_longlong,
-    const int max_nbor_size,
-    const double* avg,
-    const double* std,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const float rcut_smth,
-    const std::vector<int> sec);
-template void format_nbor_list_gpu_cuda<float>(
-    int* nlist,
-    const float* coord,
-    const int* type,
-    const deepmd::InputNlist& gpu_inlist,
-    int* array_int,
-    uint_64* array_longlong,
-    const int max_nbor_size,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const std::vector<int> sec);
-template void format_nbor_list_gpu_cuda<double>(
-    int* nlist,
-    const double* coord,
-    const int* type,
-    const deepmd::InputNlist& gpu_inlist,
-    int* array_int,
-    uint_64* array_longlong,
-    const int max_nbor_size,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const std::vector<int> sec);
-template void test_encoding_decoding_nbor_info_gpu_cuda(
-    uint_64* key,
-    int* out_type,
-    int* out_index,
-    const int* in_type,
-    const float* in_dist,
-    const int* in_index,
-    const int size_of_array);
-template void test_encoding_decoding_nbor_info_gpu_cuda(
-    uint_64* key,
-    int* out_type,
-    int* out_index,
-    const int* in_type,
-    const double* in_dist,
-    const int* in_index,
-    const int size_of_array);
+template void prod_env_mat_a_gpu<float>(float* em,
+                                        float* em_deriv,
+                                        float* rij,
+                                        int* nlist,
+                                        const float* coord,
+                                        const int* type,
+                                        const InputNlist& gpu_inlist,
+                                        int* array_int,
+                                        unsigned long long* array_longlong,
+                                        const int max_nbor_size,
+                                        const float* avg,
+                                        const float* std,
+                                        const int nloc,
+                                        const int nall,
+                                        const float rcut,
+                                        const float rcut_smth,
+                                        const std::vector<int> sec,
+                                        const int* f_type);
+template void prod_env_mat_a_gpu<double>(double* em,
+                                         double* em_deriv,
+                                         double* rij,
+                                         int* nlist,
+                                         const double* coord,
+                                         const int* type,
+                                         const InputNlist& gpu_inlist,
+                                         int* array_int,
+                                         unsigned long long* array_longlong,
+                                         const int max_nbor_size,
+                                         const double* avg,
+                                         const double* std,
+                                         const int nloc,
+                                         const int nall,
+                                         const float rcut,
+                                         const float rcut_smth,
+                                         const std::vector<int> sec,
+                                         const int* f_type);
+template void prod_env_mat_r_gpu<float>(float* em,
+                                        float* em_deriv,
+                                        float* rij,
+                                        int* nlist,
+                                        const float* coord,
+                                        const int* type,
+                                        const InputNlist& gpu_inlist,
+                                        int* array_int,
+                                        unsigned long long* array_longlong,
+                                        const int max_nbor_size,
+                                        const float* avg,
+                                        const float* std,
+                                        const int nloc,
+                                        const int nall,
+                                        const float rcut,
+                                        const float rcut_smth,
+                                        const std::vector<int> sec);
+template void prod_env_mat_r_gpu<double>(double* em,
+                                         double* em_deriv,
+                                         double* rij,
+                                         int* nlist,
+                                         const double* coord,
+                                         const int* type,
+                                         const InputNlist& gpu_inlist,
+                                         int* array_int,
+                                         unsigned long long* array_longlong,
+                                         const int max_nbor_size,
+                                         const double* avg,
+                                         const double* std,
+                                         const int nloc,
+                                         const int nall,
+                                         const float rcut,
+                                         const float rcut_smth,
+                                         const std::vector<int> sec);
+template void format_nbor_list_gpu<float>(int* nlist,
+                                          const float* coord,
+                                          const int* type,
+                                          const deepmd::InputNlist& gpu_inlist,
+                                          int* array_int,
+                                          uint_64* array_longlong,
+                                          const int max_nbor_size,
+                                          const int nloc,
+                                          const int nall,
+                                          const float rcut,
+                                          const std::vector<int> sec);
+template void format_nbor_list_gpu<double>(int* nlist,
+                                           const double* coord,
+                                           const int* type,
+                                           const deepmd::InputNlist& gpu_inlist,
+                                           int* array_int,
+                                           uint_64* array_longlong,
+                                           const int max_nbor_size,
+                                           const int nloc,
+                                           const int nall,
+                                           const float rcut,
+                                           const std::vector<int> sec);
+template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                                   int* out_type,
+                                                   int* out_index,
+                                                   const int* in_type,
+                                                   const float* in_dist,
+                                                   const int* in_index,
+                                                   const int size_of_array);
+template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                                   int* out_type,
+                                                   int* out_index,
+                                                   const int* in_type,
+                                                   const double* in_dist,
+                                                   const int* in_index,
+                                                   const int size_of_array);
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index 04f5b84dcd..d85de26394 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -102,14 +102,14 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_force_a_gpu_cuda(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes) {
+void prod_force_a_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
@@ -131,14 +131,14 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
 }
 
 template <typename FPTYPE>
-void prod_force_r_gpu_cuda(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes) {
+void prod_force_r_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
@@ -159,36 +159,36 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void prod_force_a_gpu_cuda<float>(float* force,
-                                           const float* net_deriv,
-                                           const float* in_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nall,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_a_gpu_cuda<double>(double* force,
-                                            const double* net_deriv,
-                                            const double* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei,
-                                            const int nframes);
-template void prod_force_r_gpu_cuda<float>(float* force,
-                                           const float* net_deriv,
-                                           const float* in_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nall,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_r_gpu_cuda<double>(double* force,
-                                            const double* net_deriv,
-                                            const double* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei,
-                                            const int nframes);
+template void prod_force_a_gpu<float>(float* force,
+                                      const float* net_deriv,
+                                      const float* in_deriv,
+                                      const int* nlist,
+                                      const int nloc,
+                                      const int nall,
+                                      const int nnei,
+                                      const int nframes);
+template void prod_force_a_gpu<double>(double* force,
+                                       const double* net_deriv,
+                                       const double* in_deriv,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei,
+                                       const int nframes);
+template void prod_force_r_gpu<float>(float* force,
+                                      const float* net_deriv,
+                                      const float* in_deriv,
+                                      const int* nlist,
+                                      const int nloc,
+                                      const int nall,
+                                      const int nnei,
+                                      const int nframes);
+template void prod_force_r_gpu<double>(double* force,
+                                       const double* net_deriv,
+                                       const double* in_deriv,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei,
+                                       const int nframes);
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
index e72ba2ea48..b54676586c 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/cuda/prod_force_grad.cu
@@ -81,13 +81,13 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes) {
+void prod_force_grad_a_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
@@ -112,13 +112,13 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
 }
 
 template <typename FPTYPE>
-void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes) {
+void prod_force_grad_r_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
@@ -142,32 +142,32 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void prod_force_grad_a_gpu_cuda<float>(float* grad_net,
-                                                const float* grad,
-                                                const float* env_deriv,
-                                                const int* nlist,
-                                                const int nloc,
-                                                const int nnei,
-                                                const int nframes);
-template void prod_force_grad_a_gpu_cuda<double>(double* grad_net,
-                                                 const double* grad,
-                                                 const double* env_deriv,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei,
-                                                 const int nframes);
-template void prod_force_grad_r_gpu_cuda<float>(float* grad_net,
-                                                const float* grad,
-                                                const float* env_deriv,
-                                                const int* nlist,
-                                                const int nloc,
-                                                const int nnei,
-                                                const int nframes);
-template void prod_force_grad_r_gpu_cuda<double>(double* grad_net,
-                                                 const double* grad,
-                                                 const double* env_deriv,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei,
-                                                 const int nframes);
+template void prod_force_grad_a_gpu<float>(float* grad_net,
+                                           const float* grad,
+                                           const float* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei,
+                                           const int nframes);
+template void prod_force_grad_a_gpu<double>(double* grad_net,
+                                            const double* grad,
+                                            const double* env_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei,
+                                            const int nframes);
+template void prod_force_grad_r_gpu<float>(float* grad_net,
+                                           const float* grad,
+                                           const float* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei,
+                                           const int nframes);
+template void prod_force_grad_r_gpu<double>(double* grad_net,
+                                            const double* grad,
+                                            const double* env_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei,
+                                            const int nframes);
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
index 618f82625d..e96bacf1d3 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/cuda/prod_virial.cu
@@ -104,15 +104,15 @@ __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_virial_a_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* in_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei) {
+void prod_virial_a_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* in_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
@@ -134,15 +134,15 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial,
 }
 
 template <typename FPTYPE>
-void prod_virial_r_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* in_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei) {
+void prod_virial_r_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* in_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
@@ -163,40 +163,40 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void prod_virial_a_gpu_cuda<float>(float* virial,
-                                            float* atom_virial,
-                                            const float* net_deriv,
-                                            const float* in_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei);
-template void prod_virial_a_gpu_cuda<double>(double* virial,
-                                             double* atom_virial,
-                                             const double* net_deriv,
-                                             const double* in_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nall,
-                                             const int nnei);
-template void prod_virial_r_gpu_cuda<float>(float* virial,
-                                            float* atom_virial,
-                                            const float* net_deriv,
-                                            const float* in_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei);
-template void prod_virial_r_gpu_cuda<double>(double* virial,
-                                             double* atom_virial,
-                                             const double* net_deriv,
-                                             const double* in_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nall,
-                                             const int nnei);
+template void prod_virial_a_gpu<float>(float* virial,
+                                       float* atom_virial,
+                                       const float* net_deriv,
+                                       const float* in_deriv,
+                                       const float* rij,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei);
+template void prod_virial_a_gpu<double>(double* virial,
+                                        double* atom_virial,
+                                        const double* net_deriv,
+                                        const double* in_deriv,
+                                        const double* rij,
+                                        const int* nlist,
+                                        const int nloc,
+                                        const int nall,
+                                        const int nnei);
+template void prod_virial_r_gpu<float>(float* virial,
+                                       float* atom_virial,
+                                       const float* net_deriv,
+                                       const float* in_deriv,
+                                       const float* rij,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei);
+template void prod_virial_r_gpu<double>(double* virial,
+                                        double* atom_virial,
+                                        const double* net_deriv,
+                                        const double* in_deriv,
+                                        const double* rij,
+                                        const int* nlist,
+                                        const int nloc,
+                                        const int nall,
+                                        const int nnei);
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
index aae7676d3c..047d8ae17f 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/cuda/prod_virial_grad.cu
@@ -85,13 +85,13 @@ __global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei) {
+void prod_virial_grad_a_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
@@ -107,13 +107,13 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
 }
 
 template <typename FPTYPE>
-void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei) {
+void prod_virial_grad_r_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei) {
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei;
@@ -128,32 +128,32 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void prod_virial_grad_a_gpu_cuda<float>(float* grad_net,
-                                                 const float* grad,
-                                                 const float* env_deriv,
-                                                 const float* rij,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei);
-template void prod_virial_grad_a_gpu_cuda<double>(double* grad_net,
-                                                  const double* grad,
-                                                  const double* env_deriv,
-                                                  const double* rij,
-                                                  const int* nlist,
-                                                  const int nloc,
-                                                  const int nnei);
-template void prod_virial_grad_r_gpu_cuda<float>(float* grad_net,
-                                                 const float* grad,
-                                                 const float* env_deriv,
-                                                 const float* rij,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei);
-template void prod_virial_grad_r_gpu_cuda<double>(double* grad_net,
-                                                  const double* grad,
-                                                  const double* env_deriv,
-                                                  const double* rij,
-                                                  const int* nlist,
-                                                  const int nloc,
-                                                  const int nnei);
+template void prod_virial_grad_a_gpu<float>(float* grad_net,
+                                            const float* grad,
+                                            const float* env_deriv,
+                                            const float* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei);
+template void prod_virial_grad_a_gpu<double>(double* grad_net,
+                                             const double* grad,
+                                             const double* env_deriv,
+                                             const double* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei);
+template void prod_virial_grad_r_gpu<float>(float* grad_net,
+                                            const float* grad,
+                                            const float* env_deriv,
+                                            const float* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei);
+template void prod_virial_grad_r_gpu<double>(double* grad_net,
+                                             const double* grad,
+                                             const double* env_deriv,
+                                             const double* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei);
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index 92f77ed63b..30695a6e05 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -621,16 +621,16 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
 
 namespace deepmd {
 template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* two_embed,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size,
-                                   const bool is_sorted) {
+void tabulate_fusion_se_a_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const FPTYPE* two_embed,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size,
+                              const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -646,18 +646,18 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* two_embed,
-                                        const FPTYPE* dy,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size,
-                                        const bool is_sorted) {
+void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
+                                   const FPTYPE* table,
+                                   const FPTYPE* table_info,
+                                   const FPTYPE* em_x,
+                                   const FPTYPE* em,
+                                   const FPTYPE* two_embed,
+                                   const FPTYPE* dy,
+                                   const int nloc,
+                                   const int nnei,
+                                   const int last_layer_size,
+                                   const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -676,17 +676,17 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size,
-                                             const bool is_sorted) {
+void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
+                                        const FPTYPE* table,
+                                        const FPTYPE* table_info,
+                                        const FPTYPE* em_x,
+                                        const FPTYPE* em,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
+                                        const int nloc,
+                                        const int nnei,
+                                        const int last_layer_size,
+                                        const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -703,15 +703,15 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const int nloc,
-                                   const int nnei_i,
-                                   const int nnei_j,
-                                   const int last_layer_size) {
+void tabulate_fusion_se_t_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei_i,
+                              const int nnei_j,
+                              const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -726,17 +726,17 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dy,
-                                        const int nloc,
-                                        const int nnei_i,
-                                        const int nnei_j,
-                                        const int last_layer_size) {
+void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
+                                   const FPTYPE* table,
+                                   const FPTYPE* table_info,
+                                   const FPTYPE* em_x,
+                                   const FPTYPE* em,
+                                   const FPTYPE* dy,
+                                   const int nloc,
+                                   const int nnei_i,
+                                   const int nnei_j,
+                                   const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -755,17 +755,17 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei_i,
-                                             const int nnei_j,
-                                             const int last_layer_size) {
+void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
+                                        const FPTYPE* table,
+                                        const FPTYPE* table_info,
+                                        const FPTYPE* em_x,
+                                        const FPTYPE* em,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
+                                        const int nloc,
+                                        const int nnei_i,
+                                        const int nnei_j,
+                                        const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -783,13 +783,13 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size) {
+void tabulate_fusion_se_r_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -804,14 +804,14 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dy,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size) {
+void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
+                                   const FPTYPE* table,
+                                   const FPTYPE* table_info,
+                                   const FPTYPE* em,
+                                   const FPTYPE* dy,
+                                   const int nloc,
+                                   const int nnei,
+                                   const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -828,14 +828,14 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size) {
+void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
+                                        const FPTYPE* table,
+                                        const FPTYPE* table_info,
+                                        const FPTYPE* em,
+                                        const FPTYPE* dz_dy_dem,
+                                        const int nloc,
+                                        const int nnei,
+                                        const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -851,53 +851,51 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void tabulate_fusion_se_a_gpu_cuda<float>(float* out,
+template void tabulate_fusion_se_a_gpu<float>(float* out,
+                                              const float* table,
+                                              const float* table_info,
+                                              const float* em_x,
+                                              const float* em,
+                                              const float* two_embed,
+                                              const int nloc,
+                                              const int nnei,
+                                              const int last_layer_size,
+                                              const bool is_sorted);
+template void tabulate_fusion_se_a_gpu<double>(double* out,
+                                               const double* table,
+                                               const double* table_info,
+                                               const double* em_x,
+                                               const double* em,
+                                               const double* two_embed,
+                                               const int nloc,
+                                               const int nnei,
+                                               const int last_layer_size,
+                                               const bool is_sorted);
+template void tabulate_fusion_se_a_grad_gpu<float>(float* dy_dem_x,
+                                                   float* dy_dem,
                                                    const float* table,
                                                    const float* table_info,
                                                    const float* em_x,
                                                    const float* em,
                                                    const float* two_embed,
+                                                   const float* dy,
                                                    const int nloc,
                                                    const int nnei,
                                                    const int last_layer_size,
                                                    const bool is_sorted);
-template void tabulate_fusion_se_a_gpu_cuda<double>(double* out,
+template void tabulate_fusion_se_a_grad_gpu<double>(double* dy_dem_x,
+                                                    double* dy_dem,
                                                     const double* table,
                                                     const double* table_info,
                                                     const double* em_x,
                                                     const double* em,
                                                     const double* two_embed,
+                                                    const double* dy,
                                                     const int nloc,
                                                     const int nnei,
                                                     const int last_layer_size,
                                                     const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu_cuda<float>(
-    float* dy_dem_x,
-    float* dy_dem,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* two_embed,
-    const float* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu_cuda<double>(
-    double* dy_dem_x,
-    double* dy_dem,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* two_embed,
-    const double* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu_cuda<float>(
+template void tabulate_fusion_se_a_grad_grad_gpu<float>(
     float* dz_dy,
     const float* table,
     const float* table_info,
@@ -909,7 +907,7 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda<float>(
     const int nnei,
     const int last_layer_size,
     const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu_cuda<double>(
+template void tabulate_fusion_se_a_grad_grad_gpu<double>(
     double* dz_dy,
     const double* table,
     const double* table_info,
@@ -922,49 +920,47 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda<double>(
     const int last_layer_size,
     const bool is_sorted);
 
-template void tabulate_fusion_se_t_gpu_cuda<float>(float* out,
+template void tabulate_fusion_se_t_gpu<float>(float* out,
+                                              const float* table,
+                                              const float* table_info,
+                                              const float* em_x,
+                                              const float* em,
+                                              const int nloc,
+                                              const int nnei_i,
+                                              const int nnei_j,
+                                              const int last_layer_size);
+template void tabulate_fusion_se_t_gpu<double>(double* out,
+                                               const double* table,
+                                               const double* table_info,
+                                               const double* em_x,
+                                               const double* em,
+                                               const int nloc,
+                                               const int nnei_i,
+                                               const int nnei_j,
+                                               const int last_layer_size);
+template void tabulate_fusion_se_t_grad_gpu<float>(float* dy_dem_x,
+                                                   float* dy_dem,
                                                    const float* table,
                                                    const float* table_info,
                                                    const float* em_x,
                                                    const float* em,
+                                                   const float* dy,
                                                    const int nloc,
                                                    const int nnei_i,
                                                    const int nnei_j,
                                                    const int last_layer_size);
-template void tabulate_fusion_se_t_gpu_cuda<double>(double* out,
+template void tabulate_fusion_se_t_grad_gpu<double>(double* dy_dem_x,
+                                                    double* dy_dem,
                                                     const double* table,
                                                     const double* table_info,
                                                     const double* em_x,
                                                     const double* em,
+                                                    const double* dy,
                                                     const int nloc,
                                                     const int nnei_i,
                                                     const int nnei_j,
                                                     const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu_cuda<float>(
-    float* dy_dem_x,
-    float* dy_dem,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* dy,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu_cuda<double>(
-    double* dy_dem_x,
-    double* dy_dem,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* dy,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu_cuda<float>(
+template void tabulate_fusion_se_t_grad_grad_gpu<float>(
     float* dz_dy,
     const float* table,
     const float* table_info,
@@ -976,7 +972,7 @@ template void tabulate_fusion_se_t_grad_grad_gpu_cuda<float>(
     const int nnei_i,
     const int nnei_j,
     const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu_cuda<double>(
+template void tabulate_fusion_se_t_grad_grad_gpu<double>(
     double* dz_dy,
     const double* table,
     const double* table_info,
@@ -989,39 +985,37 @@ template void tabulate_fusion_se_t_grad_grad_gpu_cuda<double>(
     const int nnei_j,
     const int last_layer_size);
 
-template void tabulate_fusion_se_r_gpu_cuda<float>(float* out,
+template void tabulate_fusion_se_r_gpu<float>(float* out,
+                                              const float* table,
+                                              const float* table_info,
+                                              const float* em,
+                                              const int nloc,
+                                              const int nnei,
+                                              const int last_layer_size);
+template void tabulate_fusion_se_r_gpu<double>(double* out,
+                                               const double* table,
+                                               const double* table_info,
+                                               const double* em,
+                                               const int nloc,
+                                               const int nnei,
+                                               const int last_layer_size);
+template void tabulate_fusion_se_r_grad_gpu<float>(float* dy_dem,
                                                    const float* table,
                                                    const float* table_info,
                                                    const float* em,
+                                                   const float* dy,
                                                    const int nloc,
                                                    const int nnei,
                                                    const int last_layer_size);
-template void tabulate_fusion_se_r_gpu_cuda<double>(double* out,
+template void tabulate_fusion_se_r_grad_gpu<double>(double* dy_dem,
                                                     const double* table,
                                                     const double* table_info,
                                                     const double* em,
+                                                    const double* dy,
                                                     const int nloc,
                                                     const int nnei,
                                                     const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu_cuda<float>(
-    float* dy_dem,
-    const float* table,
-    const float* table_info,
-    const float* em,
-    const float* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu_cuda<double>(
-    double* dy_dem,
-    const double* table,
-    const double* table_info,
-    const double* em,
-    const double* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu_cuda<float>(
+template void tabulate_fusion_se_r_grad_grad_gpu<float>(
     float* dz_dy,
     const float* table,
     const float* table_info,
@@ -1030,7 +1024,7 @@ template void tabulate_fusion_se_r_grad_grad_gpu_cuda<float>(
     const int nloc,
     const int nnei,
     const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu_cuda<double>(
+template void tabulate_fusion_se_r_grad_grad_gpu<double>(
     double* dz_dy,
     const double* table,
     const double* table_info,
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
index 198ef87311..5416022575 100644
--- a/source/lib/src/rocm/coord.hip.cu
+++ b/source/lib/src/rocm/coord.hip.cu
@@ -335,9 +335,9 @@ void copy_coord(FPTYPE *out_c,
 
 namespace deepmd {
 template <typename FPTYPE>
-void normalize_coord_gpu_rocm(FPTYPE *coord,
-                              const int natom,
-                              const Region<FPTYPE> &region) {
+void normalize_coord_gpu(FPTYPE *coord,
+                         const int natom,
+                         const Region<FPTYPE> &region) {
   const FPTYPE *boxt = region.boxt;
   const FPTYPE *rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
@@ -348,19 +348,19 @@ void normalize_coord_gpu_rocm(FPTYPE *coord,
 }
 
 template <typename FPTYPE>
-int copy_coord_gpu_rocm(FPTYPE *out_c,
-                        int *out_t,
-                        int *mapping,
-                        int *nall,
-                        int *int_data,
-                        const FPTYPE *in_c,
-                        const int *in_t,
-                        const int &nloc,
-                        const int &mem_nall,
-                        const int &loc_cellnum,
-                        const int &total_cellnum,
-                        const int *cell_info,
-                        const Region<FPTYPE> &region) {
+int copy_coord_gpu(FPTYPE *out_c,
+                   int *out_t,
+                   int *mapping,
+                   int *nall,
+                   int *int_data,
+                   const FPTYPE *in_c,
+                   const int *in_t,
+                   const int &nloc,
+                   const int &mem_nall,
+                   const int &loc_cellnum,
+                   const int &total_cellnum,
+                   const int *cell_info,
+                   const Region<FPTYPE> &region) {
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
   int *int_data_cpu = new int
@@ -409,36 +409,36 @@ int copy_coord_gpu_rocm(FPTYPE *out_c,
   return 0;
 }
 
-template void normalize_coord_gpu_rocm<float>(float *coord,
-                                              const int natom,
-                                              const Region<float> &region);
-template void normalize_coord_gpu_rocm<double>(double *coord,
-                                               const int natom,
-                                               const Region<double> &region);
-template int copy_coord_gpu_rocm<float>(float *out_c,
-                                        int *out_t,
-                                        int *mapping,
-                                        int *nall,
-                                        int *int_data,
-                                        const float *in_c,
-                                        const int *in_t,
-                                        const int &nloc,
-                                        const int &mem_nall,
-                                        const int &loc_cellnum,
-                                        const int &total_cellnum,
-                                        const int *cell_info,
-                                        const Region<float> &region);
-template int copy_coord_gpu_rocm<double>(double *out_c,
-                                         int *out_t,
-                                         int *mapping,
-                                         int *nall,
-                                         int *int_data,
-                                         const double *in_c,
-                                         const int *in_t,
-                                         const int &nloc,
-                                         const int &mem_nall,
-                                         const int &loc_cellnum,
-                                         const int &total_cellnum,
-                                         const int *cell_info,
-                                         const Region<double> &region);
+template void normalize_coord_gpu<float>(float *coord,
+                                         const int natom,
+                                         const Region<float> &region);
+template void normalize_coord_gpu<double>(double *coord,
+                                          const int natom,
+                                          const Region<double> &region);
+template int copy_coord_gpu<float>(float *out_c,
+                                   int *out_t,
+                                   int *mapping,
+                                   int *nall,
+                                   int *int_data,
+                                   const float *in_c,
+                                   const int *in_t,
+                                   const int &nloc,
+                                   const int &mem_nall,
+                                   const int &loc_cellnum,
+                                   const int &total_cellnum,
+                                   const int *cell_info,
+                                   const Region<float> &region);
+template int copy_coord_gpu<double>(double *out_c,
+                                    int *out_t,
+                                    int *mapping,
+                                    int *nall,
+                                    int *int_data,
+                                    const double *in_c,
+                                    const int *in_t,
+                                    const int &nloc,
+                                    const int &mem_nall,
+                                    const int &loc_cellnum,
+                                    const int &total_cellnum,
+                                    const int *cell_info,
+                                    const Region<double> &region);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
index 7dfcb45870..76657eea52 100644
--- a/source/lib/src/rocm/gelu.hip.cu
+++ b/source/lib/src/rocm/gelu.hip.cu
@@ -64,7 +64,7 @@ __global__ void gelu_grad_grad(FPTYPE* out,
 
 namespace deepmd {
 template <typename FPTYPE>
-void gelu_gpu_rocm(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
+void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
@@ -77,10 +77,10 @@ void gelu_gpu_rocm(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
 }
 
 template <typename FPTYPE>
-void gelu_grad_gpu_rocm(FPTYPE* out,
-                        const FPTYPE* xx,
-                        const FPTYPE* dy,
-                        const int_64 size) {
+void gelu_grad_gpu(FPTYPE* out,
+                   const FPTYPE* xx,
+                   const FPTYPE* dy,
+                   const int_64 size) {
   if (size <= 0) {
     return;
   }
@@ -94,11 +94,11 @@ void gelu_grad_gpu_rocm(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void gelu_grad_grad_gpu_rocm(FPTYPE* out,
-                             const FPTYPE* xx,
-                             const FPTYPE* dy,
-                             const FPTYPE* dy_2,
-                             const int_64 size) {
+void gelu_grad_grad_gpu(FPTYPE* out,
+                        const FPTYPE* xx,
+                        const FPTYPE* dy,
+                        const FPTYPE* dy_2,
+                        const int_64 size) {
   if (size <= 0) {
     return;
   }
@@ -111,28 +111,24 @@ void gelu_grad_grad_gpu_rocm(FPTYPE* out,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void gelu_gpu_rocm<float>(float* out,
+template void gelu_gpu<float>(float* out, const float* x, const int_64 size);
+template void gelu_gpu<double>(double* out, const double* x, const int_64 size);
+template void gelu_grad_gpu<float>(float* out,
                                    const float* x,
+                                   const float* dy,
                                    const int_64 size);
-template void gelu_gpu_rocm<double>(double* out,
+template void gelu_grad_gpu<double>(double* out,
                                     const double* x,
+                                    const double* dy,
                                     const int_64 size);
-template void gelu_grad_gpu_rocm<float>(float* out,
+template void gelu_grad_grad_gpu<float>(float* out,
                                         const float* x,
                                         const float* dy,
+                                        const float* dy_2,
                                         const int_64 size);
-template void gelu_grad_gpu_rocm<double>(double* out,
+template void gelu_grad_grad_gpu<double>(double* out,
                                          const double* x,
                                          const double* dy,
+                                         const double* dy_2,
                                          const int_64 size);
-template void gelu_grad_grad_gpu_rocm<float>(float* out,
-                                             const float* x,
-                                             const float* dy,
-                                             const float* dy_2,
-                                             const int_64 size);
-template void gelu_grad_grad_gpu_rocm<double>(double* out,
-                                              const double* x,
-                                              const double* dy,
-                                              const double* dy_2,
-                                              const int_64 size);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
index 34043233ab..736f2f9e9a 100644
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ b/source/lib/src/rocm/neighbor_list.hip.cu
@@ -175,14 +175,14 @@ __global__ void map_nei_info_noconvert(int *nlist,
 
 namespace deepmd {
 template <typename FPTYPE>
-int build_nlist_gpu_rocm(InputNlist &nlist,
-                         int *max_list_size,
-                         int *nlist_data,
-                         const FPTYPE *c_cpy,
-                         const int &nloc,
-                         const int &nall,
-                         const int &mem_size,
-                         const float &rcut) {
+int build_nlist_gpu(InputNlist &nlist,
+                    int *max_list_size,
+                    int *nlist_data,
+                    const FPTYPE *c_cpy,
+                    const int &nloc,
+                    const int &nall,
+                    const int &mem_size,
+                    const float &rcut) {
   if (mem_size < nall) {
     return 1;
   }
@@ -237,15 +237,15 @@ void use_nlist_map(int *nlist,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-void use_nei_info_gpu_rocm(int *nlist,
-                           int *ntype,
-                           bool *nmask,
-                           const int *type,
-                           const int *nlist_map,
-                           const int nloc,
-                           const int nnei,
-                           const int ntypes,
-                           const bool b_nlist_map) {
+void use_nei_info_gpu(int *nlist,
+                      int *ntype,
+                      bool *nmask,
+                      const int *type,
+                      const int *nlist_map,
+                      const int nloc,
+                      const int nnei,
+                      const int ntypes,
+                      const bool b_nlist_map) {
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -262,22 +262,22 @@ void use_nei_info_gpu_rocm(int *nlist,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template int build_nlist_gpu_rocm<float>(InputNlist &nlist,
-                                         int *max_list_size,
-                                         int *nlist_data,
-                                         const float *c_cpy,
-                                         const int &nloc,
-                                         const int &nall,
-                                         const int &mem_size,
-                                         const float &rcut);
-template int build_nlist_gpu_rocm<double>(InputNlist &nlist,
-                                          int *max_list_size,
-                                          int *nlist_data,
-                                          const double *c_cpy,
-                                          const int &nloc,
-                                          const int &nall,
-                                          const int &mem_size,
-                                          const float &rcut);
+template int build_nlist_gpu<float>(InputNlist &nlist,
+                                    int *max_list_size,
+                                    int *nlist_data,
+                                    const float *c_cpy,
+                                    const int &nloc,
+                                    const int &nall,
+                                    const int &mem_size,
+                                    const float &rcut);
+template int build_nlist_gpu<double>(InputNlist &nlist,
+                                     int *max_list_size,
+                                     int *nlist_data,
+                                     const double *c_cpy,
+                                     const int &nloc,
+                                     const int &nall,
+                                     const int &mem_size,
+                                     const float &rcut);
 __global__ void map_filter_ftype(int *ftype_out,
                                  const int *ftype_in,
                                  const int nloc) {
@@ -287,9 +287,7 @@ __global__ void map_filter_ftype(int *ftype_out,
   }
 }
 
-void filter_ftype_gpu_rocm(int *ftype_out,
-                           const int *ftype_in,
-                           const int nloc) {
+void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
   int nblock = (nloc + TPB - 1) / TPB;
   map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
   DPErrcheck(hipGetLastError());
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
index c2bfbd3cac..23e8ce1d0e 100644
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -565,17 +565,17 @@ __global__ void compute_env_mat_r(FPTYPE* em,
 
 namespace deepmd {
 template <typename FPTYPE>
-void format_nbor_list_gpu_rocm(int* nlist,
-                               const FPTYPE* coord,
-                               const int* type,
-                               const deepmd::InputNlist& gpu_inlist,
-                               int* array_int,
-                               uint_64* array_longlong,
-                               const int max_nbor_size,
-                               const int nloc,
-                               const int nall,
-                               const float rcut,
-                               const std::vector<int> sec) {
+void format_nbor_list_gpu(int* nlist,
+                          const FPTYPE* coord,
+                          const int* type,
+                          const deepmd::InputNlist& gpu_inlist,
+                          int* array_int,
+                          uint_64* array_longlong,
+                          const int max_nbor_size,
+                          const int nloc,
+                          const int nall,
+                          const float rcut,
+                          const std::vector<int> sec) {
   const int LEN = 256;
   const int nnei = sec.back();
   const int nblock = (nloc + LEN - 1) / LEN;
@@ -619,24 +619,24 @@ void format_nbor_list_gpu_rocm(int* nlist,
 }
 
 template <typename FPTYPE>
-void prod_env_mat_a_gpu_rocm(FPTYPE* em,
-                             FPTYPE* em_deriv,
-                             FPTYPE* rij,
-                             int* nlist,
-                             const FPTYPE* coord,
-                             const int* type,
-                             const InputNlist& gpu_inlist,
-                             int* array_int,
-                             uint_64* array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE* avg,
-                             const FPTYPE* std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec,
-                             const int* f_type) {
+void prod_env_mat_a_gpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& gpu_inlist,
+                        int* array_int,
+                        uint_64* array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec,
+                        const int* f_type) {
   if (f_type == NULL) {
     f_type = type;
   }
@@ -647,9 +647,8 @@ void prod_env_mat_a_gpu_rocm(FPTYPE* em,
       hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
   DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
-  format_nbor_list_gpu_rocm(nlist, coord, f_type, gpu_inlist, array_int,
-                            array_longlong, max_nbor_size, nloc, nall, rcut,
-                            sec);
+  format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
+                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
   nborErrcheck(hipGetLastError());
   nborErrcheck(hipDeviceSynchronize());
 
@@ -661,23 +660,23 @@ void prod_env_mat_a_gpu_rocm(FPTYPE* em,
 }
 
 template <typename FPTYPE>
-void prod_env_mat_r_gpu_rocm(FPTYPE* em,
-                             FPTYPE* em_deriv,
-                             FPTYPE* rij,
-                             int* nlist,
-                             const FPTYPE* coord,
-                             const int* type,
-                             const InputNlist& gpu_inlist,
-                             int* array_int,
-                             uint_64* array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE* avg,
-                             const FPTYPE* std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec) {
+void prod_env_mat_r_gpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& gpu_inlist,
+                        int* array_int,
+                        uint_64* array_longlong,
+                        const int max_nbor_size,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
+                        const int nloc,
+                        const int nall,
+                        const float rcut,
+                        const float rcut_smth,
+                        const std::vector<int> sec) {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
   DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
@@ -685,9 +684,8 @@ void prod_env_mat_r_gpu_rocm(FPTYPE* em,
       hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
   DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
-  format_nbor_list_gpu_rocm(nlist, coord, type, gpu_inlist, array_int,
-                            array_longlong, max_nbor_size, nloc, nall, rcut,
-                            sec);
+  format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
+                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
   nborErrcheck(hipGetLastError());
   nborErrcheck(hipDeviceSynchronize());
 
@@ -699,13 +697,13 @@ void prod_env_mat_r_gpu_rocm(FPTYPE* em,
 }
 
 template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu_rocm(uint_64* key,
-                                               int* out_type,
-                                               int* out_index,
-                                               const int* in_type,
-                                               const FPTYPE* in_dist,
-                                               const int* in_index,
-                                               const int size_of_array) {
+void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                          int* out_type,
+                                          int* out_index,
+                                          const int* in_type,
+                                          const FPTYPE* in_dist,
+                                          const int* in_index,
+                                          const int size_of_array) {
   const int nblock = (size_of_array + TPB - 1) / TPB;
   hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, key,
                      out_type, out_index, in_type, in_dist, in_index,
@@ -714,116 +712,110 @@ void test_encoding_decoding_nbor_info_gpu_rocm(uint_64* key,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void prod_env_mat_a_gpu_rocm<float>(float* em,
-                                             float* em_deriv,
-                                             float* rij,
-                                             int* nlist,
-                                             const float* coord,
-                                             const int* type,
-                                             const InputNlist& gpu_inlist,
-                                             int* array_int,
-                                             unsigned long long* array_longlong,
-                                             const int max_nbor_size,
-                                             const float* avg,
-                                             const float* std,
-                                             const int nloc,
-                                             const int nall,
-                                             const float rcut,
-                                             const float rcut_smth,
-                                             const std::vector<int> sec,
-                                             const int* f_type);
-template void prod_env_mat_a_gpu_rocm<double>(
-    double* em,
-    double* em_deriv,
-    double* rij,
-    int* nlist,
-    const double* coord,
-    const int* type,
-    const InputNlist& gpu_inlist,
-    int* array_int,
-    unsigned long long* array_longlong,
-    const int max_nbor_size,
-    const double* avg,
-    const double* std,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const float rcut_smth,
-    const std::vector<int> sec,
-    const int* f_type);
-template void prod_env_mat_r_gpu_rocm<float>(float* em,
-                                             float* em_deriv,
-                                             float* rij,
-                                             int* nlist,
-                                             const float* coord,
-                                             const int* type,
-                                             const InputNlist& gpu_inlist,
-                                             int* array_int,
-                                             unsigned long long* array_longlong,
-                                             const int max_nbor_size,
-                                             const float* avg,
-                                             const float* std,
-                                             const int nloc,
-                                             const int nall,
-                                             const float rcut,
-                                             const float rcut_smth,
-                                             const std::vector<int> sec);
-template void prod_env_mat_r_gpu_rocm<double>(
-    double* em,
-    double* em_deriv,
-    double* rij,
-    int* nlist,
-    const double* coord,
-    const int* type,
-    const InputNlist& gpu_inlist,
-    int* array_int,
-    unsigned long long* array_longlong,
-    const int max_nbor_size,
-    const double* avg,
-    const double* std,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const float rcut_smth,
-    const std::vector<int> sec);
-template void format_nbor_list_gpu_rocm<float>(
-    int* nlist,
-    const float* coord,
-    const int* type,
-    const deepmd::InputNlist& gpu_inlist,
-    int* array_int,
-    uint_64* array_longlong,
-    const int max_nbor_size,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const std::vector<int> sec);
-template void format_nbor_list_gpu_rocm<double>(
-    int* nlist,
-    const double* coord,
-    const int* type,
-    const deepmd::InputNlist& gpu_inlist,
-    int* array_int,
-    uint_64* array_longlong,
-    const int max_nbor_size,
-    const int nloc,
-    const int nall,
-    const float rcut,
-    const std::vector<int> sec);
-template void test_encoding_decoding_nbor_info_gpu_rocm(
-    uint_64* key,
-    int* out_type,
-    int* out_index,
-    const int* in_type,
-    const float* in_dist,
-    const int* in_index,
-    const int size_of_array);
-template void test_encoding_decoding_nbor_info_gpu_rocm(
-    uint_64* key,
-    int* out_type,
-    int* out_index,
-    const int* in_type,
-    const double* in_dist,
-    const int* in_index,
-    const int size_of_array);
+template void prod_env_mat_a_gpu<float>(float* em,
+                                        float* em_deriv,
+                                        float* rij,
+                                        int* nlist,
+                                        const float* coord,
+                                        const int* type,
+                                        const InputNlist& gpu_inlist,
+                                        int* array_int,
+                                        unsigned long long* array_longlong,
+                                        const int max_nbor_size,
+                                        const float* avg,
+                                        const float* std,
+                                        const int nloc,
+                                        const int nall,
+                                        const float rcut,
+                                        const float rcut_smth,
+                                        const std::vector<int> sec,
+                                        const int* f_type);
+template void prod_env_mat_a_gpu<double>(double* em,
+                                         double* em_deriv,
+                                         double* rij,
+                                         int* nlist,
+                                         const double* coord,
+                                         const int* type,
+                                         const InputNlist& gpu_inlist,
+                                         int* array_int,
+                                         unsigned long long* array_longlong,
+                                         const int max_nbor_size,
+                                         const double* avg,
+                                         const double* std,
+                                         const int nloc,
+                                         const int nall,
+                                         const float rcut,
+                                         const float rcut_smth,
+                                         const std::vector<int> sec,
+                                         const int* f_type);
+template void prod_env_mat_r_gpu<float>(float* em,
+                                        float* em_deriv,
+                                        float* rij,
+                                        int* nlist,
+                                        const float* coord,
+                                        const int* type,
+                                        const InputNlist& gpu_inlist,
+                                        int* array_int,
+                                        unsigned long long* array_longlong,
+                                        const int max_nbor_size,
+                                        const float* avg,
+                                        const float* std,
+                                        const int nloc,
+                                        const int nall,
+                                        const float rcut,
+                                        const float rcut_smth,
+                                        const std::vector<int> sec);
+template void prod_env_mat_r_gpu<double>(double* em,
+                                         double* em_deriv,
+                                         double* rij,
+                                         int* nlist,
+                                         const double* coord,
+                                         const int* type,
+                                         const InputNlist& gpu_inlist,
+                                         int* array_int,
+                                         unsigned long long* array_longlong,
+                                         const int max_nbor_size,
+                                         const double* avg,
+                                         const double* std,
+                                         const int nloc,
+                                         const int nall,
+                                         const float rcut,
+                                         const float rcut_smth,
+                                         const std::vector<int> sec);
+template void format_nbor_list_gpu<float>(int* nlist,
+                                          const float* coord,
+                                          const int* type,
+                                          const deepmd::InputNlist& gpu_inlist,
+                                          int* array_int,
+                                          uint_64* array_longlong,
+                                          const int max_nbor_size,
+                                          const int nloc,
+                                          const int nall,
+                                          const float rcut,
+                                          const std::vector<int> sec);
+template void format_nbor_list_gpu<double>(int* nlist,
+                                           const double* coord,
+                                           const int* type,
+                                           const deepmd::InputNlist& gpu_inlist,
+                                           int* array_int,
+                                           uint_64* array_longlong,
+                                           const int max_nbor_size,
+                                           const int nloc,
+                                           const int nall,
+                                           const float rcut,
+                                           const std::vector<int> sec);
+template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                                   int* out_type,
+                                                   int* out_index,
+                                                   const int* in_type,
+                                                   const float* in_dist,
+                                                   const int* in_index,
+                                                   const int size_of_array);
+template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
+                                                   int* out_type,
+                                                   int* out_index,
+                                                   const int* in_type,
+                                                   const double* in_dist,
+                                                   const int* in_index,
+                                                   const int size_of_array);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
index bc4fa15078..5b1f91dd49 100644
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ b/source/lib/src/rocm/prod_force.hip.cu
@@ -102,14 +102,14 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_force_a_gpu_rocm(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes) {
+void prod_force_a_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes) {
   const int ndescrpt = nnei * 4;
   DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
@@ -130,14 +130,14 @@ void prod_force_a_gpu_rocm(FPTYPE* force,
 }
 
 template <typename FPTYPE>
-void prod_force_r_gpu_rocm(FPTYPE* force,
-                           const FPTYPE* net_deriv,
-                           const FPTYPE* in_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nall,
-                           const int nnei,
-                           const int nframes) {
+void prod_force_r_gpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes) {
   const int ndescrpt = nnei * 1;
   DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
@@ -157,37 +157,37 @@ void prod_force_r_gpu_rocm(FPTYPE* force,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void prod_force_a_gpu_rocm<float>(float* force,
-                                           const float* net_deriv,
-                                           const float* in_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nall,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_a_gpu_rocm<double>(double* force,
-                                            const double* net_deriv,
-                                            const double* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei,
-                                            const int nframes);
-template void prod_force_r_gpu_rocm<float>(float* force,
-                                           const float* net_deriv,
-                                           const float* in_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nall,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_r_gpu_rocm<double>(double* force,
-                                            const double* net_deriv,
-                                            const double* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei,
-                                            const int nframes);
+template void prod_force_a_gpu<float>(float* force,
+                                      const float* net_deriv,
+                                      const float* in_deriv,
+                                      const int* nlist,
+                                      const int nloc,
+                                      const int nall,
+                                      const int nnei,
+                                      const int nframes);
+template void prod_force_a_gpu<double>(double* force,
+                                       const double* net_deriv,
+                                       const double* in_deriv,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei,
+                                       const int nframes);
+template void prod_force_r_gpu<float>(float* force,
+                                      const float* net_deriv,
+                                      const float* in_deriv,
+                                      const int* nlist,
+                                      const int nloc,
+                                      const int nall,
+                                      const int nnei,
+                                      const int nframes);
+template void prod_force_r_gpu<double>(double* force,
+                                       const double* net_deriv,
+                                       const double* in_deriv,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei,
+                                       const int nframes);
 
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
index e43ce37af6..2cb7c4f1d6 100644
--- a/source/lib/src/rocm/prod_force_grad.hip.cu
+++ b/source/lib/src/rocm/prod_force_grad.hip.cu
@@ -81,13 +81,13 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes) {
+void prod_force_grad_a_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes) {
   const int ndescrpt = nnei * 4;
   DPErrcheck(
       hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
@@ -109,13 +109,13 @@ void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net,
 }
 
 template <typename FPTYPE>
-void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net,
-                                const FPTYPE* grad,
-                                const FPTYPE* env_deriv,
-                                const int* nlist,
-                                const int nloc,
-                                const int nnei,
-                                const int nframes) {
+void prod_force_grad_r_gpu(FPTYPE* grad_net,
+                           const FPTYPE* grad,
+                           const FPTYPE* env_deriv,
+                           const int* nlist,
+                           const int nloc,
+                           const int nnei,
+                           const int nframes) {
   const int ndescrpt = nnei * 1;
   DPErrcheck(
       hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
@@ -137,32 +137,32 @@ void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void prod_force_grad_a_gpu_rocm<float>(float* grad_net,
-                                                const float* grad,
-                                                const float* env_deriv,
-                                                const int* nlist,
-                                                const int nloc,
-                                                const int nnei,
-                                                const int nframes);
-template void prod_force_grad_a_gpu_rocm<double>(double* grad_net,
-                                                 const double* grad,
-                                                 const double* env_deriv,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei,
-                                                 const int nframes);
-template void prod_force_grad_r_gpu_rocm<float>(float* grad_net,
-                                                const float* grad,
-                                                const float* env_deriv,
-                                                const int* nlist,
-                                                const int nloc,
-                                                const int nnei,
-                                                const int nframes);
-template void prod_force_grad_r_gpu_rocm<double>(double* grad_net,
-                                                 const double* grad,
-                                                 const double* env_deriv,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei,
-                                                 const int nframes);
+template void prod_force_grad_a_gpu<float>(float* grad_net,
+                                           const float* grad,
+                                           const float* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei,
+                                           const int nframes);
+template void prod_force_grad_a_gpu<double>(double* grad_net,
+                                            const double* grad,
+                                            const double* env_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei,
+                                            const int nframes);
+template void prod_force_grad_r_gpu<float>(float* grad_net,
+                                           const float* grad,
+                                           const float* env_deriv,
+                                           const int* nlist,
+                                           const int nloc,
+                                           const int nnei,
+                                           const int nframes);
+template void prod_force_grad_r_gpu<double>(double* grad_net,
+                                            const double* grad,
+                                            const double* env_deriv,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei,
+                                            const int nframes);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
index dccd721df6..ff29c07ffb 100644
--- a/source/lib/src/rocm/prod_virial.hip.cu
+++ b/source/lib/src/rocm/prod_virial.hip.cu
@@ -99,15 +99,15 @@ __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_virial_a_gpu_rocm(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* in_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei) {
+void prod_virial_a_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* in_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei) {
   DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
@@ -129,15 +129,15 @@ void prod_virial_a_gpu_rocm(FPTYPE* virial,
 }
 
 template <typename FPTYPE>
-void prod_virial_r_gpu_rocm(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* in_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei) {
+void prod_virial_r_gpu(FPTYPE* virial,
+                       FPTYPE* atom_virial,
+                       const FPTYPE* net_deriv,
+                       const FPTYPE* in_deriv,
+                       const FPTYPE* rij,
+                       const int* nlist,
+                       const int nloc,
+                       const int nall,
+                       const int nnei) {
   DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
@@ -158,40 +158,40 @@ void prod_virial_r_gpu_rocm(FPTYPE* virial,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void prod_virial_a_gpu_rocm<float>(float* virial,
-                                            float* atom_virial,
-                                            const float* net_deriv,
-                                            const float* in_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei);
-template void prod_virial_a_gpu_rocm<double>(double* virial,
-                                             double* atom_virial,
-                                             const double* net_deriv,
-                                             const double* in_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nall,
-                                             const int nnei);
-template void prod_virial_r_gpu_rocm<float>(float* virial,
-                                            float* atom_virial,
-                                            const float* net_deriv,
-                                            const float* in_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei);
-template void prod_virial_r_gpu_rocm<double>(double* virial,
-                                             double* atom_virial,
-                                             const double* net_deriv,
-                                             const double* in_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nall,
-                                             const int nnei);
+template void prod_virial_a_gpu<float>(float* virial,
+                                       float* atom_virial,
+                                       const float* net_deriv,
+                                       const float* in_deriv,
+                                       const float* rij,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei);
+template void prod_virial_a_gpu<double>(double* virial,
+                                        double* atom_virial,
+                                        const double* net_deriv,
+                                        const double* in_deriv,
+                                        const double* rij,
+                                        const int* nlist,
+                                        const int nloc,
+                                        const int nall,
+                                        const int nnei);
+template void prod_virial_r_gpu<float>(float* virial,
+                                       float* atom_virial,
+                                       const float* net_deriv,
+                                       const float* in_deriv,
+                                       const float* rij,
+                                       const int* nlist,
+                                       const int nloc,
+                                       const int nall,
+                                       const int nnei);
+template void prod_virial_r_gpu<double>(double* virial,
+                                        double* atom_virial,
+                                        const double* net_deriv,
+                                        const double* in_deriv,
+                                        const double* rij,
+                                        const int* nlist,
+                                        const int nloc,
+                                        const int nall,
+                                        const int nnei);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
index 81fb9f4bad..d41a1689ce 100644
--- a/source/lib/src/rocm/prod_virial_grad.hip.cu
+++ b/source/lib/src/rocm/prod_virial_grad.hip.cu
@@ -84,13 +84,13 @@ __global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net,
 
 namespace deepmd {
 template <typename FPTYPE>
-void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei) {
+void prod_virial_grad_a_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei) {
   const int ndescrpt = nnei * 4;
   DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
@@ -104,13 +104,13 @@ void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net,
 }
 
 template <typename FPTYPE>
-void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei) {
+void prod_virial_grad_r_gpu(FPTYPE* grad_net,
+                            const FPTYPE* grad,
+                            const FPTYPE* env_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nnei) {
   const int ndescrpt = nnei;
   DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
@@ -123,32 +123,32 @@ void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void prod_virial_grad_a_gpu_rocm<float>(float* grad_net,
-                                                 const float* grad,
-                                                 const float* env_deriv,
-                                                 const float* rij,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei);
-template void prod_virial_grad_a_gpu_rocm<double>(double* grad_net,
-                                                  const double* grad,
-                                                  const double* env_deriv,
-                                                  const double* rij,
-                                                  const int* nlist,
-                                                  const int nloc,
-                                                  const int nnei);
-template void prod_virial_grad_r_gpu_rocm<float>(float* grad_net,
-                                                 const float* grad,
-                                                 const float* env_deriv,
-                                                 const float* rij,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei);
-template void prod_virial_grad_r_gpu_rocm<double>(double* grad_net,
-                                                  const double* grad,
-                                                  const double* env_deriv,
-                                                  const double* rij,
-                                                  const int* nlist,
-                                                  const int nloc,
-                                                  const int nnei);
+template void prod_virial_grad_a_gpu<float>(float* grad_net,
+                                            const float* grad,
+                                            const float* env_deriv,
+                                            const float* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei);
+template void prod_virial_grad_a_gpu<double>(double* grad_net,
+                                             const double* grad,
+                                             const double* env_deriv,
+                                             const double* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei);
+template void prod_virial_grad_r_gpu<float>(float* grad_net,
+                                            const float* grad,
+                                            const float* env_deriv,
+                                            const float* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei);
+template void prod_virial_grad_r_gpu<double>(double* grad_net,
+                                             const double* grad,
+                                             const double* env_deriv,
+                                             const double* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu
index f4ee5517cc..de67ef648c 100644
--- a/source/lib/src/rocm/region.hip.cu
+++ b/source/lib/src/rocm/region.hip.cu
@@ -24,44 +24,42 @@ __global__ void _compute_volume(FPTYPE *volume, const FPTYPE *boxt) {
 namespace deepmd {
 // only for unittest
 template <typename FPTYPE>
-void convert_to_inter_gpu_rocm(FPTYPE *ri,
-                               const Region<FPTYPE> &region,
-                               const FPTYPE *rp) {
+void convert_to_inter_gpu(FPTYPE *ri,
+                          const Region<FPTYPE> &region,
+                          const FPTYPE *rp) {
   hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
-void convert_to_phys_gpu_rocm(FPTYPE *rp,
-                              const Region<FPTYPE> &region,
-                              const FPTYPE *ri) {
+void convert_to_phys_gpu(FPTYPE *rp,
+                         const Region<FPTYPE> &region,
+                         const FPTYPE *ri) {
   hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
-void volume_gpu_rocm(FPTYPE *volume, const Region<FPTYPE> &region) {
+void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
   hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void convert_to_inter_gpu_rocm<float>(float *ri,
-                                               const Region<float> &region,
-                                               const float *rp);
-template void convert_to_inter_gpu_rocm<double>(double *ri,
-                                                const Region<double> &region,
-                                                const double *rp);
-template void convert_to_phys_gpu_rocm<float>(float *rp,
-                                              const Region<float> &region,
-                                              const float *ri);
-template void convert_to_phys_gpu_rocm<double>(double *rp,
-                                               const Region<double> &region,
-                                               const double *ri);
-template void volume_gpu_rocm<float>(float *volume,
-                                     const Region<float> &region);
-template void volume_gpu_rocm<double>(double *volume,
-                                      const Region<double> &region);
+template void convert_to_inter_gpu<float>(float *ri,
+                                          const Region<float> &region,
+                                          const float *rp);
+template void convert_to_inter_gpu<double>(double *ri,
+                                           const Region<double> &region,
+                                           const double *rp);
+template void convert_to_phys_gpu<float>(float *rp,
+                                         const Region<float> &region,
+                                         const float *ri);
+template void convert_to_phys_gpu<double>(double *rp,
+                                          const Region<double> &region,
+                                          const double *ri);
+template void volume_gpu<float>(float *volume, const Region<float> &region);
+template void volume_gpu<double>(double *volume, const Region<double> &region);
 }  // namespace deepmd
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
index f88ae6ec4a..88a1cbb574 100644
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -621,16 +621,16 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
 
 namespace deepmd {
 template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* two_embed,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size,
-                                   const bool is_sorted) {
+void tabulate_fusion_se_a_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const FPTYPE* two_embed,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size,
+                              const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -645,18 +645,18 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* two_embed,
-                                        const FPTYPE* dy,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size,
-                                        const bool is_sorted) {
+void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
+                                   const FPTYPE* table,
+                                   const FPTYPE* table_info,
+                                   const FPTYPE* em_x,
+                                   const FPTYPE* em,
+                                   const FPTYPE* two_embed,
+                                   const FPTYPE* dy,
+                                   const int nloc,
+                                   const int nnei,
+                                   const int last_layer_size,
+                                   const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -675,17 +675,17 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size,
-                                             const bool is_sorted) {
+void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
+                                        const FPTYPE* table,
+                                        const FPTYPE* table_info,
+                                        const FPTYPE* em_x,
+                                        const FPTYPE* em,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
+                                        const int nloc,
+                                        const int nnei,
+                                        const int last_layer_size,
+                                        const bool is_sorted) {
   if (nloc <= 0) {
     return;
   }
@@ -703,15 +703,15 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const int nloc,
-                                   const int nnei_i,
-                                   const int nnei_j,
-                                   const int last_layer_size) {
+void tabulate_fusion_se_t_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em_x,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei_i,
+                              const int nnei_j,
+                              const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -726,17 +726,17 @@ void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu_rocm(FPTYPE* dy_dem_x,
-                                        FPTYPE* dy_dem,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dy,
-                                        const int nloc,
-                                        const int nnei_i,
-                                        const int nnei_j,
-                                        const int last_layer_size) {
+void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
+                                   FPTYPE* dy_dem,
+                                   const FPTYPE* table,
+                                   const FPTYPE* table_info,
+                                   const FPTYPE* em_x,
+                                   const FPTYPE* em,
+                                   const FPTYPE* dy,
+                                   const int nloc,
+                                   const int nnei_i,
+                                   const int nnei_j,
+                                   const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -754,17 +754,17 @@ void tabulate_fusion_se_t_grad_gpu_rocm(FPTYPE* dy_dem_x,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu_rocm(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em_x,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem_x,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei_i,
-                                             const int nnei_j,
-                                             const int last_layer_size) {
+void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
+                                        const FPTYPE* table,
+                                        const FPTYPE* table_info,
+                                        const FPTYPE* em_x,
+                                        const FPTYPE* em,
+                                        const FPTYPE* dz_dy_dem_x,
+                                        const FPTYPE* dz_dy_dem,
+                                        const int nloc,
+                                        const int nnei_i,
+                                        const int nnei_j,
+                                        const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -781,13 +781,13 @@ void tabulate_fusion_se_t_grad_grad_gpu_rocm(FPTYPE* dz_dy,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu_rocm(FPTYPE* out,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size) {
+void tabulate_fusion_se_r_gpu(FPTYPE* out,
+                              const FPTYPE* table,
+                              const FPTYPE* table_info,
+                              const FPTYPE* em,
+                              const int nloc,
+                              const int nnei,
+                              const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -802,14 +802,14 @@ void tabulate_fusion_se_r_gpu_rocm(FPTYPE* out,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu_rocm(FPTYPE* dy_dem,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dy,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size) {
+void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
+                                   const FPTYPE* table,
+                                   const FPTYPE* table_info,
+                                   const FPTYPE* em,
+                                   const FPTYPE* dy,
+                                   const int nloc,
+                                   const int nnei,
+                                   const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -826,14 +826,14 @@ void tabulate_fusion_se_r_grad_gpu_rocm(FPTYPE* dy_dem,
 }
 
 template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu_rocm(FPTYPE* dz_dy,
-                                             const FPTYPE* table,
-                                             const FPTYPE* table_info,
-                                             const FPTYPE* em,
-                                             const FPTYPE* dz_dy_dem,
-                                             const int nloc,
-                                             const int nnei,
-                                             const int last_layer_size) {
+void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
+                                        const FPTYPE* table,
+                                        const FPTYPE* table_info,
+                                        const FPTYPE* em,
+                                        const FPTYPE* dz_dy_dem,
+                                        const int nloc,
+                                        const int nnei,
+                                        const int last_layer_size) {
   if (nloc <= 0) {
     return;
   }
@@ -850,53 +850,51 @@ void tabulate_fusion_se_r_grad_grad_gpu_rocm(FPTYPE* dz_dy,
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void tabulate_fusion_se_a_gpu_rocm<float>(float* out,
+template void tabulate_fusion_se_a_gpu<float>(float* out,
+                                              const float* table,
+                                              const float* table_info,
+                                              const float* em_x,
+                                              const float* em,
+                                              const float* two_embed,
+                                              const int nloc,
+                                              const int nnei,
+                                              const int last_layer_size,
+                                              const bool is_sorted);
+template void tabulate_fusion_se_a_gpu<double>(double* out,
+                                               const double* table,
+                                               const double* table_info,
+                                               const double* em_x,
+                                               const double* em,
+                                               const double* two_embed,
+                                               const int nloc,
+                                               const int nnei,
+                                               const int last_layer_size,
+                                               const bool is_sorted);
+template void tabulate_fusion_se_a_grad_gpu<float>(float* dy_dem_x,
+                                                   float* dy_dem,
                                                    const float* table,
                                                    const float* table_info,
                                                    const float* em_x,
                                                    const float* em,
                                                    const float* two_embed,
+                                                   const float* dy,
                                                    const int nloc,
                                                    const int nnei,
                                                    const int last_layer_size,
                                                    const bool is_sorted);
-template void tabulate_fusion_se_a_gpu_rocm<double>(double* out,
+template void tabulate_fusion_se_a_grad_gpu<double>(double* dy_dem_x,
+                                                    double* dy_dem,
                                                     const double* table,
                                                     const double* table_info,
                                                     const double* em_x,
                                                     const double* em,
                                                     const double* two_embed,
+                                                    const double* dy,
                                                     const int nloc,
                                                     const int nnei,
                                                     const int last_layer_size,
                                                     const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu_rocm<float>(
-    float* dy_dem_x,
-    float* dy_dem,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* two_embed,
-    const float* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu_rocm<double>(
-    double* dy_dem_x,
-    double* dy_dem,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* two_embed,
-    const double* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu_rocm<float>(
+template void tabulate_fusion_se_a_grad_grad_gpu<float>(
     float* dz_dy,
     const float* table,
     const float* table_info,
@@ -908,7 +906,7 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm<float>(
     const int nnei,
     const int last_layer_size,
     const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu_rocm<double>(
+template void tabulate_fusion_se_a_grad_grad_gpu<double>(
     double* dz_dy,
     const double* table,
     const double* table_info,
@@ -921,49 +919,47 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm<double>(
     const int last_layer_size,
     const bool is_sorted);
 
-template void tabulate_fusion_se_t_gpu_rocm<float>(float* out,
+template void tabulate_fusion_se_t_gpu<float>(float* out,
+                                              const float* table,
+                                              const float* table_info,
+                                              const float* em_x,
+                                              const float* em,
+                                              const int nloc,
+                                              const int nnei_i,
+                                              const int nnei_j,
+                                              const int last_layer_size);
+template void tabulate_fusion_se_t_gpu<double>(double* out,
+                                               const double* table,
+                                               const double* table_info,
+                                               const double* em_x,
+                                               const double* em,
+                                               const int nloc,
+                                               const int nnei_i,
+                                               const int nnei_j,
+                                               const int last_layer_size);
+template void tabulate_fusion_se_t_grad_gpu<float>(float* dy_dem_x,
+                                                   float* dy_dem,
                                                    const float* table,
                                                    const float* table_info,
                                                    const float* em_x,
                                                    const float* em,
+                                                   const float* dy,
                                                    const int nloc,
                                                    const int nnei_i,
                                                    const int nnei_j,
                                                    const int last_layer_size);
-template void tabulate_fusion_se_t_gpu_rocm<double>(double* out,
+template void tabulate_fusion_se_t_grad_gpu<double>(double* dy_dem_x,
+                                                    double* dy_dem,
                                                     const double* table,
                                                     const double* table_info,
                                                     const double* em_x,
                                                     const double* em,
+                                                    const double* dy,
                                                     const int nloc,
                                                     const int nnei_i,
                                                     const int nnei_j,
                                                     const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu_rocm<float>(
-    float* dy_dem_x,
-    float* dy_dem,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* dy,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu_rocm<double>(
-    double* dy_dem_x,
-    double* dy_dem,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* dy,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu_rocm<float>(
+template void tabulate_fusion_se_t_grad_grad_gpu<float>(
     float* dz_dy,
     const float* table,
     const float* table_info,
@@ -975,7 +971,7 @@ template void tabulate_fusion_se_t_grad_grad_gpu_rocm<float>(
     const int nnei_i,
     const int nnei_j,
     const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu_rocm<double>(
+template void tabulate_fusion_se_t_grad_grad_gpu<double>(
     double* dz_dy,
     const double* table,
     const double* table_info,
@@ -988,39 +984,37 @@ template void tabulate_fusion_se_t_grad_grad_gpu_rocm<double>(
     const int nnei_j,
     const int last_layer_size);
 
-template void tabulate_fusion_se_r_gpu_rocm<float>(float* out,
+template void tabulate_fusion_se_r_gpu<float>(float* out,
+                                              const float* table,
+                                              const float* table_info,
+                                              const float* em,
+                                              const int nloc,
+                                              const int nnei,
+                                              const int last_layer_size);
+template void tabulate_fusion_se_r_gpu<double>(double* out,
+                                               const double* table,
+                                               const double* table_info,
+                                               const double* em,
+                                               const int nloc,
+                                               const int nnei,
+                                               const int last_layer_size);
+template void tabulate_fusion_se_r_grad_gpu<float>(float* dy_dem,
                                                    const float* table,
                                                    const float* table_info,
                                                    const float* em,
+                                                   const float* dy,
                                                    const int nloc,
                                                    const int nnei,
                                                    const int last_layer_size);
-template void tabulate_fusion_se_r_gpu_rocm<double>(double* out,
+template void tabulate_fusion_se_r_grad_gpu<double>(double* dy_dem,
                                                     const double* table,
                                                     const double* table_info,
                                                     const double* em,
+                                                    const double* dy,
                                                     const int nloc,
                                                     const int nnei,
                                                     const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu_rocm<float>(
-    float* dy_dem,
-    const float* table,
-    const float* table_info,
-    const float* em,
-    const float* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu_rocm<double>(
-    double* dy_dem,
-    const double* table,
-    const double* table_info,
-    const double* em,
-    const double* dy,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu_rocm<float>(
+template void tabulate_fusion_se_r_grad_grad_gpu<float>(
     float* dz_dy,
     const float* table,
     const float* table_info,
@@ -1029,7 +1023,7 @@ template void tabulate_fusion_se_r_grad_grad_gpu_rocm<float>(
     const int nloc,
     const int nnei,
     const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu_rocm<double>(
+template void tabulate_fusion_se_r_grad_grad_gpu<double>(
     double* dz_dy,
     const double* table,
     const double* table_info,
diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc
index 705c5d57bd..581301b6a7 100644
--- a/source/lib/tests/test_coord.cc
+++ b/source/lib/tests/test_coord.cc
@@ -164,7 +164,7 @@ TEST_F(TestNormCoord, gpu_case0) {
   deepmd::malloc_device_memory_sync(out_c_dev, out_c);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev);
+  deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
@@ -192,7 +192,7 @@ TEST_F(TestNormCoord, gpu_case1) {
   deepmd::malloc_device_memory_sync(out_c_dev, out_c);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev);
+  deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
@@ -220,7 +220,7 @@ TEST_F(TestNormCoord, gpu_case2) {
   deepmd::malloc_device_memory_sync(out_c_dev, out_c);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev);
+  deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
@@ -553,10 +553,10 @@ TEST_F(TestCopyCoord, gpu) {
                         1 + nloc);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu_rocm(
-      out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev,
-      in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev,
-      region_dev);
+  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
+                                   int_data_dev, in_c_dev, in_t_dev, nloc,
+                                   mem_size, loc_cellnum, total_cellnum,
+                                   cell_info_dev, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
@@ -628,10 +628,10 @@ TEST_F(TestCopyCoord, gpu_lessmem) {
                         1 + nloc);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu_rocm(
-      out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev,
-      in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev,
-      region_dev);
+  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
+                                   int_data_dev, in_c_dev, in_t_dev, nloc,
+                                   mem_size, loc_cellnum, total_cellnum,
+                                   cell_info_dev, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
@@ -938,10 +938,10 @@ TEST_F(TestCopyCoordMoreCell, gpu) {
                         1 + nloc);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu_rocm(
-      out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev,
-      in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev,
-      region_dev);
+  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
+                                   int_data_dev, in_c_dev, in_t_dev, nloc,
+                                   mem_size, loc_cellnum, total_cellnum,
+                                   cell_info_dev, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
@@ -1013,10 +1013,10 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) {
                         1 + nloc);
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu_rocm(
-      out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev,
-      in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev,
-      region_dev);
+  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
+                                   int_data_dev, in_c_dev, in_t_dev, nloc,
+                                   mem_size, loc_cellnum, total_cellnum,
+                                   cell_info_dev, region_dev);
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
   deepmd::memcpy_device_to_host(out_c_dev, out_c);
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 594e8a3601..639f99414d 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -591,7 +591,7 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestEnvMatA, prod_gpu_cuda) {
+TEST_F(TestEnvMatA, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -641,10 +641,10 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_cuda(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
@@ -669,7 +669,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) {
   }
 }
 
-TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) {
+TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -720,10 +720,10 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_cuda(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
@@ -785,7 +785,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestEnvMatA, prod_gpu_rocm) {
+TEST_F(TestEnvMatA, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -835,10 +835,10 @@ TEST_F(TestEnvMatA, prod_gpu_rocm) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_rocm(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
@@ -863,7 +863,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm) {
   }
 }
 
-TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) {
+TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -914,10 +914,10 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_rocm(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
index 19d4ea1fd8..f415317929 100644
--- a/source/lib/tests/test_env_mat_a_mix.cc
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -629,7 +629,7 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestEnvMatAMix, prod_gpu_cuda) {
+TEST_F(TestEnvMatAMix, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -688,7 +688,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_cuda(
+  deepmd::prod_env_mat_a_gpu(
       em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
       gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
       std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
@@ -729,7 +729,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda) {
   delete[] nmask;
 }
 
-TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu) {
+TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -781,7 +781,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_cuda(
+  deepmd::prod_env_mat_a_gpu(
       em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
       gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
       std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
@@ -847,7 +847,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestEnvMatAMix, prod_gpu_rocm) {
+TEST_F(TestEnvMatAMix, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -906,13 +906,13 @@ TEST_F(TestEnvMatAMix, prod_gpu_rocm) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_rocm(
+  deepmd::prod_env_mat_a_gpu(
       em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
       gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
       std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
 
-  deepmd::use_nei_info_gpu_rocm(nlist_dev, ntype_dev, nmask_dev, atype_dev,
-                                mapping_dev, nloc, nnei, ntypes, true);
+  deepmd::use_nei_info_gpu(nlist_dev, ntype_dev, nmask_dev, atype_dev,
+                           mapping_dev, nloc, nnei, ntypes, true);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(ntype_dev, ntype);
   deepmd::memcpy_device_to_host(nmask_dev, nmask, nloc * nnei);
@@ -947,7 +947,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_rocm) {
   delete[] nmask;
 }
 
-TEST_F(TestEnvMatAMix, prod_gpu_rocm_equal_cpu) {
+TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -999,7 +999,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_rocm_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_a_gpu_rocm(
+  deepmd::prod_env_mat_a_gpu(
       em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
       gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
       std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 258aa1000d..f20a8cbbc3 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -359,7 +359,7 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestEnvMatR, prod_gpu_cuda) {
+TEST_F(TestEnvMatR, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -410,10 +410,10 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_r_gpu_cuda(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
@@ -438,7 +438,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) {
   }
 }
 
-TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) {
+TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -489,10 +489,10 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_r_gpu_cuda(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
@@ -544,7 +544,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestEnvMatR, prod_gpu_rocm) {
+TEST_F(TestEnvMatR, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -595,10 +595,10 @@ TEST_F(TestEnvMatR, prod_gpu_rocm) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_r_gpu_rocm(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::delete_device_memory(em_dev);
   deepmd::delete_device_memory(em_deriv_dev);
@@ -623,7 +623,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm) {
   }
 }
 
-TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) {
+TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
   int max_nbor_size = 0;
@@ -674,10 +674,10 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
                                    max_nbor_size);
 
-  deepmd::prod_env_mat_r_gpu_rocm(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a);
+  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
+                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                             array_int_dev, array_longlong_dev, max_nbor_size,
+                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
   deepmd::memcpy_device_to_host(em_dev, em);
   deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
   deepmd::memcpy_device_to_host(rij_dev, rij);
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index 6d9a59cd36..1d995f8fce 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -314,7 +314,7 @@ TEST_F(TestFormatNlistShortSel, cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestFormatNlist, gpu_cuda) {
+TEST_F(TestFormatNlist, gpu) {
   std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
   build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
               ext_stt, ext_end, region, ncell);
@@ -357,9 +357,9 @@ TEST_F(TestFormatNlist, gpu_cuda) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev,
                                    max_nbor_size);
   // format nlist
-  format_nbor_list_gpu_cuda(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                            array_int_dev, array_longlong_dev, max_nbor_size,
-                            nloc, nall, rc, sec_a);
+  format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                       array_int_dev, array_longlong_dev, max_nbor_size, nloc,
+                       nall, rc, sec_a);
   deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
@@ -374,7 +374,7 @@ TEST_F(TestFormatNlist, gpu_cuda) {
   }
 }
 
-TEST_F(TestFormatNlistShortSel, gpu_cuda) {
+TEST_F(TestFormatNlistShortSel, gpu) {
   std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
   build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
               ext_stt, ext_end, region, ncell);
@@ -417,9 +417,9 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev,
                                    max_nbor_size);
   // format nlist
-  format_nbor_list_gpu_cuda(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                            array_int_dev, array_longlong_dev, max_nbor_size,
-                            nloc, nall, rc, sec_a);
+  format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                       array_int_dev, array_longlong_dev, max_nbor_size, nloc,
+                       nall, rc, sec_a);
   deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
@@ -434,7 +434,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) {
   }
 }
 
-TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) {
+TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu) {
   int *valid_type_dev = NULL, *valid_index_dev = NULL, *out_type_dev = NULL,
       *out_index_dev = NULL;
   double* valid_dist_dev = NULL;
@@ -449,7 +449,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) {
   deepmd::malloc_device_memory_sync(out_index_dev, out_index);
   deepmd::malloc_device_memory_sync(key_dev, key);
 
-  deepmd::test_encoding_decoding_nbor_info_gpu_cuda(
+  deepmd::test_encoding_decoding_nbor_info_gpu(
       key_dev, out_type_dev, out_index_dev, valid_type_dev, valid_dist_dev,
       valid_index_dev, size_of_array);
 
@@ -470,7 +470,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) {
   }
 }
 
-// TEST_F(TestEncodingDecodingNborInfo, invalid_nbor_info_gpu_cuda)
+// TEST_F(TestEncodingDecodingNborInfo, invalid_nbor_info_gpu)
 // {
 //   int * invalid_type_dev = NULL, * invalid_index_dev = NULL, * out_type_dev =
 //   NULL, * out_index_dev = NULL; double * invalid_dist_dev = NULL; uint_64 *
@@ -485,7 +485,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) {
 //   deepmd::malloc_device_memory_sync(key_dev, key);
 
 //   EXPECT_EQ(cudaGetLastError() == cudaSuccess && cudaDeviceSynchronize() ==
-//   cudaSuccess, true); deepmd::test_encoding_decoding_nbor_info_gpu_cuda(
+//   cudaSuccess, true); deepmd::test_encoding_decoding_nbor_info_gpu(
 //       key_dev, out_type_dev, out_index_dev,
 //       invalid_type_dev, invalid_dist_dev, invalid_index_dev, size_of_array
 //   );
@@ -504,7 +504,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestFormatNlist, gpu_rocm) {
+TEST_F(TestFormatNlist, gpu) {
   std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
   build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
               ext_stt, ext_end, region, ncell);
@@ -547,9 +547,9 @@ TEST_F(TestFormatNlist, gpu_rocm) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev,
                                    max_nbor_size);
   // format nlist
-  format_nbor_list_gpu_rocm(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                            array_int_dev, array_longlong_dev, max_nbor_size,
-                            nloc, nall, rc, sec_a);
+  format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                       array_int_dev, array_longlong_dev, max_nbor_size, nloc,
+                       nall, rc, sec_a);
   deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
@@ -564,7 +564,7 @@ TEST_F(TestFormatNlist, gpu_rocm) {
   }
 }
 
-TEST_F(TestFormatNlistShortSel, gpu_rocm) {
+TEST_F(TestFormatNlistShortSel, gpu) {
   std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
   build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
               ext_stt, ext_end, region, ncell);
@@ -607,9 +607,9 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm) {
   deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev,
                                    max_nbor_size);
   // format nlist
-  format_nbor_list_gpu_rocm(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                            array_int_dev, array_longlong_dev, max_nbor_size,
-                            nloc, nall, rc, sec_a);
+  format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
+                       array_int_dev, array_longlong_dev, max_nbor_size, nloc,
+                       nall, rc, sec_a);
   deepmd::memcpy_device_to_host(nlist_dev, nlist);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(posi_cpy_dev);
@@ -624,7 +624,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm) {
   }
 }
 
-TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_rocm) {
+TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu) {
   int *valid_type_dev = NULL, *valid_index_dev = NULL, *out_type_dev = NULL,
       *out_index_dev = NULL;
   double* valid_dist_dev = NULL;
@@ -639,7 +639,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_rocm) {
   deepmd::malloc_device_memory_sync(out_index_dev, out_index);
   deepmd::malloc_device_memory_sync(key_dev, key);
 
-  deepmd::test_encoding_decoding_nbor_info_gpu_rocm(
+  deepmd::test_encoding_decoding_nbor_info_gpu(
       key_dev, out_type_dev, out_index_dev, valid_type_dev, valid_dist_dev,
       valid_index_dev, size_of_array);
 
diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
index cdfe227809..e680567b9c 100644
--- a/source/lib/tests/test_gelu.cc
+++ b/source/lib/tests/test_gelu.cc
@@ -146,13 +146,13 @@ TEST_F(TestGelu, gelu_grad_grad_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestGelu, gelu_gpu_cuda) {
+TEST_F(TestGelu, gelu_gpu) {
   std::vector<double> gelu(nloc, 0.0);
 
   double *gelu_dev = NULL, *xx_dev = NULL;
   deepmd::malloc_device_memory_sync(gelu_dev, gelu);
   deepmd::malloc_device_memory_sync(xx_dev, xx);
-  deepmd::gelu_gpu_cuda<double>(gelu_dev, xx_dev, nloc);
+  deepmd::gelu_gpu<double>(gelu_dev, xx_dev, nloc);
   deepmd::memcpy_device_to_host(gelu_dev, gelu);
   deepmd::delete_device_memory(gelu_dev);
   deepmd::delete_device_memory(xx_dev);
@@ -164,7 +164,7 @@ TEST_F(TestGelu, gelu_gpu_cuda) {
   }
 }
 
-TEST_F(TestGelu, gelu_grad_gpu_cuda) {
+TEST_F(TestGelu, gelu_grad_gpu) {
   std::vector<double> dy(100, 1.0);
   std::vector<double> gelu_grad(nloc, 0.0);
 
@@ -172,7 +172,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda) {
   deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
   deepmd::malloc_device_memory_sync(xx_dev, xx);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::gelu_grad_gpu_cuda<double>(gelu_grad_dev, xx_dev, dy_dev, nloc);
+  deepmd::gelu_grad_gpu<double>(gelu_grad_dev, xx_dev, dy_dev, nloc);
   deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad);
   deepmd::delete_device_memory(gelu_grad_dev);
   deepmd::delete_device_memory(xx_dev);
@@ -185,7 +185,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda) {
   }
 }
 
-TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) {
+TEST_F(TestGelu, gelu_grad_grad_gpu) {
   std::vector<double> dy(100, 1.0);
   std::vector<double> dy_2(100, 1.0);
   std::vector<double> gelu_grad_grad(nloc, 0.0);
@@ -196,8 +196,8 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) {
   deepmd::malloc_device_memory_sync(xx_dev, xx);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
   deepmd::malloc_device_memory_sync(dy_2_dev, dy_2);
-  deepmd::gelu_grad_grad_gpu_cuda<double>(gelu_grad_grad_dev, xx_dev, dy_dev,
-                                          dy_2_dev, nloc);
+  deepmd::gelu_grad_grad_gpu<double>(gelu_grad_grad_dev, xx_dev, dy_dev,
+                                     dy_2_dev, nloc);
   deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
   deepmd::delete_device_memory(gelu_grad_grad_dev);
   deepmd::delete_device_memory(xx_dev);
@@ -213,13 +213,13 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestGelu, gelu_gpu_rocm) {
+TEST_F(TestGelu, gelu_gpu) {
   std::vector<double> gelu(nloc, 0.0);
 
   double *gelu_dev = NULL, *xx_dev = NULL;
   deepmd::malloc_device_memory_sync(gelu_dev, gelu);
   deepmd::malloc_device_memory_sync(xx_dev, xx);
-  deepmd::gelu_gpu_rocm<double>(gelu_dev, xx_dev, nloc);
+  deepmd::gelu_gpu<double>(gelu_dev, xx_dev, nloc);
   deepmd::memcpy_device_to_host(gelu_dev, gelu);
   deepmd::delete_device_memory(gelu_dev);
   deepmd::delete_device_memory(xx_dev);
@@ -231,7 +231,7 @@ TEST_F(TestGelu, gelu_gpu_rocm) {
   }
 }
 
-TEST_F(TestGelu, gelu_grad_gpu_rocm) {
+TEST_F(TestGelu, gelu_grad_gpu) {
   std::vector<double> dy(100, 1.0);
   std::vector<double> gelu_grad(nloc, 0.0);
 
@@ -239,7 +239,7 @@ TEST_F(TestGelu, gelu_grad_gpu_rocm) {
   deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
   deepmd::malloc_device_memory_sync(xx_dev, xx);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::gelu_grad_gpu_rocm<double>(gelu_grad_dev, xx_dev, dy_dev, nloc);
+  deepmd::gelu_grad_gpu<double>(gelu_grad_dev, xx_dev, dy_dev, nloc);
   deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad);
   deepmd::delete_device_memory(gelu_grad_dev);
   deepmd::delete_device_memory(xx_dev);
@@ -252,7 +252,7 @@ TEST_F(TestGelu, gelu_grad_gpu_rocm) {
   }
 }
 
-TEST_F(TestGelu, gelu_grad_grad_gpu_rocm) {
+TEST_F(TestGelu, gelu_grad_grad_gpu) {
   std::vector<double> dy(100, 1.0);
   std::vector<double> dy_2(100, 1.0);
   std::vector<double> gelu_grad_grad(nloc, 0.0);
@@ -263,8 +263,8 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_rocm) {
   deepmd::malloc_device_memory_sync(xx_dev, xx);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
   deepmd::malloc_device_memory_sync(dy_2_dev, dy_2);
-  deepmd::gelu_grad_grad_gpu_rocm<double>(gelu_grad_grad_dev, xx_dev, dy_dev,
-                                          dy_2_dev, nloc);
+  deepmd::gelu_grad_grad_gpu<double>(gelu_grad_grad_dev, xx_dev, dy_dev,
+                                     dy_2_dev, nloc);
   deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
   deepmd::delete_device_memory(gelu_grad_grad_dev);
   deepmd::delete_device_memory(xx_dev);
diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc
index 3e85b0ee73..985f69b3f4 100644
--- a/source/lib/tests/test_neighbor_list.cc
+++ b/source/lib/tests/test_neighbor_list.cc
@@ -253,9 +253,8 @@ TEST_F(TestNeighborList, gpu) {
   deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev);
 
   int max_list_size;
-  int ret =
-      deepmd::build_nlist_gpu_rocm(nlist_dev, &max_list_size, nlist_data_dev,
-                                   c_cpy_dev, nloc, nall, mem_size, rc);
+  int ret = deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev,
+                                    c_cpy_dev, nloc, nall, mem_size, rc);
 
   EXPECT_EQ(ret, 0);
   int* ilist = new int[nloc];
@@ -314,9 +313,8 @@ TEST_F(TestNeighborList, gpu_lessmem) {
   deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev);
 
   int max_list_size;
-  int ret =
-      deepmd::build_nlist_gpu_rocm(nlist_dev, &max_list_size, nlist_data_dev,
-                                   c_cpy_dev, nloc, nall, mem_size, rc);
+  int ret = deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev,
+                                    c_cpy_dev, nloc, nall, mem_size, rc);
 
   EXPECT_EQ(ret, 1);
   deepmd::delete_device_memory(nlist_data_dev);
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index 2c3483f845..b51c97e421 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -134,7 +134,7 @@ TEST_F(TestProdForceA, cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestProdForceA, gpu_cuda) {
+TEST_F(TestProdForceA, gpu) {
   std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
@@ -146,8 +146,8 @@ TEST_F(TestProdForceA, gpu_cuda) {
   deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  deepmd::prod_force_a_gpu_cuda<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei, nframes);
+  deepmd::prod_force_a_gpu<double>(force_dev, net_deriv_dev, env_deriv_dev,
+                                   nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
@@ -164,7 +164,7 @@ TEST_F(TestProdForceA, gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestProdForceA, gpu_rocm) {
+TEST_F(TestProdForceA, gpu) {
   std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
@@ -176,8 +176,8 @@ TEST_F(TestProdForceA, gpu_rocm) {
   deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  deepmd::prod_force_a_gpu_rocm<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei, nframes);
+  deepmd::prod_force_a_gpu<double>(force_dev, net_deriv_dev, env_deriv_dev,
+                                   nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc
index 29cac24d1d..4694c4ac3b 100644
--- a/source/lib/tests/test_prod_force_grad_a.cc
+++ b/source/lib/tests/test_prod_force_grad_a.cc
@@ -153,8 +153,8 @@ TEST_F(TestProdForceGradA, gpu) {
   deepmd::malloc_device_memory_sync(grad_dev, grad);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt);
-  deepmd::prod_force_grad_a_gpu_cuda<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes);
+  deepmd::prod_force_grad_a_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                        nlist_dev, nloc, nnei, nframes);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
@@ -183,8 +183,8 @@ TEST_F(TestProdForceGradA, gpu) {
   deepmd::malloc_device_memory_sync(grad_dev, grad);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt);
-  deepmd::prod_force_grad_a_gpu_rocm<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes);
+  deepmd::prod_force_grad_a_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                        nlist_dev, nloc, nnei, nframes);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc
index 2674e1607e..31f8b64982 100644
--- a/source/lib/tests/test_prod_force_grad_r.cc
+++ b/source/lib/tests/test_prod_force_grad_r.cc
@@ -127,8 +127,8 @@ TEST_F(TestProdForceGradR, gpu) {
   deepmd::malloc_device_memory_sync(grad_dev, grad);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt);
-  deepmd::prod_force_grad_r_gpu_cuda<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes);
+  deepmd::prod_force_grad_r_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                        nlist_dev, nloc, nnei, nframes);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
@@ -157,8 +157,8 @@ TEST_F(TestProdForceGradR, gpu) {
   deepmd::malloc_device_memory_sync(grad_dev, grad);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt);
-  deepmd::prod_force_grad_r_gpu_rocm<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes);
+  deepmd::prod_force_grad_r_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                        nlist_dev, nloc, nnei, nframes);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index 8920e76b12..7f46aa3244 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -131,7 +131,7 @@ TEST_F(TestProdForceR, cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestProdForceR, gpu_cuda) {
+TEST_F(TestProdForceR, gpu) {
   std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
@@ -143,8 +143,8 @@ TEST_F(TestProdForceR, gpu_cuda) {
   deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  deepmd::prod_force_r_gpu_cuda<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei, nframes);
+  deepmd::prod_force_r_gpu<double>(force_dev, net_deriv_dev, env_deriv_dev,
+                                   nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
@@ -161,7 +161,7 @@ TEST_F(TestProdForceR, gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestProdForceR, gpu_rocm) {
+TEST_F(TestProdForceR, gpu) {
   std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
@@ -173,8 +173,8 @@ TEST_F(TestProdForceR, gpu_rocm) {
   deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  deepmd::prod_force_r_gpu_rocm<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei, nframes);
+  deepmd::prod_force_r_gpu<double>(force_dev, net_deriv_dev, env_deriv_dev,
+                                   nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index 43244460e6..054a152869 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -179,7 +179,7 @@ TEST_F(TestProdVirialA, cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestProdVirialA, gpu_cuda) {
+TEST_F(TestProdVirialA, gpu) {
   std::vector<double> virial(9, 0.0);
   std::vector<double> atom_virial(nall * 9, 0.0);
   int n_a_sel = nnei;
@@ -195,9 +195,9 @@ TEST_F(TestProdVirialA, gpu_cuda) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
 
-  deepmd::prod_virial_a_gpu_cuda<double>(virial_dev, atom_virial_dev,
-                                         net_deriv_dev, env_deriv_dev, rij_dev,
-                                         nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_a_gpu<double>(virial_dev, atom_virial_dev, net_deriv_dev,
+                                    env_deriv_dev, rij_dev, nlist_dev, nloc,
+                                    nall, nnei);
 
   deepmd::memcpy_device_to_host(virial_dev, virial);
   deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
@@ -228,7 +228,7 @@ TEST_F(TestProdVirialA, gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestProdVirialA, gpu_rocm) {
+TEST_F(TestProdVirialA, gpu) {
   std::vector<double> virial(9, 0.0);
   std::vector<double> atom_virial(nall * 9, 0.0);
   int n_a_sel = nnei;
@@ -244,9 +244,9 @@ TEST_F(TestProdVirialA, gpu_rocm) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
 
-  deepmd::prod_virial_a_gpu_rocm<double>(virial_dev, atom_virial_dev,
-                                         net_deriv_dev, env_deriv_dev, rij_dev,
-                                         nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_a_gpu<double>(virial_dev, atom_virial_dev, net_deriv_dev,
+                                    env_deriv_dev, rij_dev, nlist_dev, nloc,
+                                    nall, nnei);
 
   deepmd::memcpy_device_to_host(virial_dev, virial);
   deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc
index 044d5a07d6..98a08ce5c3 100644
--- a/source/lib/tests/test_prod_virial_grad_a.cc
+++ b/source/lib/tests/test_prod_virial_grad_a.cc
@@ -150,8 +150,8 @@ TEST_F(TestProdVirialGradA, gpu) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
   deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
-  deepmd::prod_virial_grad_a_gpu_cuda<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei);
+  deepmd::prod_virial_grad_a_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                         rij_dev, nlist_dev, nloc, nnei);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
@@ -184,8 +184,8 @@ TEST_F(TestProdVirialGradA, gpu) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
   deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
-  deepmd::prod_virial_grad_a_gpu_rocm<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei);
+  deepmd::prod_virial_grad_a_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                         rij_dev, nlist_dev, nloc, nnei);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc
index 34e940c73c..a0c7dad0db 100644
--- a/source/lib/tests/test_prod_virial_grad_r.cc
+++ b/source/lib/tests/test_prod_virial_grad_r.cc
@@ -124,8 +124,8 @@ TEST_F(TestProdVirialGradR, gpu) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
   deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
-  deepmd::prod_virial_grad_r_gpu_cuda<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei);
+  deepmd::prod_virial_grad_r_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                         rij_dev, nlist_dev, nloc, nnei);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
@@ -158,8 +158,8 @@ TEST_F(TestProdVirialGradR, gpu) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
   deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
-  deepmd::prod_virial_grad_r_gpu_rocm<double>(
-      grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei);
+  deepmd::prod_virial_grad_r_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
+                                         rij_dev, nlist_dev, nloc, nnei);
   deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
   deepmd::delete_device_memory(nlist_dev);
   deepmd::delete_device_memory(grad_dev);
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index e38ed1da7e..f1077b6dbc 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -179,7 +179,7 @@ TEST_F(TestProdVirialR, cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestProdVirialR, gpu_cuda) {
+TEST_F(TestProdVirialR, gpu) {
   std::vector<double> virial(9, 0.0);
   std::vector<double> atom_virial(nall * 9, 0.0);
   int n_a_sel = nnei;
@@ -195,9 +195,9 @@ TEST_F(TestProdVirialR, gpu_cuda) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
 
-  deepmd::prod_virial_r_gpu_cuda<double>(virial_dev, atom_virial_dev,
-                                         net_deriv_dev, env_deriv_dev, rij_dev,
-                                         nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_r_gpu<double>(virial_dev, atom_virial_dev, net_deriv_dev,
+                                    env_deriv_dev, rij_dev, nlist_dev, nloc,
+                                    nall, nnei);
 
   deepmd::memcpy_device_to_host(virial_dev, virial);
   deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
@@ -228,7 +228,7 @@ TEST_F(TestProdVirialR, gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestProdVirialR, gpu_rocm) {
+TEST_F(TestProdVirialR, gpu) {
   std::vector<double> virial(9, 0.0);
   std::vector<double> atom_virial(nall * 9, 0.0);
   int n_a_sel = nnei;
@@ -244,9 +244,9 @@ TEST_F(TestProdVirialR, gpu_rocm) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
 
-  deepmd::prod_virial_r_gpu_rocm<double>(virial_dev, atom_virial_dev,
-                                         net_deriv_dev, env_deriv_dev, rij_dev,
-                                         nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_r_gpu<double>(virial_dev, atom_virial_dev, net_deriv_dev,
+                                    env_deriv_dev, rij_dev, nlist_dev, nloc,
+                                    nall, nnei);
 
   deepmd::memcpy_device_to_host(virial_dev, virial);
   deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc
index 467564d44b..6f1db46bb0 100644
--- a/source/lib/tests/test_simulation_region.cc
+++ b/source/lib/tests/test_simulation_region.cc
@@ -180,14 +180,14 @@ TEST_F(TestRegion, gpu) {
   double vol[1];
   double* vol_dev = NULL;
   deepmd::malloc_device_memory(vol_dev, 1);
-  deepmd::volume_gpu_rocm(vol_dev, region_dev);
+  deepmd::volume_gpu(vol_dev, region_dev);
   deepmd::memcpy_device_to_host(vol_dev, vol, 1);
   EXPECT_LT(fabs(vol[0] - expected_vol), 1e-10);
   // check conversion between phys and inter coords.
   double ri[3];
   double* ri_dev = NULL;
   deepmd::malloc_device_memory(ri_dev, 3);
-  deepmd::convert_to_inter_gpu_rocm(ri_dev, region_dev, ref_rp_dev);
+  deepmd::convert_to_inter_gpu(ri_dev, region_dev, ref_rp_dev);
   deepmd::memcpy_device_to_host(ri_dev, ri, 3);
   for (int ii = 0; ii < 3; ++ii) {
     EXPECT_LT(fabs(ri[ii] - ref_ri[ii]), 1e-10);
@@ -195,7 +195,7 @@ TEST_F(TestRegion, gpu) {
   double rp2[3];
   double* rp2_dev = NULL;
   deepmd::malloc_device_memory(rp2_dev, 3);
-  deepmd::convert_to_phys_gpu_rocm(rp2_dev, region_dev, ri_dev);
+  deepmd::convert_to_phys_gpu(rp2_dev, region_dev, ri_dev);
   deepmd::memcpy_device_to_host(rp2_dev, rp2, 3);
   for (int ii = 0; ii < 3; ++ii) {
     EXPECT_LT(fabs(rp2[ii] - ref_rp[ii]), 1e-10);
@@ -203,7 +203,7 @@ TEST_F(TestRegion, gpu) {
   double rp[3];
   double* rp_dev = NULL;
   deepmd::malloc_device_memory(rp_dev, 3);
-  deepmd::convert_to_phys_gpu_rocm(rp_dev, region_dev, ref_ri_dev);
+  deepmd::convert_to_phys_gpu(rp_dev, region_dev, ref_ri_dev);
   deepmd::memcpy_device_to_host(rp_dev, rp, 3);
   for (int ii = 0; ii < 3; ++ii) {
     EXPECT_LT(fabs(rp[ii] - ref_rp[ii]), 1e-10);
@@ -211,7 +211,7 @@ TEST_F(TestRegion, gpu) {
   double ri2[3];
   double* ri2_dev = NULL;
   deepmd::malloc_device_memory(ri2_dev, 3);
-  deepmd::convert_to_inter_gpu_rocm(ri2_dev, region_dev, rp_dev);
+  deepmd::convert_to_inter_gpu(ri2_dev, region_dev, rp_dev);
   deepmd::memcpy_device_to_host(ri2_dev, ri2, 3);
   for (int ii = 0; ii < 3; ++ii) {
     EXPECT_LT(fabs(ri2[ii] - ref_ri[ii]), 1e-10);
diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc
index 4c87a24566..6f76f9c2ee 100644
--- a/source/lib/tests/test_tabulate_se_a.cc
+++ b/source/lib/tests/test_tabulate_se_a.cc
@@ -756,7 +756,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) {
+TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
@@ -765,9 +765,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) {
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_a_gpu_cuda<double>(
-      xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, nloc,
-      nnei, last_layer_size);
+  deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_x_dev, em_dev, nullptr, nloc,
+                                           nnei, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
 
   EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
@@ -779,9 +779,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) {
   double *two_embed_dev = nullptr;
   deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
   deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  deepmd::tabulate_fusion_se_a_gpu_cuda<double>(
-      xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev,
-      nloc, nnei, last_layer_size);
+  deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_x_dev, em_dev, two_embed_dev,
+                                           nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
 
   EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
@@ -798,7 +798,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) {
   deepmd::delete_device_memory(two_embed_dev);
 }
 
-TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) {
+TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) {
   std::vector<double> dy_dem_x(em_x.size(), 0.0);
   std::vector<double> dy_dem(em.size(), 0.0);
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
@@ -811,7 +811,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) {
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_a_grad_gpu_cuda<double>(
+  deepmd::tabulate_fusion_se_a_grad_gpu<double>(
       dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr,
       dy_dev, nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
@@ -832,7 +832,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) {
   deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
   deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
   deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  deepmd::tabulate_fusion_se_a_grad_gpu_cuda<double>(
+  deepmd::tabulate_fusion_se_a_grad_gpu<double>(
       dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev,
       two_embed_dev, dy_dev, nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
@@ -855,7 +855,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) {
+TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
@@ -864,9 +864,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) {
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_a_gpu_rocm<double>(
-      xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, nloc,
-      nnei, last_layer_size);
+  deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_x_dev, em_dev, nullptr, nloc,
+                                           nnei, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
 
   EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
@@ -878,9 +878,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) {
   double *two_embed_dev = nullptr;
   deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
   deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  deepmd::tabulate_fusion_se_a_gpu_rocm<double>(
-      xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev,
-      nloc, nnei, last_layer_size);
+  deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_x_dev, em_dev, two_embed_dev,
+                                           nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
 
   EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
@@ -897,7 +897,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) {
   deepmd::delete_device_memory(two_embed_dev);
 }
 
-TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_rocm) {
+TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) {
   std::vector<double> dy_dem_x(em_x.size(), 0.0);
   std::vector<double> dy_dem(em.size(), 0.0);
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
@@ -910,7 +910,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_rocm) {
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_a_grad_gpu_rocm<double>(
+  deepmd::tabulate_fusion_se_a_grad_gpu<double>(
       dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr,
       dy_dev, nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
@@ -931,7 +931,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_rocm) {
   deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
   deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
   deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  deepmd::tabulate_fusion_se_a_grad_gpu_rocm<double>(
+  deepmd::tabulate_fusion_se_a_grad_gpu<double>(
       dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev,
       two_embed_dev, dy_dev, nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
diff --git a/source/lib/tests/test_tabulate_se_r.cc b/source/lib/tests/test_tabulate_se_r.cc
index 95ccfdf59e..5097451aab 100644
--- a/source/lib/tests/test_tabulate_se_r.cc
+++ b/source/lib/tests/test_tabulate_se_r.cc
@@ -607,16 +607,15 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_cuda) {
+TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu) {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_dev = NULL;
   deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_r_gpu_cuda<double>(xyz_scatter_dev, table_dev,
-                                                &info[0], em_dev, nloc, nnei,
-                                                last_layer_size);
+  deepmd::tabulate_fusion_se_r_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_dev, nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
   deepmd::delete_device_memory(xyz_scatter_dev);
   deepmd::delete_device_memory(table_dev);
@@ -629,7 +628,7 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_cuda) {
   }
 }
 
-TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda) {
+TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu) {
   std::vector<double> dy_dem(em.size(), 0.0);
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
 
@@ -638,9 +637,9 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda) {
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_r_grad_gpu_cuda<double>(
-      dy_dem_dev, table_dev, &info[0], em_dev, dy_dev, nloc, nnei,
-      last_layer_size);
+  deepmd::tabulate_fusion_se_r_grad_gpu<double>(dy_dem_dev, table_dev, &info[0],
+                                                em_dev, dy_dev, nloc, nnei,
+                                                last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
   deepmd::delete_device_memory(dy_dem_dev);
   deepmd::delete_device_memory(table_dev);
@@ -657,16 +656,15 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_rocm) {
+TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu) {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_dev = NULL;
   deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_r_gpu_rocm<double>(xyz_scatter_dev, table_dev,
-                                                &info[0], em_dev, nloc, nnei,
-                                                last_layer_size);
+  deepmd::tabulate_fusion_se_r_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_dev, nloc, nnei, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
   deepmd::delete_device_memory(xyz_scatter_dev);
   deepmd::delete_device_memory(table_dev);
@@ -679,7 +677,7 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_rocm) {
   }
 }
 
-TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_rocm) {
+TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu) {
   std::vector<double> dy_dem(em.size(), 0.0);
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
 
@@ -688,9 +686,9 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_rocm) {
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_r_grad_gpu_rocm<double>(
-      dy_dem_dev, table_dev, &info[0], em_dev, dy_dev, nloc, nnei,
-      last_layer_size);
+  deepmd::tabulate_fusion_se_r_grad_gpu<double>(dy_dem_dev, table_dev, &info[0],
+                                                em_dev, dy_dev, nloc, nnei,
+                                                last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
   deepmd::delete_device_memory(dy_dem_dev);
   deepmd::delete_device_memory(table_dev);
diff --git a/source/lib/tests/test_tabulate_se_t.cc b/source/lib/tests/test_tabulate_se_t.cc
index 522eef48cd..ffb1b41220 100644
--- a/source/lib/tests/test_tabulate_se_t.cc
+++ b/source/lib/tests/test_tabulate_se_t.cc
@@ -5261,7 +5261,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_cpu) {
 }
 
 #if GOOGLE_CUDA
-TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_cuda) {
+TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu) {
   std::vector<double> xyz_scatter(nloc * last_layer_size, 0.0);
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
          *em_dev = NULL;
@@ -5269,9 +5269,9 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_cuda) {
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_t_gpu_cuda<double>(
-      xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei_i,
-      nnei_j, last_layer_size);
+  deepmd::tabulate_fusion_se_t_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_x_dev, em_dev, nloc, nnei_i,
+                                           nnei_j, last_layer_size);
   // deepmd::tabulate_fusion_se_t_cpu<double>(&xyz_scatter[0], &table[0],
   // &info[0], &em_x[0], &em[0], nloc, nnei_i, nnei_j, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
@@ -5287,7 +5287,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_cuda) {
   }
 }
 
-TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu_cuda) {
+TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu) {
   std::vector<double> dy_dem_x(em_x.size(), 0.0);
   std::vector<double> dy_dem(em.size(), 0.0);
 
@@ -5299,7 +5299,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu_cuda) {
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_t_grad_gpu_cuda<double>(
+  deepmd::tabulate_fusion_se_t_grad_gpu<double>(
       dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev,
       nloc, nnei_i, nnei_j, last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
@@ -5325,7 +5325,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu_cuda) {
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_rocm) {
+TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu) {
   std::vector<double> xyz_scatter(nloc * last_layer_size, 0.0);
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
          *em_dev = NULL;
@@ -5333,9 +5333,9 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_rocm) {
   deepmd::malloc_device_memory_sync(table_dev, table);
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_t_gpu_rocm<double>(
-      xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei_i,
-      nnei_j, last_layer_size);
+  deepmd::tabulate_fusion_se_t_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
+                                           em_x_dev, em_dev, nloc, nnei_i,
+                                           nnei_j, last_layer_size);
   deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
   deepmd::delete_device_memory(xyz_scatter_dev);
   deepmd::delete_device_memory(table_dev);
@@ -5349,7 +5349,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_rocm) {
   }
 }
 
-TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu_rocm) {
+TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu) {
   std::vector<double> dy_dem_x(em_x.size(), 0.0);
   std::vector<double> dy_dem(em.size(), 0.0);
 
@@ -5361,7 +5361,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu_rocm) {
   deepmd::malloc_device_memory_sync(em_x_dev, em_x);
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_t_grad_gpu_rocm<double>(
+  deepmd::tabulate_fusion_se_t_grad_gpu<double>(
       dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev,
       nloc, nnei_i, nnei_j, last_layer_size);
   deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index 92c3968b9c..ccc95aa0e4 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -65,11 +65,11 @@ class GeluOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::gelu_gpu_cuda(out, x, size);
+      deepmd::gelu_gpu(out, x, size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::gelu_gpu_rocm(out, x, size);
+      deepmd::gelu_gpu(out, x, size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_cpu(out, x, size);
@@ -109,11 +109,11 @@ class GeluGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::gelu_grad_gpu_cuda(out, x, dy, size);
+      deepmd::gelu_grad_gpu(out, x, dy, size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::gelu_grad_gpu_rocm(out, x, dy, size);
+      deepmd::gelu_grad_gpu(out, x, dy, size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_cpu(out, x, dy, size);
@@ -155,11 +155,11 @@ class GeluGradGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::gelu_grad_grad_gpu_cuda(out, x, dy, dy_2, size);
+      deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::gelu_grad_grad_gpu_rocm(out, x, dy, dy_2, size);
+      deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_grad_cpu(out, x, dy, dy_2, size);
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 73a0d3c4c1..a90f81d079 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -320,76 +320,76 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
+static int _norm_copy_coord_gpu(OpKernelContext* context,
+                                Tensor* tensor_list,
+                                FPTYPE*& coord_cpy,
+                                int*& type_cpy,
+                                int*& idx_mapping,
+                                int& nall,
+                                int& mem_cpy,
+                                const FPTYPE* coord,
+                                const FPTYPE* box,
+                                const int* type,
+                                const int& nloc,
+                                const int& max_cpy_trial,
+                                const float& rcut_r);
+
+template <typename FPTYPE>
+static int _build_nlist_gpu(OpKernelContext* context,
+                            Tensor* tensor_list,
+                            int*& ilist,
+                            int*& numneigh,
+                            int**& firstneigh,
+                            int*& jlist,
+                            int& max_nnei,
+                            int& mem_nnei,
+                            const FPTYPE* coord,
+                            const int& nloc,
+                            const int& new_nall,
+                            const int& max_nnei_trial,
+                            const float& rcut_r);
+
+static void _map_nlist_gpu(int* nlist,
+                           const int* idx_mapping,
+                           const int& nloc,
+                           const int& nnei);
+
+static void _map_nei_info_gpu(int* nlist,
+                              int* ntype,
+                              bool* nmask,
+                              const int* type,
+                              const int* idx_mapping,
+                              const int& nloc,
+                              const int& nnei,
+                              const int& ntypes,
+                              const bool& b_nlist_map);
+
+template <typename FPTYPE>
+static void _prepare_coord_nlist_gpu(OpKernelContext* context,
                                      Tensor* tensor_list,
+                                     FPTYPE const** coord,
                                      FPTYPE*& coord_cpy,
+                                     int const** type,
                                      int*& type_cpy,
                                      int*& idx_mapping,
-                                     int& nall,
+                                     deepmd::InputNlist& inlist,
+                                     int*& ilist,
+                                     int*& numneigh,
+                                     int**& firstneigh,
+                                     int*& jlist,
+                                     int*& nbor_list_dev,
+                                     int& new_nall,
                                      int& mem_cpy,
-                                     const FPTYPE* coord,
+                                     int& mem_nnei,
+                                     int& max_nbor_size,
                                      const FPTYPE* box,
-                                     const int* type,
+                                     const int* mesh_tensor_data,
+                                     const int mesh_tensor_size,
                                      const int& nloc,
+                                     const int& nei_mode,
+                                     const float& rcut_r,
                                      const int& max_cpy_trial,
-                                     const float& rcut_r);
-
-template <typename FPTYPE>
-static int _build_nlist_gpu_rocm(OpKernelContext* context,
-                                 Tensor* tensor_list,
-                                 int*& ilist,
-                                 int*& numneigh,
-                                 int**& firstneigh,
-                                 int*& jlist,
-                                 int& max_nnei,
-                                 int& mem_nnei,
-                                 const FPTYPE* coord,
-                                 const int& nloc,
-                                 const int& new_nall,
-                                 const int& max_nnei_trial,
-                                 const float& rcut_r);
-
-static void _map_nlist_gpu_rocm(int* nlist,
-                                const int* idx_mapping,
-                                const int& nloc,
-                                const int& nnei);
-
-static void _map_nei_info_gpu_rocm(int* nlist,
-                                   int* ntype,
-                                   bool* nmask,
-                                   const int* type,
-                                   const int* idx_mapping,
-                                   const int& nloc,
-                                   const int& nnei,
-                                   const int& ntypes,
-                                   const bool& b_nlist_map);
-
-template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context,
-                                          Tensor* tensor_list,
-                                          FPTYPE const** coord,
-                                          FPTYPE*& coord_cpy,
-                                          int const** type,
-                                          int*& type_cpy,
-                                          int*& idx_mapping,
-                                          deepmd::InputNlist& inlist,
-                                          int*& ilist,
-                                          int*& numneigh,
-                                          int**& firstneigh,
-                                          int*& jlist,
-                                          int*& nbor_list_dev,
-                                          int& new_nall,
-                                          int& mem_cpy,
-                                          int& mem_nnei,
-                                          int& max_nbor_size,
-                                          const FPTYPE* box,
-                                          const int* mesh_tensor_data,
-                                          const int mesh_tensor_size,
-                                          const int& nloc,
-                                          const int& nei_mode,
-                                          const float& rcut_r,
-                                          const int& max_cpy_trial,
-                                          const int& max_nnei_trial);
+                                     const int& max_nnei_trial);
 
 #endif  // TENSORFLOW_USE_ROCM
 
@@ -605,10 +605,10 @@ class ProdEnvMatAOp : public OpKernel {
         array_longlong = uint64_temp.flat<unsigned long long>().data();
 
         // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type,
-                                        gpu_inlist, array_int, array_longlong,
-                                        max_nbor_size, avg, std, nloc,
-                                        frame_nall, rcut_r, rcut_r_smth, sec_a);
+        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
+                                   gpu_inlist, array_int, array_longlong,
+                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   rcut_r, rcut_r_smth, sec_a);
         if (b_nlist_map) {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
@@ -627,7 +627,7 @@ class ProdEnvMatAOp : public OpKernel {
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
         std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
-        _prepare_coord_nlist_gpu_rocm<FPTYPE>(
+        _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
             idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
             nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
@@ -650,12 +650,12 @@ class ProdEnvMatAOp : public OpKernel {
         array_longlong = uint64_temp.flat<unsigned long long>().data();
 
         // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu_rocm(em, em_deriv, rij, nlist, coord, type,
-                                        gpu_inlist, array_int, array_longlong,
-                                        max_nbor_size, avg, std, nloc,
-                                        frame_nall, rcut_r, rcut_r_smth, sec_a);
+        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
+                                   gpu_inlist, array_int, array_longlong,
+                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   rcut_r, rcut_r_smth, sec_a);
         if (b_nlist_map) {
-          _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei);
+          _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
         deepmd::delete_device_memory(firstneigh);
 #endif  // TENSORFLOW_USE_ROCM
@@ -900,10 +900,10 @@ class ProdEnvMatROp : public OpKernel {
         array_longlong = uint64_temp.flat<unsigned long long>().data();
 
         // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_r_gpu_cuda(em, em_deriv, rij, nlist, coord, type,
-                                        gpu_inlist, array_int, array_longlong,
-                                        max_nbor_size, avg, std, nloc,
-                                        frame_nall, rcut, rcut_smth, sec);
+        deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
+                                   gpu_inlist, array_int, array_longlong,
+                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   rcut, rcut_smth, sec);
         if (b_nlist_map) {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
@@ -922,7 +922,7 @@ class ProdEnvMatROp : public OpKernel {
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
         std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
-        _prepare_coord_nlist_gpu_rocm<FPTYPE>(
+        _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
             idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
             nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
@@ -945,12 +945,12 @@ class ProdEnvMatROp : public OpKernel {
         array_longlong = uint64_temp.flat<unsigned long long>().data();
 
         // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_r_gpu_rocm(em, em_deriv, rij, nlist, coord, type,
-                                        gpu_inlist, array_int, array_longlong,
-                                        max_nbor_size, avg, std, nloc,
-                                        frame_nall, rcut, rcut_smth, sec);
+        deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
+                                   gpu_inlist, array_int, array_longlong,
+                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   rcut, rcut_smth, sec);
         if (b_nlist_map) {
-          _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei);
+          _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
         deepmd::delete_device_memory(firstneigh);
 #endif  // TENSORFLOW_USE_ROCM
@@ -1186,10 +1186,10 @@ class ProdEnvMatAMixOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::filter_ftype_gpu_cuda(p_f_type, p_type, nsamples * nall);
+      deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall);
 #endif
 #if TENSORFLOW_USE_ROCM
-      deepmd::filter_ftype_gpu_rocm(p_f_type, p_type, nsamples * nall);
+      deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall);
 #endif
     } else if (device == "CPU") {
       for (int ii = 0; ii < nsamples * nall; ii++) {
@@ -1246,10 +1246,10 @@ class ProdEnvMatAMixOp : public OpKernel {
         array_longlong = uint64_temp.flat<unsigned long long>().data();
 
         // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu_cuda(
-            em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int,
-            array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r,
-            rcut_r_smth, sec_a, f_type);
+        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
+                                   gpu_inlist, array_int, array_longlong,
+                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   rcut_r, rcut_r_smth, sec_a, f_type);
         _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
                           ntypes, b_nlist_map);
         deepmd::delete_device_memory(firstneigh);
@@ -1267,7 +1267,7 @@ class ProdEnvMatAMixOp : public OpKernel {
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
         std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
-        _prepare_coord_nlist_gpu_rocm<FPTYPE>(
+        _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
             idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
             nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
@@ -1290,12 +1290,12 @@ class ProdEnvMatAMixOp : public OpKernel {
         array_longlong = uint64_temp.flat<unsigned long long>().data();
 
         // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu_rocm(
-            em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int,
-            array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r,
-            rcut_r_smth, sec_a, f_type);
-        _map_nei_info_gpu_rocm(nlist, ntype, nmask, type, idx_mapping, nloc,
-                               nnei, ntypes, b_nlist_map);
+        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
+                                   gpu_inlist, array_int, array_longlong,
+                                   max_nbor_size, avg, std, nloc, frame_nall,
+                                   rcut_r, rcut_r_smth, sec_a, f_type);
+        _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
+                          ntypes, b_nlist_map);
         deepmd::delete_device_memory(firstneigh);
 #endif  // TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
@@ -1802,19 +1802,19 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
 
 #if TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE*& coord_cpy,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     int& nall,
-                                     int& mem_cpy,
-                                     const FPTYPE* coord,
-                                     const FPTYPE* box,
-                                     const int* type,
-                                     const int& nloc,
-                                     const int& max_cpy_trial,
-                                     const float& rcut_r) {
+static int _norm_copy_coord_gpu(OpKernelContext* context,
+                                Tensor* tensor_list,
+                                FPTYPE*& coord_cpy,
+                                int*& type_cpy,
+                                int*& idx_mapping,
+                                int& nall,
+                                int& mem_cpy,
+                                const FPTYPE* coord,
+                                const FPTYPE* box,
+                                const int* type,
+                                const int& nloc,
+                                const int& max_cpy_trial,
+                                const float& rcut_r) {
   // Tensor FPTYPE_temp;
   TensorShape FPTYPE_shape;
   FPTYPE_shape.AddDim(nall * 3);
@@ -1857,7 +1857,7 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
   FPTYPE* new_rec_boxt = region_dev.rec_boxt;
   region_dev.boxt = box_info_dev;
   region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu_rocm(tmp_coord, nall, region_dev);
+  deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
     // Tensor cpy_temp;
@@ -1872,7 +1872,7 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
     coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
     type_cpy = (*(tensor_list + 4)).flat<int>().data();
     idx_mapping = type_cpy + mem_cpy;
-    int ret = deepmd::copy_coord_gpu_rocm(
+    int ret = deepmd::copy_coord_gpu(
         coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type,
         nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
     if (ret == 0) {
@@ -1887,19 +1887,19 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
 }
 
 template <typename FPTYPE>
-static int _build_nlist_gpu_rocm(OpKernelContext* context,
-                                 Tensor* tensor_list,
-                                 int*& ilist,
-                                 int*& numneigh,
-                                 int**& firstneigh,
-                                 int*& jlist,
-                                 int& max_nnei,
-                                 int& mem_nnei,
-                                 const FPTYPE* coord,
-                                 const int& nloc,
-                                 const int& new_nall,
-                                 const int& max_nnei_trial,
-                                 const float& rcut_r) {
+static int _build_nlist_gpu(OpKernelContext* context,
+                            Tensor* tensor_list,
+                            int*& ilist,
+                            int*& numneigh,
+                            int**& firstneigh,
+                            int*& jlist,
+                            int& max_nnei,
+                            int& mem_nnei,
+                            const FPTYPE* coord,
+                            const int& nloc,
+                            const int& new_nall,
+                            const int& max_nnei_trial,
+                            const float& rcut_r) {
   // Tensor nlist_temp;
   TensorShape nlist_shape;
   nlist_shape.AddDim(nloc * 2);
@@ -1929,8 +1929,8 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
     }
     deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
     deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
-    int ret = deepmd::build_nlist_gpu_rocm(inlist, &max_nnei, ind_data, coord,
-                                           nloc, new_nall, mem_nnei, rcut_r);
+    int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
+                                      new_nall, mem_nnei, rcut_r);
     if (ret == 0) {
       break;
     } else {
@@ -1940,58 +1940,58 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
   return (tt != max_nnei_trial);
 }
 
-static void _map_nlist_gpu_rocm(int* nlist,
-                                const int* idx_mapping,
-                                const int& nloc,
-                                const int& nnei) {
+static void _map_nlist_gpu(int* nlist,
+                           const int* idx_mapping,
+                           const int& nloc,
+                           const int& nnei) {
   deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
 }
 
-static void _map_nei_info_gpu_rocm(int* nlist,
-                                   int* ntype,
-                                   bool* nmask,
-                                   const int* type,
-                                   const int* idx_mapping,
-                                   const int& nloc,
-                                   const int& nnei,
-                                   const int& ntypes,
-                                   const bool& b_nlist_map) {
-  deepmd::use_nei_info_gpu_rocm(nlist, ntype, nmask, type, idx_mapping, nloc,
-                                nnei, ntypes, b_nlist_map);
+static void _map_nei_info_gpu(int* nlist,
+                              int* ntype,
+                              bool* nmask,
+                              const int* type,
+                              const int* idx_mapping,
+                              const int& nloc,
+                              const int& nnei,
+                              const int& ntypes,
+                              const bool& b_nlist_map) {
+  deepmd::use_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
+                           ntypes, b_nlist_map);
 }
 
 template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context,
-                                          Tensor* tensor_list,
-                                          FPTYPE const** coord,
-                                          FPTYPE*& coord_cpy,
-                                          int const** type,
-                                          int*& type_cpy,
-                                          int*& idx_mapping,
-                                          deepmd::InputNlist& inlist,
-                                          int*& ilist,
-                                          int*& numneigh,
-                                          int**& firstneigh,
-                                          int*& jlist,
-                                          int*& nbor_list_dev,
-                                          int& new_nall,
-                                          int& mem_cpy,
-                                          int& mem_nnei,
-                                          int& max_nbor_size,
-                                          const FPTYPE* box,
-                                          const int* mesh_tensor_data,
-                                          const int mesh_tensor_size,
-                                          const int& nloc,
-                                          const int& nei_mode,
-                                          const float& rcut_r,
-                                          const int& max_cpy_trial,
-                                          const int& max_nnei_trial) {
+static void _prepare_coord_nlist_gpu(OpKernelContext* context,
+                                     Tensor* tensor_list,
+                                     FPTYPE const** coord,
+                                     FPTYPE*& coord_cpy,
+                                     int const** type,
+                                     int*& type_cpy,
+                                     int*& idx_mapping,
+                                     deepmd::InputNlist& inlist,
+                                     int*& ilist,
+                                     int*& numneigh,
+                                     int**& firstneigh,
+                                     int*& jlist,
+                                     int*& nbor_list_dev,
+                                     int& new_nall,
+                                     int& mem_cpy,
+                                     int& mem_nnei,
+                                     int& max_nbor_size,
+                                     const FPTYPE* box,
+                                     const int* mesh_tensor_data,
+                                     const int mesh_tensor_size,
+                                     const int& nloc,
+                                     const int& nei_mode,
+                                     const float& rcut_r,
+                                     const int& max_cpy_trial,
+                                     const int& max_nnei_trial) {
   if (nei_mode != 3 && nei_mode != 4) {
     inlist.inum = nloc;
     // build nlist by myself
     // normalize and copy coord
     if (nei_mode == 1) {
-      int copy_ok = _norm_copy_coord_gpu_rocm(
+      int copy_ok = _norm_copy_coord_gpu(
           context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall,
           mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r);
       OP_REQUIRES(context, copy_ok,
@@ -2001,9 +2001,9 @@ static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context,
     }
     // build nlist
     int build_ok =
-        _build_nlist_gpu_rocm(context, tensor_list + 5, ilist, numneigh,
-                              firstneigh, jlist, max_nbor_size, mem_nnei,
-                              *coord, nloc, new_nall, max_nnei_trial, rcut_r);
+        _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh,
+                         jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall,
+                         max_nnei_trial, rcut_r);
     OP_REQUIRES(context, build_ok,
                 errors::Aborted("cannot allocate mem for nlist"));
     if (max_nbor_size <= 1024) {
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index 228b76e962..7d8a664a8d 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -122,13 +122,13 @@ class ProdForceSeAGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::prod_force_grad_a_gpu_cuda(p_grad_net, p_grad, p_in_deriv,
-                                         p_nlist, nloc, nnei, nframes);
+      deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
+                                    nloc, nnei, nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::prod_force_grad_a_gpu_rocm(p_grad_net, p_grad, p_in_deriv,
-                                         p_nlist, nloc, nnei, nframes);
+      deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
+                                    nloc, nnei, nframes);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
@@ -235,13 +235,13 @@ class ProdForceSeRGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::prod_force_grad_r_gpu_cuda(p_grad_net, p_grad, p_in_deriv,
-                                         p_nlist, nloc, nnei, nframes);
+      deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
+                                    nloc, nnei, nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::prod_force_grad_r_gpu_rocm(p_grad_net, p_grad, p_in_deriv,
-                                         p_nlist, nloc, nnei, nframes);
+      deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
+                                    nloc, nnei, nframes);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_grad_r_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 036064b02d..9d553b1f0c 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -143,13 +143,13 @@ class ProdForceSeAOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::prod_force_a_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist,
-                                    nloc, nall, nnei, nframes);
+      deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
+                               nall, nnei, nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::prod_force_a_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist,
-                                    nloc, nall, nnei, nframes);
+      deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
+                               nall, nnei, nframes);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
@@ -229,13 +229,13 @@ class ProdForceSeROp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::prod_force_r_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist,
-                                    nloc, nall, nnei, nframes);
+      deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
+                               nall, nnei, nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::prod_force_r_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist,
-                                    nloc, nall, nnei, nframes);
+      deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
+                               nall, nnei, nframes);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index 1c035f53ca..ef7d10b3bd 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -143,13 +143,13 @@ class ProdVirialSeAGradOp : public OpKernel {
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
 #if GOOGLE_CUDA
-        deepmd::prod_virial_grad_a_gpu_cuda(grad_net, grad, in_deriv, rij,
-                                            nlist, nloc, nnei);
+        deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist,
+                                       nloc, nnei);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-        deepmd::prod_virial_grad_a_gpu_rocm(grad_net, grad, in_deriv, rij,
-                                            nlist, nloc, nnei);
+        deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist,
+                                       nloc, nnei);
 #endif  // TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist,
@@ -276,13 +276,13 @@ class ProdVirialSeRGradOp : public OpKernel {
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
 #if GOOGLE_CUDA
-        deepmd::prod_virial_grad_r_gpu_cuda(grad_net, grad, in_deriv, rij,
-                                            nlist, nloc, nnei);
+        deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist,
+                                       nloc, nnei);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-        deepmd::prod_virial_grad_r_gpu_rocm(grad_net, grad, in_deriv, rij,
-                                            nlist, nloc, nnei);
+        deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist,
+                                       nloc, nnei);
 #endif  // TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_grad_r_cpu(grad_net, grad, in_deriv, rij, nlist,
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index db13617362..e3960fc37d 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -121,13 +121,13 @@ class ProdVirialSeAOp : public OpKernel {
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
 #if GOOGLE_CUDA
-        deepmd::prod_virial_a_gpu_cuda(virial, atom_virial, net_deriv, in_deriv,
-                                       rij, nlist, nloc, nall, nnei);
+        deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
+                                  nlist, nloc, nall, nnei);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-        deepmd::prod_virial_a_gpu_rocm(virial, atom_virial, net_deriv, in_deriv,
-                                       rij, nlist, nloc, nall, nnei);
+        deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
+                                  nlist, nloc, nall, nnei);
 #endif  // TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
@@ -225,13 +225,13 @@ class ProdVirialSeROp : public OpKernel {
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
 #if GOOGLE_CUDA
-        deepmd::prod_virial_r_gpu_cuda(virial, atom_virial, net_deriv, in_deriv,
-                                       rij, nlist, nloc, nall, nnei);
+        deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
+                                  nlist, nloc, nall, nnei);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-        deepmd::prod_virial_r_gpu_rocm(virial, atom_virial, net_deriv, in_deriv,
-                                       rij, nlist, nloc, nall, nnei);
+        deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
+                                  nlist, nloc, nall, nnei);
 #endif  // TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_r_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 0ac8745f64..886b9d9a6d 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -197,15 +197,13 @@ class TabulateFusionSeAOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_gpu_cuda(descriptor, table, table_info, em_x,
-                                            em, two_embed, nloc, nnei,
-                                            last_layer_size);
+      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
+                                       two_embed, nloc, nnei, last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_a_gpu_rocm(descriptor, table, table_info, em_x,
-                                            em, two_embed, nloc, nnei,
-                                            last_layer_size);
+      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
+                                       two_embed, nloc, nnei, last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
@@ -269,15 +267,15 @@ class TabulateFusionSeAGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_gpu_cuda(
-          dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc,
-          nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
+                                            em_x, em, two_embed, dy, nloc, nnei,
+                                            last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_a_grad_gpu_rocm(
-          dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc,
-          nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
+                                            em_x, em, two_embed, dy, nloc, nnei,
+                                            last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
@@ -333,12 +331,12 @@ class TabulateFusionSeAGradGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_grad_gpu_cuda(
+      deepmd::tabulate_fusion_se_a_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei, last_layer_size, is_sorted);
 #endif  // GOOGLE_CUDA
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_a_grad_grad_gpu_rocm(
+      deepmd::tabulate_fusion_se_a_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei, last_layer_size, is_sorted);
 #endif  // TENSORFLOW_USE_ROCM
@@ -411,15 +409,15 @@ class TabulateFusionSeAttenOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_gpu_cuda(descriptor, table, table_info, em_x,
-                                            em, two_embed, nloc, nnei,
-                                            last_layer_size, is_sorted);
+      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
+                                       two_embed, nloc, nnei, last_layer_size,
+                                       is_sorted);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_a_gpu_rocm(descriptor, table, table_info, em_x,
-                                            em, two_embed, nloc, nnei,
-                                            last_layer_size, is_sorted);
+      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
+                                       two_embed, nloc, nnei, last_layer_size,
+                                       is_sorted);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
@@ -492,15 +490,15 @@ class TabulateFusionSeAttenGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_gpu_cuda(
-          dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc,
-          nnei, last_layer_size, is_sorted);
+      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
+                                            em_x, em, two_embed, dy, nloc, nnei,
+                                            last_layer_size, is_sorted);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_a_grad_gpu_rocm(
-          dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc,
-          nnei, last_layer_size, is_sorted);
+      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
+                                            em_x, em, two_embed, dy, nloc, nnei,
+                                            last_layer_size, is_sorted);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
@@ -562,15 +560,13 @@ class TabulateFusionSeTOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_gpu_cuda(descriptor, table, table_info, em_x,
-                                            em, nloc, nnei_i, nnei_j,
-                                            last_layer_size);
+      deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
+                                       nloc, nnei_i, nnei_j, last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_t_gpu_rocm(descriptor, table, table_info, em_x,
-                                            em, nloc, nnei_i, nnei_j,
-                                            last_layer_size);
+      deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
+                                       nloc, nnei_i, nnei_j, last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em,
@@ -632,15 +628,15 @@ class TabulateFusionSeTGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_grad_gpu_cuda(
-          dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i,
-          nnei_j, last_layer_size);
+      deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
+                                            em_x, em, dy, nloc, nnei_i, nnei_j,
+                                            last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_t_grad_gpu_rocm(
-          dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i,
-          nnei_j, last_layer_size);
+      deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
+                                            em_x, em, dy, nloc, nnei_i, nnei_j,
+                                            last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info,
@@ -695,12 +691,12 @@ class TabulateFusionSeTGradGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_grad_grad_gpu_cuda(
+      deepmd::tabulate_fusion_se_t_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei_i, nnei_j, last_layer_size);
 #endif  // GOOGLE_CUDA
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_t_grad_grad_gpu_rocm(
+      deepmd::tabulate_fusion_se_t_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei_i, nnei_j, last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
@@ -763,13 +759,13 @@ class TabulateFusionSeROp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_gpu_cuda(descriptor, table, table_info, em,
-                                            nloc, nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
+                                       nnei, last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_r_gpu_rocm(descriptor, table, table_info, em,
-                                            nloc, nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
+                                       nnei, last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc,
@@ -823,13 +819,13 @@ class TabulateFusionSeRGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_grad_gpu_cuda(
-          dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
+                                            nloc, nnei, last_layer_size);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_r_grad_gpu_rocm(
-          dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size);
+      deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
+                                            nloc, nnei, last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy,
@@ -876,11 +872,11 @@ class TabulateFusionSeRGradGradOp : public OpKernel {
 
     if (device == "GPU") {
 #if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_grad_grad_gpu_cuda(
+      deepmd::tabulate_fusion_se_r_grad_grad_gpu(
           dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
 #endif  // GOOGLE_CUDA
 #if TENSORFLOW_USE_ROCM
-      deepmd::tabulate_fusion_se_r_grad_grad_gpu_rocm(
+      deepmd::tabulate_fusion_se_r_grad_grad_gpu(
           dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
 #endif  // TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),

From b34e5a31f8f6e4b38064872ada0cac27fd2929e0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 20 Sep 2023 15:43:00 +0800
Subject: [PATCH 44/63] [pre-commit.ci] pre-commit autoupdate (#2840)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--pre-commit.ci start-->
updates:
- [github.com/astral-sh/ruff-pre-commit: v0.0.288 →
v0.0.290](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.288...v0.0.290)
<!--pre-commit.ci end-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 19c29c0322..8125324ea1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
       files: \.py$
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.0.288
+    rev: v0.0.290
     hooks:
     - id: ruff
       args: ["--fix"]

From 338018ceffe9017f6225c05c9d3e04011b46b0d1 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 20 Sep 2023 03:49:03 -0400
Subject: [PATCH 45/63] merge CUDA and ROCm in header files (#2845)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/coord.h              |  45 +---------
 source/lib/include/fmt_nlist.h          |  28 +-----
 source/lib/include/gelu.h               |  23 +----
 source/lib/include/neighbor_list.h      |  46 +---------
 source/lib/include/prod_env_mat.h       |  52 +----------
 source/lib/include/prod_env_mat_nvnmd.h |   8 +-
 source/lib/include/prod_force.h         |  26 +-----
 source/lib/include/prod_force_grad.h    |  23 +----
 source/lib/include/prod_virial.h        |  28 +-----
 source/lib/include/prod_virial_grad.h   |  24 +----
 source/lib/include/region.h             |  19 +---
 source/lib/include/tabulate.h           | 112 +-----------------------
 12 files changed, 24 insertions(+), 410 deletions(-)

diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h
index fb60f6440b..699a90898c 100644
--- a/source/lib/include/coord.h
+++ b/source/lib/include/coord.h
@@ -44,7 +44,7 @@ void compute_cell_info(int* cell_info,
                        const float& rcut,
                        const deepmd::Region<FPTYPE>& region);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // normalize coords
 // output:
 // coord
@@ -83,47 +83,6 @@ int copy_coord_gpu(FPTYPE* out_c,
                    const int& total_cellnum,
                    const int* cell_info,
                    const deepmd::Region<FPTYPE>& region);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-// normalize coords
-// output:
-// coord
-// input:
-// natom, box_info: boxt, rec_boxt
-template <typename FPTYPE>
-void normalize_coord_gpu(FPTYPE* coord,
-                         const int natom,
-                         const deepmd::Region<FPTYPE>& region);
-
-// copy coordinates
-// outputs:
-//	out_c, out_t, mapping, nall,
-//  int_data(temp cuda
-//  memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map,
-//                             cell_map,cell_shift_map,sec_loc_cellnum_map,sec_total_cellnum_map,loc_clist
-// inputs:
-//	in_c, in_t, nloc, mem_nall, loc_cellnum, total_cellnum, cell_info,
-// box_info 	mem_nall is the size of allocated memory for out_c, out_t,
-// mapping
-// returns
-//	0: succssful
-//	1: the memory is not large enough to hold all copied coords and types.
-//	   i.e. nall > mem_nall
-template <typename FPTYPE>
-int copy_coord_gpu(FPTYPE* out_c,
-                   int* out_t,
-                   int* mapping,
-                   int* nall,
-                   int* int_data,
-                   const FPTYPE* in_c,
-                   const int* in_t,
-                   const int& nloc,
-                   const int& mem_nall,
-                   const int& loc_cellnum,
-                   const int& total_cellnum,
-                   const int* cell_info,
-                   const deepmd::Region<FPTYPE>& region);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 1e7c6574cc..18cb319304 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -18,7 +18,7 @@ void format_nlist_cpu(int* nlist,
                       const float rcut,
                       const std::vector<int> sec);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void format_nbor_list_gpu(int* nlist,
                           const FPTYPE* coord,
@@ -40,31 +40,7 @@ void test_encoding_decoding_nbor_info_gpu(uint_64* key,
                                           const FPTYPE* in_dist,
                                           const int* in_index,
                                           const int size_of_array);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void format_nbor_list_gpu(int* nlist,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          int* array_int,
-                          uint_64* array_longlong,
-                          const int max_nbor_size,
-                          const int nloc,
-                          const int nall,
-                          const float rcut,
-                          const std::vector<int> sec);
-
-template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                          int* out_type,
-                                          int* out_index,
-                                          const int* in_type,
-                                          const FPTYPE* in_dist,
-                                          const int* in_index,
-                                          const int size_of_array);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
 
diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h
index 946c283c8d..013d4ef02b 100644
--- a/source/lib/include/gelu.h
+++ b/source/lib/include/gelu.h
@@ -20,7 +20,7 @@ void gelu_grad_grad_cpu(FPTYPE* out,
                         const FPTYPE* dy_2,
                         const int_64 size);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size);
 
@@ -36,24 +36,5 @@ void gelu_grad_grad_gpu(FPTYPE* out,
                         const FPTYPE* dy,
                         const FPTYPE* dy_2,
                         const int_64 size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size);
-
-template <typename FPTYPE>
-void gelu_grad_gpu(FPTYPE* out,
-                   const FPTYPE* xx,
-                   const FPTYPE* dy,
-                   const int_64 size);
-
-template <typename FPTYPE>
-void gelu_grad_grad_gpu(FPTYPE* out,
-                        const FPTYPE* xx,
-                        const FPTYPE* dy,
-                        const FPTYPE* dy_2,
-                        const int_64 size);
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 5ed2dd4501..eb510eb25b 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -121,7 +121,7 @@ void use_nlist_map(int* nlist,
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
@@ -162,49 +162,7 @@ void use_nei_info_gpu(int* nlist,
                       const int ntypes,
                       const bool b_nlist_map);
 
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-// build neighbor list.
-// outputs
-//	nlist, max_list_size
-//	max_list_size is the maximal size of jlist.
-// inputs
-//	c_cpy, nloc, nall, mem_size, rcut, region
-//	mem_size is the size of allocated memory for jlist.
-// returns
-//	0: succssful
-//	1: the memory is not large enough to hold all neighbors.
-//	   i.e. max_list_size > mem_nall
-template <typename FPTYPE>
-int build_nlist_gpu(InputNlist& nlist,
-                    int* max_list_size,
-                    int* nlist_data,
-                    const FPTYPE* c_cpy,
-                    const int& nloc,
-                    const int& nall,
-                    const int& mem_size,
-                    const float& rcut);
-/**
- * @brief Filter the fake atom type.
- * @details If >=0, set to 0; if <0, set to -1.
- * @param ftype_out The output filtered atom type.
- * @param ftype_in The input atom type.
- * @param nloc The number of atoms.
- */
-void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc);
-
-void use_nei_info_gpu(int* nlist,
-                      int* ntype,
-                      bool* nmask,
-                      const int* type,
-                      const int* nlist_map,
-                      const int nloc,
-                      const int nnei,
-                      const int ntypes,
-                      const bool b_nlist_map);
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
 
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 91f09f74e7..60da638d68 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -42,7 +42,7 @@ void prod_env_mat_r_cpu(FPTYPE *em,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_env_mat_a_gpu(FPTYPE *em,
                         FPTYPE *em_deriv,
@@ -88,54 +88,6 @@ void env_mat_nbor_update(InputNlist &inlist,
                          int *&nbor_list_dev,
                          const int *mesh,
                          const int size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_env_mat_a_gpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &gpu_inlist,
-                        int *array_int,
-                        unsigned long long *array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec,
-                        const int *f_type = NULL);
-
-template <typename FPTYPE>
-void prod_env_mat_r_gpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &gpu_inlist,
-                        int *array_int,
-                        unsigned long long *array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec);
-
-void env_mat_nbor_update(InputNlist &inlist,
-                         InputNlist &gpu_inlist,
-                         int &max_nbor_size,
-                         int *&nbor_list_dev,
-                         const int *mesh,
-                         const int size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_env_mat_nvnmd.h b/source/lib/include/prod_env_mat_nvnmd.h
index df70423021..c0a7e32cc4 100644
--- a/source/lib/include/prod_env_mat_nvnmd.h
+++ b/source/lib/include/prod_env_mat_nvnmd.h
@@ -45,12 +45,8 @@ void prod_env_mat_a_nvnmd_quantize_cpu(FPTYPE* em,
                                        const std::vector<int> sec,
                                        const int* f_type = NULL);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-// UNDEFINE
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h
index 03c72ba661..b5ae68bdce 100644
--- a/source/lib/include/prod_force.h
+++ b/source/lib/include/prod_force.h
@@ -67,7 +67,7 @@ void prod_force_r_cpu(FPTYPE* force,
                       const int nnei,
                       const int nframes);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_force_a_gpu(FPTYPE* force,
                       const FPTYPE* net_deriv,
@@ -87,28 +87,6 @@ void prod_force_r_gpu(FPTYPE* force,
                       const int nall,
                       const int nnei,
                       const int nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_force_a_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes);
-
-template <typename FPTYPE>
-void prod_force_r_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h
index 5d0ab50b68..737d54001d 100644
--- a/source/lib/include/prod_force_grad.h
+++ b/source/lib/include/prod_force_grad.h
@@ -21,7 +21,7 @@ void prod_force_grad_r_cpu(FPTYPE* grad_net,
                            const int nnei,
                            const int nframes);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_force_grad_a_gpu(FPTYPE* grad_net,
                            const FPTYPE* grad,
@@ -39,25 +39,6 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net,
                            const int nloc,
                            const int nnei,
                            const int nframes);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_force_grad_a_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes);
-
-template <typename FPTYPE>
-void prod_force_grad_r_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes);
-#endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h
index 348188874c..d42b547d32 100644
--- a/source/lib/include/prod_virial.h
+++ b/source/lib/include/prod_virial.h
@@ -25,7 +25,7 @@ void prod_virial_r_cpu(FPTYPE* virial,
                        const int nall,
                        const int nnei);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_virial_a_gpu(FPTYPE* virial,
                        FPTYPE* atom_virial,
@@ -47,30 +47,6 @@ void prod_virial_r_gpu(FPTYPE* virial,
                        const int nloc,
                        const int nall,
                        const int nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_virial_a_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* env_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_r_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* env_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h
index 6e0c232f8a..eda98f9634 100644
--- a/source/lib/include/prod_virial_grad.h
+++ b/source/lib/include/prod_virial_grad.h
@@ -21,7 +21,7 @@ void prod_virial_grad_r_cpu(FPTYPE* grad_net,
                             const int nloc,
                             const int nnei);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_virial_grad_a_gpu(FPTYPE* grad_net,
                             const FPTYPE* grad,
@@ -39,26 +39,6 @@ void prod_virial_grad_r_gpu(FPTYPE* grad_net,
                             const int* nlist,
                             const int nloc,
                             const int nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_grad_r_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/region.h b/source/lib/include/region.h
index 9db2735462..2f6dbbf4e0 100644
--- a/source/lib/include/region.h
+++ b/source/lib/include/region.h
@@ -27,7 +27,7 @@ void convert_to_phys_cpu(FPTYPE* rp,
                          const Region<FPTYPE>& region,
                          const FPTYPE* ri);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // only for unittest
 template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE* ri,
@@ -41,21 +41,6 @@ void convert_to_phys_gpu(FPTYPE* rp,
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_ROCM
-// only for unittest
-template <typename FPTYPE>
-void convert_to_inter_gpu(FPTYPE* ri,
-                          const Region<FPTYPE>& region,
-                          const FPTYPE* rp);
-
-template <typename FPTYPE>
-void convert_to_phys_gpu(FPTYPE* rp,
-                         const Region<FPTYPE>& region,
-                         const FPTYPE* ri);
-
-template <typename FPTYPE>
-void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region);
-#endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h
index 96072e6a33..76a46bbe6c 100644
--- a/source/lib/include/tabulate.h
+++ b/source/lib/include/tabulate.h
@@ -108,7 +108,7 @@ void tabulate_fusion_se_r_grad_grad_cpu(FPTYPE* dz_dy,
                                         const int nnei,
                                         const int last_layer_size);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void tabulate_fusion_se_a_gpu(FPTYPE* out,
                               const FPTYPE* table,
@@ -213,113 +213,5 @@ void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
                                         const int nloc,
                                         const int nnei,
                                         const int last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const FPTYPE* two_embed,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size,
-                              const bool is_sorted = true);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* two_embed,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size,
-                                   const bool is_sorted = true);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size,
-                                        const bool is_sorted = true);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei_i,
-                              const int nnei_j,
-                              const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei_i,
-                                   const int nnei_j,
-                                   const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei_i,
-                                        const int nnei_j,
-                                        const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size);
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace deepmd

From fa2c0b67c80a932bf98e9bc9c38a0c7e8aade710 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 20 Sep 2023 03:51:05 -0400
Subject: [PATCH 46/63] merge CUDA and ROCm codes in tests (#2846)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/tests/test_coord.cc              | 375 +-------------------
 source/lib/tests/test_env_mat_a.cc          | 198 +----------
 source/lib/tests/test_env_mat_a_mix.cc      | 222 +-----------
 source/lib/tests/test_env_mat_r.cc          | 189 +---------
 source/lib/tests/test_fmt_nlist.cc          | 163 +--------
 source/lib/tests/test_gelu.cc               |  71 +---
 source/lib/tests/test_neighbor_list.cc      | 100 +-----
 source/lib/tests/test_prod_force_a.cc       |  34 +-
 source/lib/tests/test_prod_force_grad_a.cc  |  34 +-
 source/lib/tests/test_prod_force_grad_r.cc  |  34 +-
 source/lib/tests/test_prod_force_r.cc       |  34 +-
 source/lib/tests/test_prod_virial_a.cc      |  53 +--
 source/lib/tests/test_prod_virial_grad_a.cc |  38 +-
 source/lib/tests/test_prod_virial_grad_r.cc |  38 +-
 source/lib/tests/test_prod_virial_r.cc      |  53 +--
 source/lib/tests/test_simulation_region.cc  |  77 +---
 source/lib/tests/test_tabulate_se_a.cc      | 103 +-----
 source/lib/tests/test_tabulate_se_r.cc      |  53 +--
 source/lib/tests/test_tabulate_se_t.cc      |  66 +---
 19 files changed, 41 insertions(+), 1894 deletions(-)

diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc
index 581301b6a7..af320ca3f7 100644
--- a/source/lib/tests/test_coord.cc
+++ b/source/lib/tests/test_coord.cc
@@ -59,7 +59,7 @@ TEST_F(TestNormCoord, cpu_case2) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestNormCoord, gpu_case0) {
   deepmd::Region<double> region;
   deepmd::Region<double> region_dev;
@@ -144,94 +144,7 @@ TEST_F(TestNormCoord, gpu_case2) {
   }
 }
 
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestNormCoord, gpu_case0) {
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  double* box_info_dev = NULL;
-  double* out_c_dev = NULL;
-  std::vector<double> out_c(r0);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(out_c_dev, out_c);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  for (int ii = 0; ii < posi.size(); ++ii) {
-    EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12);
-  }
-}
-
-TEST_F(TestNormCoord, gpu_case1) {
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  double* box_info_dev = NULL;
-  double* out_c_dev = NULL;
-  std::vector<double> out_c(r1);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(out_c_dev, out_c);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  for (int ii = 0; ii < posi.size(); ++ii) {
-    EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12);
-  }
-}
-
-TEST_F(TestNormCoord, gpu_case2) {
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  double* box_info_dev = NULL;
-  double* out_c_dev = NULL;
-  std::vector<double> out_c(r2);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(out_c_dev, out_c);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  for (int ii = 0; ii < posi.size(); ++ii) {
-    EXPECT_LT(fabs(out_c[ii] - posi[ii]), 1e-12);
-  }
-}
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 typedef std::pair<std::vector<double>, std::vector<int>> atom;
 
@@ -375,7 +288,7 @@ TEST_F(TestCopyCoord, cpu_lessmem) {
   // 	    << nall << std::endl;
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestCopyCoord, gpu) {
   int mem_size = 1000;
   std::vector<double> out_c(mem_size * 3);
@@ -514,145 +427,6 @@ TEST_F(TestCopyCoord, gpu_lessmem) {
 }
 #endif  // GOOGLE_CUDA
 
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestCopyCoord, gpu) {
-  int mem_size = 1000;
-  std::vector<double> out_c(mem_size * 3);
-  std::vector<int> out_t(mem_size);
-  std::vector<int> mapping(mem_size);
-  int nall;
-  std::vector<int> cell_info;
-  cell_info.resize(23);
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  deepmd::compute_cell_info(&cell_info[0], rc, region);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  const int loc_cellnum = cell_info[21];
-  const int total_cellnum = cell_info[22];
-  int* cell_info_dev = NULL;
-  double* box_info_dev = NULL;
-  double *out_c_dev = NULL, *in_c_dev = NULL;
-  int *out_t_dev = NULL, *in_t_dev = NULL, *mapping_dev = NULL,
-      *int_data_dev = NULL;
-  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(in_c_dev, posi);
-  deepmd::malloc_device_memory_sync(in_t_dev, atype);
-  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
-  deepmd::malloc_device_memory(out_t_dev, mem_size);
-  deepmd::malloc_device_memory(mapping_dev, mem_size);
-  deepmd::malloc_device_memory(
-      int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                        total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
-                        1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
-                                   int_data_dev, in_c_dev, in_t_dev, nloc,
-                                   mem_size, loc_cellnum, total_cellnum,
-                                   cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::memcpy_device_to_host(out_t_dev, out_t);
-  deepmd::memcpy_device_to_host(mapping_dev, mapping);
-  deepmd::delete_device_memory(cell_info_dev);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(in_c_dev);
-  deepmd::delete_device_memory(in_t_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  deepmd::delete_device_memory(out_t_dev);
-  deepmd::delete_device_memory(mapping_dev);
-  deepmd::delete_device_memory(int_data_dev);
-  EXPECT_EQ(ret, 0);
-  EXPECT_EQ(nall, expected_nall);
-  out_c.resize(nall * 3);
-  out_t.resize(nall);
-  mapping.resize(nall);
-
-  std::vector<double> out_c_1(mem_size * 3);
-  std::vector<int> out_t_1(mem_size);
-  std::vector<int> mapping_1(mem_size);
-  sort_atoms(out_c_1, out_t_1, mapping_1, out_c, out_t, mapping, nloc, nall);
-  for (int ii = 0; ii < expected_nall; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      EXPECT_LT(fabs(out_c_1[ii * 3 + dd] - expected_posi_cpy[ii * 3 + dd]),
-                1e-12);
-    }
-    EXPECT_EQ(out_t_1[ii], expected_atype_cpy[ii]);
-    EXPECT_EQ(mapping_1[ii], expected_mapping[ii]);
-  }
-}
-
-TEST_F(TestCopyCoord, gpu_lessmem) {
-  int mem_size = 40;
-  std::vector<double> out_c(mem_size * 3);
-  std::vector<int> out_t(mem_size);
-  std::vector<int> mapping(mem_size);
-  int nall;
-  std::vector<int> cell_info;
-  cell_info.resize(23);
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  deepmd::compute_cell_info(&cell_info[0], rc, region);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  const int loc_cellnum = cell_info[21];
-  const int total_cellnum = cell_info[22];
-  int* cell_info_dev = NULL;
-  double* box_info_dev = NULL;
-  double *out_c_dev = NULL, *in_c_dev = NULL;
-  int *out_t_dev = NULL, *in_t_dev = NULL, *mapping_dev = NULL,
-      *int_data_dev = NULL;
-  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(in_c_dev, posi);
-  deepmd::malloc_device_memory_sync(in_t_dev, atype);
-  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
-  deepmd::malloc_device_memory(out_t_dev, mem_size);
-  deepmd::malloc_device_memory(mapping_dev, mem_size);
-  deepmd::malloc_device_memory(
-      int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                        total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
-                        1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
-                                   int_data_dev, in_c_dev, in_t_dev, nloc,
-                                   mem_size, loc_cellnum, total_cellnum,
-                                   cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::memcpy_device_to_host(out_t_dev, out_t);
-  deepmd::memcpy_device_to_host(mapping_dev, mapping);
-  deepmd::delete_device_memory(cell_info_dev);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(in_c_dev);
-  deepmd::delete_device_memory(in_t_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  deepmd::delete_device_memory(out_t_dev);
-  deepmd::delete_device_memory(mapping_dev);
-  deepmd::delete_device_memory(int_data_dev);
-  EXPECT_EQ(ret, 1);
-  // EXPECT_EQ(nall, expected_nall);
-  // std::cout << "---------------------"
-  // 	    << nloc << " "
-  // 	    << nall << std::endl;
-}
-#endif  // TENSORFLOW_USE_ROCM
-
 class TestCopyCoordMoreCell : public ::testing::Test {
  protected:
   std::vector<double> posi = {0.041, 0.072, 0.100, 4.053, 0.041, 0.068,
@@ -760,146 +534,7 @@ TEST_F(TestCopyCoordMoreCell, cpu_lessmem) {
   // 	    << nall << std::endl;
 }
 
-#if GOOGLE_CUDA
-TEST_F(TestCopyCoordMoreCell, gpu) {
-  int mem_size = 1000;
-  std::vector<double> out_c(mem_size * 3);
-  std::vector<int> out_t(mem_size);
-  std::vector<int> mapping(mem_size);
-  int nall;
-  std::vector<int> cell_info;
-  cell_info.resize(23);
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  deepmd::compute_cell_info(&cell_info[0], rc, region);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  const int loc_cellnum = cell_info[21];
-  const int total_cellnum = cell_info[22];
-  int* cell_info_dev = NULL;
-  double* box_info_dev = NULL;
-  double *out_c_dev = NULL, *in_c_dev = NULL;
-  int *out_t_dev = NULL, *in_t_dev = NULL, *mapping_dev = NULL,
-      *int_data_dev = NULL;
-  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(in_c_dev, posi);
-  deepmd::malloc_device_memory_sync(in_t_dev, atype);
-  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
-  deepmd::malloc_device_memory(out_t_dev, mem_size);
-  deepmd::malloc_device_memory(mapping_dev, mem_size);
-  deepmd::malloc_device_memory(
-      int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                        total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
-                        1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
-                                   int_data_dev, in_c_dev, in_t_dev, nloc,
-                                   mem_size, loc_cellnum, total_cellnum,
-                                   cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::memcpy_device_to_host(out_t_dev, out_t);
-  deepmd::memcpy_device_to_host(mapping_dev, mapping);
-  deepmd::delete_device_memory(cell_info_dev);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(in_c_dev);
-  deepmd::delete_device_memory(in_t_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  deepmd::delete_device_memory(out_t_dev);
-  deepmd::delete_device_memory(mapping_dev);
-  deepmd::delete_device_memory(int_data_dev);
-  EXPECT_EQ(ret, 0);
-  EXPECT_EQ(nall, expected_nall);
-  out_c.resize(nall * 3);
-  out_t.resize(nall);
-  mapping.resize(nall);
-
-  std::vector<double> out_c_1(mem_size * 3);
-  std::vector<int> out_t_1(mem_size);
-  std::vector<int> mapping_1(mem_size);
-  sort_atoms(out_c_1, out_t_1, mapping_1, out_c, out_t, mapping, nloc, nall);
-  for (int ii = 0; ii < expected_nall; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      EXPECT_LT(fabs(out_c_1[ii * 3 + dd] - expected_posi_cpy[ii * 3 + dd]),
-                1e-12);
-    }
-    EXPECT_EQ(out_t_1[ii], expected_atype_cpy[ii]);
-    EXPECT_EQ(mapping_1[ii], expected_mapping[ii]);
-  }
-}
-
-TEST_F(TestCopyCoordMoreCell, gpu_lessmem) {
-  int mem_size = 40;
-  std::vector<double> out_c(mem_size * 3);
-  std::vector<int> out_t(mem_size);
-  std::vector<int> mapping(mem_size);
-  int nall;
-  std::vector<int> cell_info;
-  cell_info.resize(23);
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  init_region_cpu(region, &boxt[0]);
-  deepmd::compute_cell_info(&cell_info[0], rc, region);
-  std::vector<double> box_info;
-  box_info.resize(18);
-  memcpy(&box_info[0], &boxt[0], sizeof(double) * 9);
-  memcpy(&box_info[9], region.rec_boxt, sizeof(double) * 9);
-  const int loc_cellnum = cell_info[21];
-  const int total_cellnum = cell_info[22];
-  int* cell_info_dev = NULL;
-  double* box_info_dev = NULL;
-  double *out_c_dev = NULL, *in_c_dev = NULL;
-  int *out_t_dev = NULL, *in_t_dev = NULL, *mapping_dev = NULL,
-      *int_data_dev = NULL;
-  deepmd::malloc_device_memory_sync(cell_info_dev, cell_info);
-  deepmd::malloc_device_memory_sync(box_info_dev, box_info);
-  deepmd::malloc_device_memory_sync(in_c_dev, posi);
-  deepmd::malloc_device_memory_sync(in_t_dev, atype);
-  deepmd::malloc_device_memory(out_c_dev, mem_size * 3);
-  deepmd::malloc_device_memory(out_t_dev, mem_size);
-  deepmd::malloc_device_memory(mapping_dev, mem_size);
-  deepmd::malloc_device_memory(
-      int_data_dev, nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                        total_cellnum * 3 + loc_cellnum + 1 + total_cellnum +
-                        1 + nloc);
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall,
-                                   int_data_dev, in_c_dev, in_t_dev, nloc,
-                                   mem_size, loc_cellnum, total_cellnum,
-                                   cell_info_dev, region_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  deepmd::memcpy_device_to_host(out_c_dev, out_c);
-  deepmd::memcpy_device_to_host(out_t_dev, out_t);
-  deepmd::memcpy_device_to_host(mapping_dev, mapping);
-  deepmd::delete_device_memory(cell_info_dev);
-  deepmd::delete_device_memory(box_info_dev);
-  deepmd::delete_device_memory(in_c_dev);
-  deepmd::delete_device_memory(in_t_dev);
-  deepmd::delete_device_memory(out_c_dev);
-  deepmd::delete_device_memory(out_t_dev);
-  deepmd::delete_device_memory(mapping_dev);
-  deepmd::delete_device_memory(int_data_dev);
-  EXPECT_EQ(ret, 1);
-  // EXPECT_EQ(nall, expected_nall);
-  // std::cout << "---------------------"
-  // 	    << nloc << " "
-  // 	    << nall << std::endl;
-}
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestCopyCoordMoreCell, gpu) {
   int mem_size = 1000;
   std::vector<double> out_c(mem_size * 3);
@@ -1036,4 +671,4 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) {
   // 	    << nloc << " "
   // 	    << nall << std::endl;
 }
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 639f99414d..89756c9fc5 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -590,7 +590,7 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) {
   // }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestEnvMatA, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
@@ -782,198 +782,4 @@ TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
     }
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestEnvMatA, prod_gpu) {
-  EXPECT_EQ(nlist_r_cpy.size(), nloc);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
-    tot_nnei += nlist_a_cpy[ii].size();
-    if (nlist_a_cpy[ii].size() > max_nbor_size) {
-      max_nbor_size = nlist_a_cpy[ii].size();
-    }
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
-  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
-
-  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
-  double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(avg_dev, avg);
-  deepmd::malloc_device_memory_sync(std_dev, std);
-  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
-                                   max_nbor_size);
-
-  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
-  deepmd::memcpy_device_to_host(em_dev, em);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(em_deriv_dev);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(atype_cpy_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(avg_dev);
-  deepmd::delete_device_memory(std_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int jj = 0; jj < nnei; ++jj) {
-      for (int dd = 0; dd < 4; ++dd) {
-        EXPECT_LT(fabs(em[ii * nnei * 4 + jj * 4 + dd] -
-                       expected_env[ii * nnei * 4 + jj * 4 + dd]),
-                  1e-5);
-      }
-    }
-  }
-}
-
-TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
-  EXPECT_EQ(nlist_r_cpy.size(), nloc);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
-    tot_nnei += nlist_a_cpy[ii].size();
-    if (nlist_a_cpy[ii].size() > max_nbor_size) {
-      max_nbor_size = nlist_a_cpy[ii].size();
-    }
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
-  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
-
-  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
-  double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(avg_dev, avg);
-  deepmd::malloc_device_memory_sync(std_dev, std);
-
-  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
-                                   max_nbor_size);
-
-  deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
-  deepmd::memcpy_device_to_host(em_dev, em);
-  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
-  deepmd::memcpy_device_to_host(rij_dev, rij);
-  deepmd::memcpy_device_to_host(nlist_dev, nlist);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(em_deriv_dev);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(atype_cpy_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(avg_dev);
-  deepmd::delete_device_memory(std_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-
-  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
-  std::vector<double> env_1, env_deriv_1, rij_a_1;
-  for (int ii = 0; ii < nloc; ++ii) {
-    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy,
-                                           ii, nlist_a_cpy[ii], rc, sec_a);
-    EXPECT_EQ(ret_1, -1);
-    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy,
-                                  atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth,
-                                  rc);
-    EXPECT_EQ(env_1.size(), nnei * 4);
-    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
-    EXPECT_EQ(rij_a_1.size(), nnei * 3);
-    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
-    EXPECT_EQ(env_1.size() * nloc, em.size());
-    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
-    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
-    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
-    for (unsigned jj = 0; jj < env_1.size(); ++jj) {
-      EXPECT_LT(fabs(em[ii * nnei * 4 + jj] - env_1[jj]), 1e-10);
-    }
-    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj) {
-      EXPECT_LT(fabs(em_deriv[ii * nnei * 4 * 3 + jj] - env_deriv_1[jj]),
-                1e-10);
-    }
-    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj) {
-      EXPECT_LT(fabs(rij[ii * nnei * 3 + jj] - rij_a_1[jj]), 1e-10);
-    }
-    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj) {
-      EXPECT_EQ(nlist[ii * nnei + jj], fmt_nlist_a_1[jj]);
-    }
-  }
-
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int jj = 0; jj < nnei; ++jj) {
-      for (int dd = 0; dd < 4; ++dd) {
-        EXPECT_LT(fabs(em[ii * nnei * 4 + jj * 4 + dd] -
-                       expected_env[ii * nnei * 4 + jj * 4 + dd]),
-                  1e-5);
-      }
-    }
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
index f415317929..909088d1e3 100644
--- a/source/lib/tests/test_env_mat_a_mix.cc
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -628,7 +628,7 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) {
   // }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestEnvMatAMix, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
@@ -844,222 +844,4 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
     }
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestEnvMatAMix, prod_gpu) {
-  EXPECT_EQ(nlist_r_cpy.size(), nloc);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
-    tot_nnei += nlist_a_cpy[ii].size();
-    if (nlist_a_cpy[ii].size() > max_nbor_size) {
-      max_nbor_size = nlist_a_cpy[ii].size();
-    }
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
-  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<int> ntype(nloc * nnei, 0);
-  bool *nmask = new bool[nloc * nnei];
-  memset(nmask, 0, sizeof(bool) * nloc * nnei);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
-
-  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL,
-         *nmask_dev = NULL;
-  double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL,
-      *ntype_dev = NULL, *mapping_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(avg_dev, avg);
-  deepmd::malloc_device_memory_sync(std_dev, std);
-  deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
-  deepmd::malloc_device_memory_sync(atype_dev, atype);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(ntype_dev, ntype);
-  deepmd::malloc_device_memory_sync(mapping_dev, mapping);
-  deepmd::malloc_device_memory_sync(nmask_dev, nmask, nloc * nnei);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
-                                   max_nbor_size);
-
-  deepmd::prod_env_mat_a_gpu(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
-
-  deepmd::use_nei_info_gpu(nlist_dev, ntype_dev, nmask_dev, atype_dev,
-                           mapping_dev, nloc, nnei, ntypes, true);
-  deepmd::memcpy_device_to_host(em_dev, em);
-  deepmd::memcpy_device_to_host(ntype_dev, ntype);
-  deepmd::memcpy_device_to_host(nmask_dev, nmask, nloc * nnei);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(em_deriv_dev);
-  deepmd::delete_device_memory(rij_dev);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(ntype_dev);
-  deepmd::delete_device_memory(nmask_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(f_atype_cpy_dev);
-  deepmd::delete_device_memory(atype_dev);
-  deepmd::delete_device_memory(mapping_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(avg_dev);
-  deepmd::delete_device_memory(std_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int jj = 0; jj < nnei; ++jj) {
-      for (int dd = 0; dd < 4; ++dd) {
-        EXPECT_LT(fabs(em[ii * nnei * 4 + jj * 4 + dd] -
-                       expected_env[ii * nnei * 4 + jj * 4 + dd]),
-                  1e-5);
-      }
-      EXPECT_EQ(ntype[ii * nnei + jj], expected_ntype[ii * nnei + jj]);
-      EXPECT_EQ(nmask[ii * nnei + jj], expected_nmask[ii * nnei + jj]);
-    }
-  }
-  delete[] nmask;
-}
-
-TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
-  EXPECT_EQ(nlist_r_cpy.size(), nloc);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
-    tot_nnei += nlist_a_cpy[ii].size();
-    if (nlist_a_cpy[ii].size() > max_nbor_size) {
-      max_nbor_size = nlist_a_cpy[ii].size();
-    }
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
-  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
-
-  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
-  double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL,
-      *array_int_dev = NULL, *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(avg_dev, avg);
-  deepmd::malloc_device_memory_sync(std_dev, std);
-
-  deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
-  deepmd::malloc_device_memory_sync(atype_dev, atype);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
-                                   max_nbor_size);
-
-  deepmd::prod_env_mat_a_gpu(
-      em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev,
-      gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev,
-      std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev);
-  deepmd::memcpy_device_to_host(em_dev, em);
-  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
-  deepmd::memcpy_device_to_host(rij_dev, rij);
-  deepmd::memcpy_device_to_host(nlist_dev, nlist);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(em_deriv_dev);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(f_atype_cpy_dev);
-  deepmd::delete_device_memory(atype_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(avg_dev);
-  deepmd::delete_device_memory(std_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-
-  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
-  std::vector<double> env_1, env_deriv_1, rij_a_1;
-  for (int ii = 0; ii < nloc; ++ii) {
-    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, f_atype_cpy,
-                                           ii, nlist_a_cpy[ii], rc, sec_a);
-    EXPECT_EQ(ret_1, -1);
-    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy,
-                                  f_atype_cpy, ii, fmt_nlist_a_1, sec_a,
-                                  rc_smth, rc);
-    EXPECT_EQ(env_1.size(), nnei * 4);
-    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
-    EXPECT_EQ(rij_a_1.size(), nnei * 3);
-    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
-    EXPECT_EQ(env_1.size() * nloc, em.size());
-    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
-    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
-    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
-    for (unsigned jj = 0; jj < env_1.size(); ++jj) {
-      EXPECT_LT(fabs(em[ii * nnei * 4 + jj] - env_1[jj]), 1e-10);
-    }
-    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj) {
-      EXPECT_LT(fabs(em_deriv[ii * nnei * 4 * 3 + jj] - env_deriv_1[jj]),
-                1e-10);
-    }
-    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj) {
-      EXPECT_LT(fabs(rij[ii * nnei * 3 + jj] - rij_a_1[jj]), 1e-10);
-    }
-    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj) {
-      EXPECT_EQ(nlist[ii * nnei + jj], fmt_nlist_a_1[jj]);
-    }
-  }
-
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int jj = 0; jj < nnei; ++jj) {
-      for (int dd = 0; dd < 4; ++dd) {
-        EXPECT_LT(fabs(em[ii * nnei * 4 + jj * 4 + dd] -
-                       expected_env[ii * nnei * 4 + jj * 4 + dd]),
-                  1e-5);
-      }
-    }
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index f20a8cbbc3..3024e651d9 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -358,7 +358,7 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestEnvMatR, prod_gpu) {
   EXPECT_EQ(nlist_r_cpy.size(), nloc);
   int tot_nnei = 0;
@@ -541,189 +541,4 @@ TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
     }
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestEnvMatR, prod_gpu) {
-  EXPECT_EQ(nlist_r_cpy.size(), nloc);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
-    tot_nnei += nlist_a_cpy[ii].size();
-    if (nlist_a_cpy[ii].size() > max_nbor_size) {
-      max_nbor_size = nlist_a_cpy[ii].size();
-    }
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
-  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
-
-  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
-  double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(avg_dev, avg);
-  deepmd::malloc_device_memory_sync(std_dev, std);
-
-  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
-                                   max_nbor_size);
-
-  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
-  deepmd::memcpy_device_to_host(em_dev, em);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(em_deriv_dev);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(atype_cpy_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(avg_dev);
-  deepmd::delete_device_memory(std_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int jj = 0; jj < nnei; ++jj) {
-      for (int dd = 0; dd < 1; ++dd) {
-        EXPECT_LT(fabs(em[ii * nnei * 1 + jj * 1 + dd] -
-                       expected_env[ii * nnei * 1 + jj * 1 + dd]),
-                  1e-5);
-      }
-    }
-  }
-}
-
-TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
-  EXPECT_EQ(nlist_r_cpy.size(), nloc);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for (int ii = 0; ii < nlist_a_cpy.size(); ++ii) {
-    tot_nnei += nlist_a_cpy[ii].size();
-    if (nlist_a_cpy[ii].size() > max_nbor_size) {
-      max_nbor_size = nlist_a_cpy[ii].size();
-    }
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
-  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(inlist, nlist_a_cpy);
-  std::vector<double> em(nloc * ndescrpt, 0.0),
-      em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
-  std::vector<int> nlist(nloc * nnei, 0);
-  std::vector<double> avg(ntypes * ndescrpt, 0);
-  std::vector<double> std(ntypes * ndescrpt, 1);
-
-  double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
-  double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
-  int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(avg_dev, avg);
-  deepmd::malloc_device_memory_sync(std_dev, std);
-
-  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev,
-                                   max_nbor_size);
-
-  deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev,
-                             posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                             array_int_dev, array_longlong_dev, max_nbor_size,
-                             avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a);
-  deepmd::memcpy_device_to_host(em_dev, em);
-  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
-  deepmd::memcpy_device_to_host(rij_dev, rij);
-  deepmd::memcpy_device_to_host(nlist_dev, nlist);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(em_deriv_dev);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(atype_cpy_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(avg_dev);
-  deepmd::delete_device_memory(std_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-
-  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
-  std::vector<double> env_1, env_deriv_1, rij_a_1;
-  for (int ii = 0; ii < nloc; ++ii) {
-    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy,
-                                           ii, nlist_a_cpy[ii], rc, sec_a);
-    EXPECT_EQ(ret_1, -1);
-    deepmd::env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy,
-                                  atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth,
-                                  rc);
-    EXPECT_EQ(env_1.size(), nnei * 1);
-    EXPECT_EQ(env_deriv_1.size(), nnei * 1 * 3);
-    EXPECT_EQ(rij_a_1.size(), nnei * 3);
-    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
-    EXPECT_EQ(env_1.size() * nloc, em.size());
-    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
-    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
-    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
-    for (unsigned jj = 0; jj < env_1.size(); ++jj) {
-      EXPECT_LT(fabs(em[ii * nnei * 1 + jj] - env_1[jj]), 1e-10);
-    }
-    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj) {
-      EXPECT_LT(fabs(em_deriv[ii * nnei * 1 * 3 + jj] - env_deriv_1[jj]),
-                1e-10);
-    }
-    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj) {
-      EXPECT_LT(fabs(rij[ii * nnei * 3 + jj] - rij_a_1[jj]), 1e-10);
-    }
-    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj) {
-      EXPECT_EQ(nlist[ii * nnei + jj], fmt_nlist_a_1[jj]);
-    }
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index 1d995f8fce..bc79c92ea6 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -313,7 +313,7 @@ TEST_F(TestFormatNlistShortSel, cpu) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestFormatNlist, gpu) {
   std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
   build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
@@ -501,163 +501,4 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu) {
 //   deepmd::delete_device_memory(out_index_dev);
 //   deepmd::delete_device_memory(key_dev);
 // }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestFormatNlist, gpu) {
-  std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
-  build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
-              ext_stt, ext_end, region, ncell);
-  // make a input nlist
-  int inum = nlist_a_0.size();
-  std::vector<int> ilist(inum);
-  std::vector<int> numneigh(inum);
-  std::vector<int*> firstneigh(inum);
-  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(in_nlist, nlist_a_0);
-  // allocate the mem for the result
-  std::vector<int> nlist(inum * sec_a.back());
-  EXPECT_EQ(nlist.size(), expect_nlist_cpy.size());
-
-  double* posi_cpy_dev = NULL;
-  int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64* array_longlong_dev = NULL;
-  for (int ii = 0; ii < inum; ii++) {
-    max_nbor_size =
-        max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii];
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev,
-                                   max_nbor_size);
-  // format nlist
-  format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                       array_int_dev, array_longlong_dev, max_nbor_size, nloc,
-                       nall, rc, sec_a);
-  deepmd::memcpy_device_to_host(nlist_dev, nlist);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(atype_cpy_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-  // validate
-  for (int ii = 0; ii < nlist.size(); ++ii) {
-    EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
-  }
-}
-
-TEST_F(TestFormatNlistShortSel, gpu) {
-  std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
-  build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell,
-              ext_stt, ext_end, region, ncell);
-  // make a input nlist
-  int inum = nlist_a_0.size();
-  std::vector<int> ilist(inum);
-  std::vector<int> numneigh(inum);
-  std::vector<int*> firstneigh(inum);
-  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]),
-      gpu_inlist;
-  convert_nlist(in_nlist, nlist_a_0);
-  // mem
-  std::vector<int> nlist(inum * sec_a.back());
-  EXPECT_EQ(nlist.size(), expect_nlist_cpy.size());
-  // format nlist
-  double* posi_cpy_dev = NULL;
-  int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
-      *memory_dev = NULL;
-  uint_64* array_longlong_dev = NULL;
-  for (int ii = 0; ii < inum; ii++) {
-    max_nbor_size =
-        max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii];
-  }
-  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
-  if (max_nbor_size <= 1024) {
-    max_nbor_size = 1024;
-  } else if (max_nbor_size <= 2048) {
-    max_nbor_size = 2048;
-  } else {
-    max_nbor_size = 4096;
-  }
-  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory(array_int_dev,
-                               sec_a.size() + nloc * sec_a.size() + nloc);
-  deepmd::malloc_device_memory(array_longlong_dev,
-                               nloc * GPU_MAX_NBOR_SIZE * 2);
-  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev,
-                                   max_nbor_size);
-  // format nlist
-  format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist,
-                       array_int_dev, array_longlong_dev, max_nbor_size, nloc,
-                       nall, rc, sec_a);
-  deepmd::memcpy_device_to_host(nlist_dev, nlist);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(posi_cpy_dev);
-  deepmd::delete_device_memory(atype_cpy_dev);
-  deepmd::delete_device_memory(array_int_dev);
-  deepmd::delete_device_memory(array_longlong_dev);
-  deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_device(gpu_inlist);
-  // validate
-  for (int ii = 0; ii < nlist.size(); ++ii) {
-    EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
-  }
-}
-
-TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu) {
-  int *valid_type_dev = NULL, *valid_index_dev = NULL, *out_type_dev = NULL,
-      *out_index_dev = NULL;
-  double* valid_dist_dev = NULL;
-  uint_64* key_dev = NULL;
-  std::vector<int> out_type(size_of_array, 0);
-  std::vector<int> out_index(size_of_array, 0);
-  std::vector<uint_64> key(size_of_array, 0);
-  deepmd::malloc_device_memory_sync(valid_type_dev, valid_type);
-  deepmd::malloc_device_memory_sync(valid_dist_dev, valid_dist);
-  deepmd::malloc_device_memory_sync(valid_index_dev, valid_index);
-  deepmd::malloc_device_memory_sync(out_type_dev, out_type);
-  deepmd::malloc_device_memory_sync(out_index_dev, out_index);
-  deepmd::malloc_device_memory_sync(key_dev, key);
-
-  deepmd::test_encoding_decoding_nbor_info_gpu(
-      key_dev, out_type_dev, out_index_dev, valid_type_dev, valid_dist_dev,
-      valid_index_dev, size_of_array);
-
-  deepmd::memcpy_device_to_host(key_dev, key);
-  deepmd::memcpy_device_to_host(out_type_dev, out_type);
-  deepmd::memcpy_device_to_host(out_index_dev, out_index);
-  deepmd::delete_device_memory(valid_type_dev);
-  deepmd::delete_device_memory(valid_dist_dev);
-  deepmd::delete_device_memory(valid_index_dev);
-  deepmd::delete_device_memory(out_type_dev);
-  deepmd::delete_device_memory(out_index_dev);
-  deepmd::delete_device_memory(key_dev);
-  // validate
-  for (int ii = 0; ii < size_of_array; ii++) {
-    EXPECT_EQ(key[ii], expect_key[ii]);
-    EXPECT_EQ(out_type[ii], expect_type[ii]);
-    EXPECT_EQ(out_index[ii], expect_index[ii]);
-  }
-}
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
index e680567b9c..322625f1ac 100644
--- a/source/lib/tests/test_gelu.cc
+++ b/source/lib/tests/test_gelu.cc
@@ -145,7 +145,7 @@ TEST_F(TestGelu, gelu_grad_grad_cpu) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestGelu, gelu_gpu) {
   std::vector<double> gelu(nloc, 0.0);
 
@@ -210,71 +210,4 @@ TEST_F(TestGelu, gelu_grad_grad_gpu) {
     EXPECT_LT(fabs(gelu_grad_grad[jj] - expected_gelu_grad_grad[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestGelu, gelu_gpu) {
-  std::vector<double> gelu(nloc, 0.0);
-
-  double *gelu_dev = NULL, *xx_dev = NULL;
-  deepmd::malloc_device_memory_sync(gelu_dev, gelu);
-  deepmd::malloc_device_memory_sync(xx_dev, xx);
-  deepmd::gelu_gpu<double>(gelu_dev, xx_dev, nloc);
-  deepmd::memcpy_device_to_host(gelu_dev, gelu);
-  deepmd::delete_device_memory(gelu_dev);
-  deepmd::delete_device_memory(xx_dev);
-
-  EXPECT_EQ(gelu.size(), nloc);
-  EXPECT_EQ(gelu.size(), expected_gelu.size());
-  for (int jj = 0; jj < gelu.size(); ++jj) {
-    EXPECT_LT(fabs(gelu[jj] - expected_gelu[jj]), 1e-5);
-  }
-}
-
-TEST_F(TestGelu, gelu_grad_gpu) {
-  std::vector<double> dy(100, 1.0);
-  std::vector<double> gelu_grad(nloc, 0.0);
-
-  double *gelu_grad_dev = NULL, *xx_dev = NULL, *dy_dev = NULL;
-  deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
-  deepmd::malloc_device_memory_sync(xx_dev, xx);
-  deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::gelu_grad_gpu<double>(gelu_grad_dev, xx_dev, dy_dev, nloc);
-  deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad);
-  deepmd::delete_device_memory(gelu_grad_dev);
-  deepmd::delete_device_memory(xx_dev);
-  deepmd::delete_device_memory(dy_dev);
-
-  EXPECT_EQ(gelu_grad.size(), nloc);
-  EXPECT_EQ(gelu_grad.size(), expected_gelu_grad.size());
-  for (int jj = 0; jj < gelu_grad.size(); ++jj) {
-    EXPECT_LT(fabs(gelu_grad[jj] - expected_gelu_grad[jj]), 1e-5);
-  }
-}
-
-TEST_F(TestGelu, gelu_grad_grad_gpu) {
-  std::vector<double> dy(100, 1.0);
-  std::vector<double> dy_2(100, 1.0);
-  std::vector<double> gelu_grad_grad(nloc, 0.0);
-
-  double *gelu_grad_grad_dev = NULL, *xx_dev = NULL, *dy_dev = NULL,
-         *dy_2_dev = NULL;
-  deepmd::malloc_device_memory_sync(gelu_grad_grad_dev, gelu_grad_grad);
-  deepmd::malloc_device_memory_sync(xx_dev, xx);
-  deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::malloc_device_memory_sync(dy_2_dev, dy_2);
-  deepmd::gelu_grad_grad_gpu<double>(gelu_grad_grad_dev, xx_dev, dy_dev,
-                                     dy_2_dev, nloc);
-  deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
-  deepmd::delete_device_memory(gelu_grad_grad_dev);
-  deepmd::delete_device_memory(xx_dev);
-  deepmd::delete_device_memory(dy_dev);
-  deepmd::delete_device_memory(dy_2_dev);
-
-  EXPECT_EQ(gelu_grad_grad.size(), nloc);
-  EXPECT_EQ(gelu_grad_grad.size(), expected_gelu_grad_grad.size());
-  for (int jj = 0; jj < gelu_grad_grad.size(); ++jj) {
-    EXPECT_LT(fabs(gelu_grad_grad[jj] - expected_gelu_grad_grad[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc
index 985f69b3f4..b59fbd4691 100644
--- a/source/lib/tests/test_neighbor_list.cc
+++ b/source/lib/tests/test_neighbor_list.cc
@@ -135,7 +135,7 @@ TEST_F(TestNeighborList, cpu_lessmem) {
   delete[] firstneigh;
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestNeighborList, gpu) {
   int mem_size = 48;
 
@@ -229,100 +229,4 @@ TEST_F(TestNeighborList, gpu_lessmem) {
   deepmd::delete_device_memory(c_cpy_dev);
 }
 
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestNeighborList, gpu) {
-  int mem_size = 48;
-
-  int *nlist_data_dev = NULL, *jlist_dev = NULL, *ilist_dev = NULL,
-      *numneigh_dev = NULL;
-  int** firstneigh_dev = NULL;
-  std::vector<int*> temp_firstneigh(nloc);
-  double* c_cpy_dev = NULL;
-
-  deepmd::malloc_device_memory(nlist_data_dev, 2 * nloc * mem_size);
-  deepmd::malloc_device_memory(jlist_dev, nloc * mem_size);
-  deepmd::malloc_device_memory(ilist_dev, nloc);
-  deepmd::malloc_device_memory(numneigh_dev, nloc);
-  for (int ii = 0; ii < nloc; ++ii) {
-    temp_firstneigh[ii] = jlist_dev + ii * mem_size;
-  }
-  deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh);
-  deepmd::malloc_device_memory_sync(c_cpy_dev, posi_cpy);
-  deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev);
-
-  int max_list_size;
-  int ret = deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev,
-                                    c_cpy_dev, nloc, nall, mem_size, rc);
-
-  EXPECT_EQ(ret, 0);
-  int* ilist = new int[nloc];
-  int* numneigh = new int[nloc];
-  int** firstneigh = new int*[nloc];
-  int* jlist = new int[nloc * mem_size];
-  deepmd::memcpy_device_to_host(jlist_dev, jlist, nloc * mem_size);
-  deepmd::memcpy_device_to_host(ilist_dev, ilist, nloc);
-  deepmd::memcpy_device_to_host(numneigh_dev, numneigh, nloc);
-  for (int ii = 0; ii < nloc; ++ii) {
-    firstneigh[ii] = jlist + ii * mem_size;
-  }
-
-  deepmd::InputNlist nlist(nlist_dev.inum, ilist, numneigh, firstneigh);
-  EXPECT_EQ(nlist.inum, nloc);
-  EXPECT_EQ(max_list_size, 5);
-  for (int ii = 0; ii < nloc; ++ii) {
-    EXPECT_EQ(nlist.ilist[ii], ii);
-    EXPECT_EQ(nlist.numneigh[ii], expect_nlist_cpy[ii].size());
-    std::sort(nlist.firstneigh[ii], nlist.firstneigh[ii] + nlist.numneigh[ii]);
-    for (int jj = 0; jj < nlist.numneigh[ii]; ++jj) {
-      EXPECT_EQ(nlist.firstneigh[ii][jj], expect_nlist_cpy[ii][jj]);
-    }
-  }
-
-  delete[] ilist;
-  delete[] numneigh;
-  delete[] jlist;
-  delete[] firstneigh;
-  deepmd::delete_device_memory(nlist_data_dev);
-  deepmd::delete_device_memory(jlist_dev);
-  deepmd::delete_device_memory(ilist_dev);
-  deepmd::delete_device_memory(numneigh_dev);
-  deepmd::delete_device_memory(firstneigh_dev);
-  deepmd::delete_device_memory(c_cpy_dev);
-}
-
-TEST_F(TestNeighborList, gpu_lessmem) {
-  int mem_size = 47;
-
-  int *nlist_data_dev = NULL, *jlist_dev = NULL, *ilist_dev = NULL,
-      *numneigh_dev = NULL;
-  int** firstneigh_dev = NULL;
-  std::vector<int*> temp_firstneigh(nloc);
-  double* c_cpy_dev = NULL;
-
-  deepmd::malloc_device_memory(nlist_data_dev, 2 * nloc * mem_size);
-  deepmd::malloc_device_memory(jlist_dev, nloc * mem_size);
-  deepmd::malloc_device_memory(ilist_dev, nloc);
-  deepmd::malloc_device_memory(numneigh_dev, nloc);
-  for (int ii = 0; ii < nloc; ++ii) {
-    temp_firstneigh[ii] = jlist_dev + ii * mem_size;
-  }
-  deepmd::malloc_device_memory_sync(firstneigh_dev, temp_firstneigh);
-  deepmd::malloc_device_memory_sync(c_cpy_dev, posi_cpy);
-  deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev);
-
-  int max_list_size;
-  int ret = deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev,
-                                    c_cpy_dev, nloc, nall, mem_size, rc);
-
-  EXPECT_EQ(ret, 1);
-  deepmd::delete_device_memory(nlist_data_dev);
-  deepmd::delete_device_memory(jlist_dev);
-  deepmd::delete_device_memory(ilist_dev);
-  deepmd::delete_device_memory(numneigh_dev);
-  deepmd::delete_device_memory(firstneigh_dev);
-  deepmd::delete_device_memory(c_cpy_dev);
-}
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index b51c97e421..2031f086b4 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -133,7 +133,7 @@ TEST_F(TestProdForceA, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdForceA, gpu) {
   std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
@@ -161,34 +161,4 @@ TEST_F(TestProdForceA, gpu) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdForceA, gpu) {
-  std::vector<double> force(nframes * nall * 3, 0.0);
-  int n_a_sel = nnei;
-
-  int* nlist_dev = NULL;
-  double *force_dev = NULL, *net_deriv_dev = NULL, *env_deriv_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(force_dev, force);
-  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-
-  deepmd::prod_force_a_gpu<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                   nlist_dev, nloc, nall, nnei, nframes);
-
-  deepmd::memcpy_device_to_host(force_dev, force);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(force_dev);
-  deepmd::delete_device_memory(net_deriv_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-
-  EXPECT_EQ(force.size(), nframes * nall * 3);
-  EXPECT_EQ(force.size(), expected_force.size());
-  for (int jj = 0; jj < force.size(); ++jj) {
-    EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc
index 4694c4ac3b..abb04eaa01 100644
--- a/source/lib/tests/test_prod_force_grad_a.cc
+++ b/source/lib/tests/test_prod_force_grad_a.cc
@@ -143,7 +143,7 @@ TEST_F(TestProdForceGradA, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdForceGradA, gpu) {
   std::vector<double> grad_net(nframes * nloc * ndescrpt);
   int* nlist_dev = NULL;
@@ -171,34 +171,4 @@ TEST_F(TestProdForceGradA, gpu) {
   // }
   // printf("\n");
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdForceGradA, gpu) {
-  std::vector<double> grad_net(nframes * nloc * ndescrpt);
-  int* nlist_dev = NULL;
-  double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(grad_dev, grad);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-  deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt);
-  deepmd::prod_force_grad_a_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nnei, nframes);
-  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(grad_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-  deepmd::delete_device_memory(grad_net_dev);
-
-  EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt);
-  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
-  for (int jj = 0; jj < grad_net.size(); ++jj) {
-    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5);
-  }
-  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
-  //   printf("%8.5f, ", grad_net[jj]);
-  // }
-  // printf("\n");
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc
index 31f8b64982..c8a27077c3 100644
--- a/source/lib/tests/test_prod_force_grad_r.cc
+++ b/source/lib/tests/test_prod_force_grad_r.cc
@@ -117,7 +117,7 @@ TEST_F(TestProdForceGradR, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdForceGradR, gpu) {
   std::vector<double> grad_net(nframes * nloc * ndescrpt);
   int* nlist_dev = NULL;
@@ -145,34 +145,4 @@ TEST_F(TestProdForceGradR, gpu) {
   // }
   // printf("\n");
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdForceGradR, gpu) {
-  std::vector<double> grad_net(nframes * nloc * ndescrpt);
-  int* nlist_dev = NULL;
-  double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(grad_dev, grad);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-  deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt);
-  deepmd::prod_force_grad_r_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nnei, nframes);
-  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(grad_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-  deepmd::delete_device_memory(grad_net_dev);
-
-  EXPECT_EQ(grad_net.size(), nframes * nloc * ndescrpt);
-  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
-  for (int jj = 0; jj < grad_net.size(); ++jj) {
-    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5);
-  }
-  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
-  //   printf("%8.5f, ", grad_net[jj]);
-  // }
-  // printf("\n");
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index 7f46aa3244..ff3245742d 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -130,7 +130,7 @@ TEST_F(TestProdForceR, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdForceR, gpu) {
   std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
@@ -158,34 +158,4 @@ TEST_F(TestProdForceR, gpu) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdForceR, gpu) {
-  std::vector<double> force(nframes * nall * 3, 0.0);
-  int n_a_sel = nnei;
-
-  int* nlist_dev = NULL;
-  double *force_dev = NULL, *net_deriv_dev = NULL, *env_deriv_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(force_dev, force);
-  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-
-  deepmd::prod_force_r_gpu<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                   nlist_dev, nloc, nall, nnei, nframes);
-
-  deepmd::memcpy_device_to_host(force_dev, force);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(force_dev);
-  deepmd::delete_device_memory(net_deriv_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-
-  EXPECT_EQ(force.size(), nframes * nall * 3);
-  EXPECT_EQ(force.size(), expected_force.size());
-  for (int jj = 0; jj < force.size(); ++jj) {
-    EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index 054a152869..b2f2a11989 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -178,7 +178,7 @@ TEST_F(TestProdVirialA, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdVirialA, gpu) {
   std::vector<double> virial(9, 0.0);
   std::vector<double> atom_virial(nall * 9, 0.0);
@@ -225,53 +225,4 @@ TEST_F(TestProdVirialA, gpu) {
     EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdVirialA, gpu) {
-  std::vector<double> virial(9, 0.0);
-  std::vector<double> atom_virial(nall * 9, 0.0);
-  int n_a_sel = nnei;
-
-  int* nlist_dev = NULL;
-  double *virial_dev = NULL, *atom_virial_dev = NULL, *net_deriv_dev = NULL,
-         *env_deriv_dev = NULL, *rij_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(virial_dev, virial);
-  deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial);
-  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-
-  deepmd::prod_virial_a_gpu<double>(virial_dev, atom_virial_dev, net_deriv_dev,
-                                    env_deriv_dev, rij_dev, nlist_dev, nloc,
-                                    nall, nnei);
-
-  deepmd::memcpy_device_to_host(virial_dev, virial);
-  deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(virial_dev);
-  deepmd::delete_device_memory(atom_virial_dev);
-  deepmd::delete_device_memory(net_deriv_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-  deepmd::delete_device_memory(rij_dev);
-  // virial are not calculated in gpu currently;
-  // for (int ii = 0; ii < 9; ii++) {
-  //   virial[ii] = 0;
-  // }
-  // for (int ii = 0; ii < nall * 9; ii++) {
-  //   virial[ii % 9] += atom_virial[ii];
-  // }
-  EXPECT_EQ(virial.size(), 9);
-  EXPECT_EQ(virial.size(), expected_virial.size());
-  EXPECT_EQ(atom_virial.size(), nall * 9);
-  EXPECT_EQ(atom_virial.size(), expected_atom_virial.size());
-  for (int jj = 0; jj < virial.size(); ++jj) {
-    EXPECT_LT(fabs(virial[jj] - expected_virial[jj]), 1e-5);
-  }
-  for (int jj = 0; jj < atom_virial.size(); ++jj) {
-    EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc
index 98a08ce5c3..09af51d6ed 100644
--- a/source/lib/tests/test_prod_virial_grad_a.cc
+++ b/source/lib/tests/test_prod_virial_grad_a.cc
@@ -137,7 +137,7 @@ TEST_F(TestProdVirialGradA, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdVirialGradA, gpu) {
   std::vector<double> grad_net(nloc * ndescrpt);
   int n_a_sel = nnei;
@@ -169,38 +169,4 @@ TEST_F(TestProdVirialGradA, gpu) {
   // }
   // printf("\n");
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdVirialGradA, gpu) {
-  std::vector<double> grad_net(nloc * ndescrpt);
-  int n_a_sel = nnei;
-  int* nlist_dev = NULL;
-  double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL,
-         *rij_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(grad_dev, grad);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
-  deepmd::prod_virial_grad_a_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
-                                         rij_dev, nlist_dev, nloc, nnei);
-  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(grad_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-  deepmd::delete_device_memory(rij_dev);
-  deepmd::delete_device_memory(grad_net_dev);
-
-  EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
-  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
-  for (int jj = 0; jj < grad_net.size(); ++jj) {
-    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5);
-  }
-  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
-  //   printf("%8.5f, ", grad_net[jj]);
-  // }
-  // printf("\n");
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc
index a0c7dad0db..93a7291176 100644
--- a/source/lib/tests/test_prod_virial_grad_r.cc
+++ b/source/lib/tests/test_prod_virial_grad_r.cc
@@ -111,7 +111,7 @@ TEST_F(TestProdVirialGradR, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdVirialGradR, gpu) {
   std::vector<double> grad_net(nloc * ndescrpt);
   int n_a_sel = nnei;
@@ -143,38 +143,4 @@ TEST_F(TestProdVirialGradR, gpu) {
   // }
   // printf("\n");
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdVirialGradR, gpu) {
-  std::vector<double> grad_net(nloc * ndescrpt);
-  int n_a_sel = nnei;
-  int* nlist_dev = NULL;
-  double *grad_net_dev = NULL, *grad_dev = NULL, *env_deriv_dev = NULL,
-         *rij_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(grad_dev, grad);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-  deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt);
-  deepmd::prod_virial_grad_r_gpu<double>(grad_net_dev, grad_dev, env_deriv_dev,
-                                         rij_dev, nlist_dev, nloc, nnei);
-  deepmd::memcpy_device_to_host(grad_net_dev, grad_net);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(grad_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-  deepmd::delete_device_memory(rij_dev);
-  deepmd::delete_device_memory(grad_net_dev);
-
-  EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
-  EXPECT_EQ(grad_net.size(), expected_grad_net.size());
-  for (int jj = 0; jj < grad_net.size(); ++jj) {
-    EXPECT_LT(fabs(grad_net[jj] - expected_grad_net[jj]), 1e-5);
-  }
-  // for (int jj = 0; jj < nloc * ndescrpt; ++jj){
-  //   printf("%8.5f, ", grad_net[jj]);
-  // }
-  // printf("\n");
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index f1077b6dbc..aed4abc512 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -178,7 +178,7 @@ TEST_F(TestProdVirialR, cpu) {
   // printf("\n");
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestProdVirialR, gpu) {
   std::vector<double> virial(9, 0.0);
   std::vector<double> atom_virial(nall * 9, 0.0);
@@ -225,53 +225,4 @@ TEST_F(TestProdVirialR, gpu) {
     EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestProdVirialR, gpu) {
-  std::vector<double> virial(9, 0.0);
-  std::vector<double> atom_virial(nall * 9, 0.0);
-  int n_a_sel = nnei;
-
-  int* nlist_dev = NULL;
-  double *virial_dev = NULL, *atom_virial_dev = NULL, *net_deriv_dev = NULL,
-         *env_deriv_dev = NULL, *rij_dev = NULL;
-
-  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
-  deepmd::malloc_device_memory_sync(virial_dev, virial);
-  deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial);
-  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
-  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
-  deepmd::malloc_device_memory_sync(rij_dev, rij);
-
-  deepmd::prod_virial_r_gpu<double>(virial_dev, atom_virial_dev, net_deriv_dev,
-                                    env_deriv_dev, rij_dev, nlist_dev, nloc,
-                                    nall, nnei);
-
-  deepmd::memcpy_device_to_host(virial_dev, virial);
-  deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
-  deepmd::delete_device_memory(nlist_dev);
-  deepmd::delete_device_memory(virial_dev);
-  deepmd::delete_device_memory(atom_virial_dev);
-  deepmd::delete_device_memory(net_deriv_dev);
-  deepmd::delete_device_memory(env_deriv_dev);
-  deepmd::delete_device_memory(rij_dev);
-  // virial are not calculated in gpu currently;
-  // for (int ii = 0; ii < 9; ii++) {
-  //   virial[ii] = 0;
-  // }
-  // for (int ii = 0; ii < nall * 9; ii++) {
-  //   virial[ii % 9] += atom_virial[ii];
-  // }
-  EXPECT_EQ(virial.size(), 9);
-  EXPECT_EQ(virial.size(), expected_virial.size());
-  EXPECT_EQ(atom_virial.size(), nall * 9);
-  EXPECT_EQ(atom_virial.size(), expected_atom_virial.size());
-  for (int jj = 0; jj < virial.size(); ++jj) {
-    EXPECT_LT(fabs(virial[jj] - expected_virial[jj]), 1e-5);
-  }
-  for (int jj = 0; jj < atom_virial.size(); ++jj) {
-    EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc
index 6f1db46bb0..98da9ec350 100644
--- a/source/lib/tests/test_simulation_region.cc
+++ b/source/lib/tests/test_simulation_region.cc
@@ -73,7 +73,7 @@ TEST_F(TestRegion, cpu) {
     EXPECT_LT(fabs(ri2[ii] - ref_ri[ii]), 1e-10);
   }
 }
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestRegion, gpu) {
   // check rec_box
   deepmd::Region<double> region;
@@ -144,7 +144,7 @@ TEST_F(TestRegion, gpu) {
   region_dev.boxt = new_boxt;
   region_dev.rec_boxt = new_rec_boxt;
 }
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // double square_root (const double xx)
 // {
@@ -156,76 +156,3 @@ TEST_F(TestRegion, gpu) {
 //     EXPECT_EQ (25.4, square_root (645.16));
 //     EXPECT_EQ (50.332, square_root (2533.310224));
 // }
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestRegion, gpu) {
-  // check rec_box
-  deepmd::Region<double> region;
-  deepmd::Region<double> region_dev;
-  double* new_boxt = region_dev.boxt;
-  double* new_rec_boxt = region_dev.rec_boxt;
-  double *boxt_dev = NULL, *rec_boxt_dev = NULL;
-  double *ref_rp_dev = NULL, *ref_ri_dev = NULL;
-  init_region_cpu(region, &ref_boxt[0]);
-  for (int ii = 0; ii < 9; ++ii) {
-    EXPECT_LT(fabs(region.rec_boxt[ii] - ref_rec_boxt[ii]), 1e-10);
-  }
-  deepmd::malloc_device_memory_sync(boxt_dev, region.boxt, 9);
-  deepmd::malloc_device_memory_sync(rec_boxt_dev, region.rec_boxt, 9);
-  deepmd::malloc_device_memory_sync(ref_rp_dev, ref_rp);
-  deepmd::malloc_device_memory_sync(ref_ri_dev, ref_ri);
-  region_dev.boxt = boxt_dev;
-  region_dev.rec_boxt = rec_boxt_dev;
-  // check volume
-  double vol[1];
-  double* vol_dev = NULL;
-  deepmd::malloc_device_memory(vol_dev, 1);
-  deepmd::volume_gpu(vol_dev, region_dev);
-  deepmd::memcpy_device_to_host(vol_dev, vol, 1);
-  EXPECT_LT(fabs(vol[0] - expected_vol), 1e-10);
-  // check conversion between phys and inter coords.
-  double ri[3];
-  double* ri_dev = NULL;
-  deepmd::malloc_device_memory(ri_dev, 3);
-  deepmd::convert_to_inter_gpu(ri_dev, region_dev, ref_rp_dev);
-  deepmd::memcpy_device_to_host(ri_dev, ri, 3);
-  for (int ii = 0; ii < 3; ++ii) {
-    EXPECT_LT(fabs(ri[ii] - ref_ri[ii]), 1e-10);
-  }
-  double rp2[3];
-  double* rp2_dev = NULL;
-  deepmd::malloc_device_memory(rp2_dev, 3);
-  deepmd::convert_to_phys_gpu(rp2_dev, region_dev, ri_dev);
-  deepmd::memcpy_device_to_host(rp2_dev, rp2, 3);
-  for (int ii = 0; ii < 3; ++ii) {
-    EXPECT_LT(fabs(rp2[ii] - ref_rp[ii]), 1e-10);
-  }
-  double rp[3];
-  double* rp_dev = NULL;
-  deepmd::malloc_device_memory(rp_dev, 3);
-  deepmd::convert_to_phys_gpu(rp_dev, region_dev, ref_ri_dev);
-  deepmd::memcpy_device_to_host(rp_dev, rp, 3);
-  for (int ii = 0; ii < 3; ++ii) {
-    EXPECT_LT(fabs(rp[ii] - ref_rp[ii]), 1e-10);
-  }
-  double ri2[3];
-  double* ri2_dev = NULL;
-  deepmd::malloc_device_memory(ri2_dev, 3);
-  deepmd::convert_to_inter_gpu(ri2_dev, region_dev, rp_dev);
-  deepmd::memcpy_device_to_host(ri2_dev, ri2, 3);
-  for (int ii = 0; ii < 3; ++ii) {
-    EXPECT_LT(fabs(ri2[ii] - ref_ri[ii]), 1e-10);
-  }
-  deepmd::delete_device_memory(boxt_dev);
-  deepmd::delete_device_memory(rec_boxt_dev);
-  deepmd::delete_device_memory(vol_dev);
-  deepmd::delete_device_memory(ref_rp_dev);
-  deepmd::delete_device_memory(ref_ri_dev);
-  deepmd::delete_device_memory(ri_dev);
-  deepmd::delete_device_memory(rp2_dev);
-  deepmd::delete_device_memory(rp_dev);
-  deepmd::delete_device_memory(ri2_dev);
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-}
-#endif  // TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc
index 6f76f9c2ee..fc0fd04980 100644
--- a/source/lib/tests/test_tabulate_se_a.cc
+++ b/source/lib/tests/test_tabulate_se_a.cc
@@ -755,7 +755,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
@@ -852,103 +852,4 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) {
   deepmd::delete_device_memory(dy_dev);
   deepmd::delete_device_memory(two_embed_dev);
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) {
-  std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
-
-  double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
-         *em_dev = NULL;
-  deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  deepmd::malloc_device_memory_sync(table_dev, table);
-  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
-                                           em_x_dev, em_dev, nullptr, nloc,
-                                           nnei, last_layer_size);
-  deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
-
-  EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
-  EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
-  for (int jj = 0; jj < xyz_scatter.size(); ++jj) {
-    EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter[jj]), 1e-5);
-  }
-
-  double *two_embed_dev = nullptr;
-  deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
-  deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
-                                           em_x_dev, em_dev, two_embed_dev,
-                                           nloc, nnei, last_layer_size);
-  deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
-
-  EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
-  EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
-  for (int jj = 0; jj < xyz_scatter.size(); ++jj) {
-    EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter_with_two_embed[jj]),
-              1e-5);
-  }
-
-  deepmd::delete_device_memory(xyz_scatter_dev);
-  deepmd::delete_device_memory(table_dev);
-  deepmd::delete_device_memory(em_x_dev);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(two_embed_dev);
-}
-
-TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) {
-  std::vector<double> dy_dem_x(em_x.size(), 0.0);
-  std::vector<double> dy_dem(em.size(), 0.0);
-  std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
-
-  double *dy_dem_x_dev = NULL, *dy_dem_dev = NULL, *table_dev = NULL,
-         *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL;
-  deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
-  deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  deepmd::malloc_device_memory_sync(table_dev, table);
-  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_a_grad_gpu<double>(
-      dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr,
-      dy_dev, nloc, nnei, last_layer_size);
-  deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
-  deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
-
-  EXPECT_EQ(dy_dem_x.size(), nloc * nnei);
-  EXPECT_EQ(dy_dem.size(), nloc * nnei * 4);
-  EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size());
-  EXPECT_EQ(dy_dem.size(), expected_dy_dem.size());
-  for (int jj = 0; jj < dy_dem_x.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem_x[jj] - expected_dy_dem_x[jj]), 1e-5);
-  }
-  for (int jj = 0; jj < dy_dem.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]), 1e-5);
-  }
-
-  double *two_embed_dev = nullptr;
-  deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
-  deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
-  deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  deepmd::tabulate_fusion_se_a_grad_gpu<double>(
-      dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev,
-      two_embed_dev, dy_dev, nloc, nnei, last_layer_size);
-  deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
-  deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
-  for (int jj = 0; jj < dy_dem_x.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem_x[jj] - expected_dy_dem_x_with_two_embed[jj]), 1e-5);
-  }
-  for (int jj = 0; jj < dy_dem.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem_with_two_embed[jj]), 1e-5);
-  }
-
-  deepmd::delete_device_memory(dy_dem_x_dev);
-  deepmd::delete_device_memory(dy_dem_dev);
-  deepmd::delete_device_memory(table_dev);
-  deepmd::delete_device_memory(em_x_dev);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(dy_dev);
-  deepmd::delete_device_memory(two_embed_dev);
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_tabulate_se_r.cc b/source/lib/tests/test_tabulate_se_r.cc
index 5097451aab..8ac7e13c96 100644
--- a/source/lib/tests/test_tabulate_se_r.cc
+++ b/source/lib/tests/test_tabulate_se_r.cc
@@ -606,7 +606,7 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_cpu) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu) {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
@@ -653,53 +653,4 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu) {
     EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu) {
-  std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
-
-  double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_dev = NULL;
-  deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  deepmd::malloc_device_memory_sync(table_dev, table);
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_r_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
-                                           em_dev, nloc, nnei, last_layer_size);
-  deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
-  deepmd::delete_device_memory(xyz_scatter_dev);
-  deepmd::delete_device_memory(table_dev);
-  deepmd::delete_device_memory(em_dev);
-
-  EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
-  EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
-  for (int jj = 0; jj < xyz_scatter.size(); ++jj) {
-    EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter[jj]), 1e-5);
-  }
-}
-
-TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu) {
-  std::vector<double> dy_dem(em.size(), 0.0);
-  std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
-
-  double *dy_dem_dev = NULL, *table_dev = NULL, *em_dev = NULL, *dy_dev = NULL;
-  deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  deepmd::malloc_device_memory_sync(table_dev, table);
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_r_grad_gpu<double>(dy_dem_dev, table_dev, &info[0],
-                                                em_dev, dy_dev, nloc, nnei,
-                                                last_layer_size);
-  deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
-  deepmd::delete_device_memory(dy_dem_dev);
-  deepmd::delete_device_memory(table_dev);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(dy_dev);
-
-  EXPECT_EQ(dy_dem.size(), nloc * nnei);
-  EXPECT_EQ(dy_dem.size(), expected_dy_dem.size());
-
-  for (int jj = 0; jj < dy_dem.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/tests/test_tabulate_se_t.cc b/source/lib/tests/test_tabulate_se_t.cc
index ffb1b41220..be82a07ba6 100644
--- a/source/lib/tests/test_tabulate_se_t.cc
+++ b/source/lib/tests/test_tabulate_se_t.cc
@@ -5260,7 +5260,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_cpu) {
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu) {
   std::vector<double> xyz_scatter(nloc * last_layer_size, 0.0);
   double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
@@ -5322,66 +5322,4 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu) {
     EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]), 1e-5);
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu) {
-  std::vector<double> xyz_scatter(nloc * last_layer_size, 0.0);
-  double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL,
-         *em_dev = NULL;
-  deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  deepmd::malloc_device_memory_sync(table_dev, table);
-  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::tabulate_fusion_se_t_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
-                                           em_x_dev, em_dev, nloc, nnei_i,
-                                           nnei_j, last_layer_size);
-  deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
-  deepmd::delete_device_memory(xyz_scatter_dev);
-  deepmd::delete_device_memory(table_dev);
-  deepmd::delete_device_memory(em_x_dev);
-  deepmd::delete_device_memory(em_dev);
-
-  EXPECT_EQ(xyz_scatter.size(), nloc * last_layer_size);
-  EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
-  for (int jj = 0; jj < xyz_scatter.size() / 100; ++jj) {
-    EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter[jj]), 1e-5);
-  }
-}
-
-TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu) {
-  std::vector<double> dy_dem_x(em_x.size(), 0.0);
-  std::vector<double> dy_dem(em.size(), 0.0);
-
-  double *dy_dem_x_dev = NULL, *dy_dem_dev = NULL, *table_dev = NULL,
-         *em_x_dev = NULL, *em_dev = NULL, *dy_dev = NULL;
-  deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
-  deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  deepmd::malloc_device_memory_sync(table_dev, table);
-  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
-  deepmd::malloc_device_memory_sync(em_dev, em);
-  deepmd::malloc_device_memory_sync(dy_dev, dy);
-  deepmd::tabulate_fusion_se_t_grad_gpu<double>(
-      dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev,
-      nloc, nnei_i, nnei_j, last_layer_size);
-  deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
-  deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
-  deepmd::delete_device_memory(dy_dem_x_dev);
-  deepmd::delete_device_memory(dy_dem_dev);
-  deepmd::delete_device_memory(table_dev);
-  deepmd::delete_device_memory(em_x_dev);
-  deepmd::delete_device_memory(em_dev);
-  deepmd::delete_device_memory(dy_dev);
-
-  EXPECT_EQ(dy_dem_x.size(), nloc * nnei_i * nnei_j);
-  EXPECT_EQ(dy_dem.size(), nloc * nnei_i * nnei_j);
-  EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size());
-  EXPECT_EQ(dy_dem.size(), expected_dy_dem.size());
-  for (int jj = 0; jj < dy_dem_x.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem_x[jj] - expected_dy_dem_x[jj]), 1e-5);
-  }
-  for (int jj = 0; jj < dy_dem.size(); ++jj) {
-    EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]), 1e-5);
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From 7fb1d1126d8d88d112f6b14002f349c1e2c1eae3 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 20 Sep 2023 04:22:37 -0400
Subject: [PATCH 47/63] prod_env_mat: allocate GPU memory out of frame loop
 (#2832)

Allocating GPU memory is not a cheap operator. This PR allocates memory
for `int_temp`, `uint64_temp`, and `tensor_list[0, 1, 3, 4, 5, 6]` out
of the frame loop, so they can be reused in each loop without allocating
many times.
In the original code, `tensor_list[3]`, `tensor_list[4]`, and
`tensor_list[6]` may need to reallocate if the memory is not enough.
This behavior still exists.
The shape of `tensor_list[2]` is dynamic, so it is not refactored in
this PR.
With CUDA enabled, unit tests for C++ and Python can pass. The examples
can be performed.
The speedup can be observed when the number of frames (samples) in a
batch is not small.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 421 ++++++++++++++-----------
 1 file changed, 245 insertions(+), 176 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a90f81d079..ee07dc22fe 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -559,6 +559,69 @@ class ProdEnvMatAOp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    // must declar out of if, otherwise the memory will be destroyed!
+    Tensor int_temp;
+    Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
+    if (device == "GPU") {
+      // allocate temp memory only once for multiple frames
+      // allocate temp memory, temp memory must not be used after this
+      // operation!
+      if (nei_mode != 3) {
+        if (nei_mode == 1) {
+          // Tensor FPTYPE_temp;
+          TensorShape FPTYPE_shape;
+          FPTYPE_shape.AddDim(nall * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                FPTYPE_shape, &tensor_list[0]));
+
+          // Tensor double_temp;
+          TensorShape double_shape;
+          double_shape.AddDim(18);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                double_shape, &tensor_list[1]));
+          // Tensor cpy_temp;
+          TensorShape cpy_shape;
+          cpy_shape.AddDim(mem_cpy * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &tensor_list[3]));
+          // Tensor t_temp;
+          TensorShape t_shape;
+          t_shape.AddDim(mem_cpy * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                         &tensor_list[4]));
+        }
+
+        // Tensor nlist_temp;
+        TensorShape nlist_shape;
+        nlist_shape.AddDim(nloc * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                       &tensor_list[5]));
+
+        TensorShape jlist_shape;
+        jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                       &tensor_list[6]));
+      }
+
+      // used for format_nbor_list_gpu_cuda
+
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                     &uint64_temp));
+      array_int = int_temp.flat<int>().data();
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+    }
+
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -580,7 +643,6 @@ class ProdEnvMatAOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -589,21 +651,6 @@ class ProdEnvMatAOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -625,7 +672,6 @@ class ProdEnvMatAOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -634,21 +680,6 @@ class ProdEnvMatAOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -854,6 +885,70 @@ class ProdEnvMatROp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    // must declar out of if, otherwise the memory will be destroyed!
+    Tensor int_temp;
+    Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
+    if (device == "GPU") {
+      // allocate temp memory only once for multiple frames
+      // allocate temp memory, temp memory must not be used after this
+      // operation!
+      if (nei_mode != 3) {
+        if (nei_mode == 1) {
+          // Tensor FPTYPE_temp;
+          TensorShape FPTYPE_shape;
+          FPTYPE_shape.AddDim(nall * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                FPTYPE_shape, &tensor_list[0]));
+
+          // Tensor double_temp;
+          TensorShape double_shape;
+          double_shape.AddDim(18);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                double_shape, &tensor_list[1]));
+          // Tensor cpy_temp;
+          TensorShape cpy_shape;
+          cpy_shape.AddDim(mem_cpy * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &tensor_list[3]));
+          // Tensor t_temp;
+          TensorShape t_shape;
+          t_shape.AddDim(mem_cpy * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                         &tensor_list[4]));
+        }
+
+        // Tensor nlist_temp;
+        TensorShape nlist_shape;
+        nlist_shape.AddDim(nloc * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                       &tensor_list[5]));
+
+        TensorShape jlist_shape;
+        jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                       &tensor_list[6]));
+      }
+
+      // used for format_nbor_list_gpu_cuda
+
+      TensorShape int_shape;
+      int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                     &uint64_temp));
+
+      array_int = int_temp.flat<int>().data();
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+    }
+
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -875,7 +970,6 @@ class ProdEnvMatROp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -884,21 +978,6 @@ class ProdEnvMatROp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -920,7 +999,6 @@ class ProdEnvMatROp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -929,21 +1007,6 @@ class ProdEnvMatROp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -1197,6 +1260,70 @@ class ProdEnvMatAMixOp : public OpKernel {
       }
     }
 
+    // must declar out of if, otherwise the memory will be destroyed!
+    Tensor int_temp;
+    Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
+    if (device == "GPU") {
+      // allocate temp memory only once for multiple frames
+      // allocate temp memory, temp memory must not be used after this
+      // operation!
+      if (nei_mode != 3) {
+        if (nei_mode == 1) {
+          // Tensor FPTYPE_temp;
+          TensorShape FPTYPE_shape;
+          FPTYPE_shape.AddDim(nall * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                FPTYPE_shape, &tensor_list[0]));
+
+          // Tensor double_temp;
+          TensorShape double_shape;
+          double_shape.AddDim(18);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                double_shape, &tensor_list[1]));
+          // Tensor cpy_temp;
+          TensorShape cpy_shape;
+          cpy_shape.AddDim(mem_cpy * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &tensor_list[3]));
+          // Tensor t_temp;
+          TensorShape t_shape;
+          t_shape.AddDim(mem_cpy * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                         &tensor_list[4]));
+        }
+
+        // Tensor nlist_temp;
+        TensorShape nlist_shape;
+        nlist_shape.AddDim(nloc * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                       &tensor_list[5]));
+
+        TensorShape jlist_shape;
+        jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                       &tensor_list[6]));
+      }
+
+      // used for format_nbor_list_gpu_cuda
+
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                     &uint64_temp));
+
+      array_int = int_temp.flat<int>().data();
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+    }
+
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -1221,7 +1348,6 @@ class ProdEnvMatAMixOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
@@ -1230,21 +1356,6 @@ class ProdEnvMatAMixOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -1265,7 +1376,6 @@ class ProdEnvMatAMixOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
@@ -1274,21 +1384,6 @@ class ProdEnvMatAMixOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
                                    gpu_inlist, array_int, array_longlong,
@@ -1536,14 +1631,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 const int& nloc,
                                 const int& max_cpy_trial,
                                 const float& rcut_r) {
-  // Tensor FPTYPE_temp;
-  TensorShape FPTYPE_shape;
-  FPTYPE_shape.AddDim(nall * 3);
-  tensorflow::Status status = context->allocate_temp(
-      DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
-  if (!status.ok()) {
-    return false;
-  }
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
   DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
                         cudaMemcpyDeviceToDevice));
@@ -1557,20 +1644,16 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   deepmd::compute_cell_info(cell_info, rcut_r, region);
   const int loc_cellnum = cell_info[21];
   const int total_cellnum = cell_info[22];
-  // Tensor double_temp;
-  TensorShape double_shape;
-  double_shape.AddDim(18);
-  status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, double_shape,
-                                  tensor_list + 1);
-  if (!status.ok()) {
-    return false;
-  }
   // Tensor int_temp;
   TensorShape int_shape;
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
                    nloc);
-  context, context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  if (!status.ok()) {
+    return false;
+  }
   FPTYPE* box_info_dev = (*(tensor_list + 1)).flat<FPTYPE>().data();
   int* cell_info_dev = (*(tensor_list + 2)).flat<int>().data();
   int* int_data_dev = cell_info_dev + 23;
@@ -1584,18 +1667,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
-    // Tensor cpy_temp;
-    TensorShape cpy_shape;
-    cpy_shape.AddDim(mem_cpy * 3);
-    status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
-                                    tensor_list + 3);
-    if (!status.ok()) {
-      return false;
-    }
-    // Tensor t_temp;
-    TensorShape t_shape;
-    t_shape.AddDim(mem_cpy * 2);
-    context, context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
     coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
     type_cpy = (*(tensor_list + 4)).flat<int>().data();
     idx_mapping = type_cpy + mem_cpy;
@@ -1606,6 +1677,21 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
       break;
     } else {
       mem_cpy *= 2;
+      // Tensor cpy_temp;
+      TensorShape cpy_shape;
+      cpy_shape.AddDim(mem_cpy * 3);
+      status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
+                                      tensor_list + 3);
+      if (!status.ok()) {
+        return false;
+      }
+      // Tensor t_temp;
+      TensorShape t_shape;
+      t_shape.AddDim(mem_cpy * 2);
+      status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   region_dev.boxt = new_boxt;
@@ -1627,14 +1713,6 @@ static int _build_nlist_gpu(OpKernelContext* context,
                             const int& new_nall,
                             const int& max_nnei_trial,
                             const float& rcut_r) {
-  // Tensor nlist_temp;
-  TensorShape nlist_shape;
-  nlist_shape.AddDim(nloc * 2);
-  tensorflow::Status status =
-      context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
-  if (!status.ok()) {
-    return false;
-  }
   ilist = (*tensor_list).flat<int>().data();
   numneigh = ilist + nloc;
   // Tensor jlist_temp;
@@ -1643,12 +1721,6 @@ static int _build_nlist_gpu(OpKernelContext* context,
   std::vector<int*> firstneigh_host(nloc);
   int tt;
   for (tt = 0; tt < max_nnei_trial; ++tt) {
-    TensorShape jlist_shape;
-    jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-    status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
-    if (!status.ok()) {
-      return false;
-    }
     jlist = (*(tensor_list + 1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
     for (int_64 ii = 0; ii < nloc; ++ii) {
@@ -1662,6 +1734,13 @@ static int _build_nlist_gpu(OpKernelContext* context,
       break;
     } else {
       mem_nnei *= 2;
+      TensorShape jlist_shape;
+      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      tensorflow::Status status =
+          context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   return (tt != max_nnei_trial);
@@ -1815,11 +1894,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 const int& nloc,
                                 const int& max_cpy_trial,
                                 const float& rcut_r) {
-  // Tensor FPTYPE_temp;
-  TensorShape FPTYPE_shape;
-  FPTYPE_shape.AddDim(nall * 3);
-  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape,
-                         tensor_list);
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
   DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
                        hipMemcpyDeviceToDevice));
@@ -1833,20 +1907,16 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   deepmd::compute_cell_info(cell_info, rcut_r, region);
   const int loc_cellnum = cell_info[21];
   const int total_cellnum = cell_info[22];
-  // Tensor double_temp;
-  TensorShape double_shape;
-  double_shape.AddDim(18);
-  tensorflow::Status status = context->allocate_temp(
-      DataTypeToEnum<FPTYPE>::value, double_shape, tensor_list + 1);
-  if (!status.ok()) {
-    return false;
-  }
   // Tensor int_temp;
   TensorShape int_shape;
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
                    nloc);
-  context, context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  if (!status.ok()) {
+    return false;
+  }
   FPTYPE* box_info_dev = (*(tensor_list + 1)).flat<FPTYPE>().data();
   int* cell_info_dev = (*(tensor_list + 2)).flat<int>().data();
   int* int_data_dev = cell_info_dev + 23;
@@ -1860,15 +1930,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
-    // Tensor cpy_temp;
-    TensorShape cpy_shape;
-    cpy_shape.AddDim(mem_cpy * 3);
-    context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
-                           tensor_list + 3);
-    // Tensor t_temp;
-    TensorShape t_shape;
-    t_shape.AddDim(mem_cpy * 2);
-    context, context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
     coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
     type_cpy = (*(tensor_list + 4)).flat<int>().data();
     idx_mapping = type_cpy + mem_cpy;
@@ -1879,6 +1940,21 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
       break;
     } else {
       mem_cpy *= 2;
+      // Tensor cpy_temp;
+      TensorShape cpy_shape;
+      cpy_shape.AddDim(mem_cpy * 3);
+      tensorflow::Status status = context->allocate_temp(
+          DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list + 3);
+      if (!status.ok()) {
+        return false;
+      }
+      // Tensor t_temp;
+      TensorShape t_shape;
+      t_shape.AddDim(mem_cpy * 2);
+      status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   region_dev.boxt = new_boxt;
@@ -1900,14 +1976,6 @@ static int _build_nlist_gpu(OpKernelContext* context,
                             const int& new_nall,
                             const int& max_nnei_trial,
                             const float& rcut_r) {
-  // Tensor nlist_temp;
-  TensorShape nlist_shape;
-  nlist_shape.AddDim(nloc * 2);
-  tensorflow::Status status =
-      context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
-  if (!status.ok()) {
-    return false;
-  }
   ilist = (*tensor_list).flat<int>().data();
   numneigh = ilist + nloc;
   // Tensor jlist_temp;
@@ -1916,12 +1984,6 @@ static int _build_nlist_gpu(OpKernelContext* context,
   std::vector<int*> firstneigh_host(nloc);
   int tt;
   for (tt = 0; tt < max_nnei_trial; ++tt) {
-    TensorShape jlist_shape;
-    jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-    status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
-    if (!status.ok()) {
-      return false;
-    }
     jlist = (*(tensor_list + 1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
     for (int_64 ii = 0; ii < nloc; ++ii) {
@@ -1935,6 +1997,13 @@ static int _build_nlist_gpu(OpKernelContext* context,
       break;
     } else {
       mem_nnei *= 2;
+      TensorShape jlist_shape;
+      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      tensorflow::Status status =
+          context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   return (tt != max_nnei_trial);

From 955df79bcf4d40b4f124ea3f74a289d568577036 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 20 Sep 2023 21:42:53 -0400
Subject: [PATCH 48/63] fix missing version file with setuptools-scm v8 (#2850)

See https://github.com/scikit-build/scikit-build-core/issues/507.

Now it's written to `{site_packages}/deepmd_cli/_version/__init__.py`.
The previous implementation is too complex.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/__init__.py |  2 +-
 deepmd_cli/main.py | 23 ++++-------------------
 pyproject.toml     |  7 ++++++-
 3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/deepmd/__init__.py b/deepmd/__init__.py
index 12af8f25b2..b02817b6fc 100644
--- a/deepmd/__init__.py
+++ b/deepmd/__init__.py
@@ -32,7 +32,7 @@
 set_mkl()
 
 try:
-    from ._version import version as __version__
+    from deepmd_cli._version import version as __version__
 except ImportError:
     from .__about__ import (
         __version__,
diff --git a/deepmd_cli/main.py b/deepmd_cli/main.py
index fceca239ea..8aa6785681 100644
--- a/deepmd_cli/main.py
+++ b/deepmd_cli/main.py
@@ -1,31 +1,16 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import argparse
-import importlib.util
 import logging
-import os
-import sys
 import textwrap
 from typing import (
     List,
     Optional,
 )
 
-
-def load_child_module(name):
-    """Load a child module without loading its parent module."""
-    names = name.split(".")
-    parent_spec = importlib.util.find_spec(names[0])
-    paths = os.path.join(*names[1:]) + ".py"
-    spec = importlib.util.spec_from_file_location(
-        name, os.path.join(parent_spec.submodule_search_locations[0], paths)
-    )
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[name] = module
-    spec.loader.exec_module(module)
-    return module
-
-
-__version__ = load_child_module("deepmd._version").__version__
+try:
+    from deepmd_cli._version import version as __version__
+except ImportError:
+    __version__ = "unknown"
 
 
 def get_ll(log_level: str) -> int:
diff --git a/pyproject.toml b/pyproject.toml
index 8a63a8727e..f14de6f85e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,6 @@ documentation = "https://docs.deepmodeling.com/projects/deepmd"
 repository = "https://github.com/deepmodeling/deepmd-kit"
 
 [tool.setuptools_scm]
-write_to = "deepmd/_version.py"
 
 [tool.scikit-build]
 experimental = true
@@ -97,6 +96,12 @@ provider-path = "backend"
 provider = "backend.dynamic_metadata"
 provider-path = "backend"
 
+[[tool.scikit-build.generate]]
+path = "deepmd_cli/_version.py"
+template = '''
+version = "${version}"
+'''
+
 [tool.cibuildwheel]
 test-command = [
     "python -m deepmd -h",

From 80b21958b64feff71066fdd635b85321fee6733a Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 20 Sep 2023 22:11:14 -0400
Subject: [PATCH 49/63] build linux-aarch64 wheel on self-hosted runner (#2851)

..., which should be much faster than using QEMU on x86. The runner is a
free Oracle ARM Ampere A1 Compute instance.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/workflows/build_wheel.yml | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 49ed433609..2dcec8c0bd 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -5,8 +5,24 @@ on:
   pull_request:
 
 jobs:
+  determine-arm64-runner:
+    runs-on: ubuntu-latest
+    permissions: read-all
+    outputs:
+      runner: ${{ steps.set-runner.outputs.runner }}
+    steps:
+      - name: Determine which runner to use for ARM64 build
+        id: set-runner
+        run: |
+          if [ "${{ github.repository_owner }}" == "deepmodeling" ]; then
+            echo "runner=[\"Linux\",\"ARM64\"]" >> $GITHUB_OUTPUT
+          else
+            echo "runner=\"ubuntu-latest\"" >> $GITHUB_OUTPUT
+          fi
+
   build_wheels:
     name: Build wheels for cp${{ matrix.python }}-${{ matrix.platform_id }}
+    needs: determine-arm64-runner
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -33,7 +49,7 @@ jobs:
             platform_id: win_amd64
             dp_variant: cpu
           # linux-aarch64
-          - os: ubuntu-latest
+          - os: ${{ fromJson(needs.determine-arm64-runner.outputs.runner) }}
             python: 310
             platform_id: manylinux_aarch64
             dp_variant: cpu
@@ -45,7 +61,7 @@ jobs:
           fetch-depth: 0
       - uses: docker/setup-qemu-action@v3
         name: Setup QEMU
-        if: matrix.platform_id == 'manylinux_aarch64'
+        if: matrix.platform_id == 'manylinux_aarch64' && matrix.os == 'ubuntu-latest'
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.15
         env:

From 544875ed570cbbf7fe9cc545565f062417666fe9 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 21 Sep 2023 00:57:06 -0400
Subject: [PATCH 50/63] add test cuda workflow (#2848)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/workflows/remove_test_cuda_label.yml | 18 ++++++
 .github/workflows/test_cuda.yml              | 60 ++++++++++++++++++++
 doc/development/cicd.md                      | 15 +++++
 doc/index.rst                                |  8 ++-
 source/install/test_cc.sh                    |  8 ++-
 source/install/test_cc_local.sh              |  8 ++-
 source/lmp/plugin/CMakeLists.txt             |  5 ++
 7 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/remove_test_cuda_label.yml
 create mode 100644 .github/workflows/test_cuda.yml
 create mode 100644 doc/development/cicd.md

diff --git a/.github/workflows/remove_test_cuda_label.yml b/.github/workflows/remove_test_cuda_label.yml
new file mode 100644
index 0000000000..4702814f7e
--- /dev/null
+++ b/.github/workflows/remove_test_cuda_label.yml
@@ -0,0 +1,18 @@
+on:
+  pull_request_target:
+    types:
+      - "labeled"
+name: Test CUDA
+jobs:
+  remove_label:
+    permissions:
+      contents: read
+      pull-requests: write
+    # so one can re-trigger the workflow without manually removing the label
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA'
+    steps:
+    - uses: actions-ecosystem/action-remove-labels@v1
+      with:
+        labels: Test CUDA
+        number: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
new file mode 100644
index 0000000000..adc20c27a9
--- /dev/null
+++ b/.github/workflows/test_cuda.yml
@@ -0,0 +1,60 @@
+on:
+  # manually trigger
+  workflow_dispatch:
+  pull_request:
+    types:
+      - "labeled"
+name: Test CUDA
+jobs:
+  test_cuda:
+    name: Test Python and C++ on CUDA
+    runs-on: nvidia
+    if: github.repository_owner == 'deepmodeling' && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch'
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    - name: Setup MPI
+      uses: mpi4py/setup-mpi@v1
+      with:
+        mpi: mpich
+    - uses: lukka/get-cmake@latest
+    - run: |
+         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
+         && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
+         && sudo apt-get update \
+         && sudo apt-get -y install cuda-11-8 libcudnn8=8.9.5.*-1+cuda11.8
+    - run: python -m pip install -U "pip>=21.3.1,!=23.0.0"
+    - run: pip install -v -e .[gpu,test,lmp,cu11] "ase @ https://github.com/rosswhitfield/ase/archive/edd03571aff6944b77b4a4b055239f3c3e4eeb66.zip"
+      env:
+        DP_BUILD_TESTING: 1
+        DP_VARIANT: cuda
+        CUDA_PATH: /usr/local/cuda-11.8
+    - run: dp --version
+    - run: pytest -s --cov=deepmd --cov=deepmd_cli source/tests --durations=0
+    - run: source/install/test_cc_local.sh
+      env:
+        OMP_NUM_THREADS: 1
+        TF_INTRA_OP_PARALLELISM_THREADS: 1
+        TF_INTER_OP_PARALLELISM_THREADS: 1
+        LMP_CXX11_ABI_0: 1
+        CMAKE_GENERATOR: Ninja
+        DP_VARIANT: cuda
+        DP_USE_MPICH2: 1
+        CUDA_PATH: /usr/local/cuda-11.8
+    - run: |
+        export LD_LIBRARY_PATH=${{ github.workspace }}/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH
+        export PATH=${{ github.workspace }}/dp_test/bin:$PATH
+        pytest -s --cov=deepmd source/lmp/tests
+        pytest -s --cov=deepmd source/ipi/tests
+      env:
+        OMP_NUM_THREADS: 1
+        TF_INTRA_OP_PARALLELISM_THREADS: 1
+        TF_INTER_OP_PARALLELISM_THREADS: 1
+        LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp
+        CUDA_PATH: /usr/local/cuda-11.8
+    - uses: codecov/codecov-action@v3
+      with:
+        gcov: true
diff --git a/doc/development/cicd.md b/doc/development/cicd.md
new file mode 100644
index 0000000000..b323a62385
--- /dev/null
+++ b/doc/development/cicd.md
@@ -0,0 +1,15 @@
+# CI/CD
+
+<!-- TODO: To be written... -->
+
+## CI
+
+<!-- TODO: To be written... -->
+
+### Test CUDA
+
+`Test CUDA` action runs tests on a self-hosted runner with the NVIDIA card. It is not triggered by every PR. The developer who has the permission to manage the label can apply the label `Test CUDA` to a PR to trigger this action.
+
+<!-- ## CD -->
+
+<!-- TODO: To be written... -->
diff --git a/doc/index.rst b/doc/index.rst
index 0924328b26..b60430b566 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -52,7 +52,6 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
 .. toctree::
    :maxdepth: 2
    :caption: Tutorial
-   :glob:
 
    Tutorials <https://tutorials.deepmodeling.com/>
    Publications <https://deepmodeling.com/blog/papers/deepmd-kit/>
@@ -62,9 +61,12 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
 .. toctree::
    :maxdepth: 5
    :caption: Developer Guide
-   :glob:
 
-   development/*
+   development/cmake
+   development/create-a-model
+   development/type-embedding
+   development/coding-conventions
+   development/cicd
    api_py/api_py
    api_op
    API_CC/api_cc
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index eeff8c47bc..c874e3bf6c 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -1,5 +1,11 @@
 set -e
 
+if [ "$DP_VARIANT" = "cuda" ]; then
+	CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE"
+elif [ "$DP_VARIANT" = "rocm" ]; then
+	CUDA_ARGS="-DUSE_ROCM_TOOLKIT=TRUE"
+fi
+
 #------------------
 
 SCRIPT_PATH=$(dirname $(realpath -s $0))
@@ -11,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 14f86a6646..49f221825b 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -1,5 +1,11 @@
 set -e
 
+if [ "$DP_VARIANT" = "cuda" ]; then
+	CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE"
+elif [ "$DP_VARIANT" = "rocm" ]; then
+	CUDA_ARGS="-DUSE_ROCM_TOOLKIT=TRUE"
+fi
+
 #------------------
 
 SCRIPT_PATH=$(dirname $(realpath -s $0))
@@ -12,7 +18,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ..
+cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt
index 86b99fe7b5..9b5f68b574 100644
--- a/source/lmp/plugin/CMakeLists.txt
+++ b/source/lmp/plugin/CMakeLists.txt
@@ -19,6 +19,11 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
 
   target_include_directories(lammps_interface INTERFACE ${LAMMPS_HEADER_DIR})
 
+  if("$ENV{DP_USE_MPICH2}" STREQUAL "1")
+    # See https://stackoverflow.com/a/47976518/9567349
+    set(MPI_EXECUTABLE_SUFFIX ".mpich")
+  endif()
+
   find_package(MPI)
   if(MPI_FOUND)
     set(LAMMPS_MPI_INCLUDE_DIRS ${MPI_CXX_INCLUDE_DIRS})

From 0f07afa60d9d56e60eb3be0da1fe4b1f37340a66 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 22 Sep 2023 08:38:19 -0400
Subject: [PATCH 51/63] merge cuda and rocm files (#2844)

Merge `source/lib/src/cuda` and `source/lib/src/rocm` into
`source/lib/src/gpu`.

- Define macros `gpuGetLastError`, `gpuDeviceSynchronize`, `gpuMemcpy`,
`gpuMemcpyDeviceToHost`, `gpuMemcpyHostToDevice`, and `gpuMemset` to
make them available for both CUDA and ROCm.
- Use `<<< >>> syntax` for both CUDA and ROCm. Per
https://github.com/ROCm-Developer-Tools/HIP/commit/cf78d85638b22a586317dfee4514d4590e7b2eec,
it has been supported in HIP since 2018.
- Fix several int const numbers that should be double or float.
- For tabulate:
- Fix `WARP_SIZE` for ROCm. Per
https://github.com/pytorch/pytorch/pull/64302, WARP_SIZE can be 32 or
64, so it should not be hardcoded to 64.
- Add `GpuShuffleSync`. Per
https://github.com/ROCm-Developer-Tools/HIP/issues/1491, `__shfl_sync`
is not supported by HIP.
  - After merging the code, #1274 should also work for ROCm.
- Use the same `ii` for #830 and #2357. Although both of them work, `ii`
has different meanings in these two PRs, but now it should be the same.
- However, `ii` in `tabulate_fusion_se_a_fifth_order_polynomial` (rocm)
added by #2532 is wrong. After merging the codes, it should be
corrected.
  - Optimization in #830 was not applied to ROCm.
  - `__syncwarp` is not supported by ROCm.
- After merging the code, #2661 will be applied to ROCm. Although TF
ROCm stream is still blocking
(https://github.com/tensorflow/tensorflow/blob/9d1262082e761cd85d6726bcbdfdef331d6d72c6/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc#L566),
we don't know whether it will change to non-blocking.
- There are several other differences between CUDA and ROCm.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/labeler.yml                           |    4 +-
 .gitmodules                                   |    4 +-
 .pre-commit-config.yaml                       |    4 +-
 doc/install/install-from-source.md            |    4 +-
 source/lib/CMakeLists.txt                     |    4 +-
 source/lib/include/gpu_cuda.h                 |    7 +
 source/lib/include/gpu_rocm.h                 |    7 +
 source/lib/src/cuda/CMakeLists.txt            |   60 -
 source/lib/src/gpu/CMakeLists.txt             |   95 ++
 source/lib/src/{cuda => gpu}/coord.cu         |   50 +-
 source/lib/src/{cuda => gpu}/cub              |    0
 .../src/{cuda => gpu}/cudart/CMakeLists.txt   |    0
 .../cudart/cuda_runtime_10_0.inc              |    0
 .../cudart/cuda_runtime_10_1.inc              |    0
 .../cudart/cuda_runtime_10_2.inc              |    0
 .../cudart/cuda_runtime_11_0.inc              |    0
 .../cudart/cuda_runtime_11_2.inc              |    0
 .../cudart/cuda_runtime_11_8.inc              |    0
 .../cudart/cuda_runtime_12_0.inc              |    0
 .../{cuda => gpu}/cudart/cuda_runtime_9_0.inc |    0
 .../src/{cuda => gpu}/cudart/cudart_stub.cc   |    0
 source/lib/src/{cuda => gpu}/gelu.cu          |   26 +-
 source/lib/src/{cuda => gpu}/neighbor_list.cu |   57 +-
 source/lib/src/{cuda => gpu}/prod_env_mat.cu  |  121 +-
 source/lib/src/{cuda => gpu}/prod_force.cu    |   32 +-
 .../lib/src/{cuda => gpu}/prod_force_grad.cu  |   28 +-
 source/lib/src/{cuda => gpu}/prod_virial.cu   |   32 +-
 .../lib/src/{cuda => gpu}/prod_virial_grad.cu |   20 +-
 source/lib/src/{cuda => gpu}/region.cu        |   24 +-
 source/lib/src/{cuda => gpu}/tabulate.cu      |  242 ++--
 source/lib/src/rocm/CMakeLists.txt            |   39 -
 source/lib/src/rocm/coord.hip.cu              |  444 -------
 source/lib/src/rocm/gelu.hip.cu               |  134 ---
 source/lib/src/rocm/neighbor_list.hip.cu      |  296 -----
 source/lib/src/rocm/prod_env_mat.hip.cu       |  821 -------------
 source/lib/src/rocm/prod_force.hip.cu         |  193 ---
 source/lib/src/rocm/prod_force_grad.hip.cu    |  168 ---
 source/lib/src/rocm/prod_virial.hip.cu        |  197 ----
 source/lib/src/rocm/prod_virial_grad.hip.cu   |  154 ---
 source/lib/src/rocm/region.hip.cu             |   65 --
 source/lib/src/rocm/tabulate.hip.cu           | 1036 -----------------
 41 files changed, 490 insertions(+), 3878 deletions(-)
 delete mode 100644 source/lib/src/cuda/CMakeLists.txt
 create mode 100644 source/lib/src/gpu/CMakeLists.txt
 rename source/lib/src/{cuda => gpu}/coord.cu (93%)
 rename source/lib/src/{cuda => gpu}/cub (100%)
 rename source/lib/src/{cuda => gpu}/cudart/CMakeLists.txt (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_10_0.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_10_1.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_10_2.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_11_0.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_11_2.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_11_8.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_12_0.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cuda_runtime_9_0.inc (100%)
 rename source/lib/src/{cuda => gpu}/cudart/cudart_stub.cc (100%)
 rename source/lib/src/{cuda => gpu}/gelu.cu (90%)
 rename source/lib/src/{cuda => gpu}/neighbor_list.cu (89%)
 rename source/lib/src/{cuda => gpu}/prod_env_mat.cu (93%)
 rename source/lib/src/{cuda => gpu}/prod_force.cu (91%)
 rename source/lib/src/{cuda => gpu}/prod_force_grad.cu (91%)
 rename source/lib/src/{cuda => gpu}/prod_virial.cu (91%)
 rename source/lib/src/{cuda => gpu}/prod_virial_grad.cu (92%)
 rename source/lib/src/{cuda => gpu}/region.cu (83%)
 rename source/lib/src/{cuda => gpu}/tabulate.cu (86%)
 delete mode 100644 source/lib/src/rocm/CMakeLists.txt
 delete mode 100644 source/lib/src/rocm/coord.hip.cu
 delete mode 100644 source/lib/src/rocm/gelu.hip.cu
 delete mode 100644 source/lib/src/rocm/neighbor_list.hip.cu
 delete mode 100644 source/lib/src/rocm/prod_env_mat.hip.cu
 delete mode 100644 source/lib/src/rocm/prod_force.hip.cu
 delete mode 100644 source/lib/src/rocm/prod_force_grad.hip.cu
 delete mode 100644 source/lib/src/rocm/prod_virial.hip.cu
 delete mode 100644 source/lib/src/rocm/prod_virial_grad.hip.cu
 delete mode 100644 source/lib/src/rocm/region.hip.cu
 delete mode 100644 source/lib/src/rocm/tabulate.hip.cu

diff --git a/.github/labeler.yml b/.github/labeler.yml
index ad10d7cb7d..195d2cd217 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -5,8 +5,8 @@ Python:
 Docs: doc/**/*
 Examples: examples/**/*
 Core: source/lib/**/*
-CUDA: source/lib/src/cuda/**/*
-ROCM: source/lib/src/rocm/**/*
+CUDA: source/lib/src/gpu/**/*
+ROCM: source/lib/src/gpu/**/*
 OP: source/op/**/*
 C++: source/api_cc/**/*
 C: source/api_c/**/*
diff --git a/.gitmodules b/.gitmodules
index 7f3510b9d6..849b21ced5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "source/lib/src/cuda/cub"]
-	path = source/lib/src/cuda/cub
+[submodule "source/lib/src/gpu/cub"]
+	path = source/lib/src/gpu/cub
 	url = https://github.com/NVIDIA/cub.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8125324ea1..7ea4915f6e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
     rev: v16.0.6
     hooks:
     -   id: clang-format
-        exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc
+        exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc
 # CSS
 -   repo: https://github.com/pre-commit/mirrors-csslint
     rev: v1.0.5
@@ -83,7 +83,7 @@ repos:
         - --comment-style
         - //
         - --no-extra-eol
-        exclude: ^source/3rdparty|source/lib/src/cuda/cudart/.+\.inc
+        exclude: ^source/3rdparty|source/lib/src/gpu/cudart/.+\.inc
     # CSS
     -   id: insert-license
         files: \.(css|scss)$
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index e6a4b1a7cb..1447823c08 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -74,7 +74,7 @@ One may set the following environment variables before executing `pip`:
 | Environment variables | Allowed value          | Default value | Usage                      |
 | --------------------- | ---------------------- | ------------- | -------------------------- |
 | DP_VARIANT            | `cpu`, `cuda`, `rocm`  | `cpu`         | Build CPU variant or GPU variant with CUDA or ROCM support. |
-| CUDAToolkit_ROOT | Path                   | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. |
+| CUDAToolkit_ROOT | Path                   | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. |
 | ROCM_ROOT             | Path                   | Detected automatically | The path to the ROCM toolkit directory. |
 | TENSORFLOW_ROOT       | Path                   | Detected automatically | The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.|
 | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1           | 0             | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. |
@@ -188,7 +188,7 @@ One may add the following arguments to `cmake`:
 | -DTENSORFLOW_ROOT=&lt;value&gt;  | Path              | -             | The Path to TensorFlow's C++ interface. |
 | -DCMAKE_INSTALL_PREFIX=&lt;value&gt; | Path          | -             | The Path where DeePMD-kit will be installed. |
 | -DUSE_CUDA_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with CUDA toolkit. |
-| -DCUDAToolkit_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. CUDA 7.0 or later is supported. NVCC is required. |
+| -DCUDAToolkit_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required. |
 | -DUSE_ROCM_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with ROCM toolkit. |
 | -DCMAKE_HIP_COMPILER_ROCM_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the ROCM toolkit directory. |
 | -DLAMMPS_SOURCE_ROOT=&lt;value&gt; | Path         | - | Only neccessary for LAMMPS plugin mode. The path to the [LAMMPS source code](install-lammps.md). LAMMPS 8Apr2021 or later is supported. If not assigned, the plugin mode will not be enabled. |
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 5f5528de3e..323bf2d7c0 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -11,7 +11,7 @@ target_include_directories(
 
 if(USE_CUDA_TOOLKIT)
   add_definitions("-DGOOGLE_CUDA")
-  add_subdirectory(src/cuda)
+  add_subdirectory(src/gpu)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
   target_link_libraries(${libname} INTERFACE deepmd_dyn_cudart ${EXTRA_LIBS})
   # gpu_cuda.h
@@ -22,7 +22,7 @@ endif()
 
 if(USE_ROCM_TOOLKIT)
   add_definitions("-DTENSORFLOW_USE_ROCM")
-  add_subdirectory(src/rocm)
+  add_subdirectory(src/gpu)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
   target_link_libraries(${libname} INTERFACE ${ROCM_LIBRARIES} ${EXTRA_LIBS})
   # gpu_rocm.h
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index bf8c325b14..73dfed1404 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -8,6 +8,13 @@
 
 #include "errors.h"
 
+#define gpuGetLastError cudaGetLastError
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemset cudaMemset
+
 #define GPU_MAX_NBOR_SIZE 4096
 #define DPErrcheck(res) \
   { DPAssert((res), __FILE__, __LINE__); }
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 4c3c1b41a9..3a65a57b01 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -11,6 +11,13 @@
 
 #define GPU_MAX_NBOR_SIZE 4096
 
+#define gpuGetLastError hipGetLastError
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemset hipMemset
+
 #define DPErrcheck(res) \
   { DPAssert((res), __FILE__, __LINE__); }
 inline void DPAssert(hipError_t code,
diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt
deleted file mode 100644
index 1d5ae690e1..0000000000
--- a/source/lib/src/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-# required cmake version 3.23: CMAKE_CUDA_ARCHITECTURES all
-cmake_minimum_required(VERSION 3.23)
-# project name
-project(deepmd_op_cuda)
-
-set(CMAKE_CUDA_ARCHITECTURES all)
-enable_language(CUDA)
-set(CMAKE_CUDA_STANDARD 11)
-add_compile_definitions(
-  "$<$<COMPILE_LANGUAGE:CUDA>:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>")
-
-find_package(CUDAToolkit REQUIRED)
-
-# take dynamic open cudart library replace of static one so it's not required
-# when using CPUs
-add_subdirectory(cudart)
-
-# nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true
-# -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu
-# -L/usr/local/cuda/lib64 -lcudadevrt very important here! Include path to cub.
-# for searching device compute capability,
-# https://developer.nvidia.com/cuda-gpus
-
-# cub has been included in CUDA Toolkit 11, we do not need to include it any
-# more see https://github.com/NVIDIA/cub
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11")
-  include_directories(cub)
-endif()
-
-message(STATUS "NVCC version is " ${CMAKE_CUDA_COMPILER_VERSION})
-
-# arch will be configured by CMAKE_CUDA_ARCHITECTURES
-set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT"
-)
-
-file(GLOB SOURCE_FILES "*.cu")
-
-add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
-target_link_libraries(deepmd_op_cuda PRIVATE deepmd_dyn_cudart)
-target_include_directories(
-  deepmd_op_cuda
-  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/>
-         $<INSTALL_INTERFACE:include>)
-target_precompile_headers(deepmd_op_cuda PUBLIC [["device.h"]])
-if(APPLE)
-  set_target_properties(deepmd_op_cuda PROPERTIES INSTALL_RPATH @loader_path)
-else()
-  set_target_properties(deepmd_op_cuda PROPERTIES INSTALL_RPATH "$ORIGIN")
-endif()
-
-if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
-  install(
-    TARGETS deepmd_op_cuda
-    EXPORT ${CMAKE_PROJECT_NAME}Targets
-    DESTINATION lib/)
-endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
-if(BUILD_PY_IF)
-  install(TARGETS deepmd_op_cuda DESTINATION deepmd/lib/)
-endif(BUILD_PY_IF)
diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt
new file mode 100644
index 0000000000..25223c82bf
--- /dev/null
+++ b/source/lib/src/gpu/CMakeLists.txt
@@ -0,0 +1,95 @@
+if(USE_CUDA_TOOLKIT)
+  # required cmake version 3.23: CMAKE_CUDA_ARCHITECTURES all
+  cmake_minimum_required(VERSION 3.23)
+  # project name
+  project(deepmd_op_cuda)
+  set(GPU_LIB_NAME deepmd_op_cuda)
+
+  set(CMAKE_CUDA_ARCHITECTURES all)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_STANDARD 11)
+  add_compile_definitions(
+    "$<$<COMPILE_LANGUAGE:CUDA>:_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI}>")
+
+  find_package(CUDAToolkit REQUIRED)
+
+  # take dynamic open cudart library replace of static one so it's not required
+  # when using CPUs
+  add_subdirectory(cudart)
+
+  # nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true
+  # -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler
+  # -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt very important here!
+  # Include path to cub. for searching device compute capability,
+  # https://developer.nvidia.com/cuda-gpus
+
+  # cub has been included in CUDA Toolkit 11, we do not need to include it any
+  # more see https://github.com/NVIDIA/cub
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11")
+    include_directories(cub)
+  endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "9")
+    message(FATAL_ERROR "CUDA version must be >= 9.0")
+  endif()
+
+  message(STATUS "NVCC version is " ${CMAKE_CUDA_COMPILER_VERSION})
+
+  # arch will be configured by CMAKE_CUDA_ARCHITECTURES
+  set(CMAKE_CUDA_FLAGS
+      "${CMAKE_CUDA_FLAGS} -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT"
+  )
+
+  file(GLOB SOURCE_FILES "*.cu")
+
+  add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES})
+  target_link_libraries(${GPU_LIB_NAME} PRIVATE deepmd_dyn_cudart)
+
+elseif(USE_ROCM_TOOLKIT)
+
+  # required cmake version
+  cmake_minimum_required(VERSION 3.21)
+  # project name
+  project(deepmd_op_rocm)
+  set(GPU_LIB_NAME deepmd_op_rocm)
+  set(CMAKE_LINK_WHAT_YOU_USE TRUE)
+
+  # set c++ version c++11
+  set(CMAKE_CXX_STANDARD 14)
+  set(CMAKE_HIP_STANDARD 14)
+  add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
+  add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
+
+  message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
+
+  set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS}
+  )# --amdgpu-target=gfx906
+  if(HIP_VERSION VERSION_LESS 3.5.1)
+    set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS})
+  endif()
+
+  file(GLOB SOURCE_FILES "*.cu")
+
+  hip_add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES})
+
+endif()
+
+target_include_directories(
+  ${GPU_LIB_NAME}
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/>
+         $<INSTALL_INTERFACE:include>)
+target_precompile_headers(${GPU_LIB_NAME} PUBLIC [["device.h"]])
+if(APPLE)
+  set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH @loader_path)
+else()
+  set_target_properties(${GPU_LIB_NAME} PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+
+if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+  install(
+    TARGETS ${GPU_LIB_NAME}
+    EXPORT ${CMAKE_PROJECT_NAME}Targets
+    DESTINATION lib/)
+endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+if(BUILD_PY_IF)
+  install(TARGETS ${GPU_LIB_NAME} DESTINATION deepmd/lib/)
+endif(BUILD_PY_IF)
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/gpu/coord.cu
similarity index 93%
rename from source/lib/src/cuda/coord.cu
rename to source/lib/src/gpu/coord.cu
index d37e5de9cf..52ec9ff09d 100644
--- a/source/lib/src/cuda/coord.cu
+++ b/source/lib/src/gpu/coord.cu
@@ -266,21 +266,21 @@ void compute_int_data(int *int_data,
   _fill_idx_cellmap<<<nblock_loc, TPB>>>(idx_cellmap, idx_cellmap_noshift, in_c,
                                          rec_boxt, nat_stt, nat_end, ext_stt,
                                          ext_end, nloc);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int nblock_loc_cellnum = (loc_cellnum + TPB - 1) / TPB;
   _fill_loc_cellnum_map<<<nblock_loc_cellnum, TPB>>>(
       temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc, loc_cellnum);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int nblock_total_cellnum = (total_cellnum + TPB - 1) / TPB;
   _fill_total_cellnum_map<<<nblock_total_cellnum, TPB>>>(
       total_cellnum_map, mask_cellnum_map, cell_map, cell_shift_map, nat_stt,
       nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 void build_loc_clist(int *int_data,
@@ -297,8 +297,8 @@ void build_loc_clist(int *int_data,
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1;
   _build_loc_clist<<<nblock, TPB>>>(loc_clist, idx_cellmap_noshift,
                                     temp_idx_order, sec_loc_cellnum_map, nloc);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -326,8 +326,8 @@ void copy_coord(FPTYPE *out_c,
                                cell_shift_map, sec_loc_cellnum_map,
                                sec_total_cellnum_map, loc_clist, nloc, nall,
                                total_cellnum, boxt, rec_boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 namespace deepmd {
@@ -335,14 +335,14 @@ template <typename FPTYPE>
 void normalize_coord_gpu(FPTYPE *coord,
                          const int natom,
                          const Region<FPTYPE> &region) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const FPTYPE *boxt = region.boxt;
   const FPTYPE *rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
   normalize_one<<<nblock, TPB>>>(coord, boxt, rec_boxt, natom);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 //  int_data(temp cuda
@@ -362,16 +362,17 @@ int copy_coord_gpu(FPTYPE *out_c,
                    const int &total_cellnum,
                    const int *cell_info,
                    const Region<FPTYPE> &region) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
   int *int_data_cpu = new int
       [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum +
        1];  // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-  DPErrcheck(cudaMemcpy(int_data_cpu, int_data + 3 * nloc,
-                        sizeof(int) * (loc_cellnum + 2 * total_cellnum),
-                        cudaMemcpyDeviceToHost));
+  DPErrcheck(gpuMemcpy(int_data_cpu, int_data + 3 * nloc,
+                       sizeof(int) * (loc_cellnum + 2 * total_cellnum),
+                       gpuMemcpyDeviceToHost));
+  DPErrcheck(gpuGetLastError());
   int *loc_cellnum_map = int_data_cpu;
   int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
   int *mask_cellnum_map = total_cellnum_map + total_cellnum;
@@ -397,11 +398,12 @@ int copy_coord_gpu(FPTYPE *out_c,
     // size of the output arrays is not large enough
     return 1;
   } else {
-    DPErrcheck(cudaMemcpy(int_data + nloc * 3 + loc_cellnum +
-                              total_cellnum * 3 + total_cellnum * 3,
-                          sec_loc_cellnum_map,
-                          sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1),
-                          cudaMemcpyHostToDevice));
+    DPErrcheck(gpuMemcpy(int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
+                             total_cellnum * 3,
+                         sec_loc_cellnum_map,
+                         sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1),
+                         gpuMemcpyHostToDevice));
+    DPErrcheck(gpuGetLastError());
     delete[] int_data_cpu;
     build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
     copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall,
diff --git a/source/lib/src/cuda/cub b/source/lib/src/gpu/cub
similarity index 100%
rename from source/lib/src/cuda/cub
rename to source/lib/src/gpu/cub
diff --git a/source/lib/src/cuda/cudart/CMakeLists.txt b/source/lib/src/gpu/cudart/CMakeLists.txt
similarity index 100%
rename from source/lib/src/cuda/cudart/CMakeLists.txt
rename to source/lib/src/gpu/cudart/CMakeLists.txt
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_10_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_10_0.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_1.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_1.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_10_1.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_10_1.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_10_2.inc b/source/lib/src/gpu/cudart/cuda_runtime_10_2.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_10_2.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_10_2.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_11_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_11_0.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_2.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_2.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_11_2.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_11_2.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_11_8.inc b/source/lib/src/gpu/cudart/cuda_runtime_11_8.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_11_8.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_11_8.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_12_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_12_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_12_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_12_0.inc
diff --git a/source/lib/src/cuda/cudart/cuda_runtime_9_0.inc b/source/lib/src/gpu/cudart/cuda_runtime_9_0.inc
similarity index 100%
rename from source/lib/src/cuda/cudart/cuda_runtime_9_0.inc
rename to source/lib/src/gpu/cudart/cuda_runtime_9_0.inc
diff --git a/source/lib/src/cuda/cudart/cudart_stub.cc b/source/lib/src/gpu/cudart/cudart_stub.cc
similarity index 100%
rename from source/lib/src/cuda/cudart/cudart_stub.cc
rename to source/lib/src/gpu/cudart/cudart_stub.cc
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/gpu/gelu.cu
similarity index 90%
rename from source/lib/src/cuda/gelu.cu
rename to source/lib/src/gpu/gelu.cu
index 823a843b2a..ac6020ea7a 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/gpu/gelu.cu
@@ -32,7 +32,7 @@ __global__ void gelu_grad(FPTYPE* out,
             (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx]));
   out[idx] =
       dy[idx] * ((FPTYPE)0.5 * SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) *
-                     ((FPTYPE)0.134145 * xx[idx] * xx[idx] + 1) +
+                     ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) +
                  (FPTYPE)0.5 * var + (FPTYPE)0.5);
 }
 
@@ -67,14 +67,14 @@ void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -85,14 +85,14 @@ void gelu_grad_gpu(FPTYPE* out,
   if (size <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -104,14 +104,14 @@ void gelu_grad_grad_gpu(FPTYPE* out,
   if (size <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, dy_2, size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void gelu_gpu<float>(float* out, const float* x, const int_64 size);
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu
similarity index 89%
rename from source/lib/src/cuda/neighbor_list.cu
rename to source/lib/src/gpu/neighbor_list.cu
index 7cac07690b..fc4e784915 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/gpu/neighbor_list.cu
@@ -1,4 +1,11 @@
+#if GOOGLE_CUDA
 #include <cub/block/block_scan.cuh>
+#elif TENSORFLOW_USE_ROCM
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
+#error "should not touch here"
+#endif
 
 #include "device.h"
 #include "neighbor_list.h"
@@ -187,13 +194,13 @@ int build_nlist_gpu(InputNlist &nlist,
   if (mem_size < nall) {
     return 1;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
   int *ilist = nlist.ilist;
   int *numneigh = nlist.numneigh;
   int **firstneigh = nlist.firstneigh;
-  DPErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+  DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
   int *temp_nlist = nlist_data;  // nloc*mem_size
   int *nei_order = temp_nlist + nloc * mem_size;
   nlist.inum = nloc;
@@ -203,19 +210,19 @@ int build_nlist_gpu(InputNlist &nlist,
   dim3 thread_grid(1, TPB);
   build_nlist<<<block_grid, thread_grid>>>(ilist, temp_nlist, c_cpy, rcut2,
                                            nloc, nall, mem_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   parallel_prefix_scan<TPB>
       <<<nloc, TPB>>>(numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   fill_nlist<<<block_grid, thread_grid>>>(firstneigh, temp_nlist, nei_order,
                                           mem_size, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int *numneigh_host = new int[nloc];
-  DPErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
-                        cudaMemcpyDeviceToHost));
+  DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
+                       gpuMemcpyDeviceToHost));
   int max_nei = 0;
   for (int ii = 0; ii < nloc; ii++) {
     if (numneigh_host[ii] > max_nei) {
@@ -231,14 +238,14 @@ void use_nlist_map(int *nlist,
                    const int *nlist_map,
                    const int nloc,
                    const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
   map_nlist<<<block_grid, thread_grid>>>(nlist, nlist_map, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 void use_nei_info_gpu(int *nlist,
@@ -250,13 +257,13 @@ void use_nei_info_gpu(int *nlist,
                       const int nnei,
                       const int ntypes,
                       const bool b_nlist_map) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
-  DPErrcheck(cudaMemset(ntype, 0, sizeof(int) * nloc * nnei));
-  DPErrcheck(cudaMemset(nmask, 0, sizeof(bool) * nloc * nnei));
+  DPErrcheck(gpuMemset(ntype, 0, sizeof(int) * nloc * nnei));
+  DPErrcheck(gpuMemset(nmask, 0, sizeof(bool) * nloc * nnei));
   if (b_nlist_map) {
     map_nei_info<<<block_grid, thread_grid>>>(nlist, ntype, nmask, type,
                                               nlist_map, nloc, nnei, ntypes);
@@ -264,8 +271,8 @@ void use_nei_info_gpu(int *nlist,
     map_nei_info_noconvert<<<block_grid, thread_grid>>>(
         nlist, ntype, nmask, type, nloc, nnei, ntypes);
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template int build_nlist_gpu<float>(InputNlist &nlist,
@@ -295,12 +302,12 @@ __global__ void map_filter_ftype(int *ftype_out,
 }
 
 void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
   map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/gpu/prod_env_mat.cu
similarity index 93%
rename from source/lib/src/cuda/prod_env_mat.cu
rename to source/lib/src/gpu/prod_env_mat.cu
index e603b25db7..a69e014272 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/gpu/prod_env_mat.cu
@@ -1,6 +1,13 @@
+#if GOOGLE_CUDA
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_store.cuh>
+#elif TENSORFLOW_USE_ROCM
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
+#error "should not touch here"
+#endif
 
 #include "device.h"
 #include "fmt_nlist.h"
@@ -83,7 +90,13 @@ __device__ inline uint_64 encoding_nbor_info(const int type,
   // the index of nbor atom(including ghost region) must be smaller than
   // 16777216(1 << 24)
   if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
+#if GOOGLE_CUDA
     asm("trap;");
+#elif TENSORFLOW_USE_ROCM
+    __builtin_trap();
+#else
+#error "should not touch here"
+#endif
   }
   return ((uint_64)type << 57) +
          (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) +
@@ -222,16 +235,16 @@ void format_nbor_list_256(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 4;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -250,16 +263,16 @@ void format_nbor_list_512(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 4;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -278,16 +291,16 @@ void format_nbor_list_1024(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -306,16 +319,16 @@ void format_nbor_list_2048(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -334,16 +347,16 @@ void format_nbor_list_4096(uint_64* key,
   format_nlist_fill_a<<<block_grid, thread_grid>>>(
       key, coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx,
       MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS,
   // ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>
       <<<nloc, BLOCK_THREADS>>>(key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE, int THREADS_PER_BLOCK>
@@ -376,9 +389,9 @@ __global__ void compute_env_mat_a(FPTYPE* em,
     const int idx_value = ii * 4;   // 4 components
     const int idx_deriv = ii * 12;  // 4 components time 3 directions
     if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3] = {0};
-      FPTYPE dd[4] = {0};
-      FPTYPE vv[12] = {0};
+      FPTYPE rr[3] = {(FPTYPE)0.};
+      FPTYPE dd[4] = {(FPTYPE)0.};
+      FPTYPE vv[12] = {(FPTYPE)0.};
       const int j_idx = row_nlist[ii];
       for (int kk = 0; kk < 3; kk++) {
         rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
@@ -569,8 +582,8 @@ void format_nbor_list_gpu(int* nlist,
                           const int nall,
                           const float rcut,
                           const std::vector<int> sec) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int LEN = 256;
   const int nnei = sec.back();
   const int nblock = (nloc + LEN - 1) / LEN;
@@ -581,15 +594,15 @@ void format_nbor_list_gpu(int* nlist,
   assert(max_nbor_size == 256 || max_nbor_size == 512 ||
          max_nbor_size == 1024 || max_nbor_size == 2048 ||
          max_nbor_size == 4096);
-  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
-  DPErrcheck(cudaMemset(key, 0xffffffff,
-                        sizeof(uint_64) * int_64(nloc) * max_nbor_size));
-  DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
-                        cudaMemcpyHostToDevice));
+  DPErrcheck(gpuMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
+  DPErrcheck(gpuMemset(key, 0xffffffff,
+                       sizeof(uint_64) * int_64(nloc) * max_nbor_size));
+  DPErrcheck(gpuMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
+                       gpuMemcpyHostToDevice));
 
   get_i_idx<<<nblock, LEN>>>(i_idx, nloc, gpu_inlist.ilist);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   if (max_nbor_size == 256) {
     format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
@@ -608,8 +621,8 @@ void format_nbor_list_gpu(int* nlist,
 
   format_nlist_fill_b<<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>>(
       nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -631,27 +644,27 @@ void prod_env_mat_a_gpu(FPTYPE* em,
                         const float rcut_smth,
                         const std::vector<int> sec,
                         const int* f_type) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   if (f_type == NULL) {
     f_type = type;
   }
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
   DPErrcheck(
-      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+      gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
                        array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(cudaGetLastError());
-  nborErrcheck(cudaDeviceSynchronize());
+  nborErrcheck(gpuGetLastError());
+  nborErrcheck(gpuDeviceSynchronize());
 
   compute_env_mat_a<FPTYPE, TPB><<<nloc, TPB>>>(
       em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -672,24 +685,24 @@ void prod_env_mat_r_gpu(FPTYPE* em,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
+  DPErrcheck(gpuMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
   DPErrcheck(
-      cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
+      gpuMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
+  DPErrcheck(gpuMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
 
   format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
                        array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(cudaGetLastError());
-  nborErrcheck(cudaDeviceSynchronize());
+  nborErrcheck(gpuGetLastError());
+  nborErrcheck(gpuDeviceSynchronize());
 
   compute_env_mat_r<FPTYPE, TPB><<<nloc, TPB>>>(
       em, em_deriv, rij, coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -703,8 +716,8 @@ void test_encoding_decoding_nbor_info_gpu(uint_64* key,
   const int nblock = (size_of_array + TPB - 1) / TPB;
   encoding_decoding_nbor_info<<<nblock, TPB>>>(
       key, out_type, out_index, in_type, in_dist, in_index, size_of_array);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_env_mat_a_gpu<float>(float* em,
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/gpu/prod_force.cu
similarity index 91%
rename from source/lib/src/cuda/prod_force.cu
rename to source/lib/src/gpu/prod_force.cu
index d85de26394..7b1359b3b0 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/gpu/prod_force.cu
@@ -12,7 +12,7 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
   int_64 bid = blockIdx.x;
   unsigned int tid = threadIdx.x;
   for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
-    data[ii] = 0.f;
+    data[ii] = (FPTYPE)0.;
   }
   for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
     for (int jj = 0; jj < 3; jj++) {
@@ -64,7 +64,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
   if (j_idx < 0) {
     return;
   }
-  FPTYPE force_tmp = 0.f;
+  FPTYPE force_tmp = (FPTYPE)0.;
   for (int idw = 0; idw < 4; ++idw) {
     force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
@@ -110,15 +110,15 @@ void prod_force_a_gpu(FPTYPE* force,
                       const int nall,
                       const int nnei,
                       const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
+  DPErrcheck(gpuMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB><<<nframes * nloc, TPB>>>(
       force, net_deriv, in_deriv, ndescrpt, nloc, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -126,8 +126,8 @@ void prod_force_a_gpu(FPTYPE* force,
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -139,15 +139,15 @@ void prod_force_r_gpu(FPTYPE* force,
                       const int nall,
                       const int nnei,
                       const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
+  DPErrcheck(gpuMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB><<<nframes * nloc, TPB>>>(
       force, net_deriv, in_deriv, ndescrpt, nloc, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -155,8 +155,8 @@ void prod_force_r_gpu(FPTYPE* force,
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_force_a_gpu<float>(float* force,
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/gpu/prod_force_grad.cu
similarity index 91%
rename from source/lib/src/cuda/prod_force_grad.cu
rename to source/lib/src/gpu/prod_force_grad.cu
index b54676586c..c784d6ba65 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/gpu/prod_force_grad.cu
@@ -88,18 +88,18 @@ void prod_force_grad_a_gpu(FPTYPE* grad_net,
                            const int nloc,
                            const int nnei,
                            const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(
-      cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
+      gpuMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
   const int nblock = (ndescrpt + TPB - 1) / TPB;
   dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(TPB, 1);
   force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
                                                           env_deriv, ndescrpt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 128;
   const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
@@ -107,8 +107,8 @@ void prod_force_grad_a_gpu(FPTYPE* grad_net,
   dim3 thread_grid_(LEN, 4);
   force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
       grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -119,18 +119,18 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net,
                            const int nloc,
                            const int nnei,
                            const int nframes) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(
-      cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
+      gpuMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
   const int nblock = (ndescrpt + TPB - 1) / TPB;
   dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(TPB, 1);
   force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
                                                           env_deriv, ndescrpt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 
   const int LEN = 128;
   const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
@@ -138,8 +138,8 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net,
   dim3 thread_grid_(LEN, 1);
   force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
       grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_force_grad_a_gpu<float>(float* grad_net,
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/gpu/prod_virial.cu
similarity index 91%
rename from source/lib/src/cuda/prod_virial.cu
rename to source/lib/src/gpu/prod_virial.cu
index e96bacf1d3..ab9c5326e3 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/gpu/prod_virial.cu
@@ -113,10 +113,10 @@ void prod_virial_a_gpu(FPTYPE* virial,
                        const int nloc,
                        const int nall,
                        const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(gpuMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN - 1) / LEN;
@@ -125,12 +125,12 @@ void prod_virial_a_gpu(FPTYPE* virial,
   // compute virial of a frame
   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -143,10 +143,10 @@ void prod_virial_r_gpu(FPTYPE* virial,
                        const int nloc,
                        const int nall,
                        const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(gpuMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
   const int LEN = 16;
   int nblock = (nnei + LEN - 1) / LEN;
@@ -155,12 +155,12 @@ void prod_virial_r_gpu(FPTYPE* virial,
   // compute virial of a frame
   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_virial_a_gpu<float>(float* virial,
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/gpu/prod_virial_grad.cu
similarity index 92%
rename from source/lib/src/cuda/prod_virial_grad.cu
rename to source/lib/src/gpu/prod_virial_grad.cu
index 047d8ae17f..dac5b20ba8 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/gpu/prod_virial_grad.cu
@@ -92,18 +92,18 @@ void prod_virial_grad_a_gpu(FPTYPE* grad_net,
                             const int* nlist,
                             const int nloc,
                             const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(gpuMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
   const int nblock = (nloc + LEN - 1) / LEN;
   dim3 block_grid(nblock, nnei);
   dim3 thread_grid(LEN, 4);
   virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -114,18 +114,18 @@ void prod_virial_grad_r_gpu(FPTYPE* grad_net,
                             const int* nlist,
                             const int nloc,
                             const int nnei) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   const int ndescrpt = nnei;
-  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(gpuMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
   const int nblock = (nloc + LEN - 1) / LEN;
   dim3 block_grid(nblock, nnei);
   dim3 thread_grid(LEN, 1);
   virial_grad_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void prod_virial_grad_a_gpu<float>(float* grad_net,
diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/gpu/region.cu
similarity index 83%
rename from source/lib/src/cuda/region.cu
rename to source/lib/src/gpu/region.cu
index eb8d191a8c..849eecfc3e 100644
--- a/source/lib/src/cuda/region.cu
+++ b/source/lib/src/gpu/region.cu
@@ -27,31 +27,31 @@ template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE *ri,
                           const Region<FPTYPE> &region,
                           const FPTYPE *rp) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
 void convert_to_phys_gpu(FPTYPE *rp,
                          const Region<FPTYPE> &region,
                          const FPTYPE *ri) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   _compute_volume<<<1, 1>>>(volume, region.boxt);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void convert_to_inter_gpu<float>(float *ri,
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/gpu/tabulate.cu
similarity index 86%
rename from source/lib/src/cuda/tabulate.cu
rename to source/lib/src/gpu/tabulate.cu
index 30695a6e05..f424006940 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/gpu/tabulate.cu
@@ -4,9 +4,49 @@
 #define MM 4
 #define KK 4
 #define TPB 256
+#if GOOGLE_CUDA
 #define WARP_SIZE 32
+#elif TENSORFLOW_USE_ROCM
+// See https://github.com/pytorch/pytorch/pull/64302
+#define WARP_SIZE warpSize  // = 64 or 32 (Defined in hip_runtime.h)
+#else
+#error "should not touch here"
+#endif
 #define FULL_MASK 0xffffffff
 
+#if GOOGLE_CUDA
+#define GPU_DYNAMIC_SHARED_MEM_DECL(TYPE, NAME) extern __shared__ TYPE NAME[]
+#elif TENSORFLOW_USE_ROCM
+#define GPU_DYNAMIC_SHARED_MEM_DECL(TYPE, NAME) HIP_DYNAMIC_SHARED(TYPE, NAME)
+#else
+#error "should not touch here"
+#endif
+
+// Copyright 2017 The TensorFlow Authors.
+// Licensed under the Apache License, Version 2.0
+template <typename T>
+__device__ T
+GpuShuffleSync(unsigned mask, T value, int src_lane, int width = warpSize) {
+#if GOOGLE_CUDA
+  return __shfl_sync(mask, value, src_lane, width);
+#elif TENSORFLOW_USE_ROCM
+  return __shfl(value, src_lane, width);
+#else
+#error "should not touch here"
+#endif
+}
+
+__device__ void GpuSyncThreads() {
+#if GOOGLE_CUDA
+  __syncwarp();
+#elif TENSORFLOW_USE_ROCM
+  //__syncwarp();->syncwrap
+  __syncthreads();
+#else
+#error "should not touch here"
+#endif
+}
+
 template <typename FPTYPE>
 __forceinline__ __device__ void locate_xx_se_a(FPTYPE& xx,
                                                int& table_idx,
@@ -110,8 +150,14 @@ __forceinline__ __device__ FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) {
 
 template <typename FPTYPE>
 __forceinline__ __device__ void warp_reduce(FPTYPE& val) {
-  for (int offset = 16; offset > 0; offset >>= 1) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+#if GOOGLE_CUDA
     val += __shfl_down_sync(FULL_MASK, val, offset);
+#elif TENSORFLOW_USE_ROCM
+    val += __shfl_down(val, offset);  // ########????
+#else
+#error "should not touch here"
+#endif
   }
 }
 
@@ -131,13 +177,25 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     const int last_layer_size,
     const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
+#if TENSORFLOW_USE_ROCM
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data)
+#endif
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
+  FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
-
+#if GOOGLE_CUDA
   FPTYPE sum[MTILE] = {(FPTYPE)0.};
+#elif TENSORFLOW_USE_ROCM
+  FPTYPE* iteratorC = (FPTYPE*)&_data[0];
+  for (int kk = 0; kk < MTILE; kk++) {
+    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
+  }
+  __syncthreads();
+#else
+#error "should not touch here"
+#endif
   int mark_table_idx = -1;
   FPTYPE var[6];
   for (int ii = 0; ii < nnei; ii++) {
@@ -163,8 +221,15 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
     }
 
     for (int kk = 0; kk < MTILE; kk++) {
-      sum[kk] += (nnei - breakpoint) *
-                 em[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+#if GOOGLE_CUDA
+      sum[kk]
+#elif TENSORFLOW_USE_ROCM
+      iteratorC[kk * last_layer_size + thread_idx]
+#else
+#error "should not touch here"
+#endif
+          += (nnei - breakpoint) *
+             em[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
     }
     if (unloop) {
       break;
@@ -173,7 +238,14 @@ __global__ void tabulate_fusion_se_a_fifth_order_polynomial(
   }
   for (int ii = 0; ii < MTILE; ii++) {
     out[block_idx * MTILE * last_layer_size + ii * last_layer_size +
-        thread_idx] = sum[ii];
+        thread_idx] =
+#if GOOGLE_CUDA
+        sum[ii];
+#elif TENSORFLOW_USE_ROCM
+        iteratorC[ii * last_layer_size + thread_idx];
+#else
+#error "should not touch here"
+#endif
   }
 }
 
@@ -195,10 +267,10 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     const int last_layer_size,
     const bool is_sorted) {
   bool enable_se_atten = two_embed != nullptr;
-  extern __shared__ int _data[];
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data);
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
+  int warp_idx = GpuShuffleSync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
   int lane_idx = threadIdx.x % WARP_SIZE;
   int breakpoint = nnei - 1;
   bool unloop = false;
@@ -210,7 +282,7 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
     }
   }
   __syncthreads();
-  FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
+  FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   for (int ii = warp_idx; ii < nnei; ii += KTILE) {
     FPTYPE xx = em_x[block_idx * nnei + ii];
     if (ago == xx && is_sorted) {
@@ -252,12 +324,14 @@ __global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
       res += reg_em[3] * iteratorA[3 * last_layer_size + jj];
       Csub +=
           (nnei - breakpoint) *
-          (var[1] + (2 * var[2] +
-                     (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
+          (var[1] + ((FPTYPE)2. * var[2] +
+                     ((FPTYPE)3. * var[3] +
+                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                         xx) *
                         xx) *
           (enable_se_atten ? res * t + res : res);
     }
-    __syncwarp();
+    GpuSyncThreads();
     for (int kk = 0; kk < MTILE; kk++) {
       warp_reduce(sum[kk]);
     }
@@ -290,10 +364,10 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
     const int nnei,
     const int last_layer_size,
     const bool is_sorted) {
-  extern __shared__ int _data[];
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data);
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
+  FPTYPE ago = GpuShuffleSync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0);
   bool unloop = false;
   int breakpoint = nnei - 1;
   FPTYPE* iteratorC = (FPTYPE*)&_data[0];
@@ -323,9 +397,11 @@ __global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
         (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
             xx;
     FPTYPE res_grad =
-        var[1] +
-        (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
-            xx;
+        var[1] + ((FPTYPE)2. * var[2] +
+                  ((FPTYPE)3. * var[3] +
+                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                      xx) *
+                     xx;
 
     for (int kk = 0; kk < MTILE; kk++) {
       int em_index = block_idx * nnei * MTILE + ii * MTILE + kk;
@@ -403,10 +479,10 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
     const int nnei_i,
     const int nnei_j,
     const int last_layer_size) {
-  extern __shared__ int _data[];
+  GPU_DYNAMIC_SHARED_MEM_DECL(int, _data);
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
+  int warp_idx = GpuShuffleSync(0xffffffff, threadIdx.x / WARP_SIZE, 0);
   int lane_idx = threadIdx.x % WARP_SIZE;
   FPTYPE* iteratorA = (FPTYPE*)&_data[0];  // dy
   for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) {
@@ -440,7 +516,7 @@ __global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
                            xx) *
                           xx);
       }
-      __syncwarp();
+      GpuSyncThreads();
       warp_reduce(sum);
       warp_reduce(Csub);
       if (lane_idx == 0) {
@@ -551,10 +627,9 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     const FPTYPE stride1,
     const int nnei,
     const int last_layer_size) {
-  extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl_sync(0xffffffff, thread_idx / WARP_SIZE, 0);
+  int warp_idx = GpuShuffleSync(0xffffffff, thread_idx / WARP_SIZE, 0);
   int lane_idx = thread_idx % WARP_SIZE;
   __syncthreads();
   for (int ii = warp_idx; ii < nnei; ii += KTILE) {
@@ -568,12 +643,14 @@ __global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
     for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
       load_polynomial_params(var, table, table_idx, jj, last_layer_size);
       Csub +=
-          (var[1] + (2 * var[2] +
-                     (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
+          (var[1] + ((FPTYPE)2. * var[2] +
+                     ((FPTYPE)3. * var[3] +
+                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                         xx) *
                         xx) *
           dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj];
     }
-    __syncwarp();
+    GpuSyncThreads();
 
     warp_reduce(Csub);
     if (lane_idx == 0) {
@@ -595,10 +672,13 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
     const FPTYPE stride1,
     const int nnei,
     const int last_layer_size) {
-  extern __shared__ int _data[];
   const int_64 block_idx = blockIdx.x;  // nloc
   const int thread_idx = threadIdx.x;   // last_layer_size
 
+#if TENSORFLOW_USE_ROCM
+  __syncthreads();
+#endif
+
   int mark_table_idx = -1;
   FPTYPE var[6];
   for (int ii = 0; ii < nnei; ii++) {
@@ -610,9 +690,11 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
                              last_layer_size);
     }
     FPTYPE res_grad =
-        var[1] +
-        (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) *
-            xx;
+        var[1] + ((FPTYPE)2. * var[2] +
+                  ((FPTYPE)3. * var[3] +
+                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
+                      xx) *
+                     xx;
     mark_table_idx = table_idx;
     dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size +
           thread_idx] = dz_dy_dem[block_idx * nnei + ii] * res_grad;
@@ -634,15 +716,21 @@ void tabulate_fusion_se_a_gpu(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
-      <<<nloc, last_layer_size>>>(out, table, em_x, em, two_embed,
-                                  table_info[0], table_info[1], table_info[2],
-                                  table_info[3], table_info[4], nnei,
-                                  last_layer_size, is_sorted);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+#if GOOGLE_CUDA
+      <<<nloc, last_layer_size>>>
+#elif TENSORFLOW_USE_ROCM
+      <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>
+#else
+#error "should not touch here"
+#endif
+      (out, table, em_x, em, two_embed, table_info[0], table_info[1],
+       table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
+       is_sorted);
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -661,18 +749,18 @@ void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
-  DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
+  DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
 
   tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dy_dem_x, dy_dem, table, em_x, em, two_embed, dy, table_info[0],
           table_info[1], table_info[2], table_info[3], table_info[4], nnei,
           last_layer_size, is_sorted);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -690,16 +778,16 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0],
           table_info[1], table_info[2], table_info[3], table_info[4], nnei,
           last_layer_size, is_sorted);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -715,14 +803,14 @@ void tabulate_fusion_se_t_gpu(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, table_info[0], table_info[1], table_info[2],
           table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -740,18 +828,18 @@ void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
-  DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
+  DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
   tabulate_fusion_se_t_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size>>>(
           dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1],
           table_info[2], table_info[3], table_info[4], nnei_i, nnei_j,
           last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -769,17 +857,17 @@ void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
 
   tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(dz_dy, table, em_x, em, dz_dy_dem_x,
                                   dz_dy_dem, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
                                   nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -793,14 +881,14 @@ void tabulate_fusion_se_r_gpu(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
                                   nnei, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -815,16 +903,16 @@ void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
+  DPErrcheck(gpuMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
 
   tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dy_dem, table, em, dy, table_info[0], table_info[1], table_info[2],
           table_info[3], table_info[4], nnei, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -839,16 +927,16 @@ void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
   DPErrcheck(
-      cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
+      gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
           dz_dy, table, em, dz_dy_dem, table_info[0], table_info[1],
           table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
+  DPErrcheck(gpuGetLastError());
+  DPErrcheck(gpuDeviceSynchronize());
 }
 
 template void tabulate_fusion_se_a_gpu<float>(float* out,
diff --git a/source/lib/src/rocm/CMakeLists.txt b/source/lib/src/rocm/CMakeLists.txt
deleted file mode 100644
index 1b093977b6..0000000000
--- a/source/lib/src/rocm/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-# required cmake version
-cmake_minimum_required(VERSION 3.21)
-# project name
-project(deepmd_op_rocm)
-set(CMAKE_LINK_WHAT_YOU_USE TRUE)
-
-# set c++ version c++11
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_HIP_STANDARD 14)
-add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
-add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
-
-message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
-
-set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS}
-)# --amdgpu-target=gfx906
-if(HIP_VERSION VERSION_LESS 3.5.1)
-  set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS})
-endif()
-
-file(GLOB SOURCE_FILES "*.hip.cu")
-
-hip_add_library(deepmd_op_rocm SHARED ${SOURCE_FILES})
-target_include_directories(
-  deepmd_op_rocm
-  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include/>
-         $<INSTALL_INTERFACE:include>)
-target_precompile_headers(deepmd_op_rocm PUBLIC [["device.h"]])
-
-install(TARGETS deepmd_op_rocm DESTINATION lib/)
-if(BUILD_CPP_IF)
-  install(
-    TARGETS deepmd_op_rocm
-    EXPORT ${CMAKE_PROJECT_NAME}Targets
-    DESTINATION lib/)
-endif(BUILD_CPP_IF)
-if(BUILD_PY_IF)
-  install(TARGETS deepmd_op_rocm DESTINATION deepmd/lib/)
-endif(BUILD_PY_IF)
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
deleted file mode 100644
index 5416022575..0000000000
--- a/source/lib/src/rocm/coord.hip.cu
+++ /dev/null
@@ -1,444 +0,0 @@
-#include "coord.h"
-#include "device.h"
-#include "region.cuh"
-
-__device__ inline int collapse_index(const int *idx, const int *size) {
-  return (idx[0] * size[1] + idx[1]) * size[2] + idx[2];
-}
-__device__ inline void index_recover(const int in_idx,
-                                     const int *size,
-                                     int *idx) {
-  idx[2] = in_idx % size[2];
-  idx[1] = int(in_idx / size[2]) % size[1];
-  idx[0] = int(int(in_idx / size[2]) / size[1]);
-}
-__device__ inline void idx_addshift(int *idx, const int *shift) {
-  for (int dd = 0; dd < 3; dd++) {
-    idx[dd] += shift[dd];
-  }
-}
-__device__ inline void idx_unshift(int *idx, const int *shift) {
-  for (int dd = 0; dd < 3; dd++) {
-    idx[dd] -= shift[dd];
-  }
-}
-__device__ inline int compute_pbc_shift(int idx, int ncell) {
-  int shift = 0;
-  if (idx < 0) {
-    shift = 1;
-    while (idx + shift * ncell < 0) {
-      shift++;
-    }
-  } else if (idx >= ncell) {
-    shift = -1;
-    while (idx + shift * ncell >= ncell) {
-      shift--;
-    }
-  }
-  return shift;
-}
-
-__device__ inline double _fmod(double x, double y) { return fmod(x, y); }
-__device__ inline float _fmod(float x, float y) { return fmodf(x, y); }
-
-template <typename FPTYPE>
-__global__ void normalize_one(FPTYPE *out_c,
-                              const FPTYPE *boxt,
-                              const FPTYPE *rec_boxt,
-                              const int nall) {
-  // <<<nall/TPB, TPB>>>
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy >= nall) {
-    return;
-  }
-  FPTYPE inter[3];
-  phys2Inter(inter, out_c + idy * 3, rec_boxt);
-  for (int dd = 0; dd < 3; ++dd) {
-    inter[dd] = _fmod(inter[dd], (FPTYPE)1.);
-    if (inter[dd] < (FPTYPE)0.) {
-      inter[dd] += (FPTYPE)1.;
-    }
-  }
-  inter2Phys(out_c + idy * 3, inter, boxt);
-}
-
-template <typename FPTYPE>
-__global__ void _fill_idx_cellmap(int *idx_cellmap,
-                                  int *idx_cellmap_noshift,
-                                  const FPTYPE *in_c,
-                                  const FPTYPE *rec_boxt,
-                                  const int *nat_stt,
-                                  const int *nat_end,
-                                  const int *ext_stt,
-                                  const int *ext_end,
-                                  const int nloc) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  int ext_ncell[3];
-  int global_grid[3];
-  int idx_orig_shift[3];
-  FPTYPE cell_size[3];
-  FPTYPE nat_orig[3];
-  for (int dd = 0; dd < 3; ++dd) {
-    ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
-    global_grid[dd] = nat_end[dd] - nat_stt[dd];
-    idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
-    cell_size[dd] = (FPTYPE)1. / global_grid[dd];
-    nat_orig[dd] = nat_stt[dd] * cell_size[dd];
-  }
-  if (idy < nloc) {
-    int idx_noshift[3];
-    int idx[3];
-    FPTYPE inter[3];
-    phys2Inter(inter, in_c + idy * 3, rec_boxt);
-    for (int dd = 0; dd < 3; ++dd) {
-      idx_noshift[dd] = (inter[dd] - nat_orig[dd]) / cell_size[dd];
-      if (inter[dd] - nat_orig[dd] < 0.) {
-        idx_noshift[dd]--;
-      }
-      if (idx_noshift[dd] < nat_stt[dd]) {
-        idx_noshift[dd] = nat_stt[dd];
-      } else if (idx_noshift[dd] >= nat_end[dd]) {
-        idx_noshift[dd] = nat_end[dd] - 1;
-      }
-      idx[dd] = idx_noshift[dd] + idx_orig_shift[dd];
-    }
-    idx_cellmap_noshift[idy] = collapse_index(idx_noshift, global_grid);
-    idx_cellmap[idy] = collapse_index(idx, ext_ncell);
-  }
-}
-
-__global__ void _fill_loc_cellnum_map(int *temp_idx_order,
-                                      int *loc_cellnum_map,
-                                      const int *idx_cellmap_noshift,
-                                      const int nloc,
-                                      const int loc_cellnum) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy < loc_cellnum) {
-    int num = 0;
-    for (int ii = 0; ii < nloc; ii++) {
-      if (idx_cellmap_noshift[ii] == idy) {
-        temp_idx_order[ii] = num;
-        num++;
-      }
-    }
-    loc_cellnum_map[idy] = num;
-  }
-}
-
-__global__ void _fill_total_cellnum_map(int *total_cellnum_map,
-                                        int *mask_cellnum_map,
-                                        int *cell_map,
-                                        int *cell_shift_map,
-                                        const int *nat_stt,
-                                        const int *nat_end,
-                                        const int *ext_stt,
-                                        const int *ext_end,
-                                        const int *loc_cellnum_map,
-                                        const int total_cellnum) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  int ext_ncell[3];
-  int global_grid[3];
-  int idx_orig_shift[3];
-  for (int dd = 0; dd < 3; ++dd) {
-    ext_ncell[dd] = ext_end[dd] - ext_stt[dd];
-    global_grid[dd] = nat_end[dd] - nat_stt[dd];
-    idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
-  }
-  if (idy < total_cellnum) {
-    int *shift = cell_shift_map + idy * 3;
-    int idx[3];
-    index_recover(idy, ext_ncell, idx);
-    idx_unshift(idx, idx_orig_shift);
-    shift[0] = compute_pbc_shift(idx[0], global_grid[0]);
-    shift[1] = compute_pbc_shift(idx[1], global_grid[1]);
-    shift[2] = compute_pbc_shift(idx[2], global_grid[2]);
-    bool loc = false;
-    if (shift[0] == 0 && shift[1] == 0 && shift[2] == 0) {
-      loc = true;
-    }
-    for (int dd = 0; dd < 3; dd++) {
-      idx[dd] += shift[dd] * global_grid[dd];
-    }
-    int orig_idy = collapse_index(idx, global_grid);
-    mask_cellnum_map[idy] = loc_cellnum_map[orig_idy];
-    total_cellnum_map[idy] = mask_cellnum_map[idy];
-    if (loc) {
-      mask_cellnum_map[idy] = 0;
-    }
-    cell_map[idy] = orig_idy;
-  }
-}
-
-__global__ void _build_loc_clist(int *clist,
-                                 const int *idx_cellmap,
-                                 const int *idx_order,
-                                 const int *sec_num_map,
-                                 const int nloc) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy >= nloc) {
-    return;
-  }
-  int cell_idx = idx_cellmap[idy];
-  int *clist_row = clist + sec_num_map[cell_idx];
-  clist_row[idx_order[idy]] = idy;
-}
-
-template <typename FPTYPE>
-__global__ void _copy_coord(FPTYPE *out_c,
-                            int *out_t,
-                            int *mapping,
-                            const FPTYPE *in_c,
-                            const int *in_t,
-                            const int *cell_map,
-                            const int *cell_shift_map,
-                            const int *sec_loc_cellnum_map,
-                            const int *sec_total_cellnum_map,
-                            const int *loc_clist,
-                            const int nloc,
-                            const int nall,
-                            const int total_cellnum,
-                            const FPTYPE *boxt,
-                            const FPTYPE *rec_boxt) {
-  int idy = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idy >= nall) {
-    return;
-  }
-  if (idy < nloc) {
-    mapping[idy] = idy;
-    out_t[idy] = in_t[idy];
-    for (int dd = 0; dd < 3; dd++) {
-      out_c[idy * 3 + dd] = in_c[idy * 3 + dd];
-    }
-  } else {
-    int cell_idx = 0;
-    int atom_idx = 0;
-    int orig_cell_idx = 0;
-    int orig_idy = 0;
-    int shift[3];
-    FPTYPE d_shift[3];
-    for (int ii = 0; ii < total_cellnum; ii++) {
-      if (idy >= sec_total_cellnum_map[ii + 1]) {
-        cell_idx++;
-      } else {
-        break;
-      }
-    }
-    for (int dd = 0; dd < 3; dd++) {
-      shift[dd] = cell_shift_map[cell_idx * 3 + dd];
-      d_shift[dd] = shift[dd];
-    }
-    atom_idx = idy - sec_total_cellnum_map[cell_idx];
-    orig_cell_idx = cell_map[cell_idx];
-    orig_idy = loc_clist[sec_loc_cellnum_map[orig_cell_idx] + atom_idx];
-    mapping[idy] = orig_idy;
-    out_t[idy] = in_t[orig_idy];
-    FPTYPE shift_v[3];
-    inter2Phys(shift_v, d_shift, boxt);
-    for (int dd = 0; dd < 3; dd++) {
-      out_c[idy * 3 + dd] = in_c[orig_idy * 3 + dd] - shift_v[dd];
-    }
-  }
-}
-
-template <typename FPTYPE>
-void compute_int_data(int *int_data,
-                      const FPTYPE *in_c,
-                      const int *cell_info,
-                      const deepmd::Region<FPTYPE> &region,
-                      const int nloc,
-                      const int loc_cellnum,
-                      const int total_cellnum) {
-  int *idx_cellmap = int_data;
-  int *idx_cellmap_noshift = idx_cellmap + nloc;
-  int *temp_idx_order = idx_cellmap_noshift + nloc;
-  int *loc_cellnum_map = temp_idx_order + nloc;
-  int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
-  int *mask_cellnum_map = total_cellnum_map + total_cellnum;
-  int *cell_map = mask_cellnum_map + total_cellnum;
-  int *cell_shift_map = cell_map + total_cellnum;
-  const int *nat_stt = cell_info;
-  const int *nat_end = cell_info + 3;
-  const int *ext_stt = cell_info + 6;
-  const int *ext_end = cell_info + 9;
-  const FPTYPE *rec_boxt = region.rec_boxt;
-
-  const int nblock_loc = (nloc + TPB - 1) / TPB;
-  hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap,
-                     idx_cellmap_noshift, in_c, rec_boxt, nat_stt, nat_end,
-                     ext_stt, ext_end, nloc);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int nblock_loc_cellnum = (loc_cellnum + TPB - 1) / TPB;
-  hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0,
-                     temp_idx_order, loc_cellnum_map, idx_cellmap_noshift, nloc,
-                     loc_cellnum);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int nblock_total_cellnum = (total_cellnum + TPB - 1) / TPB;
-  hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0,
-                     total_cellnum_map, mask_cellnum_map, cell_map,
-                     cell_shift_map, nat_stt, nat_end, ext_stt, ext_end,
-                     loc_cellnum_map, total_cellnum);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-void build_loc_clist(int *int_data,
-                     const int nloc,
-                     const int loc_cellnum,
-                     const int total_cellnum) {
-  const int nblock = (nloc + TPB - 1) / TPB;
-  const int *idx_cellmap_noshift = int_data + nloc;
-  const int *temp_idx_order = idx_cellmap_noshift + nloc;
-  const int *sec_loc_cellnum_map = temp_idx_order + nloc + loc_cellnum +
-                                   2 * total_cellnum + total_cellnum +
-                                   3 * total_cellnum;
-  int *loc_clist = int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                   total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1;
-  hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist,
-                     idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map,
-                     nloc);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void copy_coord(FPTYPE *out_c,
-                int *out_t,
-                int *mapping,
-                const int *int_data,
-                const FPTYPE *in_c,
-                const int *in_t,
-                const int nloc,
-                const int nall,
-                const int loc_cellnum,
-                const int total_cellnum,
-                const deepmd::Region<FPTYPE> &region) {
-  const int nblock = (nall + TPB - 1) / TPB;
-  const int *cell_map = int_data + 3 * nloc + loc_cellnum + 2 * total_cellnum;
-  const int *cell_shift_map = cell_map + total_cellnum;
-  const int *sec_loc_cellnum_map = cell_shift_map + 3 * total_cellnum;
-  const int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
-  const int *loc_clist = sec_total_cellnum_map + total_cellnum + 1;
-
-  const FPTYPE *boxt = region.boxt;
-  const FPTYPE *rec_boxt = region.rec_boxt;
-  hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping,
-                     in_c, in_t, cell_map, cell_shift_map, sec_loc_cellnum_map,
-                     sec_total_cellnum_map, loc_clist, nloc, nall,
-                     total_cellnum, boxt, rec_boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void normalize_coord_gpu(FPTYPE *coord,
-                         const int natom,
-                         const Region<FPTYPE> &region) {
-  const FPTYPE *boxt = region.boxt;
-  const FPTYPE *rec_boxt = region.rec_boxt;
-  const int nblock = (natom + TPB - 1) / TPB;
-  hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt,
-                     natom);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-int copy_coord_gpu(FPTYPE *out_c,
-                   int *out_t,
-                   int *mapping,
-                   int *nall,
-                   int *int_data,
-                   const FPTYPE *in_c,
-                   const int *in_t,
-                   const int &nloc,
-                   const int &mem_nall,
-                   const int &loc_cellnum,
-                   const int &total_cellnum,
-                   const int *cell_info,
-                   const Region<FPTYPE> &region) {
-  compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
-                   total_cellnum);
-  int *int_data_cpu = new int
-      [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum +
-       1];  // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-  DPErrcheck(hipMemcpy(int_data_cpu, int_data + 3 * nloc,
-                       sizeof(int) * (loc_cellnum + 2 * total_cellnum),
-                       hipMemcpyDeviceToHost));
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  int *loc_cellnum_map = int_data_cpu;
-  int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
-  int *mask_cellnum_map = total_cellnum_map + total_cellnum;
-  int *sec_loc_cellnum_map = mask_cellnum_map + total_cellnum;
-  int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
-  sec_loc_cellnum_map[0] = 0;
-  sec_total_cellnum_map[0] = nloc;
-  int max_cell = 0;
-  for (int iii = 0; iii < total_cellnum; iii++) {
-    if (max_cell < total_cellnum_map[iii]) {
-      max_cell = total_cellnum_map[iii];
-    }
-    if (iii < loc_cellnum) {
-      sec_loc_cellnum_map[iii + 1] =
-          sec_loc_cellnum_map[iii] + loc_cellnum_map[iii];
-    }
-    sec_total_cellnum_map[iii + 1] =
-        sec_total_cellnum_map[iii] + mask_cellnum_map[iii];
-  }
-  *nall = sec_total_cellnum_map[total_cellnum];
-  if (*nall > mem_nall) {
-    delete[] int_data_cpu;
-    // size of the output arrays is not large enough
-    return 1;
-  } else {
-    DPErrcheck(hipMemcpy(int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                             total_cellnum * 3,
-                         sec_loc_cellnum_map,
-                         sizeof(int) * (loc_cellnum + 1 + total_cellnum + 1),
-                         hipMemcpyHostToDevice));
-    delete[] int_data_cpu;
-    build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
-    copy_coord(out_c, out_t, mapping, int_data, in_c, in_t, nloc, *nall,
-               loc_cellnum, total_cellnum, region);
-  }
-  return 0;
-}
-
-template void normalize_coord_gpu<float>(float *coord,
-                                         const int natom,
-                                         const Region<float> &region);
-template void normalize_coord_gpu<double>(double *coord,
-                                          const int natom,
-                                          const Region<double> &region);
-template int copy_coord_gpu<float>(float *out_c,
-                                   int *out_t,
-                                   int *mapping,
-                                   int *nall,
-                                   int *int_data,
-                                   const float *in_c,
-                                   const int *in_t,
-                                   const int &nloc,
-                                   const int &mem_nall,
-                                   const int &loc_cellnum,
-                                   const int &total_cellnum,
-                                   const int *cell_info,
-                                   const Region<float> &region);
-template int copy_coord_gpu<double>(double *out_c,
-                                    int *out_t,
-                                    int *mapping,
-                                    int *nall,
-                                    int *int_data,
-                                    const double *in_c,
-                                    const int *in_t,
-                                    const int &nloc,
-                                    const int &mem_nall,
-                                    const int &loc_cellnum,
-                                    const int &total_cellnum,
-                                    const int *cell_info,
-                                    const Region<double> &region);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
deleted file mode 100644
index 76657eea52..0000000000
--- a/source/lib/src/rocm/gelu.hip.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "device.h"
-#include "gelu.h"
-
-__device__ inline double _tanh(double x) { return tanh(x); }
-__device__ inline float _tanh(float x) { return tanhf(x); }
-
-template <typename FPTYPE>
-__global__ void gelu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
-  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx >= size) {
-    return;
-  }
-  out[idx] = xx[idx] * (FPTYPE)0.5 *
-             ((FPTYPE)1.0 +
-              _tanh((FPTYPE)SQRT_2_PI * (xx[idx] + (FPTYPE)0.044715 * xx[idx] *
-                                                       xx[idx] * xx[idx])));
-}
-
-template <typename FPTYPE>
-__global__ void gelu_grad(FPTYPE* out,
-                          const FPTYPE* xx,
-                          const FPTYPE* dy,
-                          const int_64 size) {
-  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx >= size) {
-    return;
-  }
-  // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 *
-  // xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var =
-      _tanh((FPTYPE)SQRT_2_PI *
-            (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx]));
-  out[idx] =
-      dy[idx] *
-      ((FPTYPE)0.5 * (FPTYPE)SQRT_2_PI * xx[idx] * ((FPTYPE)1. - var * var) *
-           ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) +
-       (FPTYPE)0.5 * var + (FPTYPE)0.5);
-}
-
-template <typename FPTYPE>
-__global__ void gelu_grad_grad(FPTYPE* out,
-                               const FPTYPE* xx,
-                               const FPTYPE* dy,
-                               const FPTYPE* dy_2,
-                               const int_64 size) {
-  const int_64 idx = int_64(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx >= size) {
-    return;
-  }
-  // out[idx] = xx[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (xx[idx] + 0.044715 *
-  // xx[idx] * xx[idx] *xx[idx])));
-  const FPTYPE var1 =
-      _tanh((FPTYPE)SQRT_2_PI *
-            (xx[idx] + (FPTYPE)0.044715 * xx[idx] * xx[idx] * xx[idx]));
-  const FPTYPE var2 = (FPTYPE)SQRT_2_PI * ((FPTYPE)1. - var1 * var1) *
-                      ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.);
-  out[idx] = dy[idx] * dy_2[idx] *
-             ((FPTYPE)0.134145 * (FPTYPE)SQRT_2_PI * xx[idx] * xx[idx] *
-                  ((FPTYPE)1. - var1 * var1) -
-              (FPTYPE)SQRT_2_PI * xx[idx] * var2 *
-                  ((FPTYPE)0.134145 * xx[idx] * xx[idx] + (FPTYPE)1.) * var1 +
-              var2);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
-  if (size <= 0) {
-    return;
-  }
-  const int THREAD_ITEMS = 1024;
-  const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
-
-  hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void gelu_grad_gpu(FPTYPE* out,
-                   const FPTYPE* xx,
-                   const FPTYPE* dy,
-                   const int_64 size) {
-  if (size <= 0) {
-    return;
-  }
-  const int THREAD_ITEMS = 1024;
-  const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
-
-  hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy,
-                     size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void gelu_grad_grad_gpu(FPTYPE* out,
-                        const FPTYPE* xx,
-                        const FPTYPE* dy,
-                        const FPTYPE* dy_2,
-                        const int_64 size) {
-  if (size <= 0) {
-    return;
-  }
-  const int THREAD_ITEMS = 1024;
-  const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
-
-  hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx,
-                     dy, dy_2, size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void gelu_gpu<float>(float* out, const float* x, const int_64 size);
-template void gelu_gpu<double>(double* out, const double* x, const int_64 size);
-template void gelu_grad_gpu<float>(float* out,
-                                   const float* x,
-                                   const float* dy,
-                                   const int_64 size);
-template void gelu_grad_gpu<double>(double* out,
-                                    const double* x,
-                                    const double* dy,
-                                    const int_64 size);
-template void gelu_grad_grad_gpu<float>(float* out,
-                                        const float* x,
-                                        const float* dy,
-                                        const float* dy_2,
-                                        const int_64 size);
-template void gelu_grad_grad_gpu<double>(double* out,
-                                         const double* x,
-                                         const double* dy,
-                                         const double* dy_2,
-                                         const int_64 size);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
deleted file mode 100644
index 736f2f9e9a..0000000000
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-#include "device.h"
-#include "hipcub/hipcub.hpp"
-#include "neighbor_list.h"
-// A stateful callback functor that maintains a running prefix to be applied
-// during consecutive scan operations.
-struct parallel_prefix_scan_op {
-  // Running prefix
-  int running_total;
-  // Constructor
-  __device__ parallel_prefix_scan_op(int running_total)
-      : running_total(running_total) {}
-  // Callback operator to be entered by the first warp of threads in the block.
-  // Thread-0 is responsible for returning a value for seeding the block-wide
-  // scan.
-  __device__ int operator()(int block_aggregate) {
-    int old_prefix = running_total;
-    running_total += block_aggregate;
-    return old_prefix;
-  }
-};
-
-template <int THREADS_PER_BLOCK>
-__global__ void parallel_prefix_scan(int *numneigh,
-                                     int *nei_order,
-                                     const int *temp_nlist,
-                                     const int mem_size,
-                                     const int nloc,
-                                     const int nall) {
-  // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128
-  // threads, 4 ints per thread
-  typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScan;
-  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-  __shared__ typename BlockScan::TempStorage temp_storage;
-
-  // Initialize running total
-  parallel_prefix_scan_op prefix_op(0);
-
-  // Have the block iterate over segments of items
-  for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK) {
-    int block_offset = blockIdx.x * mem_size;
-    // Load a segment of consecutive items that are blocked across threads
-    int i_data = temp_nlist[block_offset + ii];
-    int o_data = i_data == -1 ? 0 : 1;
-
-    // Collectively compute the block-wide exclusive prefix sum
-    BlockScan(temp_storage).ExclusiveSum(o_data, o_data, prefix_op);
-
-    __syncthreads();
-    // Store scanned items to output segment
-    if (i_data != -1) {
-      nei_order[block_offset + ii] = o_data;
-    }
-    // Store numneigh into the output array
-    if (ii == nall - 1) {
-      o_data += i_data == -1 ? 0 : 1;
-      numneigh[blockIdx.x] = o_data;
-    }
-  }
-}
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(FPTYPE *arr1, FPTYPE *arr2) {
-  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
-}
-
-template <typename FPTYPE>
-__global__ void build_nlist(int *ilist,
-                            int *temp_nlist,
-                            const FPTYPE *c_cpy,
-                            const FPTYPE rcut2,
-                            const int nloc,
-                            const int nall,
-                            const int mem_size) {
-  const unsigned int atom_idx = blockIdx.x;
-  const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (neighbor_idx < nall) {
-    int *neighbor_row = temp_nlist + atom_idx * mem_size;
-    if (neighbor_idx == atom_idx) {
-      ilist[atom_idx] = atom_idx;
-    } else {
-      const FPTYPE *ccoord = c_cpy + atom_idx * 3;
-      const FPTYPE *ncoord = c_cpy + neighbor_idx * 3;
-      FPTYPE diff[3];
-      for (int kk = 0; kk < 3; kk++) {
-        diff[kk] = ccoord[kk] - ncoord[kk];
-      }
-      FPTYPE r2 = dev_dot(diff, diff);
-      if (r2 < rcut2) {
-        neighbor_row[neighbor_idx] = neighbor_idx;
-      }
-    }
-  }
-}
-
-__global__ void fill_nlist(int **firstneigh,
-                           const int *temp_nlist,
-                           const int *nei_order,
-                           const int mem_size,
-                           const int nall) {
-  const unsigned int atom_idx = blockIdx.x;
-  const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (neighbor_idx < nall) {
-    const int *in_row = temp_nlist + atom_idx * mem_size;
-    int *out_row = firstneigh[atom_idx];
-    int nei = in_row[neighbor_idx];
-    if (nei != -1) {
-      out_row[nei_order[atom_idx * mem_size + neighbor_idx]] = nei;
-    }
-  }
-}
-
-__global__ void map_nlist(int *nlist,
-                          const int *nlist_map,
-                          const int nloc,
-                          const int nnei) {
-  int atom_idx = blockIdx.x;
-  int nei_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (nei_idx >= nnei) {
-    return;
-  }
-  int nlist_idx = atom_idx * nnei + nei_idx;
-  int nlist_item = nlist[nlist_idx];
-  if (nlist_item != -1) {
-    nlist[nlist_idx] = nlist_map[nlist_item];
-  }
-}
-
-__global__ void map_nei_info(int *nlist,
-                             int *ntype,
-                             bool *nmask,
-                             const int *type,
-                             const int *nlist_map,
-                             const int nloc,
-                             const int nnei,
-                             const int ntypes) {
-  int atom_idx = blockIdx.x;
-  int nei_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (nei_idx >= nnei) {
-    return;
-  }
-  int nlist_idx = atom_idx * nnei + nei_idx;
-  int nlist_item = nlist[nlist_idx];
-  int temp = 0;
-  if (nlist_item != -1) {
-    temp = nlist_map[nlist_item];
-    nlist[nlist_idx] = temp;
-    ntype[nlist_idx] = type[temp];
-    nmask[nlist_idx] = true;
-  } else {
-    ntype[nlist_idx] = ntypes;
-  }
-}
-
-__global__ void map_nei_info_noconvert(int *nlist,
-                                       int *ntype,
-                                       bool *nmask,
-                                       const int *type,
-                                       const int nloc,
-                                       const int nnei,
-                                       const int ntypes) {
-  int atom_idx = blockIdx.x;
-  int nei_idx = blockIdx.y * blockDim.y + threadIdx.y;
-  if (nei_idx >= nnei) {
-    return;
-  }
-  int nlist_idx = atom_idx * nnei + nei_idx;
-  int nlist_item = nlist[nlist_idx];
-  if (nlist_item != -1) {
-    ntype[nlist_idx] = type[nlist_item];
-    nmask[nlist_idx] = true;
-  } else {
-    ntype[nlist_idx] = ntypes;
-  }
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-int build_nlist_gpu(InputNlist &nlist,
-                    int *max_list_size,
-                    int *nlist_data,
-                    const FPTYPE *c_cpy,
-                    const int &nloc,
-                    const int &nall,
-                    const int &mem_size,
-                    const float &rcut) {
-  if (mem_size < nall) {
-    return 1;
-  }
-  const int nblock = (nall + TPB - 1) / TPB;
-  int *ilist = nlist.ilist;
-  int *numneigh = nlist.numneigh;
-  int **firstneigh = nlist.firstneigh;
-  DPErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
-  int *temp_nlist = nlist_data;  // nloc*mem_size
-  int *nei_order = temp_nlist + nloc * mem_size;
-  nlist.inum = nloc;
-  FPTYPE rcut2 = rcut * rcut;
-
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, TPB);
-  hipLaunchKernelGGL(build_nlist, block_grid, thread_grid, 0, 0, ilist,
-                     temp_nlist, c_cpy, rcut2, nloc, nall, mem_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(parallel_prefix_scan<TPB>), nloc, TPB, 0,
-                     0, numneigh, nei_order, temp_nlist, mem_size, nloc, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, firstneigh,
-                     temp_nlist, nei_order, mem_size, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  int *numneigh_host = new int[nloc];
-  DPErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
-                       hipMemcpyDeviceToHost));
-  int max_nei = 0;
-  for (int ii = 0; ii < nloc; ii++) {
-    if (numneigh_host[ii] > max_nei) {
-      max_nei = numneigh_host[ii];
-    }
-  }
-  *max_list_size = max_nei;
-  delete[] numneigh_host;
-  return 0;
-}
-
-void use_nlist_map(int *nlist,
-                   const int *nlist_map,
-                   const int nloc,
-                   const int nnei) {
-  int nblock = (nnei + TPB - 1) / TPB;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, TPB);
-  hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map,
-                     nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-void use_nei_info_gpu(int *nlist,
-                      int *ntype,
-                      bool *nmask,
-                      const int *type,
-                      const int *nlist_map,
-                      const int nloc,
-                      const int nnei,
-                      const int ntypes,
-                      const bool b_nlist_map) {
-  int nblock = (nnei + TPB - 1) / TPB;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, TPB);
-  DPErrcheck(hipMemset(ntype, 0, sizeof(int) * nloc * nnei));
-  DPErrcheck(hipMemset(nmask, 0, sizeof(bool) * nloc * nnei));
-  if (b_nlist_map) {
-    hipLaunchKernelGGL(map_nei_info, block_grid, thread_grid, 0, 0, nlist,
-                       ntype, nmask, type, nlist_map, nloc, nnei, ntypes);
-  } else {
-    hipLaunchKernelGGL(map_nei_info_noconvert, block_grid, thread_grid, 0, 0,
-                       nlist, ntype, nmask, type, nloc, nnei, ntypes);
-  }
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template int build_nlist_gpu<float>(InputNlist &nlist,
-                                    int *max_list_size,
-                                    int *nlist_data,
-                                    const float *c_cpy,
-                                    const int &nloc,
-                                    const int &nall,
-                                    const int &mem_size,
-                                    const float &rcut);
-template int build_nlist_gpu<double>(InputNlist &nlist,
-                                     int *max_list_size,
-                                     int *nlist_data,
-                                     const double *c_cpy,
-                                     const int &nloc,
-                                     const int &nall,
-                                     const int &mem_size,
-                                     const float &rcut);
-__global__ void map_filter_ftype(int *ftype_out,
-                                 const int *ftype_in,
-                                 const int nloc) {
-  int ii = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ii < nloc) {
-    ftype_out[ii] = ftype_in[ii] >= 0 ? 0 : -1;
-  }
-}
-
-void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
-  int nblock = (nloc + TPB - 1) / TPB;
-  map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
deleted file mode 100644
index 23e8ce1d0e..0000000000
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ /dev/null
@@ -1,821 +0,0 @@
-#include "device.h"
-#include "fmt_nlist.h"
-#include "hipcub/hipcub.hpp"
-#include "prod_env_mat.h"
-
-__device__ inline double _sqrt(double x) { return sqrt(x); }
-__device__ inline float _sqrt(float x) { return sqrtf(x); }
-__device__ inline double _rsqrt(double x) { return rsqrt(x); }
-__device__ inline float _rsqrt(float x) { return rsqrtf(x); }
-
-// common part of prod_env_mat
-template <typename Key, int BLOCK_THREADS, int ITEMS_PER_THREAD>
-__launch_bounds__(BLOCK_THREADS) __global__
-    void BlockSortKernel(Key* d_in,
-                         Key* d_out)  // Tile of output
-{
-  enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-  // Specialize BlockLoad type for our thread block (uses warp-striped loads for
-  // coalescing, then transposes in shared memory to a blocked arrangement)
-  typedef hipcub::BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD,
-                            hipcub::BLOCK_LOAD_WARP_TRANSPOSE>
-      BlockLoadT;
-  // Specialize BlockRadixSort type for our thread block
-  typedef hipcub::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD>
-      BlockRadixSortT;
-  // Shared memory
-  __shared__ union TempStorage {
-    typename BlockLoadT::TempStorage load;
-    typename BlockRadixSortT::TempStorage sort;
-  } temp_storage;
-  // Per-thread tile items
-  Key items[ITEMS_PER_THREAD];
-  // Our current block's offset
-  int_64 block_offset = (int_64)blockIdx.x * TILE_SIZE;
-  // Load items into a blocked arrangement
-  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-  // Barrier for smem reuse
-  __syncthreads();
-  // Sort keys
-  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
-  // Store output in striped fashion
-  hipcub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset,
-                                            items);
-}
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) {
-  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
-}
-
-template <typename FPTYPE>
-__device__ inline void spline5_switch(
-    FPTYPE& vv, FPTYPE& dd, FPTYPE& xx, const float& rmin, const float& rmax) {
-  if (xx < rmin) {
-    dd = (FPTYPE)0.;
-    vv = (FPTYPE)1.;
-  } else if (xx < rmax) {
-    FPTYPE uu = (xx - rmin) / (rmax - rmin);
-    FPTYPE du = (FPTYPE)1. / (rmax - rmin);
-    vv = uu * uu * uu *
-             ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
-         (FPTYPE)1.;
-    dd = ((FPTYPE)3. * uu * uu *
-              ((FPTYPE)-6. * uu * uu + (FPTYPE)15. * uu - (FPTYPE)10.) +
-          uu * uu * uu * ((FPTYPE)-12. * uu + (FPTYPE)15.)) *
-         du;
-  } else {
-    dd = (FPTYPE)0.;
-    vv = (FPTYPE)0.;
-  }
-}
-
-template <typename FPTYPE>
-__device__ inline uint_64 encoding_nbor_info(const int type,
-                                             const FPTYPE dist,
-                                             const int index) {
-  // nbor info checking:
-  // the type of nbor atom must be smaller than 128
-  // the distance of center atom between nbor atom must be smaller than 128
-  // the index of nbor atom(including ghost region) must be smaller than
-  // 16777216(1 << 24)
-  if (type >= 128 || dist >= (FPTYPE)128.0 || index >= (1 << 24)) {
-    __builtin_trap();
-  }
-  return ((uint_64)type << 57) +
-         (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) +
-         index;
-}
-
-__device__ inline void decoding_nbor_info(int& type,
-                                          int& index,
-                                          const uint_64 key) {
-  type = key >> 57;
-  index = key & 0xFFFFFF;
-}
-
-template <typename FPTYPE>
-__global__ void get_i_idx(FPTYPE* i_idx, const int nloc, const FPTYPE* ilist) {
-  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= nloc) {
-    return;
-  }
-  i_idx[ilist[idx]] = idx;
-}
-
-template <typename FPTYPE>
-__global__ void format_nlist_fill_a(uint_64* key,
-                                    const FPTYPE* coord,
-                                    const int* type,
-                                    const int* numneigh,
-                                    int** firstneigh,
-                                    const float rcut,
-                                    int* i_idx,
-                                    const int MAX_NBOR_SIZE) {
-  // <<<nloc, MAX_NBOR_SIZE>>>
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-
-  const int nsize = numneigh[i_idx[idx]];
-  if (idy >= nsize) {
-    return;
-  }
-
-  const int* nei_idx = firstneigh[i_idx[idx]];
-  // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
-  uint_64* key_in = key + idx * MAX_NBOR_SIZE;
-  FPTYPE diff[3];
-  const int& j_idx = nei_idx[idy];
-  if (type[j_idx] < 0) {
-    return;
-  }
-  for (int dd = 0; dd < 3; dd++) {
-    diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
-  }
-  FPTYPE rr = _sqrt(dev_dot(diff, diff));
-  if (rr <= rcut) {
-    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
-  }
-}
-
-template <typename FPTYPE>
-__global__ void fill_nei_iter(int* nei_iter_dev,
-                              const FPTYPE* key,
-                              const int nloc,
-                              const int max_nbor_size,
-                              const int sec_size) {
-  int_64 row = blockIdx.x;
-  int col = blockIdx.y * blockDim.x + threadIdx.x;
-  const FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
-  int nei_type_cur = -1, nbor_idx_cur = 0;
-  int nei_type_pre = -1, nbor_idx_pre = 0;
-  if (col < max_nbor_size && key_out[col] != key_out[max_nbor_size - 1]) {
-    if (col >= 1) {
-      decoding_nbor_info(nei_type_pre, nbor_idx_pre, key_out[col - 1]);
-    }
-    decoding_nbor_info(nei_type_cur, nbor_idx_cur, key_out[col]);
-  }
-  if (nei_type_cur != nei_type_pre) {
-    nei_iter_dev[row * sec_size + nei_type_cur] = col;
-  }
-}
-
-template <typename FPTYPE>
-__global__ void format_nlist_fill_b(int* nlist,
-                                    const int nlist_size,
-                                    const int nloc,
-                                    FPTYPE* key,
-                                    const int* sec,
-                                    const int sec_size,
-                                    int* nei_iter_dev,
-                                    const int max_nbor_size) {
-  int_64 row = blockIdx.x;
-  int col = blockIdx.y * blockDim.x + threadIdx.x;
-  int* nei_iter = nei_iter_dev + row * sec_size;
-  FPTYPE* key_out = key + nloc * max_nbor_size + row * max_nbor_size;
-  int* row_nlist = nlist + row * nlist_size;
-  if (col < max_nbor_size) {
-    if (key_out[col] != key_out[max_nbor_size - 1]) {
-      int nei_type = 0, nbor_idx = 0;
-      decoding_nbor_info(nei_type, nbor_idx, key_out[col]);
-      int out_indx = col - nei_iter[nei_type] + sec[nei_type];
-      if (out_indx < sec[nei_type + 1]) {
-        row_nlist[out_indx] = nbor_idx;
-      }
-    }
-  }
-}
-
-template <typename FPTYPE>
-__global__ void encoding_decoding_nbor_info(uint_64* key,
-                                            int* out_type,
-                                            int* out_index,
-                                            const int* in_type,
-                                            const FPTYPE* in_dist,
-                                            const int* in_index,
-                                            const int size_of_array) {
-  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= size_of_array) {
-    return;
-  }
-
-  key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]);
-  decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
-}
-
-template <typename FPTYPE>
-void format_nbor_list_256(uint_64* key,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          const int& nloc,
-                          const float& rcut,
-                          int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 256;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 4;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_512(uint_64* key,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          const int& nloc,
-                          const float& rcut,
-                          int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 512;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 4;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_1024(uint_64* key,
-                           const FPTYPE* coord,
-                           const int* type,
-                           const deepmd::InputNlist& gpu_inlist,
-                           const int& nloc,
-                           const float& rcut,
-                           int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 1024;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 8;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_2048(uint_64* key,
-                           const FPTYPE* coord,
-                           const int* type,
-                           const deepmd::InputNlist& gpu_inlist,
-                           const int& nloc,
-                           const float& rcut,
-                           int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 2048;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 8;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void format_nbor_list_4096(uint_64* key,
-                           const FPTYPE* coord,
-                           const int* type,
-                           const deepmd::InputNlist& gpu_inlist,
-                           const int& nloc,
-                           const float& rcut,
-                           int* i_idx) {
-  const int LEN = 256;
-  const int MAX_NBOR_SIZE = 4096;
-  const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(1, LEN);
-  hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, key,
-                     coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh,
-                     rcut, i_idx, MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int ITEMS_PER_THREAD = 16;
-  const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
-  // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo,
-  // BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0,
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>),
-      nloc, BLOCK_THREADS, 0, 0, key, key + nloc * MAX_NBOR_SIZE);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void compute_env_mat_a(FPTYPE* em,
-                                  FPTYPE* em_deriv,
-                                  FPTYPE* rij,
-                                  const FPTYPE* coord,
-                                  const FPTYPE* avg,
-                                  const FPTYPE* std,
-                                  const int* type,
-                                  const int* nlist,
-                                  const int nnei,
-                                  const float rmin,
-                                  const float rmax) {
-  // <<<nloc, TPB>>>
-  const int_64 bid = blockIdx.x;
-  const unsigned int tid = threadIdx.x;
-  if (type[bid] < 0) {
-    return;
-  }
-  if (tid >= nnei) {
-    return;
-  }
-  const int ndescrpt = nnei * 4;
-  const int* row_nlist = nlist + bid * nnei;
-  FPTYPE* row_rij = rij + bid * nnei * 3;
-  FPTYPE* row_descript = em + bid * nnei * 4;
-  FPTYPE* row_descript_deriv = em_deriv + bid * nnei * 12;
-  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
-    const int idx_value = ii * 4;   // 4 components
-    const int idx_deriv = ii * 12;  // 4 components time 3 directions
-    if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3] = {0};
-      FPTYPE dd[4] = {0};
-      FPTYPE vv[12] = {0};
-      const int j_idx = row_nlist[ii];
-      for (int kk = 0; kk < 3; kk++) {
-        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
-        row_rij[ii * 3 + kk] = rr[kk];
-      }
-      // const FPTYPE * rr = &row_rij[ii * 3];
-      FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = _rsqrt(nr2);
-      FPTYPE nr = nr2 * inr;
-      FPTYPE inr2 = inr * inr;
-      FPTYPE inr4 = inr2 * inr2;
-      FPTYPE inr3 = inr4 * nr;
-      FPTYPE sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd[0] = ((FPTYPE)1. / nr);  //* sw;
-      dd[1] = (rr[0] / nr2);      //* sw;
-      dd[2] = (rr[1] / nr2);      //* sw;
-      dd[3] = (rr[2] / nr2);      //* sw;
-      vv[0] = (rr[0] * inr3 * sw -
-               dd[0] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-      vv[1] = (rr[1] * inr3 * sw -
-               dd[0] * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-      vv[2] = (rr[2] * inr3 * sw -
-               dd[0] * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-      // ****deriv of component x/r2
-      vv[3] = (((FPTYPE)2. * rr[0] * rr[0] * inr4 - inr2) * sw -
-               dd[1] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
-      vv[4] = (((FPTYPE)2. * rr[0] * rr[1] * inr4) * sw -
-               dd[1] * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
-      vv[5] = (((FPTYPE)2. * rr[0] * rr[2] * inr4) * sw -
-               dd[1] * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
-      // ***deriv of component y/r2
-      vv[6] = (((FPTYPE)2. * rr[1] * rr[0] * inr4) * sw -
-               dd[2] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
-      vv[7] = (((FPTYPE)2. * rr[1] * rr[1] * inr4 - inr2) * sw -
-               dd[2] * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
-      vv[8] = (((FPTYPE)2. * rr[1] * rr[2] * inr4) * sw -
-               dd[2] * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
-      // ***deriv of component z/r2
-      vv[9] = (((FPTYPE)2. * rr[2] * rr[0] * inr4) * sw -
-               dd[3] * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
-      vv[10] =
-          (((FPTYPE)2. * rr[2] * rr[1] * inr4) * sw -
-           dd[3] * dsw * rr[1] *
-               inr);  // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt
-                      // + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
-      vv[11] =
-          (((FPTYPE)2. * rr[2] * rr[2] * inr4 - inr2) * sw -
-           dd[3] * dsw * rr[2] *
-               inr);  // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt
-                      // + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
-      // 4 value components
-      dd[0] *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
-                    // idx_value + 0];
-      dd[1] *= sw;  // * em[idx * ndescrpt + idx_value + 1]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt +
-                    // idx_value + 1];
-      dd[2] *= sw;  // * em[idx * ndescrpt + idx_value + 2]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt +
-                    // idx_value + 2];
-      dd[3] *= sw;  // * em[idx * ndescrpt + idx_value + 3]);// - avg[type[idx]
-                    // * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt +
-                    // idx_value + 3];
-      for (int ii = 0; ii < 12; ii++) {
-        row_descript_deriv[idx_deriv + ii] =
-            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
-      }
-      for (int ii = 0; ii < 4; ii++) {
-        row_descript[idx_value + ii] =
-            (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) /
-            std[type[bid] * ndescrpt + idx_value + ii];
-      }
-    } else {
-      // TODO: move it to the memset.
-      row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
-                                 std[type[bid] * ndescrpt + idx_value];
-    }
-  }
-}
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void compute_env_mat_r(FPTYPE* em,
-                                  FPTYPE* em_deriv,
-                                  FPTYPE* rij,
-                                  const FPTYPE* coord,
-                                  const FPTYPE* avg,
-                                  const FPTYPE* std,
-                                  const int* type,
-                                  const int* nlist,
-                                  const int nnei,
-                                  const float rmin,
-                                  const float rmax) {
-  // <<<nloc, TPB>>>
-  const int_64 bid = blockIdx.x;
-  const unsigned int tid = threadIdx.x;
-  if (tid >= nnei) {
-    return;
-  }
-  const int ndescrpt = nnei;
-  const int* row_nlist = nlist + bid * nnei;
-  FPTYPE* row_rij = rij + bid * nnei * 3;
-  FPTYPE* row_em = em + bid * nnei;
-  FPTYPE* row_em_deriv = em_deriv + bid * nnei * 3;
-  for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
-    const int idx_value = ii;      // 4 components
-    const int idx_deriv = ii * 3;  // 4 components time 3 directions
-    if (row_nlist[ii] >= 0) {
-      FPTYPE rr[3] = {(FPTYPE)0.};
-      FPTYPE vv[3] = {(FPTYPE)0.};
-      FPTYPE dd = (FPTYPE)0.;
-      const int& j_idx = row_nlist[ii];
-      for (int kk = 0; kk < 3; kk++) {
-        rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
-        row_rij[ii * 3 + kk] = rr[kk];
-      }
-      // const FPTYPE * rr = &row_rij[ii * 3];
-      FPTYPE nr2 = dev_dot(rr, rr);
-      FPTYPE inr = _rsqrt(nr2);
-      FPTYPE nr = nr2 * inr;
-      FPTYPE inr2 = inr * inr;
-      FPTYPE inr4 = inr2 * inr2;
-      FPTYPE inr3 = inr4 * nr;
-      FPTYPE sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
-      dd = ((FPTYPE)1. / nr);  //* sw;
-      vv[0] = (rr[0] * inr3 * sw -
-               dd * dsw * rr[0] *
-                   inr);  // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-      vv[1] = (rr[1] * inr3 * sw -
-               dd * dsw * rr[1] *
-                   inr);  // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-      vv[2] = (rr[2] * inr3 * sw -
-               dd * dsw * rr[2] *
-                   inr);  // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] *
-                          // ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-
-      // 4 value components
-      dd *= sw;  // * em[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] *
-                 // ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt +
-                 // idx_value + 0];
-      for (int ii = 0; ii < 3; ii++) {
-        row_em_deriv[idx_deriv + ii] =
-            vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
-      }
-      row_em[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) /
-                          std[type[bid] * ndescrpt + idx_value];
-    } else {
-      // TODO: move it to the memset.
-      row_em[idx_value] -= avg[type[bid] * ndescrpt + idx_value] /
-                           std[type[bid] * ndescrpt + idx_value];
-    }
-  }
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void format_nbor_list_gpu(int* nlist,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          int* array_int,
-                          uint_64* array_longlong,
-                          const int max_nbor_size,
-                          const int nloc,
-                          const int nall,
-                          const float rcut,
-                          const std::vector<int> sec) {
-  const int LEN = 256;
-  const int nnei = sec.back();
-  const int nblock = (nloc + LEN - 1) / LEN;
-  int* sec_dev = array_int;
-  int* nei_iter = array_int + sec.size();  // = new int[sec_size];
-  int* i_idx = array_int + sec.size() + nloc * sec.size();
-  uint_64* key = array_longlong;
-  assert(max_nbor_size == 256 || max_nbor_size == 512 || 1024 ||
-         max_nbor_size == 2048 || max_nbor_size == 4096);
-  DPErrcheck(hipMemset(nlist, -1, sizeof(int) * int_64(nloc) * nnei));
-  DPErrcheck(hipMemset(key, 0xffffffff,
-                       sizeof(uint_64) * int_64(nloc) * max_nbor_size));
-  DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
-                       hipMemcpyHostToDevice));
-
-  hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, i_idx, nloc,
-                     gpu_inlist.ilist);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  if (max_nbor_size == 256) {
-    format_nbor_list_256(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 512) {
-    format_nbor_list_512(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 1024) {
-    format_nbor_list_1024(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 2048) {
-    format_nbor_list_2048(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  } else if (max_nbor_size == 4096) {
-    format_nbor_list_4096(key, coord, type, gpu_inlist, nloc, rcut, i_idx);
-  }
-
-  hipLaunchKernelGGL(fill_nei_iter, dim3(nloc, (max_nbor_size + LEN - 1) / LEN),
-                     LEN, 0, 0, nei_iter, key, nloc, max_nbor_size, sec.size());
-
-  hipLaunchKernelGGL(
-      format_nlist_fill_b, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN, 0,
-      0, nlist, nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_env_mat_a_gpu(FPTYPE* em,
-                        FPTYPE* em_deriv,
-                        FPTYPE* rij,
-                        int* nlist,
-                        const FPTYPE* coord,
-                        const int* type,
-                        const InputNlist& gpu_inlist,
-                        int* array_int,
-                        uint_64* array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE* avg,
-                        const FPTYPE* std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec,
-                        const int* f_type) {
-  if (f_type == NULL) {
-    f_type = type;
-  }
-  const int nnei = sec.back();
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
-  DPErrcheck(
-      hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
-
-  format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int,
-                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(hipGetLastError());
-  nborErrcheck(hipDeviceSynchronize());
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a<FPTYPE, TPB>), nloc, TPB,
-                     0, 0, em, em_deriv, rij, coord, avg, std, type, nlist,
-                     nnei, rcut_smth, rcut);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_env_mat_r_gpu(FPTYPE* em,
-                        FPTYPE* em_deriv,
-                        FPTYPE* rij,
-                        int* nlist,
-                        const FPTYPE* coord,
-                        const int* type,
-                        const InputNlist& gpu_inlist,
-                        int* array_int,
-                        uint_64* array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE* avg,
-                        const FPTYPE* std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec) {
-  const int nnei = sec.back();
-  const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
-  DPErrcheck(
-      hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3));
-  DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3));
-
-  format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int,
-                       array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-  nborErrcheck(hipGetLastError());
-  nborErrcheck(hipDeviceSynchronize());
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r<FPTYPE, TPB>), nloc, TPB,
-                     0, 0, em, em_deriv, rij, coord, avg, std, type, nlist,
-                     nnei, rcut_smth, rcut);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                          int* out_type,
-                                          int* out_index,
-                                          const int* in_type,
-                                          const FPTYPE* in_dist,
-                                          const int* in_index,
-                                          const int size_of_array) {
-  const int nblock = (size_of_array + TPB - 1) / TPB;
-  hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, key,
-                     out_type, out_index, in_type, in_dist, in_index,
-                     size_of_array);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_env_mat_a_gpu<float>(float* em,
-                                        float* em_deriv,
-                                        float* rij,
-                                        int* nlist,
-                                        const float* coord,
-                                        const int* type,
-                                        const InputNlist& gpu_inlist,
-                                        int* array_int,
-                                        unsigned long long* array_longlong,
-                                        const int max_nbor_size,
-                                        const float* avg,
-                                        const float* std,
-                                        const int nloc,
-                                        const int nall,
-                                        const float rcut,
-                                        const float rcut_smth,
-                                        const std::vector<int> sec,
-                                        const int* f_type);
-template void prod_env_mat_a_gpu<double>(double* em,
-                                         double* em_deriv,
-                                         double* rij,
-                                         int* nlist,
-                                         const double* coord,
-                                         const int* type,
-                                         const InputNlist& gpu_inlist,
-                                         int* array_int,
-                                         unsigned long long* array_longlong,
-                                         const int max_nbor_size,
-                                         const double* avg,
-                                         const double* std,
-                                         const int nloc,
-                                         const int nall,
-                                         const float rcut,
-                                         const float rcut_smth,
-                                         const std::vector<int> sec,
-                                         const int* f_type);
-template void prod_env_mat_r_gpu<float>(float* em,
-                                        float* em_deriv,
-                                        float* rij,
-                                        int* nlist,
-                                        const float* coord,
-                                        const int* type,
-                                        const InputNlist& gpu_inlist,
-                                        int* array_int,
-                                        unsigned long long* array_longlong,
-                                        const int max_nbor_size,
-                                        const float* avg,
-                                        const float* std,
-                                        const int nloc,
-                                        const int nall,
-                                        const float rcut,
-                                        const float rcut_smth,
-                                        const std::vector<int> sec);
-template void prod_env_mat_r_gpu<double>(double* em,
-                                         double* em_deriv,
-                                         double* rij,
-                                         int* nlist,
-                                         const double* coord,
-                                         const int* type,
-                                         const InputNlist& gpu_inlist,
-                                         int* array_int,
-                                         unsigned long long* array_longlong,
-                                         const int max_nbor_size,
-                                         const double* avg,
-                                         const double* std,
-                                         const int nloc,
-                                         const int nall,
-                                         const float rcut,
-                                         const float rcut_smth,
-                                         const std::vector<int> sec);
-template void format_nbor_list_gpu<float>(int* nlist,
-                                          const float* coord,
-                                          const int* type,
-                                          const deepmd::InputNlist& gpu_inlist,
-                                          int* array_int,
-                                          uint_64* array_longlong,
-                                          const int max_nbor_size,
-                                          const int nloc,
-                                          const int nall,
-                                          const float rcut,
-                                          const std::vector<int> sec);
-template void format_nbor_list_gpu<double>(int* nlist,
-                                           const double* coord,
-                                           const int* type,
-                                           const deepmd::InputNlist& gpu_inlist,
-                                           int* array_int,
-                                           uint_64* array_longlong,
-                                           const int max_nbor_size,
-                                           const int nloc,
-                                           const int nall,
-                                           const float rcut,
-                                           const std::vector<int> sec);
-template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                                   int* out_type,
-                                                   int* out_index,
-                                                   const int* in_type,
-                                                   const float* in_dist,
-                                                   const int* in_index,
-                                                   const int size_of_array);
-template void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                                   int* out_type,
-                                                   int* out_index,
-                                                   const int* in_type,
-                                                   const double* in_dist,
-                                                   const int* in_index,
-                                                   const int size_of_array);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
deleted file mode 100644
index 5b1f91dd49..0000000000
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ /dev/null
@@ -1,193 +0,0 @@
-#include "device.h"
-#include "prod_force.h"
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void force_deriv_wrt_center_atom(FPTYPE* force,
-                                            const FPTYPE* net_deriv,
-                                            const FPTYPE* in_deriv,
-                                            const int ndescrpt,
-                                            const int nloc,
-                                            const int nall) {
-  __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
-  int_64 bid = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
-    data[ii] = (FPTYPE)0.;
-  }
-  for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
-    for (int jj = 0; jj < 3; jj++) {
-      data[jj * THREADS_PER_BLOCK + tid] +=
-          net_deriv[bid * ndescrpt + ii] *
-          in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
-    }
-  }
-  __syncthreads();
-  // do reduction in shared memory
-  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-    if (tid < ii) {
-      for (int jj = 0; jj < 3; jj++) {
-        data[jj * THREADS_PER_BLOCK + tid] +=
-            data[jj * THREADS_PER_BLOCK + tid + ii];
-      }
-    }
-    __syncthreads();
-  }
-  // write result for this block to global memory
-  const int_64 kk = bid / nloc;  // frame index
-  const int_64 ll = bid % nloc;  // atom index
-  const int_64 i_idx_nall = kk * nall + ll;
-  if (tid == 0) {
-    force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
-    force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
-    force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
-  }
-}
-
-template <typename FPTYPE>
-__global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
-                                            const FPTYPE* net_deriv,
-                                            const FPTYPE* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  if (idy >= nnei) {
-    return;
-  }
-  // deriv wrt neighbors
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE force_tmp = (FPTYPE)0.;
-  for (int idw = 0; idw < 4; ++idw) {
-    force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
-                 in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
-  }
-  const int_64 kk = idx / nloc;  // frame index
-  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp);
-}
-
-template <typename FPTYPE>
-__global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
-                                            const FPTYPE* net_deriv,
-                                            const FPTYPE* in_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nall,
-                                            const int nnei) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 1;
-  if (idy >= nnei) {
-    return;
-  }
-  // deriv wrt neighbors
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  const int_64 kk = idx / nloc;  // frame index
-  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz,
-            net_deriv[idx * ndescrpt + idy] *
-                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_force_a_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>),
-                     nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv,
-                     ndescrpt, nloc, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int LEN = 64;
-  const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(LEN, 3);
-  hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0,
-                     force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_force_r_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes) {
-  const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
-
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>),
-                     nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv,
-                     ndescrpt, nloc, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int LEN = 64;
-  const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(LEN, 3);
-  hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0,
-                     force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_force_a_gpu<float>(float* force,
-                                      const float* net_deriv,
-                                      const float* in_deriv,
-                                      const int* nlist,
-                                      const int nloc,
-                                      const int nall,
-                                      const int nnei,
-                                      const int nframes);
-template void prod_force_a_gpu<double>(double* force,
-                                       const double* net_deriv,
-                                       const double* in_deriv,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei,
-                                       const int nframes);
-template void prod_force_r_gpu<float>(float* force,
-                                      const float* net_deriv,
-                                      const float* in_deriv,
-                                      const int* nlist,
-                                      const int nloc,
-                                      const int nall,
-                                      const int nnei,
-                                      const int nframes);
-template void prod_force_r_gpu<double>(double* force,
-                                       const double* net_deriv,
-                                       const double* in_deriv,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei,
-                                       const int nframes);
-
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
deleted file mode 100644
index 2cb7c4f1d6..0000000000
--- a/source/lib/src/rocm/prod_force_grad.hip.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "device.h"
-#include "prod_force_grad.h"
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) {
-  return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
-}
-
-template <typename FPTYPE>
-__global__ void force_grad_wrt_center_atom(FPTYPE* grad_net,
-                                           const FPTYPE* grad,
-                                           const FPTYPE* env_deriv,
-                                           const int ndescrpt) {
-  __shared__ FPTYPE grad_one[3];
-  int_64 center_idx = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  if (tid < 3) {
-    grad_one[tid] = grad[center_idx * 3 + tid];
-  }
-  __syncthreads();
-  unsigned int descrpt_idx = blockIdx.y * blockDim.x + tid;
-  if (descrpt_idx < ndescrpt) {
-    grad_net[center_idx * ndescrpt + descrpt_idx] -= dev_dot(
-        grad_one, env_deriv + center_idx * ndescrpt * 3 + descrpt_idx * 3);
-  }
-}
-
-template <typename FPTYPE>
-__global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net,
-                                           const FPTYPE* grad,
-                                           const FPTYPE* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const unsigned int idy = blockIdx.y;
-  const unsigned int idw = threadIdx.y;
-  if (idx >= nframes * nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  if (j_idx >= nloc) {
-    j_idx = j_idx % nloc;
-  }
-  const int kk = idx / nloc;  // frame index
-  grad_net[idx * nnei * 4 + idy * 4 + idw] +=
-      dev_dot(grad + kk * nloc * 3 + j_idx * 3,
-              env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
-}
-
-template <typename FPTYPE>
-__global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
-                                           const FPTYPE* grad,
-                                           const FPTYPE* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes) {
-  // idy -> nnei
-  const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const unsigned int idy = blockIdx.y;
-  if (idx >= nframes * nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  if (j_idx >= nloc) {
-    j_idx = j_idx % nloc;
-  }
-  const int kk = idx / nloc;  // frame index
-  grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3,
-                                        env_deriv + idx * nnei * 3 + idy * 3);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_force_grad_a_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(
-      hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
-  const int nblock = (ndescrpt + TPB - 1) / TPB;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(TPB, 1);
-  hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, ndescrpt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  const int LEN = 128;
-  const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
-  dim3 block_grid_(nblock_, nnei);
-  dim3 thread_grid_(LEN, 4);
-  hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0,
-                     0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_force_grad_r_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes) {
-  const int ndescrpt = nnei * 1;
-  DPErrcheck(
-      hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
-  const int nblock = (ndescrpt + TPB - 1) / TPB;
-  dim3 block_grid(nframes * nloc, nblock);
-  dim3 thread_grid(TPB, 1);
-  hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, ndescrpt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-
-  const int LEN = 128;
-  const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
-  dim3 block_grid_(nblock_, nnei);
-  dim3 thread_grid_(LEN, 1);
-  hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0,
-                     0, grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_force_grad_a_gpu<float>(float* grad_net,
-                                           const float* grad,
-                                           const float* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_grad_a_gpu<double>(double* grad_net,
-                                            const double* grad,
-                                            const double* env_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei,
-                                            const int nframes);
-template void prod_force_grad_r_gpu<float>(float* grad_net,
-                                           const float* grad,
-                                           const float* env_deriv,
-                                           const int* nlist,
-                                           const int nloc,
-                                           const int nnei,
-                                           const int nframes);
-template void prod_force_grad_r_gpu<double>(double* grad_net,
-                                            const double* grad,
-                                            const double* env_deriv,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei,
-                                            const int nframes);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
deleted file mode 100644
index ff29c07ffb..0000000000
--- a/source/lib/src/rocm/prod_virial.hip.cu
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "device.h"
-#include "prod_virial.h"
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void atom_virial_reduction(FPTYPE* virial,
-                                      const FPTYPE* atom_virial,
-                                      const int nall) {
-  unsigned int bid = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  __shared__ FPTYPE data[THREADS_PER_BLOCK];
-  data[tid] = (FPTYPE)0.;
-  for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
-    data[tid] += atom_virial[ii * 9 + bid];
-  }
-  __syncthreads();
-  // do reduction in shared memory
-  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-    if (tid < ii) {
-      data[tid] += data[tid + ii];
-    }
-    __syncthreads();
-  }
-  // write result for this block to global memory
-  if (tid == 0) {
-    virial[bid] = data[0];
-  }
-}
-
-template <typename FPTYPE>
-__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
-                                             FPTYPE* atom_virial,
-                                             const FPTYPE* net_deriv,
-                                             const FPTYPE* in_deriv,
-                                             const FPTYPE* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei) {
-  // idx -> nloc
-  // idy -> nnei
-  // idz = dd0 * 3 + dd1
-  // dd0 = idz / 3
-  // dd1 = idz % 3
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  if (idy >= nnei) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE virial_tmp = (FPTYPE)0.;
-  for (int idw = 0; idw < 4; ++idw) {
-    virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
-                  rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
-  }
-  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
-}
-
-template <typename FPTYPE>
-__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
-                                             FPTYPE* atom_virial,
-                                             const FPTYPE* net_deriv,
-                                             const FPTYPE* in_deriv,
-                                             const FPTYPE* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei) {
-  // idx -> nloc
-  // idy -> nnei
-  // idz = dd0 * 3 + dd1
-  // dd0 = idz / 3
-  // dd1 = idz % 3
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 1;
-
-  if (idy >= nnei) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  // atomicAdd(
-  //    virial + idz,
-  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
-  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
-  //    3]);
-  atomicAdd(atom_virial + j_idx * 9 + idz,
-            net_deriv[idx * ndescrpt + idy] *
-                rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_virial_a_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* in_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei) {
-  DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-  const int LEN = 16;
-  int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(LEN, 9);
-  // compute virial of a frame
-  hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0,
-                     0, virial, atom_virial, net_deriv, in_deriv, rij, nlist,
-                     nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  // reduction atom_virial to virial
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9,
-                     TPB, 0, 0, virial, atom_virial, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_virial_r_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* in_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei) {
-  DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-  const int LEN = 16;
-  int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(LEN, 9);
-  // compute virial of a frame
-  hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0,
-                     0, virial, atom_virial, net_deriv, in_deriv, rij, nlist,
-                     nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-  // reduction atom_virial to virial
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9,
-                     TPB, 0, 0, virial, atom_virial, nall);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_virial_a_gpu<float>(float* virial,
-                                       float* atom_virial,
-                                       const float* net_deriv,
-                                       const float* in_deriv,
-                                       const float* rij,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei);
-template void prod_virial_a_gpu<double>(double* virial,
-                                        double* atom_virial,
-                                        const double* net_deriv,
-                                        const double* in_deriv,
-                                        const double* rij,
-                                        const int* nlist,
-                                        const int nloc,
-                                        const int nall,
-                                        const int nnei);
-template void prod_virial_r_gpu<float>(float* virial,
-                                       float* atom_virial,
-                                       const float* net_deriv,
-                                       const float* in_deriv,
-                                       const float* rij,
-                                       const int* nlist,
-                                       const int nloc,
-                                       const int nall,
-                                       const int nnei);
-template void prod_virial_r_gpu<double>(double* virial,
-                                        double* atom_virial,
-                                        const double* net_deriv,
-                                        const double* in_deriv,
-                                        const double* rij,
-                                        const int* nlist,
-                                        const int nloc,
-                                        const int nall,
-                                        const int nnei);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
deleted file mode 100644
index d41a1689ce..0000000000
--- a/source/lib/src/rocm/prod_virial_grad.hip.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "device.h"
-#include "prod_virial_grad.h"
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) {
-  FPTYPE result = (FPTYPE)0.0;
-  for (int ii = 0; ii < 9; ii++) {
-    result += arr1[ii] * arr2[ii];
-  }
-  return result;
-}
-
-template <typename FPTYPE>
-__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net,
-                                            const FPTYPE* grad,
-                                            const FPTYPE* env_deriv,
-                                            const FPTYPE* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei) {
-  // idy -> nnei
-  const unsigned int tid = threadIdx.x;
-  const int_64 idx = blockIdx.x * blockDim.x + tid;
-  const unsigned int idy = blockIdx.y;
-  const unsigned int idw = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  __shared__ FPTYPE grad_one[9];
-  if (tid < 9) {
-    grad_one[tid] = grad[tid];
-  }
-  __syncthreads();
-  if (idx >= nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE tmp[9];
-  for (int dd0 = 0; dd0 < 3; ++dd0) {
-    for (int dd1 = 0; dd1 < 3; ++dd1) {
-      tmp[dd0 * 3 + dd1] =
-          rij[idx * nnei * 3 + idy * 3 + dd1] *
-          env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
-    }
-  }
-  grad_net[idx * ndescrpt + idy * 4 + idw] -= -1.0 * dev_dot9(grad_one, tmp);
-}
-
-template <typename FPTYPE>
-__global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net,
-                                            const FPTYPE* grad,
-                                            const FPTYPE* env_deriv,
-                                            const FPTYPE* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei) {
-  // idy -> nnei
-  const unsigned int tid = threadIdx.x;
-  const int_64 idx = blockIdx.x * blockDim.x + tid;
-  const unsigned int idy = blockIdx.y;
-  const int ndescrpt = nnei;
-  __shared__ FPTYPE grad_one[9];
-  if (tid < 9) {
-    grad_one[tid] = grad[tid];
-  }
-  __syncthreads();
-  if (idx >= nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE tmp[9];
-  for (int dd0 = 0; dd0 < 3; ++dd0) {
-    for (int dd1 = 0; dd1 < 3; ++dd1) {
-      tmp[dd0 * 3 + dd1] = rij[idx * nnei * 3 + idy * 3 + dd1] *
-                           env_deriv[idx * ndescrpt * 3 + idy * 3 + dd0];
-    }
-  }
-  grad_net[idx * ndescrpt + idy] -= (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  const int LEN = 128;
-  const int nblock = (nloc + LEN - 1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 4);
-  hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_virial_grad_r_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei) {
-  const int ndescrpt = nnei;
-  DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  const int LEN = 128;
-  const int nblock = (nloc + LEN - 1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 1);
-  hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0,
-                     grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void prod_virial_grad_a_gpu<float>(float* grad_net,
-                                            const float* grad,
-                                            const float* env_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei);
-template void prod_virial_grad_a_gpu<double>(double* grad_net,
-                                             const double* grad,
-                                             const double* env_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei);
-template void prod_virial_grad_r_gpu<float>(float* grad_net,
-                                            const float* grad,
-                                            const float* env_deriv,
-                                            const float* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei);
-template void prod_virial_grad_r_gpu<double>(double* grad_net,
-                                             const double* grad,
-                                             const double* env_deriv,
-                                             const double* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu
deleted file mode 100644
index de67ef648c..0000000000
--- a/source/lib/src/rocm/region.hip.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "device.h"
-#include "region.cuh"
-#include "region.h"
-
-template <typename FPTYPE>
-__global__ void _phys2Inter(FPTYPE *inter,
-                            const FPTYPE *phys,
-                            const FPTYPE *rec_boxt) {
-  phys2Inter(inter, phys, rec_boxt);
-}
-
-template <typename FPTYPE>
-__global__ void _inter2Phys(FPTYPE *phys,
-                            const FPTYPE *inter,
-                            const FPTYPE *boxt) {
-  inter2Phys(phys, inter, boxt);
-}
-
-template <typename FPTYPE>
-__global__ void _compute_volume(FPTYPE *volume, const FPTYPE *boxt) {
-  volume[0] = compute_volume(boxt);
-}
-
-namespace deepmd {
-// only for unittest
-template <typename FPTYPE>
-void convert_to_inter_gpu(FPTYPE *ri,
-                          const Region<FPTYPE> &region,
-                          const FPTYPE *rp) {
-  hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void convert_to_phys_gpu(FPTYPE *rp,
-                         const Region<FPTYPE> &region,
-                         const FPTYPE *ri) {
-  hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
-  hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void convert_to_inter_gpu<float>(float *ri,
-                                          const Region<float> &region,
-                                          const float *rp);
-template void convert_to_inter_gpu<double>(double *ri,
-                                           const Region<double> &region,
-                                           const double *rp);
-template void convert_to_phys_gpu<float>(float *rp,
-                                         const Region<float> &region,
-                                         const float *ri);
-template void convert_to_phys_gpu<double>(double *rp,
-                                          const Region<double> &region,
-                                          const double *ri);
-template void volume_gpu<float>(float *volume, const Region<float> &region);
-template void volume_gpu<double>(double *volume, const Region<double> &region);
-}  // namespace deepmd
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
deleted file mode 100644
index 88a1cbb574..0000000000
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ /dev/null
@@ -1,1036 +0,0 @@
-#include "device.h"
-#include "tabulate.h"
-
-#define MM 4
-#define KK 4
-#define TPB 256
-#define WARP_SIZE 64
-#define FULL_MASK 0xffffffff
-
-template <typename FPTYPE>
-__forceinline__ __device__ void locate_xx(FPTYPE& xx,
-                                          int& table_idx,
-                                          const FPTYPE& lower,
-                                          const FPTYPE& upper,
-                                          const FPTYPE& max,
-                                          const FPTYPE& stride0,
-                                          const FPTYPE& stride1) {
-  if (xx < lower) {
-    table_idx = 0;
-    xx = (FPTYPE)0.;
-  } else if (xx < upper) {
-    table_idx = (int)((xx - lower) / stride0);
-    xx -= (table_idx * stride0 + lower);
-  } else if (xx < max) {
-    int first_stride = int((upper - lower) / stride0);
-    table_idx = first_stride + (int)((xx - upper) / stride1);
-    xx -= ((table_idx - first_stride) * stride1 + upper);
-  } else {
-    table_idx =
-        int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
-    xx = (FPTYPE)0.;
-  }
-}
-
-template <typename FPTYPE>
-__forceinline__ __device__ void locate_xx_se_t(FPTYPE& xx,
-                                               int& table_idx,
-                                               const FPTYPE& lower,
-                                               const FPTYPE& upper,
-                                               const FPTYPE& min,
-                                               const FPTYPE& max,
-                                               const FPTYPE& stride0,
-                                               const FPTYPE& stride1) {
-  if (xx < min) {
-    table_idx = 0;
-    xx = (FPTYPE)0.;
-  } else if (xx < lower) {
-    table_idx = (int)((xx - min) / stride1);
-    xx -= (table_idx * stride1 + min);
-  } else if (xx < upper) {
-    int first_stride = int((lower - min) / stride1);
-    table_idx = first_stride + (int)((xx - lower) / stride0);
-    xx -= ((table_idx - first_stride) * stride0 + lower);
-  } else if (xx < max) {
-    int first_stride =
-        int((lower - min) / stride1) + int((upper - lower) / stride0);
-    table_idx = first_stride + (int)((xx - upper) / stride1);
-    xx -= ((table_idx - first_stride) * stride1 + upper);
-  } else {
-    table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) +
-                (int)((max - upper) / stride1) - 1;
-    xx = (FPTYPE)0.;
-  }
-}
-
-template <typename FPTYPE>
-__forceinline__ __device__ FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) {
-  return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3];
-}
-
-template <typename FPTYPE>
-__forceinline__ __device__ void warp_reduce(FPTYPE& val) {
-  for (int offset = 32; offset > 0; offset >>= 1) {
-    val += __shfl_down(val, offset);  // ########????
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_a_fifth_order_polynomial(
-    FPTYPE* out,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* two_embed,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted) {
-  bool enable_se_atten = two_embed != nullptr;
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
-  bool unloop = false;
-  int breakpoint = nnei - 1;
-  FPTYPE* iteratorC = (FPTYPE*)&_data[0];
-  for (int kk = 0; kk < MTILE; kk++) {
-    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
-  }
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em_x[block_idx * nnei + ii];
-    if (xx == ago && is_sorted) {
-      unloop = true;
-      breakpoint = ii;
-    }
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res =
-        var[0] +
-        (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-            xx;
-    if (enable_se_atten) {
-      FPTYPE t = two_embed[block_idx * nnei * last_layer_size +
-                           ii * last_layer_size + thread_idx];
-      res = res * t + res;
-    }
-
-    for (int kk = 0; kk < MTILE; kk++) {
-      iteratorC[kk * last_layer_size + thread_idx] +=
-          (nnei - breakpoint) * em[block_idx * nnei * MTILE + ii * MTILE + kk] *
-          res;
-    }
-    if (unloop) {
-      break;
-    }
-  }
-  for (int ii = 0; ii < MTILE; ii++) {
-    out[block_idx * MTILE * last_layer_size + ii * last_layer_size +
-        thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial(
-    FPTYPE* dy_dem_x,
-    FPTYPE* dy_dem,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* two_embed,
-    const FPTYPE* dy,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted) {
-  bool enable_se_atten = two_embed != nullptr;
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl(threadIdx.x / 64, 0);
-  int lane_idx = threadIdx.x % 64;
-  int breakpoint = nnei - 1;
-  bool unloop = false;
-  FPTYPE* iteratorA = (FPTYPE*)&_data[0];  // dy
-  for (int ii = 0; ii < MTILE; ii++) {
-    for (int jj = thread_idx; jj < last_layer_size; jj += blockDim.x) {
-      iteratorA[ii * last_layer_size + jj] =
-          dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + jj];
-    }
-  }
-  __syncthreads();
-  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
-  for (int ii = 0; ii < nnei - warp_idx; ii += KTILE) {
-    FPTYPE xx = em_x[block_idx * nnei + ii + warp_idx];
-    if (ago == xx && is_sorted) {
-      unloop = true;
-      breakpoint = ii + warp_idx;
-    }
-
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    FPTYPE sum[KTILE] = {(FPTYPE)0.};
-    FPTYPE Csub = (FPTYPE)0.;
-    for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
-      FPTYPE var[6];
-      // load iteratorB through table
-      var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
-      FPTYPE res =
-          var[0] +
-          (var[1] +
-           (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-              xx;
-      FPTYPE t;
-      if (enable_se_atten) {
-        t = two_embed[block_idx * nnei * last_layer_size +
-                      ii * last_layer_size + jj];
-        res = res * t + res;
-      }
-
-      for (int kk = 0; kk < KTILE; kk++) {
-        sum[kk] +=
-            (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res;
-      }
-      res = em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] *
-            iteratorA[0 * last_layer_size + jj];
-      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] *
-             iteratorA[1 * last_layer_size + jj];
-      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] *
-             iteratorA[2 * last_layer_size + jj];
-      res += em[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] *
-             iteratorA[3 * last_layer_size + jj];
-      Csub +=
-          (nnei - breakpoint) *
-          (var[1] + ((FPTYPE)2. * var[2] +
-                     ((FPTYPE)3. * var[3] +
-                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                         xx) *
-                        xx) *
-          (enable_se_atten ? res * t + res : res);
-    }
-    //__syncwarp();->syncwrap
-    __syncthreads();
-    for (int kk = 0; kk < KTILE; kk++) {
-      warp_reduce(sum[kk]);
-    }
-    warp_reduce(Csub);
-    if (lane_idx == 0) {
-      for (int kk = 0; kk < KTILE; kk++) {
-        dy_dem[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
-      }
-      dy_dem_x[block_idx * nnei + ii + warp_idx] = Csub;
-    }
-    if (unloop) {
-      break;
-    }
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial(
-    FPTYPE* dz_dy,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* dz_dy_dem_x,
-    const FPTYPE* dz_dy_dem,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted) {
-  extern __shared__ int _data[];
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-  FPTYPE ago = __shfl(em_x[block_idx * nnei + nnei - 1], 0);
-  bool unloop = false;
-  int breakpoint = nnei - 1;
-  FPTYPE* iteratorC = (FPTYPE*)&_data[0];
-  for (int kk = 0; kk < MTILE; kk++) {
-    iteratorC[kk * last_layer_size + thread_idx] = (FPTYPE)0.;
-  }
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em_x[block_idx * nnei + ii];
-    FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii];
-    if (xx == ago && is_sorted) {
-      unloop = true;
-      breakpoint = ii;
-    }
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res =
-        var[0] +
-        (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-            xx;
-    FPTYPE res_grad =
-        var[1] + ((FPTYPE)2. * var[2] +
-                  ((FPTYPE)3. * var[3] +
-                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                      xx) *
-                     xx;
-
-    for (int kk = 0; kk < MTILE; kk++) {
-      int em_index = block_idx * nnei * MTILE + ii * MTILE + kk;
-      iteratorC[kk * last_layer_size + thread_idx] +=
-          (nnei - breakpoint) *
-          (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res);
-    }
-    if (unloop) {
-      break;
-    }
-  }
-  for (int ii = 0; ii < MTILE; ii++) {
-    dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size +
-          thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_t_fifth_order_polynomial(
-    FPTYPE* out,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  FPTYPE sum = (FPTYPE)0.;
-  for (int ii = 0; ii < nnei_i; ii++) {
-    for (int jj = 0; jj < nnei_j; jj++) {
-      FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE tmp = xx;
-      int table_idx = 0;
-      locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      FPTYPE var[6];
-      var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-      FPTYPE res =
-          var[0] +
-          (var[1] +
-           (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-              xx;
-
-      sum += tmp * res;
-    }
-  }
-  out[block_idx * last_layer_size + thread_idx] = sum;
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial(
-    FPTYPE* dy_dem_x,
-    FPTYPE* dy_dem,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* dy,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl(threadIdx.x / 64, 0);
-  int lane_idx = threadIdx.x % 64;
-  FPTYPE* iteratorA = (FPTYPE*)&_data[0];  // dy
-  for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) {
-    iteratorA[ii] = dy[block_idx * last_layer_size + ii];
-  }
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei_i; ii++) {
-    FPTYPE ago =
-        __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
-    for (int jj = warp_idx; jj < nnei_j; jj += KTILE) {
-      FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE tmp = xx;
-      int table_idx = 0;
-      locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      FPTYPE sum = (FPTYPE)0.;
-      FPTYPE Csub = (FPTYPE)0.;
-      for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) {
-        FPTYPE var[6];
-        // load iteratorB through table
-        var[0] = table[table_idx * last_layer_size * 6 + 6 * kk + 0];
-        var[1] = table[table_idx * last_layer_size * 6 + 6 * kk + 1];
-        var[2] = table[table_idx * last_layer_size * 6 + 6 * kk + 2];
-        var[3] = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
-        var[4] = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
-        var[5] = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-        FPTYPE res =
-            var[0] +
-            (var[1] +
-             (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-                xx;
-
-        sum += iteratorA[kk] * res;
-        Csub +=
-            iteratorA[kk] * tmp *
-            (var[1] + ((FPTYPE)2. * var[2] +
-                       ((FPTYPE)3. * var[3] +
-                        ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                           xx) *
-                          xx);
-      }
-      __syncthreads();
-      warp_reduce(sum);
-      warp_reduce(Csub);
-      if (lane_idx == 0) {
-        dy_dem[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = sum;
-        dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = Csub;
-      }
-    }
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial(
-    FPTYPE* dz_dy,
-    const FPTYPE* table,
-    const FPTYPE* em_x,
-    const FPTYPE* em,
-    const FPTYPE* dz_dy_dem_x,
-    const FPTYPE* dz_dy_dem,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size) {
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  FPTYPE sum = (FPTYPE)0.;
-  for (int ii = 0; ii < nnei_i; ii++) {
-    FPTYPE ago =
-        __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0);
-    for (int jj = 0; ii < nnei_j; jj++) {
-      FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE tmp = xx;
-      FPTYPE dz_xx =
-          dz_dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE dz_em = dz_dy_dem[block_idx * nnei_i * nnei_j + ii * nnei_j + jj];
-      FPTYPE var[6];
-
-      int table_idx = 0;
-      locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1);
-      var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-      FPTYPE res =
-          var[0] +
-          (var[1] +
-           (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-              xx;
-      FPTYPE res_grad =
-          var[1] + ((FPTYPE)2. * var[2] +
-                    ((FPTYPE)3. * var[3] +
-                     ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                        xx) *
-                       xx;
-
-      sum += (tmp * res_grad * dz_xx + dz_em * res);
-    }
-  }
-  dz_dy[block_idx * last_layer_size + thread_idx] = sum;
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_r_fifth_order_polynomial(
-    FPTYPE* out,
-    const FPTYPE* table,
-    const FPTYPE* em,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em[block_idx * nnei + ii];
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    out[block_idx * nnei * last_layer_size + ii * last_layer_size +
-        thread_idx] =
-        var[0] +
-        (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) *
-            xx;
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_r_grad_fifth_order_polynomial(
-    FPTYPE* dy_dem,
-    const FPTYPE* table,
-    const FPTYPE* em,
-    const FPTYPE* dy,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size) {
-  HIP_DYNAMIC_SHARED(int, _data)
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // KTILE * WARP_SIZE, usally 128 here~
-  int warp_idx = __shfl(threadIdx.x / 64, 0);
-  int lane_idx = threadIdx.x % 64;
-
-  for (int ii = 0; ii < nnei; ii += KTILE) {
-    FPTYPE xx = em[block_idx * nnei + ii + warp_idx];
-
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    FPTYPE Csub = 0.f;
-    for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
-      FPTYPE var[6];
-      // load iteratorB through table
-      var[0] = table[table_idx * last_layer_size * 6 + 6 * jj + 0];
-      var[1] = table[table_idx * last_layer_size * 6 + 6 * jj + 1];
-      var[2] = table[table_idx * last_layer_size * 6 + 6 * jj + 2];
-      var[3] = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
-      var[4] = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
-      var[5] = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
-      Csub +=
-          (var[1] + ((FPTYPE)2. * var[2] +
-                     ((FPTYPE)3. * var[3] +
-                      ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                         xx) *
-                        xx) *
-          dy[block_idx * nnei * last_layer_size + ii * last_layer_size + jj];
-    }
-    //__syncwarp();->syncwrap
-    __syncthreads();
-    warp_reduce(Csub);
-    if (lane_idx == 0) {
-      dy_dem[block_idx * nnei + ii + warp_idx] = Csub;
-    }
-  }
-}
-
-template <typename FPTYPE, int MTILE, int KTILE>
-__global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial(
-    FPTYPE* dz_dy,
-    const FPTYPE* table,
-    const FPTYPE* em,
-    const FPTYPE* dz_dy_dem,
-    const FPTYPE lower,
-    const FPTYPE upper,
-    const FPTYPE max,
-    const FPTYPE stride0,
-    const FPTYPE stride1,
-    const int nnei,
-    const int last_layer_size) {
-  extern __shared__ int _data[];
-  const int_64 block_idx = blockIdx.x;  // nloc
-  const int thread_idx = threadIdx.x;   // last_layer_size
-
-  __syncthreads();
-
-  for (int ii = 0; ii < nnei; ii++) {
-    FPTYPE var[6];
-    FPTYPE xx = em[block_idx * nnei + ii];
-    int table_idx = 0;
-    locate_xx(xx, table_idx, lower, upper, max, stride0, stride1);
-    var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
-    var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
-    var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
-    var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
-    var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
-    var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-    FPTYPE res_grad =
-        var[1] + ((FPTYPE)2. * var[2] +
-                  ((FPTYPE)3. * var[3] +
-                   ((FPTYPE)4. * var[4] + (FPTYPE)5. * var[5] * xx) * xx) *
-                      xx) *
-                     xx;
-    dz_dy[block_idx * nnei * last_layer_size + ii * last_layer_size +
-          thread_idx] = dz_dy_dem[block_idx * nnei + ii] * res_grad;
-  }
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const FPTYPE* two_embed,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size,
-                              const bool is_sorted) {
-  if (nloc <= 0) {
-    return;
-  }
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out,
-      table, em_x, em, two_embed, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei, last_layer_size, is_sorted);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* two_embed,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size,
-                                   const bool is_sorted) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
-  DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
-
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_a_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x,
-      dy_dem, table, em_x, em, two_embed, dy, table_info[0], table_info[1],
-      table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
-      is_sorted);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size,
-                                        const bool is_sorted) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM,
-                                                                KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy,
-      table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1],
-      table_info[2], table_info[3], table_info[4], nnei, last_layer_size,
-      is_sorted);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei_i,
-                              const int nnei_j,
-                              const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, last_layer_size, 0, 0, out, table, em_x, em, table_info[0],
-      table_info[1], table_info[2], table_info[3], table_info[4], nnei_i,
-      nnei_j, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei_i,
-                                   const int nnei_j,
-                                   const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
-  DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
-
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_t_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size, 0, dy_dem_x,
-      dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei_i,
-                                        const int nnei_j,
-                                        const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM,
-                                                                KK>),
-      nloc, last_layer_size, 0, 0, dz_dy, table, em_x, em, dz_dy_dem_x,
-      dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3],
-      table_info[4], nnei_i, nnei_j, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out,
-      table, em, table_info[0], table_info[1], table_info[2], table_info[3],
-      table_info[4], nnei, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(hipMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
-
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>),
-      nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem,
-      table, em, dy, table_info[0], table_info[1], table_info[2], table_info[3],
-      table_info[4], nnei, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size) {
-  if (nloc <= 0) {
-    return;
-  }
-  DPErrcheck(
-      hipMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
-  hipLaunchKernelGGL(
-      HIP_KERNEL_NAME(
-          tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM,
-                                                                KK>),
-      nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy,
-      table, em, dz_dy_dem, table_info[0], table_info[1], table_info[2],
-      table_info[3], table_info[4], nnei, last_layer_size);
-  DPErrcheck(hipGetLastError());
-  DPErrcheck(hipDeviceSynchronize());
-}
-
-template void tabulate_fusion_se_a_gpu<float>(float* out,
-                                              const float* table,
-                                              const float* table_info,
-                                              const float* em_x,
-                                              const float* em,
-                                              const float* two_embed,
-                                              const int nloc,
-                                              const int nnei,
-                                              const int last_layer_size,
-                                              const bool is_sorted);
-template void tabulate_fusion_se_a_gpu<double>(double* out,
-                                               const double* table,
-                                               const double* table_info,
-                                               const double* em_x,
-                                               const double* em,
-                                               const double* two_embed,
-                                               const int nloc,
-                                               const int nnei,
-                                               const int last_layer_size,
-                                               const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu<float>(float* dy_dem_x,
-                                                   float* dy_dem,
-                                                   const float* table,
-                                                   const float* table_info,
-                                                   const float* em_x,
-                                                   const float* em,
-                                                   const float* two_embed,
-                                                   const float* dy,
-                                                   const int nloc,
-                                                   const int nnei,
-                                                   const int last_layer_size,
-                                                   const bool is_sorted);
-template void tabulate_fusion_se_a_grad_gpu<double>(double* dy_dem_x,
-                                                    double* dy_dem,
-                                                    const double* table,
-                                                    const double* table_info,
-                                                    const double* em_x,
-                                                    const double* em,
-                                                    const double* two_embed,
-                                                    const double* dy,
-                                                    const int nloc,
-                                                    const int nnei,
-                                                    const int last_layer_size,
-                                                    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu<float>(
-    float* dz_dy,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* dz_dy_dem_x,
-    const float* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-template void tabulate_fusion_se_a_grad_grad_gpu<double>(
-    double* dz_dy,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* dz_dy_dem_x,
-    const double* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size,
-    const bool is_sorted);
-
-template void tabulate_fusion_se_t_gpu<float>(float* out,
-                                              const float* table,
-                                              const float* table_info,
-                                              const float* em_x,
-                                              const float* em,
-                                              const int nloc,
-                                              const int nnei_i,
-                                              const int nnei_j,
-                                              const int last_layer_size);
-template void tabulate_fusion_se_t_gpu<double>(double* out,
-                                               const double* table,
-                                               const double* table_info,
-                                               const double* em_x,
-                                               const double* em,
-                                               const int nloc,
-                                               const int nnei_i,
-                                               const int nnei_j,
-                                               const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu<float>(float* dy_dem_x,
-                                                   float* dy_dem,
-                                                   const float* table,
-                                                   const float* table_info,
-                                                   const float* em_x,
-                                                   const float* em,
-                                                   const float* dy,
-                                                   const int nloc,
-                                                   const int nnei_i,
-                                                   const int nnei_j,
-                                                   const int last_layer_size);
-template void tabulate_fusion_se_t_grad_gpu<double>(double* dy_dem_x,
-                                                    double* dy_dem,
-                                                    const double* table,
-                                                    const double* table_info,
-                                                    const double* em_x,
-                                                    const double* em,
-                                                    const double* dy,
-                                                    const int nloc,
-                                                    const int nnei_i,
-                                                    const int nnei_j,
-                                                    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu<float>(
-    float* dz_dy,
-    const float* table,
-    const float* table_info,
-    const float* em_x,
-    const float* em,
-    const float* dz_dy_dem_x,
-    const float* dz_dy_dem,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-template void tabulate_fusion_se_t_grad_grad_gpu<double>(
-    double* dz_dy,
-    const double* table,
-    const double* table_info,
-    const double* em_x,
-    const double* em,
-    const double* dz_dy_dem_x,
-    const double* dz_dy_dem,
-    const int nloc,
-    const int nnei_i,
-    const int nnei_j,
-    const int last_layer_size);
-
-template void tabulate_fusion_se_r_gpu<float>(float* out,
-                                              const float* table,
-                                              const float* table_info,
-                                              const float* em,
-                                              const int nloc,
-                                              const int nnei,
-                                              const int last_layer_size);
-template void tabulate_fusion_se_r_gpu<double>(double* out,
-                                               const double* table,
-                                               const double* table_info,
-                                               const double* em,
-                                               const int nloc,
-                                               const int nnei,
-                                               const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu<float>(float* dy_dem,
-                                                   const float* table,
-                                                   const float* table_info,
-                                                   const float* em,
-                                                   const float* dy,
-                                                   const int nloc,
-                                                   const int nnei,
-                                                   const int last_layer_size);
-template void tabulate_fusion_se_r_grad_gpu<double>(double* dy_dem,
-                                                    const double* table,
-                                                    const double* table_info,
-                                                    const double* em,
-                                                    const double* dy,
-                                                    const int nloc,
-                                                    const int nnei,
-                                                    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu<float>(
-    float* dz_dy,
-    const float* table,
-    const float* table_info,
-    const float* em,
-    const float* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-template void tabulate_fusion_se_r_grad_grad_gpu<double>(
-    double* dz_dy,
-    const double* table,
-    const double* table_info,
-    const double* em,
-    const double* dz_dy_dem,
-    const int nloc,
-    const int nnei,
-    const int last_layer_size);
-
-}  // namespace deepmd

From c35ab61f85473ee76192c24f7870c351720240e6 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 22 Sep 2023 08:48:42 -0400
Subject: [PATCH 52/63] fix compatibility with NumPy 1.26 (#2853)

Fix #2852.

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/env.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/deepmd/env.py b/deepmd/env.py
index d8875cabd2..075e37446f 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -245,7 +245,20 @@ def set_mkl():
     check whether the numpy is built by mkl, see
     https://github.com/numpy/numpy/issues/14751
     """
-    if "mkl_rt" in np.__config__.get_info("blas_mkl_info").get("libraries", []):
+    try:
+        is_mkl = (
+            np.show_config("dicts")
+            .get("Build Dependencies", {})
+            .get("blas", {})
+            .get("name", "")
+            .lower()
+            .startswith("mkl")
+        )
+    except TypeError:
+        is_mkl = "mkl_rt" in np.__config__.get_info("blas_mkl_info").get(
+            "libraries", []
+        )
+    if is_mkl:
         set_env_if_empty("KMP_BLOCKTIME", "0")
         set_env_if_empty("KMP_AFFINITY", "granularity=fine,verbose,compact,1,0")
         reload(np)

From 9d774a478273bd811834c369075628a9e663bad8 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 22 Sep 2023 09:04:00 -0400
Subject: [PATCH 53/63] docs: rewrite coding conventions (#2855)

Update this section to reflect the latest state.

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 doc/development/coding-conventions.rst | 76 +++++++-------------------
 1 file changed, 21 insertions(+), 55 deletions(-)

diff --git a/doc/development/coding-conventions.rst b/doc/development/coding-conventions.rst
index 90531a3d5d..ad4203ee4f 100644
--- a/doc/development/coding-conventions.rst
+++ b/doc/development/coding-conventions.rst
@@ -20,11 +20,14 @@ consistent, clean, and correct, it probably will be accepted.  But
 don't be surprised if the "offending" code gets fiddled with overtime to
 conform to these conventions.
 
-There are also GitHub actions CI checks for python code style which will annotate the
-PR diff for you to see the areas where your code is lacking compared to the set standard.
+There are also pre-commit CI checks for python code style which will automatically fix the
+PR.
+
+Python
+======
 
 Rules
-=====
+-----
 
 The code must be compatible with the oldest supported version of python
 which is 3.7
@@ -74,7 +77,7 @@ Conventions`_ and `Typing Conventions`_ PEPs, clarified and extended as follows:
   convenient than f-strings.
 
 Whitespace
-==========
+----------
 
 Python is not C/C++ so whitespace  should be used sparingly to maintain code readability
 
@@ -114,7 +117,7 @@ Python is not C/C++ so whitespace  should be used sparingly to maintain code rea
                           emacs/Useless-Whitespace.html
 
 General advice
-==============
+--------------
 
 * Get rid of as many ``break`` and ``continue`` statements as possible.
 
@@ -124,7 +127,7 @@ General advice
 * Use descriptive variable names.
 
 Writing documentation in the code
-=================================
+---------------------------------
 
 Here is an example of how to write good docstrings:
 
@@ -132,59 +135,22 @@ Here is an example of how to write good docstrings:
 
 The NumPy docstring documentation can be found `here <https://numpydoc.readthedocs.io/en/latest/format.html>`_
 
-It is a good practice to run `pydocstyle <https://github.com/PyCQA/pydocstyle>`_
-check on your code or use a text editor that does it automatically):
-
-.. code-block:: bash
-
-    $ pydocstyle filename.py
+C++
+===
 
-.. _stylecheck:
+The customized Clang Format style is used for C++ code formatting. The style is defined in
+``.clang-format`` file in the root of the repository. The style is based on the Google C++
+style with some modifications.
 
-Run pycodestyle on your code
-============================
+Run scripts to check the code
+=============================
 
-It's a good idea to run `pycodestyle <https://github.com/PyCQA/pycodestyle>`_
-on your code (or use a text editor that does it automatically):
+It's a good idea to install `pre-commit <https://pre-commit.com>`_ on your repository:
 
 .. code-block:: bash
 
-    $ pycodestyle filename.py
-
-.. _typing:
-
-Run mypy on your code
-=====================
-
-It's a good idea to run `mypy <https://github.com/PyCQA/pycodestyle>`_
-on your code (or use a text editor that does it automatically):
-
-.. code-block:: bash
-
-    $ mypy filename.py
-
-.. _docstyle:
-
-Run pydocstyle on your code
-===========================
-
-It's a good idea to run `pycodestyle <https://github.com/PyCQA/pycodestyle>`_
-on your code (or use a text editor that does it automatically):
-
-.. code-block:: bash
-
-    $ pycodestyle filename.py --max-line-length=88
-
-.. _autoformat:
-
-Run black on your code
-======================
-
-Another method of enforcing PEP8_ is using a tool such as
-`black <https://github.com/psf/black>`_. These tools tend to be
-very effective at cleaning up code but should be used carefully and code
-should be retested after cleaning it. Try:
-
-.. code-block:: bash
+    $ pip install pre-commit
+    $ pre-commit install
 
-  $ black --help
+The scripts will be run automatically before each commit and will fix the code style
+issues automatically.

From c2c647602374d8192355983de637799e4975187a Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 22 Sep 2023 09:24:10 -0400
Subject: [PATCH 54/63] refactor convert (#2854)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/convert.py        | 117 +++++++++++++--------------------
 source/tests/test_deeppot_a.py |  13 ++--
 2 files changed, 54 insertions(+), 76 deletions(-)

diff --git a/deepmd/utils/convert.py b/deepmd/utils/convert.py
index dd26fa1058..13e07f0885 100644
--- a/deepmd/utils/convert.py
+++ b/deepmd/utils/convert.py
@@ -1,15 +1,28 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
 import os
 import textwrap
+from typing import (
+    Optional,
+)
 
 from google.protobuf import (
     text_format,
 )
+from packaging.specifiers import (
+    SpecifierSet,
+)
+from packaging.version import parse as parse_version
 
+from deepmd import (
+    __version__,
+)
 from deepmd.env import (
     tf,
 )
 
+log = logging.getLogger(__name__)
+
 
 def detect_model_version(input_model: str):
     """Detect DP graph version.
@@ -20,33 +33,33 @@ def detect_model_version(input_model: str):
         filename of the input graph
     """
     convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
-    version = "undetected"
+    version = None
     with open("frozen_model.pbtxt") as fp:
         file_content = fp.read()
     if file_content.find("DescrptNorot") > -1:
-        version = "<= 0.12"
+        version = parse_version("0.12")
     elif (
         file_content.find("fitting_attr/dfparam") > -1
         and file_content.find("fitting_attr/daparam") == -1
     ):
-        version = "1.0"
+        version = parse_version("1.0")
     elif file_content.find("model_attr/model_version") == -1:
         name_dsea = file_content.find('name: "DescrptSeA"')
         post_dsea = file_content[name_dsea:]
         post_dsea2 = post_dsea[:300].find(r"}")
         search_double = post_dsea[:post_dsea2]
         if search_double.find("DT_DOUBLE") == -1:
-            version = "1.2"
+            version = parse_version("1.2")
         else:
-            version = "1.3"
+            version = parse_version("1.3")
     elif file_content.find('string_val: "1.0"') > -1:
-        version = "2.0"
+        version = parse_version("2.0")
     elif file_content.find('string_val: "1.1"') > -1:
-        version = ">= 2.1"
+        version = parse_version("2.1")
     return version
 
 
-def convert_to_21(input_model: str, output_model: str):
+def convert_to_21(input_model: str, output_model: str, version: Optional[str] = None):
     """Convert DP graph to 2.1 graph.
 
     Parameters
@@ -55,37 +68,36 @@ def convert_to_21(input_model: str, output_model: str):
         filename of the input graph
     output_model : str
         filename of the output graph
+    version : str
+        version of the input graph, if not specified, it will be detected automatically
     """
-    version = detect_model_version(input_model)
-    if version == "<= 0.12":
+    if version is None:
+        version = detect_model_version(input_model)
+    else:
+        convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
+    if version is None:
+        raise ValueError(
+            "The version of the DP graph %s cannot be detected. Please do the conversion manually."
+            % (input_model)
+        )
+    if version in SpecifierSet("<1.0"):
         convert_dp012_to_dp10("frozen_model.pbtxt")
+    if version in SpecifierSet("<1.1"):
         convert_dp10_to_dp11("frozen_model.pbtxt")
+    if version in SpecifierSet("<1.3"):
         convert_dp12_to_dp13("frozen_model.pbtxt")
+    if version in SpecifierSet("<2.0"):
         convert_dp13_to_dp20("frozen_model.pbtxt")
+    if version in SpecifierSet("<2.1"):
         convert_dp20_to_dp21("frozen_model.pbtxt")
-    elif version == "1.0":
-        convert_dp10_to_dp11("frozen_model.pbtxt")
-        convert_dp12_to_dp13("frozen_model.pbtxt")
-        convert_dp13_to_dp20("frozen_model.pbtxt")
-        convert_dp20_to_dp21("frozen_model.pbtxt")
-    elif version == "1.2":
-        convert_dp12_to_dp13("frozen_model.pbtxt")
-        convert_dp13_to_dp20("frozen_model.pbtxt")
-        convert_dp20_to_dp21("frozen_model.pbtxt")
-    elif version == "1.3":
-        convert_dp13_to_dp20("frozen_model.pbtxt")
-        convert_dp20_to_dp21("frozen_model.pbtxt")
-    elif version == "2.0":
-        convert_dp20_to_dp21("frozen_model.pbtxt")
-    elif version == "undetected":
-        raise ValueError(
-            "The version of the DP graph %s cannot be detected. Please do the conversion manually."
-            % (input_model)
-        )
     convert_pbtxt_to_pb("frozen_model.pbtxt", output_model)
     if os.path.isfile("frozen_model.pbtxt"):
         os.remove("frozen_model.pbtxt")
-    print("the converted output model (2.1 support) is saved in %s" % output_model)
+    log.info(
+        "the converted output model (%s support) is saved in %s",
+        __version__,
+        output_model,
+    )
 
 
 def convert_13_to_21(input_model: str, output_model: str):
@@ -98,13 +110,7 @@ def convert_13_to_21(input_model: str, output_model: str):
     output_model : str
         filename of the output graph
     """
-    convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
-    convert_dp13_to_dp20("frozen_model.pbtxt")
-    convert_dp20_to_dp21("frozen_model.pbtxt")
-    convert_pbtxt_to_pb("frozen_model.pbtxt", output_model)
-    if os.path.isfile("frozen_model.pbtxt"):
-        os.remove("frozen_model.pbtxt")
-    print("the converted output model (2.1 support) is saved in %s" % output_model)
+    convert_to_21(input_model, output_model, version="1.3")
 
 
 def convert_12_to_21(input_model: str, output_model: str):
@@ -117,14 +123,7 @@ def convert_12_to_21(input_model: str, output_model: str):
     output_model : str
         filename of the output graph
     """
-    convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
-    convert_dp12_to_dp13("frozen_model.pbtxt")
-    convert_dp13_to_dp20("frozen_model.pbtxt")
-    convert_dp20_to_dp21("frozen_model.pbtxt")
-    convert_pbtxt_to_pb("frozen_model.pbtxt", output_model)
-    if os.path.isfile("frozen_model.pbtxt"):
-        os.remove("frozen_model.pbtxt")
-    print("the converted output model (2.1 support) is saved in %s" % output_model)
+    convert_to_21(input_model, output_model, version="1.2")
 
 
 def convert_10_to_21(input_model: str, output_model: str):
@@ -137,15 +136,7 @@ def convert_10_to_21(input_model: str, output_model: str):
     output_model : str
         filename of the output graph
     """
-    convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
-    convert_dp10_to_dp11("frozen_model.pbtxt")
-    convert_dp12_to_dp13("frozen_model.pbtxt")
-    convert_dp13_to_dp20("frozen_model.pbtxt")
-    convert_dp20_to_dp21("frozen_model.pbtxt")
-    convert_pbtxt_to_pb("frozen_model.pbtxt", output_model)
-    if os.path.isfile("frozen_model.pbtxt"):
-        os.remove("frozen_model.pbtxt")
-    print("the converted output model (2.1 support) is saved in %s" % output_model)
+    convert_to_21(input_model, output_model, version="1.0")
 
 
 def convert_012_to_21(input_model: str, output_model: str):
@@ -158,16 +149,7 @@ def convert_012_to_21(input_model: str, output_model: str):
     output_model : str
         filename of the output graph
     """
-    convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
-    convert_dp012_to_dp10("frozen_model.pbtxt")
-    convert_dp10_to_dp11("frozen_model.pbtxt")
-    convert_dp12_to_dp13("frozen_model.pbtxt")
-    convert_dp13_to_dp20("frozen_model.pbtxt")
-    convert_dp20_to_dp21("frozen_model.pbtxt")
-    convert_pbtxt_to_pb("frozen_model.pbtxt", output_model)
-    if os.path.isfile("frozen_model.pbtxt"):
-        os.remove("frozen_model.pbtxt")
-    print("the converted output model (2.1 support) is saved in %s" % output_model)
+    convert_to_21(input_model, output_model, version="0.12")
 
 
 def convert_20_to_21(input_model: str, output_model: str):
@@ -180,12 +162,7 @@ def convert_20_to_21(input_model: str, output_model: str):
     output_model : str
         filename of the output graph
     """
-    convert_pb_to_pbtxt(input_model, "frozen_model.pbtxt")
-    convert_dp20_to_dp21("frozen_model.pbtxt")
-    convert_pbtxt_to_pb("frozen_model.pbtxt", output_model)
-    if os.path.isfile("frozen_model.pbtxt"):
-        os.remove("frozen_model.pbtxt")
-    print("the converted output model (2.1 support) is saved in %s" % output_model)
+    convert_to_21(input_model, output_model, version="2.0")
 
 
 def convert_pb_to_pbtxt(pbfile: str, pbtxtfile: str):
diff --git a/source/tests/test_deeppot_a.py b/source/tests/test_deeppot_a.py
index 1f43121e65..006b391e49 100644
--- a/source/tests/test_deeppot_a.py
+++ b/source/tests/test_deeppot_a.py
@@ -8,6 +8,7 @@
     run_dp,
     tests_path,
 )
+from packaging.version import parse as parse_version
 
 from deepmd.env import (
     GLOBAL_NP_FLOAT_PRECISION,
@@ -750,33 +751,33 @@ def test_detect(self):
         new_model_pb = "deeppot_new.pb"
         convert_pbtxt_to_pb(str(tests_path / "infer" / "sea_012.pbtxt"), old_model)
         version = detect_model_version(old_model)
-        self.assertEqual(version, "<= 0.12")
+        self.assertEqual(version, parse_version("0.12"))
         os.remove(old_model)
         shutil.copyfile(str(tests_path / "infer" / "sea_012.pbtxt"), new_model_txt)
         convert_dp012_to_dp10(new_model_txt)
         convert_pbtxt_to_pb(new_model_txt, new_model_pb)
         version = detect_model_version(new_model_pb)
-        self.assertEqual(version, "1.0")
+        self.assertEqual(version, parse_version("1.0"))
         os.remove(new_model_pb)
         convert_dp10_to_dp11(new_model_txt)
         convert_pbtxt_to_pb(new_model_txt, new_model_pb)
         version = detect_model_version(new_model_pb)
-        self.assertEqual(version, "1.3")
+        self.assertEqual(version, parse_version("1.3"))
         os.remove(new_model_pb)
         convert_dp12_to_dp13(new_model_txt)
         convert_pbtxt_to_pb(new_model_txt, new_model_pb)
         version = detect_model_version(new_model_pb)
-        self.assertEqual(version, "1.3")
+        self.assertEqual(version, parse_version("1.3"))
         os.remove(new_model_pb)
         convert_dp13_to_dp20(new_model_txt)
         convert_pbtxt_to_pb(new_model_txt, new_model_pb)
         version = detect_model_version(new_model_pb)
-        self.assertEqual(version, "2.0")
+        self.assertEqual(version, parse_version("2.0"))
         os.remove(new_model_pb)
         convert_dp20_to_dp21(new_model_txt)
         convert_pbtxt_to_pb(new_model_txt, new_model_pb)
         version = detect_model_version(new_model_pb)
-        self.assertEqual(version, ">= 2.1")
+        self.assertEqual(version, parse_version("2.1"))
         os.remove(new_model_pb)
         os.remove(new_model_txt)
 

From e1ae5e07f7ccb658fa66a9f854591badb1da3d6b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 24 Sep 2023 20:56:06 -0400
Subject: [PATCH 55/63] cmake: use pip to install tensorflow (#2858)

When setting the `INSTALL_TENSORFLOW` variable, previously CMake calls
conda to install tensorflow. Now we use pip to install it.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/CMakeLists.txt             |  7 +++----
 source/cmake/Findtensorflow.cmake | 19 ++++++++++++-------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index c3b5fde054..1a6b82bcf0 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -25,10 +25,6 @@ if((NOT BUILD_PY_IF) AND (NOT BUILD_CPP_IF))
 endif()
 
 if(BUILD_CPP_IF AND BUILD_TESTING)
-  if(NOT INSTALL_TENSORFLOW)
-    # some errors in conda packages...
-    find_package(GTest)
-  endif()
   if(NOT GTEST_LIBRARIES)
     configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake.in
                    googletest-download/CMakeLists.txt @ONLY)
@@ -127,6 +123,9 @@ endif(USE_ROCM_TOOLKIT)
 set(DEEPMD_SOURCE_DIR ${PROJECT_SOURCE_DIR}/..)
 
 # setup tensorflow libraries by python
+if(INSTALL_TENSORFLOW)
+  set(USE_TF_PYTHON_LIBS TRUE)
+endif(INSTALL_TENSORFLOW)
 if(USE_TF_PYTHON_LIBS)
   if(NOT "$ENV{CIBUILDWHEEL}" STREQUAL "1")
     find_package(
diff --git a/source/cmake/Findtensorflow.cmake b/source/cmake/Findtensorflow.cmake
index 6ab5747805..3ebbd4ea62 100644
--- a/source/cmake/Findtensorflow.cmake
+++ b/source/cmake/Findtensorflow.cmake
@@ -19,25 +19,30 @@ if(SKBUILD)
 endif(SKBUILD)
 
 if(BUILD_CPP_IF AND INSTALL_TENSORFLOW)
-  # Here we try to install libtensorflow_cc using conda install.
+  # Here we try to install libtensorflow_cc using pip install.
 
   if(USE_CUDA_TOOLKIT)
-    set(VARIANT cuda)
+    set(VARIANT "")
   else()
-    set(VARIANT cpu)
+    set(VARIANT "-cpu")
   endif()
 
   if(NOT DEFINED TENSORFLOW_ROOT)
     set(TENSORFLOW_ROOT ${CMAKE_INSTALL_PREFIX})
   endif()
-  # execute conda install
-  execute_process(COMMAND conda create libtensorflow_cc=*=${VARIANT}* -c
-                          deepmodeling -y -p ${TENSORFLOW_ROOT})
+  # execute pip install
+  execute_process(
+    COMMAND ${Python_EXECUTABLE} -m pip install tensorflow${VARIANT} --no-deps
+            --target=${TENSORFLOW_ROOT})
+  set(TENSORFLOW_ROOT
+      ${TENSORFLOW_ROOT}/lib/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages/tensorflow
+  )
 endif()
 
 if(BUILD_CPP_IF
    AND USE_TF_PYTHON_LIBS
-   AND NOT SKBUILD)
+   AND NOT SKBUILD
+   AND NOT INSTALL_TENSORFLOW)
   # Here we try to install libtensorflow_cc.so as well as
   # libtensorflow_framework.so using libs within the python site-package
   # tensorflow folder.

From aa3b58ea1fd69c54e0f00fa50260f9fb2074708f Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 24 Sep 2023 23:54:22 -0400
Subject: [PATCH 56/63] bump LAMMPS version to stable_2Aug2023_update1 (#2859)

https://github.com/lammps/lammps/releases/tag/stable_2Aug2023_update1
---
 backend/dynamic_metadata.py     |  3 +--
 doc/install/install-lammps.md   | 18 +++++++++---------
 pyproject.toml                  |  4 ++--
 source/install/build_cc.sh      |  2 +-
 source/install/build_from_c.sh  |  2 +-
 source/install/build_lammps.sh  |  2 +-
 source/install/test_cc.sh       |  2 +-
 source/install/test_cc_local.sh |  2 +-
 8 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/backend/dynamic_metadata.py b/backend/dynamic_metadata.py
index 1270b6031e..fa40d332cf 100644
--- a/backend/dynamic_metadata.py
+++ b/backend/dynamic_metadata.py
@@ -55,8 +55,7 @@ def dynamic_metadata(
                 "sphinxcontrib-bibtex",
             ],
             "lmp": [
-                "lammps~=2023.8.2.0.0; platform_system=='Linux'",
-                "lammps~=2023.8.2.0.0; platform_system!='Linux'",
+                "lammps~=2023.8.2.1.0",
                 *find_libpython_requires,
             ],
             "ipi": [
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index 80d02d3dfa..d9d6a28bc6 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -14,12 +14,12 @@ make lammps
 DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory, which supports either double or single float precision interface. Now download the LAMMPS code, and uncompress it.
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_2Aug2023.tar.gz
-tar xf stable_2Aug2023.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update1.tar.gz
+tar xf stable_2Aug2023_update1.tar.gz
 ```
-The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
+The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023_update1`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
 ```bash
-cd lammps-stable_2Aug2023/src/
+cd lammps-stable_2Aug2023_update1/src/
 cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
 make yes-kspace
 make yes-extra-fix
@@ -46,15 +46,15 @@ Starting from `8Apr2021`, LAMMPS also provides a plugin mode, allowing one to bu
 Now download the LAMMPS code (`8Apr2021` or later), and uncompress it:
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_2Aug2023.tar.gz
-tar xf stable_2Aug2023.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_2Aug2023_update1.tar.gz
+tar xf stable_2Aug2023_update1.tar.gz
 ```
 
-The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
+The source code of LAMMPS is stored in the directory `lammps-stable_2Aug2023_update1`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
 
 ```bash
-mkdir -p lammps-stable_2Aug2023/build/
-cd lammps-stable_2Aug2023/build/
+mkdir -p lammps-stable_2Aug2023_update1/build/
+cd lammps-stable_2Aug2023_update1/build/
 ```
 Now build LAMMPS. Note that `PLUGIN` and `KSPACE` packages must be enabled, and `BUILD_SHARED_LIBS` must be set to `yes`. You can install any other package you want.
 ```bash
diff --git a/pyproject.toml b/pyproject.toml
index f14de6f85e..6c2d5d0601 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -117,7 +117,7 @@ manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2022-11-19-1b19e81"
 manylinux-aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:2022-11-19-1b19e81"
 
 [tool.cibuildwheel.macos]
-environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023", DP_ENABLE_IPI="1" }
+environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update1", DP_ENABLE_IPI="1" }
 before-all = [
     """if [[ "$CIBW_BUILD" != *macosx_arm64* ]]; then brew install mpich; fi""",
 ]
@@ -129,7 +129,7 @@ repair-wheel-command = """if [[ "$CIBW_BUILD" == *macosx_arm64* ]]; then rm -rf
 [tool.cibuildwheel.linux]
 repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 -w {dest_dir} {wheel}"
 environment-pass = ["CIBW_BUILD", "DP_VARIANT"]
-environment = { PIP_PREFER_BINARY="1", DP_VARIANT="cuda", DP_LAMMPS_VERSION="stable_2Aug2023", DP_ENABLE_IPI="1", MPI_HOME="/usr/lib64/mpich", PATH="/usr/lib64/mpich/bin:$PATH" }
+environment = { PIP_PREFER_BINARY="1", DP_VARIANT="cuda", DP_LAMMPS_VERSION="stable_2Aug2023_update1", DP_ENABLE_IPI="1", MPI_HOME="/usr/lib64/mpich", PATH="/usr/lib64/mpich/bin:$PATH" }
 before-all = [
     """{ if [ "$(uname -m)" = "x86_64" ] ; then curl https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run -O && bash cuda_11.8.0_520.61.05_linux.run --silent --toolkit; fi }""",
     "yum install -y mpich-devel",
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index bfa3cd1ce4..74e3835b74 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -20,7 +20,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DUSE_TF_PYTHON_LIBS=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_2Aug2023 ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DUSE_TF_PYTHON_LIBS=TRUE ${CUDA_ARGS} -DLAMMPS_VERSION=stable_2Aug2023_update1 ..
 cmake --build . -j${NPROC}
 cmake --install .
 
diff --git a/source/install/build_from_c.sh b/source/install/build_from_c.sh
index 3a48d3d46c..cd0aeca089 100755
--- a/source/install/build_from_c.sh
+++ b/source/install/build_from_c.sh
@@ -13,7 +13,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_2Aug2023 ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_2Aug2023_update1 ..
 cmake --build . -j${NPROC}
 cmake --install .
 cmake --build . --target=lammps
diff --git a/source/install/build_lammps.sh b/source/install/build_lammps.sh
index 292d226f8a..6798212086 100755
--- a/source/install/build_lammps.sh
+++ b/source/install/build_lammps.sh
@@ -14,7 +14,7 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_lammps
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 # download LAMMMPS
-LAMMPS_VERSION=stable_2Aug2023
+LAMMPS_VERSION=stable_2Aug2023_update1
 if [ ! -d "lammps-${LAMMPS_VERSION}" ]; then
 	curl -L -o lammps.tar.gz https://github.com/lammps/lammps/archive/refs/tags/${LAMMPS_VERSION}.tar.gz
 	tar vxzf lammps.tar.gz
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index c874e3bf6c..0a8700b275 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -17,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} ..
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023_update1 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 49f221825b..74477a8c2a 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -18,7 +18,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023 ${CUDA_ARGS} ..
+cmake -DINSTALL_TENSORFLOW=FALSE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_2Aug2023_update1 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure

From ffe10f9d3076cfb7d61c645e4651b71ab9a45945 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 25 Sep 2023 00:56:32 -0400
Subject: [PATCH 57/63] cmake: use modern `HIP` language (#2857)

Use modern `HIP` language instead of `hip_add_library`.
Use modern `find_package` to find `hip` and `hipcub` instead of
`FindROCM.cmake` (which is removed in this PR).

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 backend/read_env.py                |  4 +-
 doc/install/install-from-source.md |  4 +-
 source/CMakeLists.txt              | 10 ++--
 source/cmake/FindROCM.cmake        | 84 ------------------------------
 source/lib/CMakeLists.txt          |  6 ++-
 source/lib/src/gpu/CMakeLists.txt  | 17 +++---
 6 files changed, 26 insertions(+), 99 deletions(-)
 delete mode 100644 source/cmake/FindROCM.cmake

diff --git a/backend/read_env.py b/backend/read_env.py
index 575c1a57de..079211d4d7 100644
--- a/backend/read_env.py
+++ b/backend/read_env.py
@@ -57,8 +57,8 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
         if rocm_root:
             cmake_args.append(f"-DCMAKE_HIP_COMPILER_ROCM_ROOT:STRING={rocm_root}")
         hipcc_flags = os.environ.get("HIP_HIPCC_FLAGS")
-        if hipcc_flags:
-            cmake_args.append(f"-DHIP_HIPCC_FLAGS:STRING={hipcc_flags}")
+        if hipcc_flags is not None:
+            os.environ["HIPFLAGS"] = os.environ.get("HIPFLAGS", "") + " " + hipcc_flags
     else:
         raise RuntimeError("Unsupported DP_VARIANT option: %s" % dp_variant)
 
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index 1447823c08..ef3f5d9c36 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -79,6 +79,7 @@ One may set the following environment variables before executing `pip`:
 | TENSORFLOW_ROOT       | Path                   | Detected automatically | The path to TensorFlow Python library. By default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.|
 | DP_ENABLE_NATIVE_OPTIMIZATION | 0, 1           | 0             | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. |
 | CMAKE_ARGS             | str                   | -             | Additional CMake arguments |
+| &lt;LANG&gt;FLAGS (`<LANG>`=`CXX`, `CUDA` or `HIP`)   | str            | -             | Default compilation flags to be used when compiling `<LANG>` files. See [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html). |
 
 To test the installation, one should first jump out of the source directory
 ```
@@ -193,7 +194,8 @@ One may add the following arguments to `cmake`:
 | -DCMAKE_HIP_COMPILER_ROCM_ROOT=&lt;value&gt; | Path         | Detected automatically | The path to the ROCM toolkit directory. |
 | -DLAMMPS_SOURCE_ROOT=&lt;value&gt; | Path         | - | Only neccessary for LAMMPS plugin mode. The path to the [LAMMPS source code](install-lammps.md). LAMMPS 8Apr2021 or later is supported. If not assigned, the plugin mode will not be enabled. |
 | -DUSE_TF_PYTHON_LIBS=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build C++ interface with TensorFlow's Python libraries(TensorFlow's Python Interface is required). And there's no need for building TensorFlow's C++ interface.|
-| -DENABLE_NATIVE_OPTIMIZATION       | `TRUE` or `FALSE` | `FALSE`       | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. |
+| -DENABLE_NATIVE_OPTIMIZATION=&lt;value&gt;       | `TRUE` or `FALSE` | `FALSE`       | Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs. |
+| -DCMAKE_&lt;LANG&gt;_FLAGS=&lt;value&gt; (`<LANG>`=`CXX`, `CUDA` or `HIP`)   | str            | -             | Default compilation flags to be used when compiling `<LANG>` files. See [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html). |
 
 If the CMake has been executed successfully, then run the following make commands to build the package:
 ```bash
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 1a6b82bcf0..c1c9b8e7fe 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -111,10 +111,14 @@ endif(USE_CUDA_TOOLKIT)
 # define USE_ROCM_TOOLKIT
 if(USE_ROCM_TOOLKIT)
   cmake_minimum_required(VERSION 3.21)
-  find_package(ROCM REQUIRED)
+  include(CMakeDetermineHIPCompiler)
+  list(APPEND CMAKE_PREFIX_PATH ${CMAKE_HIP_COMPILER_ROCM_ROOT})
+  find_package(hip REQUIRED)
+  find_package(hipcub REQUIRED)
   add_definitions("-DTENSORFLOW_USE_ROCM")
-  add_compile_definitions(__HIP_PLATFORM_HCC__)
-  message(STATUS "Found ROCM in ${ROCM_ROOT}, build AMD GPU support")
+  message(
+    STATUS
+      "Found ROCM in ${CMAKE_HIP_COMPILER_ROCM_ROOT}, build AMD GPU support")
   set(DP_VARIANT "rocm")
 else()
   message(STATUS "Will not build AMD GPU support")
diff --git a/source/cmake/FindROCM.cmake b/source/cmake/FindROCM.cmake
deleted file mode 100644
index 6ef575ae39..0000000000
--- a/source/cmake/FindROCM.cmake
+++ /dev/null
@@ -1,84 +0,0 @@
-# Input: ROCM_ROOT
-#
-# Output: ROCM_FOUND ROCM_INCLUDE_DIRS ROCM_LIBRARIES
-
-# define the search path
-cmake_minimum_required(VERSION 3.21)
-include(CMakeDetermineHIPCompiler)
-set(ROCM_PATH ${CMAKE_HIP_COMPILER_ROCM_ROOT})
-set(ROCM_search_PATHS ${CMAKE_HIP_COMPILER_ROCM_ROOT})
-
-# includes
-find_path(
-  ROCM_INCLUDE_DIRS
-  NAMES hip/hip_runtime.h rocprim/rocprim.hpp hipcub/hipcub.hpp
-  PATHS ${ROCM_search_PATHS}
-  PATH_SUFFIXES "include"
-  NO_DEFAULT_PATH)
-if(NOT ROCM_INCLUDE_DIRS AND ROCM_FIND_REQUIRED)
-  message(
-    FATAL_ERROR
-      "Not found 'hip' or 'rocprim' or 'hipcub' directory in path '${ROCM_search_PATHS}' "
-      "You can manually set the ROCM install path by -DROCM_ROOT ")
-endif()
-
-# FindHIP.cmake
-find_path(
-  HIP_CMAKE
-  NAMES FindHIP.cmake
-  PATHS ${ROCM_search_PATHS}
-  PATH_SUFFIXES "hip/cmake"
-  NO_DEFAULT_PATH)
-
-if(NOT HIP_CMAKE AND ROCM_FIND_REQUIRED)
-  message(
-    FATAL_ERROR "Not found 'FindHIP.cmake' file in path '${ROCM_search_PATHS}' "
-                "You can manually set the ROCM install path by -DROCM_ROOT ")
-endif()
-
-list(APPEND CMAKE_MODULE_PATH ${HIP_CMAKE})
-find_package(HIP)
-
-# define the libs to find
-if(NOT ROCM_FIND_COMPONENTS)
-  if(HIP_VERSION VERSION_GREATER_EQUAL 3.5.1)
-    set(ROCM_FIND_COMPONENTS amd_comgr amdhip64)
-  else()
-    set(ROCM_FIND_COMPONENTS hip-hcc hiprtc)
-  endif()
-endif()
-
-# libs
-foreach(module ${ROCM_FIND_COMPONENTS})
-  find_library(
-    ROCM_LIBRARIES_${module}
-    NAMES ${module}
-    PATHS ${ROCM_search_PATHS}
-    PATH_SUFFIXES "lib"
-    NO_DEFAULT_PATH)
-  if(ROCM_LIBRARIES_${module})
-    list(APPEND ROCM_LIBRARIES ${ROCM_LIBRARIES_${module}})
-  elseif(ROCM_FIND_REQUIRED)
-    message(
-      FATAL_ERROR "Not found lib/'${module}' in '${ROCM_search_PATHS}' "
-                  "You can manually set the ROCM install path by -DROCM_ROOT ")
-  endif()
-endforeach()
-
-# define the output variable
-if(ROCM_INCLUDE_DIRS
-   AND ROCM_LIBRARIES
-   AND HIP_CMAKE)
-  set(ROCM_FOUND TRUE)
-else()
-  set(ROCM_FOUND FALSE)
-endif()
-
-# print message
-if(NOT ROCM_FIND_QUIETLY)
-  message(
-    STATUS "Found ROCM: ${ROCM_INCLUDE_DIRS}, ${ROCM_LIBRARIES}, ${HIP_CMAKE}"
-           " in ${ROCM_search_PATHS}, build AMD GPU support")
-endif()
-
-unset(ROCM_search_PATHS)
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index 323bf2d7c0..0f5bdb73fb 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -24,10 +24,12 @@ if(USE_ROCM_TOOLKIT)
   add_definitions("-DTENSORFLOW_USE_ROCM")
   add_subdirectory(src/gpu)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_rocm)
-  target_link_libraries(${libname} INTERFACE ${ROCM_LIBRARIES} ${EXTRA_LIBS})
+  # to define __HIP_PLATFORM_AMD__ in hip_runtime.h
+  target_link_libraries(${libname} PUBLIC hip::host)
+  target_link_libraries(${libname} INTERFACE ${EXTRA_LIBS})
   # gpu_rocm.h
   target_include_directories(
-    ${libname} PUBLIC $<BUILD_INTERFACE:${ROCM_INCLUDE_DIRS}>
+    ${libname} PUBLIC $<BUILD_INTERFACE:${HIP_INCLUDE_DIRS}>
                       $<INSTALL_INTERFACE:include>)
 endif()
 
diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt
index 25223c82bf..4b491a312d 100644
--- a/source/lib/src/gpu/CMakeLists.txt
+++ b/source/lib/src/gpu/CMakeLists.txt
@@ -50,6 +50,7 @@ elseif(USE_ROCM_TOOLKIT)
   cmake_minimum_required(VERSION 3.21)
   # project name
   project(deepmd_op_rocm)
+  enable_language(HIP)
   set(GPU_LIB_NAME deepmd_op_rocm)
   set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 
@@ -57,19 +58,21 @@ elseif(USE_ROCM_TOOLKIT)
   set(CMAKE_CXX_STANDARD 14)
   set(CMAKE_HIP_STANDARD 14)
   add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
-  add_definitions("-DCUB_IGNORE_DEPRECATED_CPP_DIALECT")
 
-  message(STATUS "HIP major version is " ${HIP_VERSION_MAJOR})
+  message(STATUS "HIP major version is " ${hip_VERSION_MAJOR})
 
-  set(HIP_HIPCC_FLAGS -fno-gpu-rdc; -fPIC --std=c++14 ${HIP_HIPCC_FLAGS}
-  )# --amdgpu-target=gfx906
-  if(HIP_VERSION VERSION_LESS 3.5.1)
-    set(HIP_HIPCC_FLAGS -hc; ${HIP_HIPCC_FLAGS})
+  set(CMAKE_HIP_FLAGS -fno-gpu-rdc ${CMAKE_HIP_FLAGS}) # --amdgpu-target=gfx906
+  if(hip_VERSION VERSION_LESS 3.5.1)
+    set(CMAKE_HIP_FLAGS -hc ${CMAKE_HIP_FLAGS})
   endif()
 
   file(GLOB SOURCE_FILES "*.cu")
 
-  hip_add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES})
+  add_library(${GPU_LIB_NAME} SHARED ${SOURCE_FILES})
+  set_source_files_properties(${SOURCE_FILES} PROPERTIES LANGUAGE HIP)
+  # -fpic
+  set_property(TARGET ${GPU_LIB_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON)
+  target_link_libraries(${GPU_LIB_NAME} PRIVATE hip::hipcub)
 
 endif()
 

From 67f30e005042dac3f6a893d54f7c0a9cf46da089 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 25 Sep 2023 02:46:37 -0400
Subject: [PATCH 58/63] fix finetune RMSE and memory issue (#2860)

Fix #2472. The previous implementation tried to allocate a (N, N) array
and the RMSE result was actually MAE instead.

---------

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Signed-off-by: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com>
Co-authored-by: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 deepmd/fit/ener.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index 9bfbf5a5cc..61cf0ce40c 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -856,9 +856,13 @@ def change_energy_bias(
             delta_bias = np.linalg.lstsq(type_numbs, bias_diff, rcond=None)[0]
             unbias_e = energy_predict + type_numbs @ delta_bias
             atom_numbs = type_numbs.sum(-1)
-            rmse_ae = (
-                np.sqrt(np.square(unbias_e - energy_ground_truth)) / atom_numbs
-            ).mean()
+            rmse_ae = np.sqrt(
+                np.mean(
+                    np.square(
+                        (unbias_e.ravel() - energy_ground_truth.ravel()) / atom_numbs
+                    )
+                )
+            )
             self.bias_atom_e[idx_type_map] += delta_bias.reshape(-1)
             log.info(
                 f"RMSE of atomic energy after linear regression is: {rmse_ae} eV/atom."

From e7a88769ed1253eb85934a3ebdda80e26ff702eb Mon Sep 17 00:00:00 2001
From: Han Wang <92130845+wanghan-iapcm@users.noreply.github.com>
Date: Tue, 26 Sep 2023 14:49:40 +0800
Subject: [PATCH 59/63] fix the issue of applying modifier multiple times when
 batch set is load only once (#2864)

fix #2862

Co-authored-by: Han Wang <wang_han@iapcm.ac.cn>
---
 deepmd/utils/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 57bea00fac..423745cddf 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -265,8 +265,6 @@ def get_batch(self, batch_size: int) -> dict:
             self._load_batch_set(self.train_dirs[self.set_count % self.get_numb_set()])
             self.set_count += 1
             set_size = self.batch_set["coord"].shape[0]
-            if self.modifier is not None:
-                self.modifier.modify_data(self.batch_set, self)
         iterator_1 = self.iterator + batch_size
         if iterator_1 >= set_size:
             iterator_1 = set_size
@@ -410,6 +408,8 @@ def _get_subdata(self, data, idx=None):
     def _load_batch_set(self, set_name: DPPath):
         if not hasattr(self, "batch_set") or self.get_numb_set() > 1:
             self.batch_set = self._load_set(set_name)
+            if self.modifier is not None:
+                self.modifier.modify_data(self.batch_set, self)
         self.batch_set, _ = self._shuffle_data(self.batch_set)
         self.reset_get_batch()
 

From 938cea286dff1e54dd5ad9784ffd36799c2543ca Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 26 Sep 2023 03:11:22 -0400
Subject: [PATCH 60/63] merge CUDA and ROCm codes in op (#2847)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h                |   1 +
 source/lib/include/gpu_rocm.h                |   1 +
 source/op/gelu_multi_device.cc               |  24 +-
 source/op/prod_env_mat_multi_device.cc       | 454 +------------------
 source/op/prod_env_mat_multi_device_nvnmd.cc |  16 +-
 source/op/prod_force_grad_multi_device.cc    |  18 +-
 source/op/prod_force_multi_device.cc         |  18 +-
 source/op/prod_virial_grad_multi_device.cc   |  18 +-
 source/op/prod_virial_multi_device.cc        |  18 +-
 source/op/tabulate_multi_device.cc           | 102 +----
 source/op/unaggregated_grad.cc               |   4 +-
 11 files changed, 66 insertions(+), 608 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 73dfed1404..1e750e0ea0 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -13,6 +13,7 @@
 #define gpuMemcpy cudaMemcpy
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemset cudaMemset
 
 #define GPU_MAX_NBOR_SIZE 4096
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 3a65a57b01..bb404720bc 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -16,6 +16,7 @@
 #define gpuMemcpy hipMemcpy
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemset hipMemset
 
 #define DPErrcheck(res) \
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index ccc95aa0e4..1c76cd25d3 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -64,13 +64,9 @@ class GeluOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_gpu(out, x, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_gpu(out, x, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_cpu(out, x, size);
     }
@@ -108,13 +104,9 @@ class GeluGradOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_grad_gpu(out, x, dy, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_grad_gpu(out, x, dy, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_cpu(out, x, dy, size);
     }
@@ -154,13 +146,9 @@ class GeluGradGradOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_grad_cpu(out, x, dy, dy_2, size);
     }
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index ee07dc22fe..47541bc69f 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include "coord.h"
 #include "custom_op.h"
+#include "device.h"
 #include "errors.h"
 #include "neighbor_list.h"
 #include "prod_env_mat.h"
@@ -243,82 +244,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial);
 
-#if GOOGLE_CUDA
-template <typename FPTYPE>
-static int _norm_copy_coord_gpu(OpKernelContext* context,
-                                Tensor* tensor_list,
-                                FPTYPE*& coord_cpy,
-                                int*& type_cpy,
-                                int*& idx_mapping,
-                                int& nall,
-                                int& mem_cpy,
-                                const FPTYPE* coord,
-                                const FPTYPE* box,
-                                const int* type,
-                                const int& nloc,
-                                const int& max_cpy_trial,
-                                const float& rcut_r);
-
-template <typename FPTYPE>
-static int _build_nlist_gpu(OpKernelContext* context,
-                            Tensor* tensor_list,
-                            int*& ilist,
-                            int*& numneigh,
-                            int**& firstneigh,
-                            int*& jlist,
-                            int& max_nnei,
-                            int& mem_nnei,
-                            const FPTYPE* coord,
-                            const int& nloc,
-                            const int& new_nall,
-                            const int& max_nnei_trial,
-                            const float& rcut_r);
-
-static void _map_nlist_gpu(int* nlist,
-                           const int* idx_mapping,
-                           const int& nloc,
-                           const int& nnei);
-
-static void _map_nei_info_gpu(int* nlist,
-                              int* ntype,
-                              bool* nmask,
-                              const int* type,
-                              const int* idx_mapping,
-                              const int& nloc,
-                              const int& nnei,
-                              const int& ntypes,
-                              const bool& b_nlist_map);
-
-template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial);
-
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 Tensor* tensor_list,
@@ -391,7 +317,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial);
 
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename FPTYPE>
 class ProdEnvMatAOp : public OpKernel {
@@ -633,36 +559,7 @@ class ProdEnvMatAOp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        int* idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int* jlist = NULL;
-        FPTYPE* coord_cpy;
-        int* type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut_r, max_cpy_trial, max_nnei_trial);
-
-        // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
-                                   gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
-                                   rcut_r, rcut_r_smth, sec_a);
-        if (b_nlist_map) {
-          _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
-        }
-        deepmd::delete_device_memory(firstneigh);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
         int** firstneigh = NULL;
@@ -689,7 +586,7 @@ class ProdEnvMatAOp : public OpKernel {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
         deepmd::delete_device_memory(firstneigh);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -960,36 +857,7 @@ class ProdEnvMatROp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        int* idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int* jlist = NULL;
-        FPTYPE* coord_cpy;
-        int* type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut, max_cpy_trial, max_nnei_trial);
-
-        // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
-                                   gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
-                                   rcut, rcut_smth, sec);
-        if (b_nlist_map) {
-          _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
-        }
-        deepmd::delete_device_memory(firstneigh);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
         int** firstneigh = NULL;
@@ -1016,7 +884,7 @@ class ProdEnvMatROp : public OpKernel {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
         deepmd::delete_device_memory(firstneigh);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -1248,10 +1116,7 @@ class ProdEnvMatAMixOp : public OpKernel {
     int* p_f_type = fake_type_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall);
-#endif
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall);
 #endif
     } else if (device == "CPU") {
@@ -1338,35 +1203,7 @@ class ProdEnvMatAMixOp : public OpKernel {
       const int* f_type = p_f_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        int* idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int* jlist = NULL;
-        FPTYPE* coord_cpy;
-        int* type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut_r, max_cpy_trial, max_nnei_trial);
-
-        // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
-                                   gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
-                                   rcut_r, rcut_r_smth, sec_a, f_type);
-        _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
-                          ntypes, b_nlist_map);
-        deepmd::delete_device_memory(firstneigh);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
         int** firstneigh = NULL;
@@ -1392,7 +1229,7 @@ class ProdEnvMatAMixOp : public OpKernel {
         _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
                           ntypes, b_nlist_map);
         deepmd::delete_device_memory(firstneigh);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -1616,7 +1453,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 Tensor* tensor_list,
@@ -1632,8 +1469,8 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 const int& max_cpy_trial,
                                 const float& rcut_r) {
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
-                        cudaMemcpyDeviceToDevice));
+  DPErrcheck(gpuMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
+                       gpuMemcpyDeviceToDevice));
 
   deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);
@@ -1877,270 +1714,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
                     ", which currently is not supported by deepmd-kit."));
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-static int _norm_copy_coord_gpu(OpKernelContext* context,
-                                Tensor* tensor_list,
-                                FPTYPE*& coord_cpy,
-                                int*& type_cpy,
-                                int*& idx_mapping,
-                                int& nall,
-                                int& mem_cpy,
-                                const FPTYPE* coord,
-                                const FPTYPE* box,
-                                const int* type,
-                                const int& nloc,
-                                const int& max_cpy_trial,
-                                const float& rcut_r) {
-  FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
-                       hipMemcpyDeviceToDevice));
-
-  deepmd::Region<FPTYPE> region;
-  init_region_cpu(region, box);
-  FPTYPE box_info[18];
-  std::copy(region.boxt, region.boxt + 9, box_info);
-  std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
-  int cell_info[23];
-  deepmd::compute_cell_info(cell_info, rcut_r, region);
-  const int loc_cellnum = cell_info[21];
-  const int total_cellnum = cell_info[22];
-  // Tensor int_temp;
-  TensorShape int_shape;
-  int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                   total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
-                   nloc);
-  tensorflow::Status status =
-      context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
-  if (!status.ok()) {
-    return false;
-  }
-  FPTYPE* box_info_dev = (*(tensor_list + 1)).flat<FPTYPE>().data();
-  int* cell_info_dev = (*(tensor_list + 2)).flat<int>().data();
-  int* int_data_dev = cell_info_dev + 23;
-  deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
-  deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
-  deepmd::Region<FPTYPE> region_dev;
-  FPTYPE* new_boxt = region_dev.boxt;
-  FPTYPE* new_rec_boxt = region_dev.rec_boxt;
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
-  int tt;
-  for (tt = 0; tt < max_cpy_trial; ++tt) {
-    coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
-    type_cpy = (*(tensor_list + 4)).flat<int>().data();
-    idx_mapping = type_cpy + mem_cpy;
-    int ret = deepmd::copy_coord_gpu(
-        coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type,
-        nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
-    if (ret == 0) {
-      break;
-    } else {
-      mem_cpy *= 2;
-      // Tensor cpy_temp;
-      TensorShape cpy_shape;
-      cpy_shape.AddDim(mem_cpy * 3);
-      tensorflow::Status status = context->allocate_temp(
-          DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list + 3);
-      if (!status.ok()) {
-        return false;
-      }
-      // Tensor t_temp;
-      TensorShape t_shape;
-      t_shape.AddDim(mem_cpy * 2);
-      status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
-      if (!status.ok()) {
-        return false;
-      }
-    }
-  }
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  return (tt != max_cpy_trial);
-}
-
-template <typename FPTYPE>
-static int _build_nlist_gpu(OpKernelContext* context,
-                            Tensor* tensor_list,
-                            int*& ilist,
-                            int*& numneigh,
-                            int**& firstneigh,
-                            int*& jlist,
-                            int& max_nnei,
-                            int& mem_nnei,
-                            const FPTYPE* coord,
-                            const int& nloc,
-                            const int& new_nall,
-                            const int& max_nnei_trial,
-                            const float& rcut_r) {
-  ilist = (*tensor_list).flat<int>().data();
-  numneigh = ilist + nloc;
-  // Tensor jlist_temp;
-  int* ind_data = NULL;
-
-  std::vector<int*> firstneigh_host(nloc);
-  int tt;
-  for (tt = 0; tt < max_nnei_trial; ++tt) {
-    jlist = (*(tensor_list + 1)).flat<int>().data();
-    ind_data = jlist + nloc * mem_nnei;
-    for (int_64 ii = 0; ii < nloc; ++ii) {
-      firstneigh_host[ii] = jlist + ii * mem_nnei;
-    }
-    deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
-    deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
-    int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
-                                      new_nall, mem_nnei, rcut_r);
-    if (ret == 0) {
-      break;
-    } else {
-      mem_nnei *= 2;
-      TensorShape jlist_shape;
-      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-      tensorflow::Status status =
-          context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
-      if (!status.ok()) {
-        return false;
-      }
-    }
-  }
-  return (tt != max_nnei_trial);
-}
-
-static void _map_nlist_gpu(int* nlist,
-                           const int* idx_mapping,
-                           const int& nloc,
-                           const int& nnei) {
-  deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
-}
-
-static void _map_nei_info_gpu(int* nlist,
-                              int* ntype,
-                              bool* nmask,
-                              const int* type,
-                              const int* idx_mapping,
-                              const int& nloc,
-                              const int& nnei,
-                              const int& ntypes,
-                              const bool& b_nlist_map) {
-  deepmd::use_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
-                           ntypes, b_nlist_map);
-}
-
-template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial) {
-  if (nei_mode != 3 && nei_mode != 4) {
-    inlist.inum = nloc;
-    // build nlist by myself
-    // normalize and copy coord
-    if (nei_mode == 1) {
-      int copy_ok = _norm_copy_coord_gpu(
-          context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall,
-          mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r);
-      OP_REQUIRES(context, copy_ok,
-                  errors::Aborted("cannot allocate mem for copied coords"));
-      *coord = coord_cpy;
-      *type = type_cpy;
-    }
-    // build nlist
-    int build_ok =
-        _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh,
-                         jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall,
-                         max_nnei_trial, rcut_r);
-    OP_REQUIRES(context, build_ok,
-                errors::Aborted("cannot allocate mem for nlist"));
-    if (max_nbor_size <= 1024) {
-      max_nbor_size = 1024;
-    } else if (max_nbor_size <= 2048) {
-      max_nbor_size = 2048;
-    } else {
-      max_nbor_size = 4096;
-    }
-    inlist.ilist = ilist;
-    inlist.numneigh = numneigh;
-    inlist.firstneigh = firstneigh;
-  } else if (nei_mode == 4) {
-    // TODO: in theory, it will be faster to put everything on GPUs...
-    std::vector<int> mesh_tensor_data_host(mesh_tensor_size);
-    std::vector<int> ilist_host(nloc);
-    std::vector<int> numneigh_host(nloc);
-    std::vector<int*> firstneigh_host(nloc);
-    std::vector<int> fake_mesh(16);
-
-    // copy from gpu to cpu
-    deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host);
-    std::memcpy(&ilist_host[0], &mesh_tensor_data_host[16], sizeof(int) * nloc);
-    std::memcpy(&numneigh_host[0], &mesh_tensor_data_host[16 + nloc],
-                sizeof(int) * nloc);
-    for (int ii = 0, kk = 0; ii < nloc; ++ii) {
-      firstneigh_host[ii] = &mesh_tensor_data_host[16 + 2 * nloc + kk];
-      kk += numneigh_host[ii];
-    }
-    // make a fake mesh
-    fake_mesh[0] = 0;
-    fake_mesh[1] = nloc;
-    std::memcpy(&fake_mesh[4], &ilist_host, sizeof(int*));
-    std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*));
-    std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**));
-    // copy from cpu to gpu
-    int* fake_mesh_dev = NULL;
-    deepmd::malloc_device_memory(fake_mesh_dev, 16);
-    deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh);
-
-    deepmd::InputNlist inlist_temp;
-    inlist_temp.inum = nloc;
-    // everything should be copied to GPU...
-    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
-                                nbor_list_dev, fake_mesh_dev, 16);
-    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
-                errors::InvalidArgument(
-                    "Assert failed, max neighbor size of atom(lammps) " +
-                    std::to_string(max_numneigh(inlist_temp)) +
-                    " is larger than " + std::to_string(max_nbor_size) +
-                    ", which currently is not supported by deepmd-kit."));
-    deepmd::delete_device_memory(fake_mesh_dev);
-  } else {
-    // update nbor list
-    deepmd::InputNlist inlist_temp;
-    inlist_temp.inum = nloc;
-    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
-                                nbor_list_dev, mesh_tensor_data,
-                                mesh_tensor_size);
-    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
-                errors::InvalidArgument(
-                    "Assert failed, max neighbor size of atom(lammps) " +
-                    std::to_string(max_numneigh(inlist_temp)) +
-                    " is larger than " + std::to_string(max_nbor_size) +
-                    ", which currently is not supported by deepmd-kit."));
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Register the CPU kernels.
 // Compatible with v1.3
diff --git a/source/op/prod_env_mat_multi_device_nvnmd.cc b/source/op/prod_env_mat_multi_device_nvnmd.cc
index b5863d1b71..abca947f0a 100644
--- a/source/op/prod_env_mat_multi_device_nvnmd.cc
+++ b/source/op/prod_env_mat_multi_device_nvnmd.cc
@@ -471,13 +471,9 @@ class ProdEnvMatANvnmdQuantizeOp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-// UNDEFINE
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -720,13 +716,9 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-// UNDEFINE
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index 7d8a664a8d..ffcd8f0b8b 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -121,15 +121,10 @@ class ProdForceSeAGradOp : public OpKernel {
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
-                                    nloc, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
@@ -234,15 +229,10 @@ class ProdForceSeRGradOp : public OpKernel {
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
-                                    nloc, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_grad_r_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 9d553b1f0c..935b5b9f2f 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -142,15 +142,10 @@ class ProdForceSeAOp : public OpKernel {
     }
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
-                               nall, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes, nloc_loc,
@@ -228,15 +223,10 @@ class ProdForceSeROp : public OpKernel {
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
-                               nall, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes);
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index ef7d10b3bd..d3e7025e6e 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -142,15 +142,10 @@ class ProdVirialSeAGradOp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist,
-                                       nloc, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
@@ -275,15 +270,10 @@ class ProdVirialSeRGradOp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist,
-                                       nloc, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_grad_r_cpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index e3960fc37d..445770e85a 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -120,15 +120,10 @@ class ProdVirialSeAOp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
-                                  nlist, nloc, nall, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
@@ -224,15 +219,10 @@ class ProdVirialSeROp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
-                                  nlist, nloc, nall, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_r_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 886b9d9a6d..85ea82803a 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -196,15 +196,10 @@ class TabulateFusionSeAOp : public OpKernel {
     const int nnei = em_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
-                                       two_embed, nloc, nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size);
@@ -266,17 +261,11 @@ class TabulateFusionSeAGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
-                                            em_x, em, two_embed, dy, nloc, nnei,
-                                            last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
                                             last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
@@ -330,16 +319,11 @@ class TabulateFusionSeAGradGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_grad_gpu(
-          dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
-          nnei, last_layer_size, is_sorted);
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei, last_layer_size, is_sorted);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
@@ -408,17 +392,11 @@ class TabulateFusionSeAttenOp : public OpKernel {
     const int nnei = em_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
-                                       two_embed, nloc, nnei, last_layer_size,
-                                       is_sorted);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size,
                                        is_sorted);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size,
@@ -489,17 +467,11 @@ class TabulateFusionSeAttenGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
-                                            em_x, em, two_embed, dy, nloc, nnei,
-                                            last_layer_size, is_sorted);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
                                             last_layer_size, is_sorted);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
@@ -559,15 +531,10 @@ class TabulateFusionSeTOp : public OpKernel {
     const int nnei_j = em_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
-                                       nloc, nnei_i, nnei_j, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
                                        nloc, nnei_i, nnei_j, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em,
                                        nloc, nnei_i, nnei_j, last_layer_size);
@@ -627,17 +594,11 @@ class TabulateFusionSeTGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
-                                            em_x, em, dy, nloc, nnei_i, nnei_j,
-                                            last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, dy, nloc, nnei_i, nnei_j,
                                             last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, dy, nloc, nnei_i, nnei_j,
@@ -690,16 +651,11 @@ class TabulateFusionSeTGradGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_grad_grad_gpu(
-          dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
-          nnei_i, nnei_j, last_layer_size);
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_t_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei_i, nnei_j, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
@@ -758,15 +714,10 @@ class TabulateFusionSeROp : public OpKernel {
     const int nnei = em_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
-                                       nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
                                        nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc,
                                        nnei, last_layer_size);
@@ -818,15 +769,10 @@ class TabulateFusionSeRGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
-                                            nloc, nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
                                             nloc, nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy,
                                             nloc, nnei, last_layer_size);
@@ -871,14 +817,10 @@ class TabulateFusionSeRGradGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_grad_grad_gpu(
-          dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_r_grad_grad_gpu(
           dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index bc67a9fac9..9a61a3bac9 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -490,7 +490,7 @@ REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Not required in the current situation
 // // Register the GPU kernels.
-// #if GOOGLE_CUDA
+// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // #define REGISTER_GPU(T) \
 // REGISTER_KERNEL_BUILDER( \
 //     Name("UnaggregatedDyDxS").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
@@ -500,4 +500,4 @@ REGISTER_CPU(double);
 //     UnaggregatedDyDxOp<GPUDevice, T>);
 // REGISTER_GPU(float);
 // REGISTER_GPU(double);
-// #endif  // GOOGLE_CUDA
+// #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From 5aca4bf7dd9f91edfc2db2a5d055f9308db29500 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 26 Sep 2023 15:12:09 +0800
Subject: [PATCH 61/63] Bump pypa/cibuildwheel from 2.15 to 2.16 (#2861)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from
2.15 to 2.16.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/pypa/cibuildwheel/releases">pypa/cibuildwheel's
releases</a>.</em></p>
<blockquote>
<h2>v2.16.0</h2>
<ul>
<li>✨ Add the ability to pass additional flags to a build frontend
through the <a
href="https://cibuildwheel.readthedocs.io/en/stable/options/#build-frontend">CIBW_BUILD_FRONTEND</a>
option (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1588">#1588</a>).</li>
<li>✨ The environment variable SOURCE_DATE_EPOCH is now automatically
passed through to container Linux builds (useful for <a
href="https://reproducible-builds.org/docs/source-date-epoch/">reproducible
builds</a>!) (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1589">#1589</a>)</li>
<li>🛠 Updates the prerelease CPython 3.12 version to 3.12.0rc2 (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1604">#1604</a>)</li>
<li>🐛 Fix <code>requires_python</code> auto-detection from setup.py when
the call to <code>setup()</code> is within an <code>if __name__ ==
&quot;__main__&quot;</code> block (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1613">#1613</a>)</li>
<li>🐛 Fix a bug that prevented building Linux wheels in Docker on a
Windows host (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1573">#1573</a>)</li>
<li>🐛 <code>--only</code> can now select prerelease-pythons (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1564">#1564</a>)</li>
<li>📚 Docs &amp; examples updates (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1582">#1582</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1593">#1593</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1598">#1598</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1615">#1615</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md">pypa/cibuildwheel's
changelog</a>.</em></p>
<blockquote>
<hr />
<h2>title: Changelog</h2>
<h1>Changelog</h1>
<h3>v2.16.0</h3>
<p><em>18 September 2023</em></p>
<ul>
<li>✨ Add the ability to pass additional flags to a build frontend
through the <a
href="https://cibuildwheel.readthedocs.io/en/stable/options/#build-frontend">CIBW_BUILD_FRONTEND</a>
option (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1588">#1588</a>).</li>
<li>✨ The environment variable SOURCE_DATE_EPOCH is now automatically
passed through to container Linux builds (useful for <a
href="https://reproducible-builds.org/docs/source-date-epoch/">reproducible
builds</a>!) (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1589">#1589</a>)</li>
<li>🛠 Updates the prerelease CPython 3.12 version to 3.12.0rc2 (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1604">#1604</a>)</li>
<li>🐛 Fix <code>requires_python</code> auto-detection from setup.py when
the call to <code>setup()</code> is within an `if <strong>name</strong>
== &quot;<strong>main</strong>&quot; block (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1613">#1613</a>)</li>
<li>🐛 Fix a bug that prevented building Linux wheels in Docker on a
Windows host (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1573">#1573</a>)</li>
<li>🐛 <code>--only</code> can now select prerelease-pythons (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1564">#1564</a>)</li>
<li>📚 Docs &amp; examples updates (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1582">#1582</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1593">#1593</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1598">#1598</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1615">#1615</a>)</li>
</ul>
<h3>v2.15.0</h3>
<p><em>8 August 2023</em></p>
<ul>
<li>🌟 CPython 3.12 wheels are now built by default - without the
CIBW_PRERELEASE_PYTHONS flag. It's time to build and upload these wheels
to PyPI! This release includes CPython 3.12.0rc1, which is guaranteed to
be ABI compatible with the final release. (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1565">#1565</a>)</li>
<li>✨ Adds musllinux_1_2 support - this allows packagers to build for
musl-based Linux distributions on a more recent Alpine image, and a
newer musl libc. (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1561">#1561</a>)</li>
</ul>
<h3>v2.14.1</h3>
<p><em>15 July 2023</em></p>
<ul>
<li>🛠 Updates the prerelease CPython 3.12 version to 3.12.0b4 (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1550">#1550</a>)</li>
</ul>
<h3>v2.14.0</h3>
<p><em>10 July 2023</em></p>
<ul>
<li>✨ Adds support for building PyPy 3.10 wheels. (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1525">#1525</a>)</li>
<li>🛠 Updates the prerelease CPython 3.12 version to 3.12.0b3.</li>
<li>✨ Allow the use of the <code>{wheel}</code> placeholder in
CIBW_TEST_COMMAND (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1533">#1533</a>)</li>
<li>📚 Docs &amp; examples updates (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1532">#1532</a>,
<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1416">#1416</a>)</li>
<li>⚠️ Removed support for running cibuildwheel in Python 3.7. Python
3.7 is EOL. However, cibuildwheel continues to build Python 3.7 wheels
for the moment. (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1175">#1175</a>)</li>
</ul>
<h3>v2.13.1</h3>
<p><em>10 June 2023</em></p>
<ul>
<li>🛠 Updates the prerelease CPython 3.12 version to 3.12.0b2. (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1516">#1516</a>)</li>
<li>🛠 Adds a moving <code>v&lt;major&gt;.&lt;minor&gt;</code> tag for
use in GitHub Actions workflow files. If you use this, you'll get the
latest patch release within a minor version. Additionally, Dependabot
won't send you PRs for patch releases. (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1517">#1517</a>)</li>
</ul>
<h3>v2.13.0</h3>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/a873dd9cbf9e3c4c73a1fd11ac31cf835f6eb502"><code>a873dd9</code></a>
Bump version: v2.16.0</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/e8ba0d49edd2845a1a46395921609f1b7a194bbf"><code>e8ba0d4</code></a>
Merge pull request <a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1615">#1615</a>
from pypa/dependabot/github_actions/docker/setup-qem...</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/f0feaffbaabd508d48bb83b5d46b83cac7107181"><code>f0feaff</code></a>
Merge pull request <a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1613">#1613</a>
from henryiii/henryiii/fix/mainif</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/80a54b0226a8a4cc643bc24a968570871dd84364"><code>80a54b0</code></a>
Merge pull request <a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1589">#1589</a>
from dalcinl/source_date_epoch</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/76dba0b9ba3b5143ff833d8414b023ecf2ce8a90"><code>76dba0b</code></a>
Merge pull request <a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1588">#1588</a>
from pypa/frontend-flags</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/0954ffaa6586fffcdc720ab2b788ec1abcdf3481"><code>0954ffa</code></a>
Merge pull request <a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1618">#1618</a>
from pypa/rtd-update</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/753cbd1ca526543d317f0678fb5bc16018ed5ee9"><code>753cbd1</code></a>
Update RtD config to include mandatory build.os option</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/825d89818ab1c7ca0bb9b9781b0b9beb74925b6d"><code>825d898</code></a>
Merge pull request <a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1614">#1614</a>
from henryiii/henryiii/chore/minor</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/f5e60d647fe700ba6e357c30376e8a48f91e5974"><code>f5e60d6</code></a>
fix: include examples too</li>
<li><a
href="https://github.com/pypa/cibuildwheel/commit/adc991c47b1cb56b6de3bb052686f8081791b21b"><code>adc991c</code></a>
[Bot] Update dependencies (<a
href="https://redirect.github.com/pypa/cibuildwheel/issues/1604">#1604</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/pypa/cibuildwheel/compare/v2.15...v2.16">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pypa/cibuildwheel&package-manager=github_actions&previous-version=2.15&new-version=2.16)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/build_wheel.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 2dcec8c0bd..22ed3f31d0 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -63,7 +63,7 @@ jobs:
         name: Setup QEMU
         if: matrix.platform_id == 'manylinux_aarch64' && matrix.os == 'ubuntu-latest'
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.15
+        uses: pypa/cibuildwheel@v2.16
         env:
           CIBW_BUILD_VERBOSITY: 1
           CIBW_ARCHS: all

From 73a536277b6bb294a8fcdb09a50fc806bb19b95e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 Sep 2023 07:29:48 +0000
Subject: [PATCH 62/63] [pre-commit.ci] pre-commit autoupdate (#2867)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

<!--pre-commit.ci start-->
updates:
- [github.com/astral-sh/ruff-pre-commit: v0.0.290 →
v0.0.291](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.290...v0.0.291)
<!--pre-commit.ci end-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7ea4915f6e..dc1d5e99eb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
       files: \.py$
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.0.290
+    rev: v0.0.291
     hooks:
     - id: ruff
       args: ["--fix"]

From 218ff455d739800a68abbc8395599f13c3ebdaa5 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 26 Sep 2023 21:05:08 -0400
Subject: [PATCH 63/63] download cub using CMake FetchContent (#2870)

Many people suggest not using `git submodule`. CMake FetchContent might
be a better option (which we have used for googletest and lammps).

(only take effect for CUDA 10; CUDA 11 contains cub)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 .github/workflows/build_cc.yml     |  2 --
 .github/workflows/build_wheel.yml  |  3 ---
 .gitmodules                        |  3 ---
 doc/install/install-from-source.md |  3 +--
 source/lib/src/gpu/CMakeLists.txt  | 12 +++++++++++-
 source/lib/src/gpu/cub             |  1 -
 6 files changed, 12 insertions(+), 12 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 160000 source/lib/src/gpu/cub

diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 55a5a5c4d8..964a11ce37 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -21,8 +21,6 @@ jobs:
           dp_variant: clang
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: true
     - uses: actions/setup-python@v4
       with:
         python-version: '3.11'
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 22ed3f31d0..98360e41e4 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -56,7 +56,6 @@ jobs:
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: true
           # https://github.com/pypa/setuptools_scm/issues/480
           fetch-depth: 0
       - uses: docker/setup-qemu-action@v3
@@ -77,8 +76,6 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-        with:
-          submodules: true
       - uses: actions/setup-python@v4
         name: Install Python
         with:
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 849b21ced5..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "source/lib/src/gpu/cub"]
-	path = source/lib/src/gpu/cub
-	url = https://github.com/NVIDIA/cub.git
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index ef3f5d9c36..4f94b9c793 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -5,9 +5,8 @@ Please follow our [GitHub](https://github.com/deepmodeling/deepmd-kit) webpage t
 Or get the DeePMD-kit source code by `git clone`
 ```bash
 cd /some/workspace
-git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
+git clone https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
 ```
-The `--recursive` option clones all [submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) needed by DeePMD-kit.
 
 For convenience, you may want to record the location of the source to a variable, saying `deepmd_source_dir` by
 ```bash
diff --git a/source/lib/src/gpu/CMakeLists.txt b/source/lib/src/gpu/CMakeLists.txt
index 4b491a312d..c78da978a2 100644
--- a/source/lib/src/gpu/CMakeLists.txt
+++ b/source/lib/src/gpu/CMakeLists.txt
@@ -26,7 +26,17 @@ if(USE_CUDA_TOOLKIT)
   # cub has been included in CUDA Toolkit 11, we do not need to include it any
   # more see https://github.com/NVIDIA/cub
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "11")
-    include_directories(cub)
+    include(FetchContent)
+    FetchContent_Declare(
+      cub_download
+      GIT_REPOSITORY https://github.com/NVIDIA/cub
+      GIT_TAG b229817e3963fc942c7cc2c61715a6b2b2c49bed)
+    FetchContent_GetProperties(cub_download)
+    if(NOT cub_download_POPULATED)
+      FetchContent_Populate(cub_download)
+      set(CUB_SOURCE_ROOT ${cub_download_SOURCE_DIR})
+    endif()
+    include_directories(${CUB_SOURCE_ROOT})
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS "9")
     message(FATAL_ERROR "CUDA version must be >= 9.0")
diff --git a/source/lib/src/gpu/cub b/source/lib/src/gpu/cub
deleted file mode 160000
index b229817e39..0000000000
--- a/source/lib/src/gpu/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b229817e3963fc942c7cc2c61715a6b2b2c49bed