From d8b392163ee7f0230d60a963c817efe0787b5194 Mon Sep 17 00:00:00 2001
From: Lu <Eric@LuDhMacBook-Pro.local>
Date: Tue, 8 Sep 2020 01:12:14 +0800
Subject: [PATCH 01/65] device independence for deepmd-kit

1. device independence for deepmd-kit
2. fix some potential bugs with cuda code
3. replace concat with gemm to calculate the descriptor after the embedding net
---
 source/CMakeLists.txt                 |   2 +-
 source/lib/include/CustomeOperation.h | 369 ++++++++++++++++++++++++++
 source/lib/include/NNPInter.h         |   6 -
 source/lib/include/common.h           |  20 ++
 source/lib/src/NNPInter.cc            | 341 +++---------------------
 source/lib/src/common.cc              | 130 +++++++++
 source/op/cuda/descrpt_se_a.cu        | 315 +++++++++++++++-------
 source/op/cuda/gelu.cu                |  12 +-
 source/op/cuda/prod_force_se_a.cu     |  70 +++--
 source/op/cuda/prod_virial_se_a.cu    |  69 +++--
 source/op/descrpt_se_a_gpu.cc         | 236 +++++++++-------
 source/op/gelu_gpu.cc                 |  90 +++++--
 source/op/prod_force_se_a_gpu.cc      |  93 +++----
 source/op/prod_virial_se_a_gpu.cc     |  97 +++----
 source/train/DescrptSeA.py            |  12 +-
 15 files changed, 1192 insertions(+), 670 deletions(-)
 create mode 100644 source/lib/include/CustomeOperation.h
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 84c3d326da..dc35ee5dc0 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -125,7 +125,7 @@ else()
   endif()
 endif()
 if (USE_CUDA_TOOLKIT)
-  add_definitions("-DUSE_CUDA_TOOLKIT")
+  add_definitions("-D GOOGLE_CUDA")
 endif()
 
 # define USE_TTM
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
new file mode 100644
index 0000000000..ef8b450db4
--- /dev/null
+++ b/source/lib/include/CustomeOperation.h
@@ -0,0 +1,369 @@
+#pragma once
+#include <vector>
+#include <fstream>
+#include <stdio.h>
+#include <string.h>
+#include <algorithm>
+#include "MathUtilities.h"
+
+#if GOOGLE_CUDA
+#include <cuda_runtime.h>
+#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+#endif // GOOGLE_CUDA
+
+struct NeighborInfo {
+    int type;
+    double dist;
+    int index;
+    NeighborInfo () : type (0), dist(0), index(0) {}
+    NeighborInfo (int tt, double dd, int ii) : type (tt), dist(dd), index(ii) {}
+    
+    bool operator < (const NeighborInfo & b) const {
+	    return (type < b.type || (type == b.type && (dist < b.dist || (dist == b.dist && index < b.index))));
+    }
+};
+
+template <typename T>
+inline void spline5_switch (
+    T & vv,
+	T & dd,
+	const T & xx, 
+	const float & rmin, 
+	const float & rmax)
+{
+    if (xx < rmin) {
+        dd = 0;
+        vv = 1;
+    }
+    else if (xx < rmax) {
+        T uu = (xx - rmin) / (rmax - rmin) ;
+        T du = 1. / (rmax - rmin) ;
+        vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
+        dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+    }
+    else {
+        dd = 0;
+        vv = 0;
+    }
+}
+
+template<typename T> 
+int format_nlist_fill_se_a_cpu (
+    vector<int > &		    fmt_nei_idx_a,
+	const vector<T > &	    posi,
+	const int &			    ntypes,
+	const vector<int > &    type,
+	const int &			    i_idx,
+	const vector<int > &    nei_idx_a, 
+	const float &		    rcut,
+	const vector<int > &    sec_a)
+{
+    fmt_nei_idx_a.resize (sec_a.back());
+    fill (fmt_nei_idx_a.begin(), fmt_nei_idx_a.end(), -1);
+  
+    // gether all neighbors
+    std::vector<int > nei_idx (nei_idx_a);
+    // allocate the information for all neighbors
+    vector<NeighborInfo > sel_nei;
+    sel_nei.reserve (nei_idx_a.size());
+    for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
+        T diff[3];
+        const int & j_idx = nei_idx[kk];
+        for (int dd = 0; dd < 3; ++dd) {
+            diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+        }
+        T rr = sqrt(MathUtilities::dot<T> (diff, diff));    
+        if (rr <= rcut) {
+            sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx));
+        }
+    }
+    sort(sel_nei.begin(), sel_nei.end());  
+  
+    std::vector<int > nei_iter = sec_a;
+    int overflowed = -1;
+    for (unsigned kk = 0; kk < sel_nei.size(); ++kk) {
+        const int & nei_type = sel_nei[kk].type;
+        if (nei_iter[nei_type] < sec_a[nei_type+1]) {
+            fmt_nei_idx_a[nei_iter[nei_type] ++] = sel_nei[kk].index;
+        }
+    }
+    return overflowed;
+}
+
+template<typename T> 
+void compute_descriptor_se_a_cpu (
+    vector<T > &	        descrpt_a,
+	vector<T > &	        descrpt_a_deriv,
+	vector<T > &	        rij_a,
+	const vector<T > &	    posi,
+	const int &				ntypes,
+	const vector<int > &	type,
+	const int &				i_idx,
+	const vector<int > &	fmt_nlist_a,
+	const vector<int > &	sec_a, 
+	const float &			rmin,
+	const float &			rmax) 
+{
+    // compute the diff of the neighbors
+    rij_a.resize (sec_a.back() * 3);
+    fill (rij_a.begin(), rij_a.end(), 0.0);
+    for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+        for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+            if (fmt_nlist_a[jj] < 0) break;
+            const int & j_idx = fmt_nlist_a[jj];
+
+            for (int dd = 0; dd < 3; ++dd) {
+                rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+            }
+        }
+    }
+    // 1./rr, cos(theta), cos(phi), sin(phi)
+    descrpt_a.resize (sec_a.back() * 4);
+    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    // deriv wrt center: 3
+    descrpt_a_deriv.resize (sec_a.back() * 4 * 3);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+    for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+        for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
+            if (fmt_nlist_a[nei_iter] < 0) break;
+            const T * rr = &rij_a[nei_iter * 3];
+            T nr2 = MathUtilities::dot(rr, rr);
+            T inr = 1./sqrt(nr2);
+            T nr = nr2 * inr;
+            T inr2 = inr * inr;
+            T inr4 = inr2 * inr2;
+            T inr3 = inr4 * nr;
+            T sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
+            int idx_value = nei_iter * 4;	// 4 components
+            // 4 value components
+            descrpt_a[idx_value + 0] = 1./nr;
+            descrpt_a[idx_value + 1] = rr[0] / nr2;
+            descrpt_a[idx_value + 2] = rr[1] / nr2;
+            descrpt_a[idx_value + 3] = rr[2] / nr2;
+            // deriv of component 1/r
+            descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
+            // deriv of component x/r2
+            descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr;
+            // deriv of component y/r2
+            descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4	) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr;
+            // deriv of component z/r2
+            descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4	) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr;
+            // 4 value components
+            descrpt_a[idx_value + 0] *= sw;
+            descrpt_a[idx_value + 1] *= sw;
+            descrpt_a[idx_value + 2] *= sw;
+            descrpt_a[idx_value + 3] *= sw;
+        }
+    }
+}
+
+template<typename T>
+void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    // set & normalize coord
+    std::vector<T> d_coord3(nall * 3);
+    for (int ii = 0; ii < nall; ++ii) {
+	    for (int dd = 0; dd < 3; ++dd) {
+	        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+	    }
+    }
+
+    // set type
+    std::vector<int> d_type (nall);
+    for (int ii = 0; ii < nall; ++ii) {
+        d_type[ii] = type[ii];
+    }
+    
+    // build nlist
+    std::vector<vector<int > > d_nlist_a(nloc);
+
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+	}
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    int i_idx = ilist[ii];
+	    for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
+	        int j_idx = jlist[jj];
+	        d_nlist_a[i_idx].push_back (j_idx);
+	    }
+	}
+    
+    #pragma omp parallel for 
+    for (int ii = 0; ii < nloc; ++ii) {
+	    vector<int> fmt_nlist_a;
+	    int ret = -1;
+	    if (fill_nei_a) {
+	        format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
+	    }
+	    std::vector<T> d_descrpt_a;
+	    std::vector<T> d_descrpt_a_deriv;
+	    std::vector<T> d_descrpt_r;
+	    std::vector<T> d_descrpt_r_deriv;
+	    std::vector<T> d_rij_a;
+	    compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
+
+	    // check sizes
+	    assert (d_descrpt_a.size() == ndescrpt);
+	    assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+	    assert (d_rij_a.size() == nnei * 3);
+	    assert (fmt_nlist_a.size() == nnei);
+	    // record outputs
+	    for (int jj = 0; jj < ndescrpt; ++jj) {
+            descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj];
+        }
+	    for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+	        descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3];
+	    }
+	    for (int jj = 0; jj < nnei * 3; ++jj) {
+	        rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+	    }
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+	    }
+    }
+
+    #if DEBUG
+    std::fstream fout1("nlist.txt", std::ios::out);
+    fout1 << "tensor nlist, length:\t" << nloc << ",\twidth:\t" << nnei << std::endl;
+    for (int ii = 0; ii < nloc; ii++) {
+        for (int jj = 0; jj < nnei; jj++) {
+            fout1 << "nlist[" << ii << "][" << jj << "]:\t" << nlist[ii * nnei + jj] << std::endl;
+        }
+    }
+    fout1.close();
+
+    std::fstream fout2("rij.txt", std::ios::out);
+    fout2 << "tensor rij, length:\t" << nloc << ",\twidth:\t" << nnei * 3 << std::endl;
+    for (int ii = 0; ii < nloc; ii++) {
+        for (int jj = 0; jj < nnei * 3; jj++) {
+            fout2 << "rij[" << ii << "][" << jj << "]:\t" << rij[ii * nnei * 3 + jj] << std::endl;
+        }
+    }
+    fout2.close();
+
+    std::fstream fout3("descrpt.txt", std::ios::out);
+    fout3 << "tensor descrpt, length:\t" << nloc << ",\twidth:\t" << ndescrpt << std::endl;
+    for (int ii = 0; ii < nloc; ii++) {
+        for (int jj = 0; jj < ndescrpt; jj++) {
+            fout3 << "descrpt[" << ii << "][" << jj << "]:\t" << descrpt[ii * ndescrpt + jj] << std::endl;
+        }
+    }
+    fout3.close();
+    #endif // DEBUG
+}
+
+extern void DescrptSeAGPUExecuteLauncher(const float * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const float * avg, const float * std, float * descrpt, float * descrpt_deriv, float * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number);
+extern void DescrptSeAGPUExecuteLauncher(const double * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const double * avg, const double * std, double * descrpt, double * descrpt_deriv, double * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number);
+
+template<typename T>
+void DescrptSeAGPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeAGPUExecuteLauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+}
+
+// ******************************************************************************
+// end of custome op DescrptSeA
+// ******************************************************************************
+
+inline void make_descript_range (int & idx_start, int & idx_end, const int & nei_idx, const int& n_a_sel, const int n_a_shift) {
+    if (nei_idx < n_a_sel) {
+        idx_start = nei_idx * 4;
+        idx_end   = nei_idx * 4 + 4;
+    }
+    else {
+        idx_start = n_a_shift + (nei_idx - n_a_sel);
+        idx_end   = n_a_shift + (nei_idx - n_a_sel) + 1;
+    }
+}
+
+template<typename T>
+void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    memset(force, 0.0, sizeof(T) * nall * 3);
+    // compute force of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt center atom
+	    for (int aa = 0; aa < ndescrpt; ++aa) {
+	        force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+	        force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+	        force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+	    }
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        int aa_start, aa_end;
+	        make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
+	        for (int aa = aa_start; aa < aa_end; ++aa) {
+	            force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+	            force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+	            force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+	        }
+	    }
+    }
+}
+
+extern void ProdForceSeAGPUExecuteLauncher(float * force, const float * net_derive, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+extern void ProdForceSeAGPUExecuteLauncher(double * force, const double * net_derive, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+
+
+template<typename T>
+void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    ProdForceSeAGPUExecuteLauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+
+// ******************************************************************************
+// end of custome op ProdForceSeA
+// ******************************************************************************
+
+template<typename T>
+void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    memset(virial, 0.0, sizeof(T) * 9);
+    memset(atom_virial, 0.0, sizeof(T) * nall * 9);
+
+    // compute virial of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        int aa_start, aa_end;
+	        make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
+	        for (int aa = aa_start; aa < aa_end; ++aa) {
+	            T pref = -1.0 * net_deriv[i_idx * ndescrpt + aa];
+	            for (int dd0 = 0; dd0 < 3; ++dd0)
+	                for (int dd1 = 0; dd1 < 3; ++dd1) {
+		                T tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
+		                virial[dd0 * 3 + dd1] -= tmp_v;
+		                atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
+	                }
+	        }
+	    }
+	}
+}
+
+extern void ProdVirialSeAGPUExecuteLauncher(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+extern void ProdVirialSeAGPUExecuteLauncher(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+
+template<typename T>
+void ProdVirialSeAGPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    ProdVirialSeAGPUExecuteLauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+
+// ******************************************************************************
+// end of custome op ProdVirialSeA
+// ******************************************************************************
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index a32940a738..6c37770758 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -98,9 +98,6 @@ class NNPInter
 
   // function used for neighbor list copy
   vector<int> get_sel_a() const;
-#ifdef USE_CUDA_TOOLKIT
-  void update_nbor(const InternalNeighborList & nlist, const int nloc);
-#endif
 };
 
 class NNPInterModelDevi
@@ -195,9 +192,6 @@ class NNPInterModelDevi
   // function used for nborlist copy
   vector<vector<int> > get_sel() const;
   void cum_sum(const std::vector<std::vector<int32> > n_sel);
-#ifdef USE_CUDA_TOOLKIT
-  void update_nbor(const InternalNeighborList & nlist, const int nloc);
-#endif
 };
 
 
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
index 3912f21f7f..4874274305 100644
--- a/source/lib/include/common.h
+++ b/source/lib/include/common.h
@@ -8,11 +8,17 @@
 
 using namespace tensorflow;
 using namespace std;
+#include <tensorflow/core/graph/default_device.h>
+#include <tensorflow/core/graph/graph_def_builder.h>
 
 #include "NNPAtomMap.h"
 #include <vector>
+#include <string>
+#include <iostream>
 #include "version.h"
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 #ifdef HIGH_PREC
 typedef double VALUETYPE;
 typedef double ENERGYTYPE;
@@ -122,6 +128,20 @@ session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
 		       const int			nghost = 0,
 		       const string			scope = "");
 
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox,		    
+		       InternalNeighborList &		dlist, 
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost,
+		       const int			ago,
+		       const string			scope = "");
+
 int
 session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
 		       const vector<VALUETYPE> &	dcoord_,
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index aea9d48c9b..b262851450 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -4,11 +4,8 @@
 #include <stdexcept>	
 
 
-#ifdef  USE_CUDA_TOOLKIT
+#if  GOOGLE_CUDA
 #include "cuda_runtime.h"
-#include <tensorflow/core/public/session.h>
-#include <tensorflow/core/graph/default_device.h>
-#include <tensorflow/core/graph/graph_def_builder.h>
 
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
@@ -57,7 +54,6 @@ run_model (ENERGYTYPE &			dener,
     return;
   }
 
-#ifdef USE_CUDA_TOOLKIT
   std::vector<Tensor> output_tensors;
   checkStatus (session->Run(input_tensors, 
 			    {"o_energy", "o_force", "o_atom_virial"}, 
@@ -74,56 +70,23 @@ run_model (ENERGYTYPE &			dener,
 
   dener = oe(0);
   vector<VALUETYPE> dforce (3 * nall);
-  vector<VALUETYPE> datom_virial (9 * nall);
   dvirial.resize (9);
   for (unsigned ii = 0; ii < nall * 3; ++ii){
     dforce[ii] = of(ii);
   }
-  for (int ii = 0; ii < nall * 9; ++ii) {
-    datom_virial[ii] = oav(ii);
-  }
   for (int ii = 0; ii < nall; ++ii) {
-    dvirial[0] += 1.0 * datom_virial[9*ii+0];
-    dvirial[1] += 1.0 * datom_virial[9*ii+1];
-    dvirial[2] += 1.0 * datom_virial[9*ii+2];
-    dvirial[3] += 1.0 * datom_virial[9*ii+3];
-    dvirial[4] += 1.0 * datom_virial[9*ii+4];
-    dvirial[5] += 1.0 * datom_virial[9*ii+5];
-    dvirial[6] += 1.0 * datom_virial[9*ii+6];
-    dvirial[7] += 1.0 * datom_virial[9*ii+7];
-    dvirial[8] += 1.0 * datom_virial[9*ii+8];
-	}
-
-  dforce_ = dforce;
-  nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-#else 
-  std::vector<Tensor> output_tensors;
-
-  checkStatus (session->Run(input_tensors, 
-			    {"o_energy", "o_force", "o_virial"}, 
-			    {}, 
-			    &output_tensors));
-  
-  Tensor output_e = output_tensors[0];
-  Tensor output_f = output_tensors[1];
-  Tensor output_v = output_tensors[2];
-
-  auto oe = output_e.flat <ENERGYTYPE> ();
-  auto of = output_f.flat <VALUETYPE> ();
-  auto ov = output_v.flat <VALUETYPE> ();
-
-  dener = oe(0);
-  vector<VALUETYPE> dforce (3 * nall);
-  dvirial.resize (9);
-  for (unsigned ii = 0; ii < nall * 3; ++ii){
-    dforce[ii] = of(ii);
-  }
-  for (unsigned ii = 0; ii < 9; ++ii){
-    dvirial[ii] = ov(ii);
+    dvirial[0] += 1.0 * oav(9*ii+0);
+    dvirial[1] += 1.0 * oav(9*ii+1);
+    dvirial[2] += 1.0 * oav(9*ii+2);
+    dvirial[3] += 1.0 * oav(9*ii+3);
+    dvirial[4] += 1.0 * oav(9*ii+4);
+    dvirial[5] += 1.0 * oav(9*ii+5);
+    dvirial[6] += 1.0 * oav(9*ii+6);
+    dvirial[7] += 1.0 * oav(9*ii+7);
+    dvirial[8] += 1.0 * oav(9*ii+8);
   }
   dforce_ = dforce;
   nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-#endif
 }
 
 static void run_model (ENERGYTYPE   &	dener,
@@ -155,7 +118,6 @@ static void run_model (ENERGYTYPE   &	dener,
         fill(datom_virial_.begin(), datom_virial_.end(), 0.0);
         return;
     }
-#ifdef USE_CUDA_TOOLKIT
     std::vector<Tensor> output_tensors;
 
     checkStatus (session->Run(input_tensors, 
@@ -204,50 +166,6 @@ static void run_model (ENERGYTYPE   &	dener,
     nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
     nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
     nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
-#else
-    std::vector<Tensor> output_tensors;
-
-    checkStatus (session->Run(input_tensors, 
-	  		    {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"}, 
-	  		    {}, 
-	  		    &output_tensors));
-
-    Tensor output_e = output_tensors[0];
-    Tensor output_f = output_tensors[1];
-    Tensor output_v = output_tensors[2];
-    Tensor output_ae = output_tensors[3];
-    Tensor output_av = output_tensors[4];
-
-    auto oe = output_e.flat <ENERGYTYPE> ();
-    auto of = output_f.flat <VALUETYPE> ();
-    auto ov = output_v.flat <VALUETYPE> ();
-    auto oae = output_ae.flat <VALUETYPE> ();
-    auto oav = output_av.flat <VALUETYPE> ();
-
-    dener = oe(0);
-    vector<VALUETYPE> dforce (3 * nall);
-    vector<VALUETYPE> datom_energy (nall, 0);
-    vector<VALUETYPE> datom_virial (9 * nall);
-    dvirial.resize (9);
-    for (int ii = 0; ii < nall * 3; ++ii) {
-        dforce[ii] = of(ii);
-    }
-    for (int ii = 0; ii < nloc; ++ii) {
-        datom_energy[ii] = oae(ii);
-    }
-    for (int ii = 0; ii < nall * 9; ++ii) {
-        datom_virial[ii] = oav(ii);
-    }
-    for (int ii = 0; ii < 9; ++ii) {
-        dvirial[ii] = ov(ii);
-    }
-    dforce_ = dforce;
-    datom_energy_ = datom_energy;
-    datom_virial_ = datom_virial;
-    nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-    nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
-    nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
-#endif
 }
 
 
@@ -266,50 +184,8 @@ NNPInter (const string & model, const int & gpu_rank)
   init(model, gpu_rank);  
 }
 
-NNPInter::~NNPInter() {
-    #ifdef USE_CUDA_TOOLKIT
-    if (init_nbor) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaFree(jlist));
-    }
-    #endif
-}
-
-#ifdef USE_CUDA_TOOLKIT
-void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
-    if (!init_nbor) {
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        ilist_size = nlist.ilist.size();
-        jrange_size = nlist.jrange.size();
-        jlist_size = nlist.jlist.size();
-        init_nbor = true;
-    }
-    if (ilist_size < nlist.ilist.size()) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        ilist_size = nlist.ilist.size();
-    }
-    if (jrange_size < nlist.jrange.size()) {
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        jrange_size = nlist.jrange.size();
-    }
-    if (jlist_size < nlist.jlist.size()) {
-        cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        jlist_size = nlist.jlist.size();
-    }
-    
-    cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
-}
-#endif // USE_CUDA_TOOLKIT
+NNPInter::~NNPInter() {}
 
-#ifdef USE_CUDA_TOOLKIT
 void
 NNPInter::
 init (const string & model, const int & gpu_rank)
@@ -318,21 +194,21 @@ init (const string & model, const int & gpu_rank)
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  options.config.set_allow_soft_placement(true);
-  options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-  options.config.mutable_gpu_options()->set_allow_growth(true);
 
   checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
   int gpu_num = -1;
-  cudaGetDeviceCount(&gpu_num);
-  // std::cout << "current number of devices: " << gpu_num << std::endl;
-  // set device to GPU only when at least GPU is found
+  #if GOOGLE_CUDA
+  cudaGetDeviceCount(&gpu_num); // check current device environment
   if (gpu_num > 0) {
+    options.config.set_allow_soft_placement(true);
+    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+    options.config.mutable_gpu_options()->set_allow_growth(true);
+    cudaErrcheck(cudaSetDevice(gpu_rank));
     std::string str = "/gpu:";
     str += std::to_string(gpu_rank % gpu_num);
     graph::SetDefaultDevice(str, &graph_def);
-    // std::cout << "current device rank: " << str << std::endl;
   }
+  #endif // GOOGLE_CUDA
   checkStatus (NewSession(options, &session));
   checkStatus (session->Create(graph_def));
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
@@ -340,8 +216,6 @@ init (const string & model, const int & gpu_rank)
   ntypes = get_scalar<int>("descrpt_attr/ntypes");
   dfparam = get_scalar<int>("fitting_attr/dfparam");
   daparam = get_scalar<int>("fitting_attr/daparam");
-  // assert(rcut == get_rcut());
-  // assert(ntypes == get_ntypes());
   if (dfparam < 0) dfparam = 0;
   if (daparam < 0) daparam = 0;
   inited = true;
@@ -350,38 +224,6 @@ init (const string & model, const int & gpu_rank)
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
 }
-#else
-void
-NNPInter::
-init (const string & model, const int & gpu_rank)
-{
-  assert (!inited);
-  SessionOptions options;
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  checkStatus (NewSession(options, &session));
-  checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
-  checkStatus (session->Create(graph_def));  
-  rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  // assert(rcut == get_rcut());
-  // assert(ntypes == get_ntypes());
-  if (dfparam < 0) dfparam = 0;
-  if (daparam < 0) daparam = 0;
-  // rcut = get_rcut();
-  // cell_size = rcut;
-  // ntypes = get_ntypes();
-  // dfparam = get_dfparam();
-  inited = true;
-
-  init_nbor = false;
-  ilist = NULL; jrange = NULL; jlist = NULL;
-  ilist_size = 0; jrange_size = 0; jlist_size = 0;
-}
-#endif
 
 void 
 NNPInter::
@@ -554,17 +396,10 @@ compute_inner (ENERGYTYPE &			dener,
         nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
         assert (nloc == nnpmap.get_type().size());
 
-	shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
+   shuffle_nlist (nlist, nnpmap);
     }
 
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
     assert (nloc == ret);
     run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
 }
@@ -622,16 +457,9 @@ compute (ENERGYTYPE &			dener,
         // InternalNeighborList nlist;
         convert_nlist_lmp_internal (nlist, lmp_list);
         shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
     }
 
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
     assert (nloc == ret);
     run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost);
 }
@@ -663,17 +491,8 @@ NNPInterModelDevi (const vector<string> & models, const int & gpu_rank)
   init(models, gpu_rank);
 }
 
-NNPInterModelDevi::~NNPInterModelDevi() {
-#ifdef USE_CUDA_TOOLKIT
-    if (init_nbor) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaFree(jlist));
-    }
-#endif
-}
+NNPInterModelDevi::~NNPInterModelDevi() {}
 
-#ifdef USE_CUDA_TOOLKIT
 void
 NNPInterModelDevi::
 init (const vector<string> & models, const int & gpu_rank)
@@ -682,26 +501,32 @@ init (const vector<string> & models, const int & gpu_rank)
   numb_models = models.size();
   sessions.resize(numb_models);
   graph_defs.resize(numb_models);
+  
+  int gpu_num = -1;
+  #if GOOGLE_CUDA 
+  cudaGetDeviceCount(&gpu_num);
+  #endif // GOOGLE_CUDA
+
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
   options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  options.config.set_allow_soft_placement(true);
-  options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-  options.config.mutable_gpu_options()->set_allow_growth(true);
-  
   for (unsigned ii = 0; ii < numb_models; ++ii){
     checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
   }
-  int gpu_num = -1;
-  cudaGetDeviceCount(&gpu_num);
-  // std::cout << "current number of devices: " << gpu_num << std::endl;
-  for (unsigned ii = 0; ii < numb_models; ++ii){
-    // set device to GPU only when at least GPU is found
+  #if GOOGLE_CUDA 
+  if (gpu_num > 0) {
+      options.config.set_allow_soft_placement(true);
+      options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+      options.config.mutable_gpu_options()->set_allow_growth(true);
+      cudaErrcheck(cudaSetDevice(gpu_rank));
+  }
+  #endif // GOOGLE_CUDA
+
+  for (unsigned ii = 0; ii < numb_models; ++ii) {
     if (gpu_num > 0) {
       std::string str = "/gpu:";
       str += std::to_string(gpu_rank % gpu_num);
       graph::SetDefaultDevice(str, &graph_defs[ii]);
-      // std::cout << "current device rank: " << str << std::endl;
     }
     checkStatus (NewSession(options, &(sessions[ii])));
     checkStatus (sessions[ii]->Create(graph_defs[ii]));
@@ -722,40 +547,6 @@ init (const vector<string> & models, const int & gpu_rank)
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
 }
-#else
-void
-NNPInterModelDevi::
-init (const vector<string> & models, const int & gpu_rank)
-{
-  assert (!inited);
-  numb_models = models.size();
-  sessions.resize(numb_models);
-  graph_defs.resize(numb_models);
-  SessionOptions options;
-  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
-  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
-  for (unsigned ii = 0; ii < numb_models; ++ii){
-    checkStatus (NewSession(options, &(sessions[ii])));
-    checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
-    checkStatus (sessions[ii]->Create(graph_defs[ii]));
-  }
-  rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
-  cell_size = rcut;
-  ntypes = get_scalar<int>("descrpt_attr/ntypes");
-  dfparam = get_scalar<int>("fitting_attr/dfparam");
-  daparam = get_scalar<int>("fitting_attr/daparam");
-  if (dfparam < 0) dfparam = 0;
-  if (daparam < 0) daparam = 0;
-  // rcut = get_rcut();
-  // cell_size = rcut;
-  // ntypes = get_ntypes();
-  inited = true;
-  
-  init_nbor = false;
-  ilist = NULL; jrange = NULL; jlist = NULL;
-  ilist_size = 0; jrange_size = 0; jlist_size = 0;
-}
-#endif
 
 template<class VT>
 VT
@@ -821,42 +612,6 @@ cum_sum (const std::vector<std::vector<int32> > n_sel)
     }
 }
 
-#ifdef USE_CUDA_TOOLKIT
-void
-NNPInterModelDevi::
-update_nbor(const InternalNeighborList & nlist, const int nloc) 
-{
-    if (!init_nbor) {
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        ilist_size = nlist.ilist.size();
-        jrange_size = nlist.jrange.size();
-        jlist_size = nlist.jlist.size();
-        init_nbor = true;
-    }
-    if (ilist_size < nlist.ilist.size()) {
-        cudaErrcheck(cudaFree(ilist));
-        cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
-        ilist_size = nlist.ilist.size();
-    }
-    if (jrange_size < nlist.jrange.size()) {
-        cudaErrcheck(cudaFree(jrange));
-        cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
-        jrange_size = nlist.jrange.size();
-    }
-    if (jlist_size < nlist.jlist.size()) {
-        cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        jlist_size = nlist.jlist.size();
-    }
-
-    cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
-    cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
-}
-#endif //USE_CUDA_TOOLKIT
-
 void
 NNPInterModelDevi::
 validate_fparam_aparam(const int & nloc,
@@ -946,16 +701,8 @@ compute (vector<ENERGYTYPE> &		all_energy,
         // InternalNeighborList nlist;
         convert_nlist_lmp_internal (nlist, lmp_list);
         shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
-
     }
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
 
     all_energy.resize (numb_models);
     all_force.resize (numb_models);
@@ -996,16 +743,8 @@ compute (vector<ENERGYTYPE> &			all_energy,
         // InternalNeighborList nlist;
         convert_nlist_lmp_internal (nlist, lmp_list);
         shuffle_nlist (nlist, nnpmap);
-        #ifdef USE_CUDA_TOOLKIT
-            update_nbor(nlist, nloc);
-        #endif
-        
     }
-    #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
-    #else
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
-    #endif
+    int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
 
     all_energy.resize (numb_models);
     all_force .resize (numb_models);
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
index dd4c3f672a..d990195e6c 100644
--- a/source/lib/src/common.cc
+++ b/source/lib/src/common.cc
@@ -340,6 +340,136 @@ session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
   return nloc;
 }
 
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox,		    
+		       InternalNeighborList &		dlist, 
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost,
+           const int      ago,
+		       const string			scope)
+{
+  assert (dbox.size() == 9);
+
+  int nframes = 1;
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  assert (nall == datype_.size());  
+
+  vector<int > datype = nnpmap.get_type();
+  vector<int > type_count (ntypes, 0);
+  for (unsigned ii = 0; ii < datype.size(); ++ii){
+    type_count[datype[ii]] ++;
+  }
+  datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+  TensorShape coord_shape ;
+  coord_shape.AddDim (nframes);
+  coord_shape.AddDim (nall * 3);
+  TensorShape type_shape ;
+  type_shape.AddDim (nframes);
+  type_shape.AddDim (nall);
+  TensorShape box_shape ;
+  box_shape.AddDim (nframes);
+  box_shape.AddDim (9);
+  TensorShape mesh_shape ;
+  mesh_shape.AddDim (16);
+  TensorShape natoms_shape ;
+  natoms_shape.AddDim (2 + ntypes);
+  TensorShape fparam_shape ;
+  fparam_shape.AddDim (nframes);
+  fparam_shape.AddDim (fparam_.size());
+  TensorShape aparam_shape ;
+  aparam_shape.AddDim (nframes);
+  aparam_shape.AddDim (aparam_.size());
+  
+#ifdef HIGH_PREC
+  Tensor coord_tensor	(DT_DOUBLE, coord_shape);
+  Tensor box_tensor	(DT_DOUBLE, box_shape);
+  Tensor fparam_tensor  (DT_DOUBLE, fparam_shape);
+  Tensor aparam_tensor  (DT_DOUBLE, aparam_shape);
+#else
+  Tensor coord_tensor	(DT_FLOAT, coord_shape);
+  Tensor box_tensor	(DT_FLOAT, box_shape);
+  Tensor fparam_tensor  (DT_FLOAT, fparam_shape);
+  Tensor aparam_tensor  (DT_FLOAT, aparam_shape);
+#endif
+  Tensor type_tensor	(DT_INT32, type_shape);
+  Tensor mesh_tensor	(DT_INT32, mesh_shape);
+  Tensor natoms_tensor	(DT_INT32, natoms_shape);
+
+  auto coord = coord_tensor.matrix<VALUETYPE> ();
+  auto type = type_tensor.matrix<int> ();
+  auto box = box_tensor.matrix<VALUETYPE> ();
+  auto mesh = mesh_tensor.flat<int> ();
+  auto natoms = natoms_tensor.flat<int> ();
+  auto fparam = fparam_tensor.matrix<VALUETYPE> ();
+  auto aparam = aparam_tensor.matrix<VALUETYPE> ();
+
+  vector<VALUETYPE> dcoord (dcoord_);
+  nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+  
+  for (int ii = 0; ii < nframes; ++ii){
+    for (int jj = 0; jj < nall * 3; ++jj){
+      coord(ii, jj) = dcoord[jj];
+    }
+    for (int jj = 0; jj < 9; ++jj){
+      box(ii, jj) = dbox[jj];
+    }
+    for (int jj = 0; jj < nall; ++jj){
+      type(ii, jj) = datype[jj];
+    }
+    for (int jj = 0; jj < fparam_.size(); ++jj){
+      fparam(ii, jj) = fparam_[jj];
+    }
+    for (int jj = 0; jj < aparam_.size(); ++jj){
+      aparam(ii, jj) = aparam_[jj];
+    }
+  }
+  
+  for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
+  
+  const int stride = sizeof(int *) / sizeof(int);
+  assert (stride * sizeof(int) == sizeof(int *));
+  assert (stride <= 4);
+  mesh (0) = ago;
+  mesh (1) = dlist.ilist.size();
+  mesh (2) = dlist.jrange.size();
+  mesh (3) = dlist.jlist.size();
+  dlist.make_ptrs();
+  memcpy (&mesh(4), &(dlist.pilist), sizeof(int *));
+  memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *));
+  memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *));
+
+  natoms (0) = nloc;
+  natoms (1) = nall;
+  for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+
+  string prefix = "";
+  if (scope != ""){
+    prefix = scope + "/";
+  }
+  input_tensors = {
+    {prefix+"t_coord",	coord_tensor}, 
+    {prefix+"t_type",	type_tensor},
+    {prefix+"t_box",	box_tensor},
+    {prefix+"t_mesh",	mesh_tensor},
+    {prefix+"t_natoms",natoms_tensor},
+  };  
+  if (fparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_fparam", fparam_tensor});
+  }
+  if (aparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_aparam", aparam_tensor});
+  }
+  return nloc;
+}
+
 int
 session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
 		       const vector<VALUETYPE> &	dcoord_,
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 1636df8ff5..7fa84335e5 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -1,15 +1,3 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_GPU
 #include <vector>
 #include <climits>
 #include <stdio.h>
@@ -18,14 +6,6 @@ limitations under the License.
 #include <cub/block/block_radix_sort.cuh>
 #include <cuda_runtime.h>
 
-#ifdef HIGH_PREC
-    typedef double  VALUETYPE;
-#else
-    typedef float   VALUETYPE;
-#endif
-
-typedef double compute_t;
-
 typedef unsigned long long int_64;
 
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
@@ -77,19 +57,20 @@ __device__ inline T dev_dot(T * arr1, T * arr2) {
     return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
 }
 
-__device__ inline void spline5_switch(compute_t & vv,
-        compute_t & dd,
-        compute_t & xx, 
-		const compute_t & rmin, 
-		const compute_t & rmax) 
+template<typename T>
+__device__ inline void spline5_switch(T & vv,
+        T & dd,
+        T & xx, 
+		const float & rmin, 
+		const float & rmax) 
 {
     if (xx < rmin) {
         dd = 0;
         vv = 1;
     }
     else if (xx < rmax) {
-        compute_t uu = (xx - rmin) / (rmax - rmin) ;
-        compute_t du = 1. / (rmax - rmin) ;
+        T uu = (xx - rmin) / (rmax - rmin) ;
+        T du = 1. / (rmax - rmin) ;
         vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
         dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
     }
@@ -110,7 +91,8 @@ __global__ void get_i_idx_se_a(const int nloc,
     i_idx[ilist[idy]] = idy;
 }
 
-__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
+template<typename T>
+__global__ void format_nlist_fill_a_se_a(const T * coord,
                             const int * type,
                             const int  * jrange,
                             const int  * jlist,
@@ -120,8 +102,8 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
                             const int MAGIC_NUMBER)
 {   
     // <<<nloc, MAGIC_NUMBER>>>
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     
     const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
     if (idy >= nsize) {
@@ -133,12 +115,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
 
     int_64 * key_in = key + idx * MAGIC_NUMBER;
 
-    compute_t diff[3];
+    T diff[3];
     const int & j_idx = nei_idx[idy];
     for (int dd = 0; dd < 3; dd++) {
         diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
     }
-    compute_t rr = sqrt(dev_dot(diff, diff)); 
+    T rr = sqrt(dev_dot(diff, diff)); 
     if (rr <= rcut) {
         key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
     }
@@ -180,33 +162,34 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
 }
 //it's ok!
 
-__global__ void compute_descriptor_se_a (VALUETYPE* descript,
+template<typename T>
+__global__ void compute_descriptor_se_a (T* descript,
                             const int ndescrpt,
-                            VALUETYPE* descript_deriv,
+                            T* descript_deriv,
                             const int descript_deriv_size,
-                            VALUETYPE* rij,
+                            T* rij,
                             const int rij_size,
                             const int* type,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
+                            const T* avg,
+                            const T* std,
                             int* nlist,
                             const int nlist_size,
-                            const VALUETYPE* coord,
-                            const VALUETYPE rmin,
-                            const VALUETYPE rmax,
+                            const T* coord,
+                            const float rmin,
+                            const float rmax,
                             const int sec_a_size)
 {   
     // <<<nloc, sec_a.back()>>>
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
     const int idx_value = idy * 4;	// 4 components
     if (idy >= sec_a_size) {return;}
 
     // else {return;}
-    VALUETYPE * row_descript = descript + idx * ndescrpt;
-    VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    VALUETYPE * row_rij = rij + idx * rij_size;
+    T * row_descript = descript + idx * ndescrpt;
+    T * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    T * row_rij = rij + idx * rij_size;
     int * row_nlist = nlist + idx * nlist_size;
 
     if (row_nlist[idy] >= 0) {
@@ -214,14 +197,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
         for (int kk = 0; kk < 3; kk++) {
             row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const compute_t * rr = &row_rij[idy * 3 + 0];
-        compute_t nr2 = dev_dot(rr, rr);
-        compute_t inr = 1./sqrt(nr2);
-        compute_t nr = nr2 * inr;
-        compute_t inr2 = inr * inr;
-        compute_t inr4 = inr2 * inr2;
-        compute_t inr3 = inr4 * nr;
-        compute_t sw, dsw;
+        const T * rr = &row_rij[idy * 3 + 0];
+        T nr2 = dev_dot(rr, rr);
+        T inr = 1./sqrt(nr2);
+        T nr = nr2 * inr;
+        T inr2 = inr * inr;
+        T inr4 = inr2 * inr2;
+        T inr3 = inr4 * nr;
+        T sw, dsw;
         spline5_switch(sw, dsw, nr, rmin, rmax);
         row_descript[idx_value + 0] = (1./nr)       ;//* sw;
         row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw;
@@ -260,8 +243,9 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
     }
 }
 
+template<typename T>
 void format_nbor_list_256 (
-    const VALUETYPE* coord,
+    const T* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -274,9 +258,10 @@ void format_nbor_list_256 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 256;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -292,8 +277,9 @@ void format_nbor_list_256 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename T>
 void format_nbor_list_512 (
-    const VALUETYPE* coord,
+    const T* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -306,9 +292,10 @@ void format_nbor_list_512 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 512;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -324,8 +311,9 @@ void format_nbor_list_512 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename T>
 void format_nbor_list_1024 (
-    const VALUETYPE* coord,
+    const T* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -338,9 +326,10 @@ void format_nbor_list_1024 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 1024;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -356,8 +345,9 @@ void format_nbor_list_1024 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename T>
 void format_nbor_list_2048 (
-    const VALUETYPE* coord,
+    const T* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -370,9 +360,10 @@ void format_nbor_list_2048 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 2048;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -388,8 +379,9 @@ void format_nbor_list_2048 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
+template<typename T>
 void format_nbor_list_4096 (
-    const VALUETYPE* coord,
+    const T* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -402,9 +394,10 @@ void format_nbor_list_4096 (
     const int LEN = 256;
     const int MAGIC_NUMBER = 4096;
     const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nblock, nloc);
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
-    <<<block_grid, LEN>>> (
+    <<<block_grid, thread_grid>>> (
         coord,
         type,
         jrange,
@@ -420,26 +413,27 @@ void format_nbor_list_4096 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-void DescrptSeALauncher(const VALUETYPE* coord,
-                            const int* type,
-                            const int* ilist,
-                            const int* jrange,
-                            const int* jlist,
-                            int* array_int,
-                            unsigned long long* array_longlong,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
-                            VALUETYPE* descript,
-                            VALUETYPE* descript_deriv,
-                            VALUETYPE* rij,
-                            int* nlist,
-                            const int& nloc,       
-                            const int& nnei,       
-                            const float& rcut_r,     
-                            const float& rcut_r_smth,
-                            const int& ndescrpt, 
-                            const std::vector<int>& sec_a,      
-                            const bool& fill_nei_a,
+void DescrptSeAGPUExecuteLauncher(const float * coord, 
+                            const int * type, 
+                            const int * ilist, 
+                            const int * jrange, 
+                            const int * jlist, 
+                            int * array_int, 
+                            unsigned long long * array_longlong, 
+                            const float * avg, 
+                            const float * std, 
+                            float * descript, 
+                            float * descript_deriv, 
+                            float * rij, 
+                            int * nlist, 
+                            const int nloc, 
+                            const int nall, 
+                            const int nnei, 
+                            const int ndescrpt, 
+                            const float rcut_r, 
+                            const float rcut_r_smth, 
+                            const std::vector<int> sec_a, 
+                            const bool fill_nei_a, 
                             const int MAGIC_NUMBER
 )
 {   
@@ -454,8 +448,144 @@ void DescrptSeALauncher(const VALUETYPE* coord,
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(float) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(float) * nloc * ndescrpt * 3); cudaErrcheck(res);
+
+    if (fill_nei_a) {
+        // ~~~
+        // cudaProfilerStart();
+        get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
+
+        if (nnei <= 256) {
+            format_nbor_list_256 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 512) {
+            format_nbor_list_512 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 1024) {
+            format_nbor_list_1024 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 2048) {
+            format_nbor_list_2048 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 4096) {
+            format_nbor_list_4096 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } 
+
+        format_nlist_fill_b_se_a<<<nblock, LEN>>> (
+                            nlist,
+                            nnei,       
+                            nloc,
+                            jrange,
+                            jlist,
+                            key,
+                            sec_a_dev,
+                            sec_a.size(),
+                            nei_iter,
+                            MAGIC_NUMBER
+        );
+    }
+
+    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
+    dim3 block_grid(nloc, nblock_);
+    dim3 thread_grid(1, LEN);
+    compute_descriptor_se_a<<<block_grid, thread_grid>>> (
+                            descript,
+                            ndescrpt,
+                            descript_deriv,
+                            ndescrpt * 3,
+                            rij,
+                            nnei * 3,
+                            type,
+                            avg,
+                            std,
+                            nlist,
+                            nnei,
+                            coord,
+                            rcut_r_smth,
+                            rcut_r,
+                            sec_a.back()
+    );
+}  
+
+void DescrptSeAGPUExecuteLauncher(const double * coord, 
+                            const int * type, 
+                            const int * ilist, 
+                            const int * jrange, 
+                            const int * jlist, 
+                            int * array_int, 
+                            unsigned long long * array_longlong, 
+                            const double * avg, 
+                            const double * std, 
+                            double * descript, 
+                            double * descript_deriv, 
+                            double * rij, 
+                            int * nlist, 
+                            const int nloc, 
+                            const int nall, 
+                            const int nnei, 
+                            const int ndescrpt, 
+                            const float rcut_r, 
+                            const float rcut_r_smth, 
+                            const std::vector<int> sec_a, 
+                            const bool fill_nei_a, 
+                            const int MAGIC_NUMBER
+)
+{   
+    const int LEN = 256;
+    int nblock = (nloc + LEN -1) / LEN;
+    int * sec_a_dev = array_int;
+    int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
+    int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
+    int_64 * key = array_longlong;
+
+    cudaError_t res = cudaSuccess;
+    res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
+    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
+    res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(double) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(double) * nloc * ndescrpt * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
         // ~~~
@@ -534,8 +664,9 @@ void DescrptSeALauncher(const VALUETYPE* coord,
     }
 
     const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nblock_, nloc);
-    compute_descriptor_se_a<<<block_grid, LEN>>> (
+    dim3 block_grid(nloc, nblock_);
+    dim3 thread_grid(1, LEN);
+    compute_descriptor_se_a<<<block_grid, thread_grid>>> (
                             descript,
                             ndescrpt,
                             descript_deriv,
diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu
index 99b7b1aed4..e16a285034 100644
--- a/source/op/cuda/gelu.cu
+++ b/source/op/cuda/gelu.cu
@@ -34,42 +34,42 @@ __global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * ou
 }
 
 
-void GeluLauncher(const float * in, float * out, int const size) {
+void GeluGPULauncher(const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
 }
 
-void GeluLauncher(const double * in, double * out, int const size) {
+void GeluGPULauncher(const double * in, double * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
 }
 
-void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) {
+void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
 }
 
-void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) {
+void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
 }
 
-void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) {
+void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
 }
 
-void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) {
+void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
index 080ff8ef75..926f30604b 100644
--- a/source/op/cuda/prod_force_se_a.cu
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -2,14 +2,8 @@
 #include <iostream>
 #include <cuda_runtime.h>
 
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)  
 {
     if (code != cudaSuccess) 
     {
@@ -32,25 +26,25 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_center_atom_se_a(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename T>
+__global__ void deriv_wrt_center_atom_se_a(T * force, 
+                        const T * net_deriv,
+                        const T * in_deriv,
                         const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idz = threadIdx.y;
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+    const unsigned int idz = threadIdx.x;
 
-    if (idy >= ndescrpt) {
-        return;
-    }
+    if (idy >= ndescrpt) {return;}
     
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename T>
+__global__ void deriv_wrt_neighbors_se_a(T * force, 
+                        const T * net_deriv,
+                        const T * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
@@ -75,23 +69,49 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * force,
     atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
 }
 
-void ProdForceSeALauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+void ProdForceSeAGPUExecuteLauncher(float * force, 
+                        const float * net_deriv,
+                        const float * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nall,
+                        const int nnei,
                         const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift)
+{   
+    // std::cout << "I'm here!" << std::endl;
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(float) * nall * 3));
+    const int LEN1 = 256;
+    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
+    dim3 grid(nloc, nblock1);
+    dim3 thread(3, LEN1);
+    deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    
+    const int LEN = 64;
+    int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 3, 4);
+    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+
+void ProdForceSeAGPUExecuteLauncher(double * force, 
+                        const double * net_deriv,
+                        const double * in_deriv,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
                         const int nnei,
+                        const int ndescrpt,
                         const int n_a_sel,
                         const int n_a_shift)
 {   
     // std::cout << "I'm here!" << std::endl;
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(double) * nall * 3));
     const int LEN1 = 256;
     const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nblock1, nloc);
-    dim3 thread(LEN1, 3);
+    dim3 grid(nloc, nblock1);
+    dim3 thread(3, LEN1);
     deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu
index 241e2b7e06..d2524a3c81 100644
--- a/source/op/cuda/prod_virial_se_a.cu
+++ b/source/op/cuda/prod_virial_se_a.cu
@@ -1,14 +1,6 @@
 #include <stdio.h>
 #include <cuda_runtime.h>
 
-#define MUL 512
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
     if (code != cudaSuccess) {
@@ -31,11 +23,12 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template<typename T>
+__global__ void deriv_wrt_neighbors_se_a(T * virial, 
+                        T * atom_virial,
+                        const T * net_deriv,
+                        const T * in_deriv,
+                        const T * rij,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
@@ -64,11 +57,47 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial,
     atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
 }
 
-void ProdVirialSeALauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+void ProdVirialSeAGPUExecuteLauncher(float * virial, 
+                        float * atom_virial,
+                        const float * net_deriv,
+                        const float * in_deriv,
+                        const float * rij,
+                        const int * nlist,
+                        const int nloc,
+                        const int nall,
+                        const int nnei,
+                        const int ndescrpt,
+                        const int n_a_sel,
+                        const int n_a_shift)
+{
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(float) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(float) * 9 * nall));
+
+    const int LEN = 16;
+    int nblock = (nloc + LEN -1) / LEN;
+    dim3 block_grid(nblock, nnei);
+    dim3 thread_grid(LEN, 9, 4);
+    // compute virial of a frame
+    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(
+                        virial, 
+                        atom_virial, 
+                        net_deriv, 
+                        in_deriv,
+                        rij,
+                        nlist,
+                        nloc,
+                        nnei,
+                        ndescrpt,
+                        n_a_sel,
+                        n_a_shift
+    );
+}
+
+void ProdVirialSeAGPUExecuteLauncher(double * virial, 
+                        double * atom_virial,
+                        const double * net_deriv,
+                        const double * in_deriv,
+                        const double * rij,
                         const int * nlist,
                         const int nloc,
                         const int nall,
@@ -77,8 +106,8 @@ void ProdVirialSeALauncher(VALUETYPE * virial,
                         const int n_a_sel,
                         const int n_a_shift) 
 {
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall));
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(double) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(double) * 9 * nall));
 
     const int LEN = 16;
     int nblock = (nloc + LEN -1) / LEN;
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
index fd5ae632cb..9c52a62474 100644
--- a/source/op/descrpt_se_a_gpu.cc
+++ b/source/op/descrpt_se_a_gpu.cc
@@ -1,32 +1,5 @@
-#include <vector>
-#include <string.h>
-#include <iostream>
-#include <cuda_runtime.h>
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;  // NOLINT(build/namespaces)
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE ;
-#else
-    typedef float  VALUETYPE ;
-#endif
-
-typedef double compute_t;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-using GPUDevice = Eigen::GpuDevice;
+#include "common.h"
+#include "CustomeOperation.h"
 
 #ifdef HIGH_PREC
 REGISTER_OP("DescrptSeA")
@@ -85,29 +58,32 @@ int get_magic_number(int const nnei) {
     }
 }
 
-void DescrptSeALauncher(const VALUETYPE* coord,
-                            const int* type,
-                            const int* ilist,
-                            const int* jrange,
-                            const int* jlist,
-                            int* array_int,
-                            unsigned long long* array_longlong,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
-                            VALUETYPE* descript,
-                            VALUETYPE* descript_deriv,
-                            VALUETYPE* rij,
-                            int* nlist,
-                            const int& nloc,
-                            const int& nnei,
-                            const float& rcut_r,
-                            const float& rcut_r_smth,
-                            const int& ndescrpt,
-                            const std::vector<int>& sec_a,
-                            const bool& fill_nei_a,
-                            const int MAGIC_NUMBER
-);
+template <typename Device, typename T> 
+struct DeviceFunctor {
+    void operator()(const CPUDevice& d, std::string& device) {
+        device = "CPU";
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, std::string& device) {
+        device = "GPU";
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template <typename Device, typename T>
+struct DescrptSeAFunctor {
+    void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+    #endif // GOOGLE_CUDA
+};
 
+template<typename Device>
 class DescrptSeAOp : public OpKernel {
 public:
     explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -154,8 +130,12 @@ class DescrptSeAOp : public OpKernel {
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
 
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        DeviceFunctor<Device, VALUETYPE>() (
+            context->eigen_device<Device>(),
+            device
+        );
+
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
@@ -209,51 +189,55 @@ class DescrptSeAOp : public OpKernel {
 	    					     nlist_shape,
 	    					     &nlist_tensor));
         
-        // allocate temp memory, temp memory must not be used after this operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
-
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(nloc * magic_number * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+        if(device == "GPU") {
+            // allocate temp memory, temp memory must not be used after this operation!
+            Tensor int_temp;
+            TensorShape int_shape;
+            int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+            Tensor uint64_temp;
+            TensorShape uint64_shape;
+            uint64_shape.AddDim(nloc * magic_number * 2);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
 
-	    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
-        int * array_int = int_temp.flat<int>().data(); 
-        unsigned long long * array_longlong = uint64_temp.flat<unsigned long long>().data(); 
-        cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
+            array_int = int_temp.flat<int>().data(); 
+            array_longlong = uint64_temp.flat<unsigned long long>().data();
 
-        // Launch computation
-        for (int II = 0; II < nsamples; II++) {
-            DescrptSeALauncher(coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),    // related to the kk argument
-                        type_tensor.matrix<int>().data() + II * nall,           // also related to the kk argument
-                        ilist,
-                        jrange,
-                        jlist,
-                        array_int,
-                        array_longlong,
-                        avg_tensor.matrix<VALUETYPE>().data(),
-                        std_tensor.matrix<VALUETYPE>().data(),
-                        descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                        descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                        rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                        nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
-                        nloc,
-                        nnei,
-                        rcut_r,
-                        rcut_r_smth,
-                        ndescrpt,
-                        sec_a,
-                        fill_nei_a,
-                        magic_number
-            );
+            nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
+        }
+        else if (device == "CPU") {
+            memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jrange, 8  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
         }
-        // std::cout << "done" << std::endl;
-        delete[] natoms;
+
+        DescrptSeAFunctor<Device, VALUETYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            coord_tensor.matrix<VALUETYPE>().data(),    // related to the kk argument
+            type_tensor.matrix<int>().data(),           // also related to the kk argument
+            mesh_tensor.flat<int>().data(),
+            ilist,
+            jrange,
+            jlist,
+            array_int,
+            array_longlong,
+            avg_tensor.matrix<VALUETYPE>().data(),
+            std_tensor.matrix<VALUETYPE>().data(),
+            descrpt_tensor->matrix<VALUETYPE>().data(),
+            descrpt_deriv_tensor->matrix<VALUETYPE>().data(),
+            rij_tensor->matrix<VALUETYPE>().data(),
+            nlist_tensor->matrix<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ntypes,
+            ndescrpt,
+            rcut_r,
+            rcut_r_smth,
+            sec_a,
+            fill_nei_a,
+            magic_number
+        );
     }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -277,6 +261,66 @@ class DescrptSeAOp : public OpKernel {
             sec[ii] = sec[ii-1] + n_sel[ii-1];
         }
     }
+
+    std::string device;
+    int *array_int;
+    unsigned long long*array_longlong;
+    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
+    int ilist_size = 0, jrange_size = 0, jlist_size = 0;
+    bool init = false;
+
+    void nbor_update(const int * mesh, const int size) {
+        int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL;
+        cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost));
+        memcpy (&ilist_host,  4  + mesh_host, sizeof(int *));
+	    memcpy (&jrange_host, 8  + mesh_host, sizeof(int *));
+	    memcpy (&jlist_host,  12 + mesh_host, sizeof(int *));
+        int const ago = mesh_host[0];
+        if (!init) {
+            ilist_size  = (int)(mesh_host[1] * 1.2);
+            jrange_size = (int)(mesh_host[2] * 1.2);
+            jlist_size  = (int)(mesh_host[3] * 1.2);
+            cudaErrcheck(cudaMalloc((void **)&ilist,     sizeof(int) * ilist_size));
+            cudaErrcheck(cudaMalloc((void **)&jrange,    sizeof(int) * jrange_size));
+            cudaErrcheck(cudaMalloc((void **)&jlist,     sizeof(int) * jlist_size));
+            init = true;
+        }
+        if (ago == 0) {
+            if (ilist_size < mesh_host[1]) {
+                ilist_size = (int)(mesh_host[1] * 1.2);
+                cudaErrcheck(cudaFree(ilist));
+                cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size));
+            }
+            if (jrange_size < mesh_host[2]) {
+                jrange_size = (int)(mesh_host[2] * 1.2);
+                cudaErrcheck(cudaFree(jrange));
+                cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size));
+            }
+            if (jlist_size < mesh_host[3]) {
+                jlist_size = (int)(mesh_host[3] * 1.2);
+                cudaErrcheck(cudaFree(jlist));
+                cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size));
+            }
+            cudaErrcheck(cudaMemcpy(ilist,  ilist_host,  sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
+        }
+        delete [] mesh_host;
+    }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_GPU), DescrptSeAOp);
\ No newline at end of file
+// Register the CPU kernels.
+#define REGISTER_CPU()                                           \
+REGISTER_KERNEL_BUILDER(                                         \
+    Name("DescrptSeA").Device(DEVICE_CPU),                       \
+    DescrptSeAOp<CPUDevice>);
+REGISTER_CPU();
+
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU()                                           \
+REGISTER_KERNEL_BUILDER(                                         \
+    Name("DescrptSeA").Device(DEVICE_GPU).HostMemory("natoms"),  \
+    DescrptSeAOp<GPUDevice>);
+REGISTER_GPU();
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc
index 34d4183f98..8aece7e384 100644
--- a/source/op/gelu_gpu.cc
+++ b/source/op/gelu_gpu.cc
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -7,6 +8,7 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
+#define SQRT_2_PI 0.7978845608028654 
 REGISTER_OP("Gelu")
     .Attr("T: {float, double}")
     .Input("x: T")
@@ -25,35 +27,76 @@ REGISTER_OP("GeluGradGrad")
     .Input("x: T")
     .Output("output: T");
 
+#if GOOGLE_CUDA
 // maybe instead use cudnn activation forward 
-void GeluLauncher(const float * in, float * out, int const size);
-void GeluLauncher(const double * in, double * out, int const size);
+void GeluGPULauncher(const float * in, float * out, int const size);
+void GeluGPULauncher(const double * in, double * out, int const size);
 
-void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
-void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
+void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size);
+void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size);
 
-void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
-void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
+void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
+void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
+#endif // GOOGLE_CUDA
+
+template<typename T>
+void GeluCPULauncher(const T * in, T * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])));
+    }
+}
+
+template<typename T>
+void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
+    }
+}
+
+template <typename T>
+void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+	    out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
+    }
+}
 
 template <typename Device, typename T>
 struct GeluFunctor {
-    void operator()(const Device& d, const T * in, T * out, int const size) {
-		GeluLauncher(in, out, size);
+    void operator()(const CPUDevice& d, const T * in, T * out, int const size) {
+		GeluCPULauncher(in, out, size);
 	}
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * in, T * out, int const size) {
+        GeluGPULauncher(in, out, size);
+    }
+    #endif
 };
 
 template <typename Device, typename T>
 struct GeluGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
-        GeluGradLauncher(dy, in, out, size);
+    void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) {
+        GeluGradCPULauncher(dy, in, out, size);
     }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) {
+        GeluGradGPULauncher(dy, in, out, size);
+    }
+    #endif
 };
 
 template <typename Device, typename T>
 struct GeluGradGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
-        GeluGradGradLauncher(dy, dy_, in, out, size);
+    void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        GeluGradGradCPULauncher(dy, dy_, in, out, size);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        GeluGradGradGPULauncher(dy, dy_, in, out, size);
     }
+    #endif
 };
 
 // OpKernel definition.
@@ -78,7 +121,6 @@ class GeluOp : public OpKernel {
 			output->flat<T>().data(),
 			static_cast<int>(output->NumElements())
 		);
-        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
@@ -107,7 +149,6 @@ class GeluGradOp : public OpKernel {
             output->flat<T>().data(),
             static_cast<int>(output->NumElements())
         );
-        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
@@ -138,11 +179,27 @@ class GeluGradGradOp : public OpKernel {
             output->flat<T>().data(),
             static_cast<int>(output->NumElements())
         );
-        // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
-#define REGISTER_GPU(T)                                                     \
+#define REGISTER_CPU(T)                                                 \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
+    GeluOp<CPUDevice, T>);                                              \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<CPUDevice, T>);                                          \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<CPUDevice, T>);                                      
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                 \
     /* Declare explicit instantiations in kernel_example.cu.cc. */          \
     REGISTER_KERNEL_BUILDER(                                                \
         Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
@@ -157,3 +214,4 @@ class GeluGradGradOp : public OpKernel {
         GeluGradGradOp<GPUDevice, T>);                                      
     REGISTER_GPU(float);
     REGISTER_GPU(double);
+#endif // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_gpu.cc
index 2d159a8505..c5c6c5c6b7 100644
--- a/source/op/prod_force_se_a_gpu.cc
+++ b/source/op/prod_force_se_a_gpu.cc
@@ -1,26 +1,5 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <cuda_runtime.h>
-#include <iostream>
-
-using namespace tensorflow;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
+#include "common.h"
+#include "CustomeOperation.h"
 
 #ifdef HIGH_PREC
 REGISTER_OP("ProdForceSeA")
@@ -42,17 +21,19 @@ REGISTER_OP("ProdForceSeA")
     .Output("force: float");
 #endif
 
-void ProdForceSeALauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int ndescrpt,
-                        const int nnei,
-                        const int n_a_sel,
-                        const int n_a_shift);
+template <typename Device, typename T>
+struct ProdForceSeAFunctor {
+    void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdForceSeAGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    }
+    #endif // GOOGLE_CUDA
+};
 
+template<typename Device>
 class ProdForceSeAOp : public OpKernel {
 public:
     explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -76,8 +57,7 @@ class ProdForceSeAOp : public OpKernel {
         OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int nframes = net_deriv_tensor.shape().dim_size(0);
@@ -117,23 +97,36 @@ class ProdForceSeAOp : public OpKernel {
         assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
         assert (nnei * 4 == ndescrpt);	    
 
-        for (int II = 0; II < nframes; II++) {
-            ProdForceSeALauncher(force_tensor->flat<VALUETYPE>().data() + II * (nall * 3),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall, 
-                                ndescrpt,
-                                nnei,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
+        ProdForceSeAFunctor<Device, VALUETYPE>()(
+            context->eigen_device<Device>(),
+            force_tensor->flat<VALUETYPE>().data(),
+            net_deriv_tensor.flat<VALUETYPE>().data(),
+            in_deriv_tensor.flat<VALUETYPE>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall, 
+            nnei,
+            ndescrpt,
+            n_a_sel,
+            n_a_shift
+        );
     }
 private:
     int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_GPU), ProdForceSeAOp);
\ No newline at end of file
+// Register the CPU kernels.
+#define REGISTER_CPU()                                            \
+REGISTER_KERNEL_BUILDER(                                          \
+    Name("ProdForceSeA").Device(DEVICE_CPU),                      \
+    ProdForceSeAOp<CPUDevice>);
+REGISTER_CPU();
+
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU()                                            \
+REGISTER_KERNEL_BUILDER(                                          \
+    Name("ProdForceSeA").Device(DEVICE_GPU).HostMemory("natoms"), \
+    ProdForceSeAOp<GPUDevice>);
+REGISTER_GPU();
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_gpu.cc
index 42f70d06d2..34a7a03574 100644
--- a/source/op/prod_virial_se_a_gpu.cc
+++ b/source/op/prod_virial_se_a_gpu.cc
@@ -1,14 +1,5 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <iostream>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
+#include "common.h"
+#include "CustomeOperation.h"
 
 #ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeA")
@@ -34,31 +25,19 @@ REGISTER_OP("ProdVirialSeA")
     .Output("atom_virial: float");
 #endif
 
-using namespace tensorflow;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
+template<typename Device, typename T>
+struct ProdVirialSeAFunctor {
+    void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
     }
-}
-
-void ProdVirialSeALauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift);
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+        ProdVirialSeAGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    }
+    #endif // GOOGLE_CUDA
+};
 
+template<typename Device>
 class ProdVirialSeAOp : public OpKernel {
  public:
     explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -84,8 +63,7 @@ class ProdVirialSeAOp : public OpKernel {
         OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int nnei = nlist_tensor.shape().dim_size(1) / nloc;
@@ -121,25 +99,38 @@ class ProdVirialSeAOp : public OpKernel {
         auto virial = virial_tensor->flat<VALUETYPE>();
         auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
         
-        for (int II = 0; II < nframes; II++) {
-            ProdVirialSeALauncher(virial_tensor->flat<VALUETYPE>().data() + II * 9, 
-                                atom_virial_tensor->flat<VALUETYPE>().data() + II * (nall * 9),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor.flat<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall,
-                                nnei,
-                                ndescrpt,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
+        ProdVirialSeAFunctor<Device, VALUETYPE>()(
+            context->eigen_device<Device>(),
+            virial_tensor->flat<VALUETYPE>().data(), 
+            atom_virial_tensor->flat<VALUETYPE>().data(),
+            net_deriv_tensor.flat<VALUETYPE>().data(),
+            in_deriv_tensor.flat<VALUETYPE>().data(),
+            rij_tensor.flat<VALUETYPE>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ndescrpt,
+            n_a_sel,
+            n_a_shift
+        );
     }
 private:
     int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_GPU), ProdVirialSeAOp);
\ No newline at end of file
+// Register the CPU kernels.
+#define REGISTER_CPU()                                             \
+REGISTER_KERNEL_BUILDER(                                           \
+    Name("ProdVirialSeA").Device(DEVICE_CPU),                      \
+    ProdVirialSeAOp<CPUDevice>);
+REGISTER_CPU();
+
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU()                                             \
+REGISTER_KERNEL_BUILDER(                                           \
+    Name("ProdVirialSeA").Device(DEVICE_GPU).HostMemory("natoms"), \
+    ProdVirialSeAOp<GPUDevice>);
+REGISTER_GPU();
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py
index e46265231f..1c93e92f02 100644
--- a/source/train/DescrptSeA.py
+++ b/source/train/DescrptSeA.py
@@ -391,14 +391,18 @@ def _filter(self,
               xyz_scatter = tf.matmul(xyz_scatter, w)
             # natom x nei_type_i x out_size
             xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
-            xyz_scatter_total.append(xyz_scatter)
 
+            # xyz_scatter_total.append(xyz_scatter)
+            if type_i == 0 :
+                xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+            else :
+                xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
           # natom x nei x outputs_size
-          xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
+          # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
           # natom x nei x 4
-          inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
+          # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
           # natom x 4 x outputs_size
-          xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
+          # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
           xyz_scatter_1 = xyz_scatter_1 * (4.0 / shape[1])
           # natom x 4 x outputs_size_2
           xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2])

From b55309f10434566b14e0068255b6982da4e50094 Mon Sep 17 00:00:00 2001
From: Lu <Eric@LuDhMacBook-Pro.local>
Date: Fri, 11 Sep 2020 04:16:31 +0800
Subject: [PATCH 02/65] use standard operator style

---
 source/lib/include/CustomeOperation.h         | 111 ++++++-----
 source/lib/include/DeviceFunctor.h            |  47 +++++
 source/op/CMakeLists.txt                      |   2 +-
 source/op/cuda/descrpt_se_a.cu                | 186 +-----------------
 source/op/cuda/gelu.cu                        |  49 +++--
 source/op/cuda/prod_force_se_a.cu             |  50 +----
 source/op/cuda/prod_virial_se_a.cu            |  63 ++----
 source/op/descrpt.cc                          |  64 +++---
 source/op/descrpt_se_a.cc                     |  77 ++++----
 ..._a_gpu.cc => descrpt_se_a_multi_device.cc} |  81 +++-----
 source/op/descrpt_se_r.cc                     |  55 +++---
 source/op/descrpt_se_r_gpu.cc                 |  39 ++--
 source/op/ewald_recp.cc                       |  60 +++---
 source/op/gelu_gpu.cc                         |  90 ++-------
 source/op/gelu_multi_device.cc                | 167 ++++++++++++++++
 source/op/prod_force.cc                       |  36 ++--
 source/op/prod_force_grad.cc                  |  39 ++--
 source/op/prod_force_se_a.cc                  |  38 ++--
 source/op/prod_force_se_a_grad.cc             |  30 ++-
 ...gpu.cc => prod_force_se_a_multi_device.cc} |  59 +++---
 source/op/prod_force_se_r.cc                  |  32 +--
 source/op/prod_force_se_r_gpu.cc              |  31 +--
 source/op/prod_force_se_r_grad.cc             |  36 ++--
 source/op/prod_virial.cc                      |  60 +++---
 source/op/prod_virial_grad.cc                 |  34 ++--
 source/op/prod_virial_se_a.cc                 |  51 +++--
 source/op/prod_virial_se_a_grad.cc            |  44 ++---
 ...pu.cc => prod_virial_se_a_multi_device.cc} |  73 +++----
 source/op/prod_virial_se_r.cc                 |  47 ++---
 source/op/prod_virial_se_r_gpu.cc             |  33 ++--
 source/op/prod_virial_se_r_grad.cc            |  32 ++-
 source/op/soft_min.cc                         |  71 +++----
 source/op/soft_min_force.cc                   |  35 ++--
 source/op/soft_min_force_grad.cc              |  39 ++--
 source/op/soft_min_virial.cc                  |  37 ++--
 source/op/soft_min_virial_grad.cc             |  44 ++---
 source/op/tab_inter.cc                        |  53 +++--
 37 files changed, 935 insertions(+), 1160 deletions(-)
 create mode 100644 source/lib/include/DeviceFunctor.h
 rename source/op/{descrpt_se_a_gpu.cc => descrpt_se_a_multi_device.cc} (89%)
 create mode 100644 source/op/gelu_multi_device.cc
 rename source/op/{prod_force_se_a_gpu.cc => prod_force_se_a_multi_device.cc} (80%)
 rename source/op/{prod_virial_se_a_gpu.cc => prod_virial_se_a_multi_device.cc} (77%)

diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index ef8b450db4..c0bd8fdbd5 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -1,22 +1,16 @@
 #pragma once
 #include <vector>
-#include <fstream>
 #include <stdio.h>
 #include <string.h>
 #include <algorithm>
 #include "MathUtilities.h"
-
 #if GOOGLE_CUDA
-#include <cuda_runtime.h>
-#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
-    if (code != cudaSuccess) {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 #endif // GOOGLE_CUDA
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
 struct NeighborInfo {
     int type;
     double dist;
@@ -237,45 +231,14 @@ void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist,
 	        nlist[ii * nnei + jj] = fmt_nlist_a[jj];
 	    }
     }
-
-    #if DEBUG
-    std::fstream fout1("nlist.txt", std::ios::out);
-    fout1 << "tensor nlist, length:\t" << nloc << ",\twidth:\t" << nnei << std::endl;
-    for (int ii = 0; ii < nloc; ii++) {
-        for (int jj = 0; jj < nnei; jj++) {
-            fout1 << "nlist[" << ii << "][" << jj << "]:\t" << nlist[ii * nnei + jj] << std::endl;
-        }
-    }
-    fout1.close();
-
-    std::fstream fout2("rij.txt", std::ios::out);
-    fout2 << "tensor rij, length:\t" << nloc << ",\twidth:\t" << nnei * 3 << std::endl;
-    for (int ii = 0; ii < nloc; ii++) {
-        for (int jj = 0; jj < nnei * 3; jj++) {
-            fout2 << "rij[" << ii << "][" << jj << "]:\t" << rij[ii * nnei * 3 + jj] << std::endl;
-        }
-    }
-    fout2.close();
-
-    std::fstream fout3("descrpt.txt", std::ios::out);
-    fout3 << "tensor descrpt, length:\t" << nloc << ",\twidth:\t" << ndescrpt << std::endl;
-    for (int ii = 0; ii < nloc; ii++) {
-        for (int jj = 0; jj < ndescrpt; jj++) {
-            fout3 << "descrpt[" << ii << "][" << jj << "]:\t" << descrpt[ii * ndescrpt + jj] << std::endl;
-        }
-    }
-    fout3.close();
-    #endif // DEBUG
 }
 
-extern void DescrptSeAGPUExecuteLauncher(const float * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const float * avg, const float * std, float * descrpt, float * descrpt_deriv, float * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number);
-extern void DescrptSeAGPUExecuteLauncher(const double * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const double * avg, const double * std, double * descrpt, double * descrpt_deriv, double * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number);
-
+#if GOOGLE_CUDA
 template<typename T>
 void DescrptSeAGPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeAGPUExecuteLauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    DescrptSeAGPUExecuteFunctor<T>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
 }
-
+#endif // GOOGLE_CUDA
 // ******************************************************************************
 // end of custome op DescrptSeA
 // ******************************************************************************
@@ -317,14 +280,12 @@ void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv,
     }
 }
 
-extern void ProdForceSeAGPUExecuteLauncher(float * force, const float * net_derive, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
-extern void ProdForceSeAGPUExecuteLauncher(double * force, const double * net_derive, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
-
-
+#if GOOGLE_CUDA
 template<typename T>
 void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
-    ProdForceSeAGPUExecuteLauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    ProdForceSeAGPUExecuteFunctor<T>()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
+#endif // GOOGLE_CUDA
 
 // ******************************************************************************
 // end of custome op ProdForceSeA
@@ -356,14 +317,56 @@ void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv,
 	}
 }
 
-extern void ProdVirialSeAGPUExecuteLauncher(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
-extern void ProdVirialSeAGPUExecuteLauncher(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
-
+#if GOOGLE_CUDA
 template<typename T>
 void ProdVirialSeAGPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
-    ProdVirialSeAGPUExecuteLauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+    ProdVirialSeAGPUExecuteFunctor<T>()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
-
+#endif // GOOGLE_CUDA
 // ******************************************************************************
 // end of custome op ProdVirialSeA
 // ******************************************************************************
+
+template<typename T>
+void GeluCPULauncher(const T * in, T * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])));
+    }
+}
+
+template<typename T>
+void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
+    }
+}
+
+template <typename T>
+void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    for (int ii = 0; ii < size; ii++) {
+        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+	    out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename T>
+void GeluGPULauncher(const T * in, T * out, int const size) {
+    GeluGPUExecuteFunctor<T>()(in, out, size);
+}
+
+template<typename T>
+void GeluGradGPULauncher(const T * dy, const T * in, T * out, int const size) {
+    GeluGradGPUExecuteFunctor<T>()(dy, in, out, size);
+}
+
+template <typename T>
+void GeluGradGradGPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    GeluGradGradGPUExecuteFunctor<T>()(dy, dy_, in, out, size);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op Gelu
+// ******************************************************************************
diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h
new file mode 100644
index 0000000000..f02ef55651
--- /dev/null
+++ b/source/lib/include/DeviceFunctor.h
@@ -0,0 +1,47 @@
+#pragma once
+#include <vector>
+#include <climits>
+#include <stdio.h>
+#include <iostream>
+#include <cuda_runtime.h>
+
+typedef unsigned long long int_64;
+#define SQRT_2_PI 0.7978845608028654 
+
+#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+        if (abort) exit(code);
+    }
+}
+
+template<typename T>
+struct DescrptSeAGPUExecuteFunctor {
+    void operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+};
+
+template<typename T>
+struct ProdForceSeAGPUExecuteFunctor {
+    void operator()(T * force, const T * net_derive, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template<typename T>
+struct ProdVirialSeAGPUExecuteFunctor {
+    void operator()(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template<typename T>
+struct GeluGPUExecuteFunctor {
+    void operator()(const T * in, T * out, const int size);
+};
+
+template<typename T>
+struct GeluGradGPUExecuteFunctor {
+    void operator()(const T * dy, const T * in, T * out, const int size);
+};
+
+template<typename T>
+struct GeluGradGradGPUExecuteFunctor {
+    void operator()(const T * dy, const T * dy_, const T * in, T * out, const int size);
+};
\ No newline at end of file
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 993a1b6fd4..21f675c845 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -5,7 +5,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
 file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc)
 file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 7fa84335e5..afed0dbe83 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -1,22 +1,7 @@
-#include <vector>
-#include <climits>
-#include <stdio.h>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
-#include <cuda_runtime.h>
-
-typedef unsigned long long int_64;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 template <
     typename    Key,
@@ -413,30 +398,8 @@ void format_nbor_list_4096 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-void DescrptSeAGPUExecuteLauncher(const float * coord, 
-                            const int * type, 
-                            const int * ilist, 
-                            const int * jrange, 
-                            const int * jlist, 
-                            int * array_int, 
-                            unsigned long long * array_longlong, 
-                            const float * avg, 
-                            const float * std, 
-                            float * descript, 
-                            float * descript_deriv, 
-                            float * rij, 
-                            int * nlist, 
-                            const int nloc, 
-                            const int nall, 
-                            const int nnei, 
-                            const int ndescrpt, 
-                            const float rcut_r, 
-                            const float rcut_r_smth, 
-                            const std::vector<int> sec_a, 
-                            const bool fill_nei_a, 
-                            const int MAGIC_NUMBER
-)
-{   
+template <typename T>
+void DescrptSeAGPUExecuteFunctor<T>::operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
     int * sec_a_dev = array_int;
@@ -448,8 +411,8 @@ void DescrptSeAGPUExecuteLauncher(const float * coord,
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(float) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(float) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(T) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(T) * nloc * ndescrpt * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
         // ~~~
@@ -547,140 +510,7 @@ void DescrptSeAGPUExecuteLauncher(const float * coord,
                             rcut_r,
                             sec_a.back()
     );
-}  
-
-void DescrptSeAGPUExecuteLauncher(const double * coord, 
-                            const int * type, 
-                            const int * ilist, 
-                            const int * jrange, 
-                            const int * jlist, 
-                            int * array_int, 
-                            unsigned long long * array_longlong, 
-                            const double * avg, 
-                            const double * std, 
-                            double * descript, 
-                            double * descript_deriv, 
-                            double * rij, 
-                            int * nlist, 
-                            const int nloc, 
-                            const int nall, 
-                            const int nnei, 
-                            const int ndescrpt, 
-                            const float rcut_r, 
-                            const float rcut_r_smth, 
-                            const std::vector<int> sec_a, 
-                            const bool fill_nei_a, 
-                            const int MAGIC_NUMBER
-)
-{   
-    const int LEN = 256;
-    int nblock = (nloc + LEN -1) / LEN;
-    int * sec_a_dev = array_int;
-    int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
-    int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
-    int_64 * key = array_longlong;
-
-    cudaError_t res = cudaSuccess;
-    res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
-    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
-    res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(double) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(double) * nloc * ndescrpt * 3); cudaErrcheck(res);
-
-    if (fill_nei_a) {
-        // ~~~
-        // cudaProfilerStart();
-        get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
-
-        if (nnei <= 256) {
-            format_nbor_list_256 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 512) {
-            format_nbor_list_512 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 1024) {
-            format_nbor_list_1024 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 2048) {
-            format_nbor_list_2048 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 4096) {
-            format_nbor_list_4096 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } 
-
-        format_nlist_fill_b_se_a<<<nblock, LEN>>> (
-                            nlist,
-                            nnei,       
-                            nloc,
-                            jrange,
-                            jlist,
-                            key,
-                            sec_a_dev,
-                            sec_a.size(),
-                            nei_iter,
-                            MAGIC_NUMBER
-        );
-    }
+}
 
-    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nloc, nblock_);
-    dim3 thread_grid(1, LEN);
-    compute_descriptor_se_a<<<block_grid, thread_grid>>> (
-                            descript,
-                            ndescrpt,
-                            descript_deriv,
-                            ndescrpt * 3,
-                            rij,
-                            nnei * 3,
-                            type,
-                            avg,
-                            std,
-                            nlist,
-                            nnei,
-                            coord,
-                            rcut_r_smth,
-                            rcut_r,
-                            sec_a.back()
-    );
-}  
\ No newline at end of file
+template struct DescrptSeAGPUExecuteFunctor<float>;
+template struct DescrptSeAGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu
index e16a285034..078a31da33 100644
--- a/source/op/cuda/gelu.cu
+++ b/source/op/cuda/gelu.cu
@@ -1,7 +1,4 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#define SQRT_2_PI 0.7978845608028654 
+#include "DeviceFunctor.h"
 
 template <typename T>
 __global__ void gelu(const T * in, T * out, int const size) {
@@ -33,45 +30,75 @@ __global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * ou
 	out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2);
 }
 
-
-void GeluGPULauncher(const float * in, float * out, int const size) {
+void GeluLauncher(const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
 }
 
-void GeluGPULauncher(const double * in, double * out, int const size) {
+void GeluLauncher(const double * in, double * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
 }
 
-void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size) {
+void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
 }
 
-void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size) {
+void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
 }
 
-void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) {
+void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
 }
 
-void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) {
+void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
 }
+
+template <typename T>
+void GeluGPUExecuteFunctor<T>::operator()(const T * in, T * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
+}
+
+template <typename T>
+void GeluGradGPUExecuteFunctor<T>::operator()(const T * dy, const T * in, T * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
+}
+ 
+template <typename T>
+void GeluGradGradGPUExecuteFunctor<T>::operator()(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
+}
+
+template struct GeluGPUExecuteFunctor<float>;
+template struct GeluGPUExecuteFunctor<double>;
+template struct GeluGradGPUExecuteFunctor<float>;
+template struct GeluGradGPUExecuteFunctor<double>;
+template struct GeluGradGradGPUExecuteFunctor<float>;
+template struct GeluGradGradGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
index 926f30604b..ee826449af 100644
--- a/source/op/cuda/prod_force_se_a.cu
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -1,16 +1,4 @@
-#include <stdio.h>
-#include <iostream>
-#include <cuda_runtime.h>
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)  
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
@@ -69,9 +57,10 @@ __global__ void deriv_wrt_neighbors_se_a(T * force,
     atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
 }
 
-void ProdForceSeAGPUExecuteLauncher(float * force, 
-                        const float * net_deriv,
-                        const float * in_deriv,
+template <typename T>
+void ProdForceSeAGPUExecuteFunctor<T>::operator()(T * force, 
+                        const T * net_deriv,
+                        const T * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nall,
@@ -81,7 +70,7 @@ void ProdForceSeAGPUExecuteLauncher(float * force,
                         const int n_a_shift)
 {   
     // std::cout << "I'm here!" << std::endl;
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(float) * nall * 3));
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(T) * nall * 3));
     const int LEN1 = 256;
     const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
     dim3 grid(nloc, nblock1);
@@ -95,28 +84,5 @@ void ProdForceSeAGPUExecuteLauncher(float * force,
     deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
 
-void ProdForceSeAGPUExecuteLauncher(double * force, 
-                        const double * net_deriv,
-                        const double * in_deriv,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift)
-{   
-    // std::cout << "I'm here!" << std::endl;
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(double) * nall * 3));
-    const int LEN1 = 256;
-    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nloc, nblock1);
-    dim3 thread(3, LEN1);
-    deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
-    
-    const int LEN = 64;
-    int nblock = (nloc + LEN -1) / LEN;
-    dim3 block_grid(nblock, nnei);
-    dim3 thread_grid(LEN, 3, 4);
-    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
-}
\ No newline at end of file
+template struct ProdForceSeAGPUExecuteFunctor<float>;
+template struct ProdForceSeAGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu
index d2524a3c81..5825120970 100644
--- a/source/op/cuda/prod_virial_se_a.cu
+++ b/source/op/cuda/prod_virial_se_a.cu
@@ -1,13 +1,4 @@
-#include <stdio.h>
-#include <cuda_runtime.h>
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
-    if (code != cudaSuccess) {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
@@ -57,11 +48,12 @@ __global__ void deriv_wrt_neighbors_se_a(T * virial,
     atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
 }
 
-void ProdVirialSeAGPUExecuteLauncher(float * virial, 
-                        float * atom_virial,
-                        const float * net_deriv,
-                        const float * in_deriv,
-                        const float * rij,
+template <typename T>
+void ProdVirialSeAGPUExecuteFunctor<T>::operator()(T * virial, 
+                        T * atom_virial,
+                        const T * net_deriv,
+                        const T * in_deriv,
+                        const T * rij,
                         const int * nlist,
                         const int nloc,
                         const int nall,
@@ -70,8 +62,8 @@ void ProdVirialSeAGPUExecuteLauncher(float * virial,
                         const int n_a_sel,
                         const int n_a_shift)
 {
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(float) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(float) * 9 * nall));
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(T) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(T) * 9 * nall));
 
     const int LEN = 16;
     int nblock = (nloc + LEN -1) / LEN;
@@ -93,38 +85,5 @@ void ProdVirialSeAGPUExecuteLauncher(float * virial,
     );
 }
 
-void ProdVirialSeAGPUExecuteLauncher(double * virial, 
-                        double * atom_virial,
-                        const double * net_deriv,
-                        const double * in_deriv,
-                        const double * rij,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift) 
-{
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(double) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(double) * 9 * nall));
-
-    const int LEN = 16;
-    int nblock = (nloc + LEN -1) / LEN;
-    dim3 block_grid(nblock, nnei);
-    dim3 thread_grid(LEN, 9, 4);
-    // compute virial of a frame
-    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(
-                        virial, 
-                        atom_virial, 
-                        net_deriv, 
-                        in_deriv,
-                        rij,
-                        nlist,
-                        nloc,
-                        nnei,
-                        ndescrpt,
-                        n_a_sel,
-                        n_a_shift
-    );
-}
+template struct ProdVirialSeAGPUExecuteFunctor<float>;
+template struct ProdVirialSeAGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 75c7640b2b..c59ba817b1 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -18,48 +18,30 @@ typedef double VALUETYPE ;
 typedef float  VALUETYPE ;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("Descrpt")
-.Input("coord: double")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: double")
-.Input("mesh: int32")
-.Input("davg: double")
-.Input("dstd: double")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Attr("axis_rule: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
-.Output("nlist: int32")
-.Output("axis: int32")
-.Output("rot_mat: double");
-#else
+using CPUDevice = Eigen::ThreadPoolDevice;
+
 REGISTER_OP("Descrpt")
-.Input("coord: float")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: float")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("davg: float")
-.Input("dstd: float")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
 .Attr("axis_rule: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32")
 .Output("axis: int32")
-.Output("rot_mat: float");
-#endif
+.Output("rot_mat: T");
 
+template<typename Device, typename T>
 class DescrptOp : public OpKernel {
 public:
   explicit DescrptOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -182,18 +164,18 @@ class DescrptOp : public OpKernel {
     Tensor* rot_mat_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(5, rot_mat_shape, &rot_mat_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<T>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<T>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto avg	= avg_tensor	.matrix<T>();
+    auto std	= std_tensor	.matrix<T>();
+    auto descrpt	= descrpt_tensor	->matrix<T>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<T>();
+    auto rij		= rij_tensor		->matrix<T>();
     auto nlist		= nlist_tensor		->matrix<int>();
     auto axis		= axis_tensor		->matrix<int>();
-    auto rot_mat	= rot_mat_tensor		->matrix<VALUETYPE>();
+    auto rot_mat	= rot_mat_tensor		->matrix<T>();
 
     // // check the types
     // int max_type_v = 0;
@@ -624,5 +606,9 @@ class DescrptOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("Descrpt").Device(DEVICE_CPU), DescrptOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc
index b970f6dbf0..3dca8040aa 100644
--- a/source/op/descrpt_se_a.cc
+++ b/source/op/descrpt_se_a.cc
@@ -18,44 +18,29 @@ typedef double VALUETYPE ;
 typedef float  VALUETYPE ;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("DescrptSeA")
-.Input("coord: double")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: double")
-.Input("mesh: int32")
-.Input("davg: double")
-.Input("dstd: double")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
-.Output("nlist: int32");
-#else
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
 REGISTER_OP("DescrptSeA")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
+    .Attr("T: {float, double}")
+    .Input("coord: T")          //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
 
+template<typename Device, typename T>
 class DescrptSeAOp : public OpKernel {
 public:
   explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -180,15 +165,15 @@ class DescrptSeAOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<T>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<T>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto avg	= avg_tensor	.matrix<T>();
+    auto std	= std_tensor	.matrix<T>();
+    auto descrpt	= descrpt_tensor	->matrix<T>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<T>();
+    auto rij		= rij_tensor		->matrix<T>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -369,5 +354,9 @@ class DescrptSeAOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_CPU), DescrptSeAOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeAOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_multi_device.cc
similarity index 89%
rename from source/op/descrpt_se_a_gpu.cc
rename to source/op/descrpt_se_a_multi_device.cc
index 9c52a62474..0069bb5375 100644
--- a/source/op/descrpt_se_a_gpu.cc
+++ b/source/op/descrpt_se_a_multi_device.cc
@@ -1,44 +1,25 @@
 #include "common.h"
 #include "CustomeOperation.h"
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeA")
-    .Input("coord: double")     //atomic coordinates
+    .Attr("T: {float, double}")
+    .Input("coord: T")          //atomic coordinates
     .Input("type: int32")       //atomic type
     .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
-    .Input("box : double")
+    .Input("box : T")
     .Input("mesh : int32")
-    .Input("davg: double")      //average value of data
-    .Input("dstd: double")      //standard deviation
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
     .Attr("rcut_a: float")      //no use
     .Attr("rcut_r: float")
     .Attr("rcut_r_smth: float")
     .Attr("sel_a: list(int)")
     .Attr("sel_r: list(int)")   //all zero
-    .Output("descrpt: double")
-    .Output("descrpt_deriv: double")
-    .Output("rij: double")
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
     .Output("nlist: int32");
     // only sel_a and rcut_r uesd.
-#else
-REGISTER_OP("DescrptSeA")
-    .Input("coord: float")
-    .Input("type: int32")
-    .Input("natoms: int32")
-    .Input("box : float")
-    .Input("mesh : int32")
-    .Input("davg: float")
-    .Input("dstd: float")
-    .Attr("rcut_a: float")
-    .Attr("rcut_r: float")
-    .Attr("rcut_r_smth: float")
-    .Attr("sel_a: list(int)")
-    .Attr("sel_r: list(int)")
-    .Output("descrpt: float")
-    .Output("descrpt_deriv: float")
-    .Output("rij: float")
-    .Output("nlist: int32");
-#endif
 
 int get_magic_number(int const nnei) {
     if (nnei <= 256) {
@@ -58,7 +39,6 @@ int get_magic_number(int const nnei) {
     }
 }
 
-template <typename Device, typename T> 
 struct DeviceFunctor {
     void operator()(const CPUDevice& d, std::string& device) {
         device = "CPU";
@@ -70,7 +50,7 @@ struct DeviceFunctor {
     #endif // GOOGLE_CUDA
 };
 
-template <typename Device, typename T>
+template <typename T>
 struct DescrptSeAFunctor {
     void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
         DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
@@ -80,10 +60,10 @@ struct DescrptSeAFunctor {
     void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
         DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
     }
-    #endif // GOOGLE_CUDA
+    #endif // GOOGLE_CUDA 
 };
 
-template<typename Device>
+template <typename Device, typename T>
 class DescrptSeAOp : public OpKernel {
 public:
     explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -130,7 +110,7 @@ class DescrptSeAOp : public OpKernel {
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
 
-        DeviceFunctor<Device, VALUETYPE>() (
+        DeviceFunctor() (
             context->eigen_device<Device>(),
             device
         );
@@ -211,9 +191,9 @@ class DescrptSeAOp : public OpKernel {
 	        memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
         }
 
-        DescrptSeAFunctor<Device, VALUETYPE>()(
+        DescrptSeAFunctor<T>()(
             context->eigen_device<Device>(),            // define actually graph execution device
-            coord_tensor.matrix<VALUETYPE>().data(),    // related to the kk argument
+            coord_tensor.matrix<T>().data(),    // related to the kk argument
             type_tensor.matrix<int>().data(),           // also related to the kk argument
             mesh_tensor.flat<int>().data(),
             ilist,
@@ -221,11 +201,11 @@ class DescrptSeAOp : public OpKernel {
             jlist,
             array_int,
             array_longlong,
-            avg_tensor.matrix<VALUETYPE>().data(),
-            std_tensor.matrix<VALUETYPE>().data(),
-            descrpt_tensor->matrix<VALUETYPE>().data(),
-            descrpt_deriv_tensor->matrix<VALUETYPE>().data(),
-            rij_tensor->matrix<VALUETYPE>().data(),
+            avg_tensor.matrix<T>().data(),
+            std_tensor.matrix<T>().data(),
+            descrpt_tensor->matrix<T>().data(),
+            descrpt_deriv_tensor->matrix<T>().data(),
+            rij_tensor->matrix<T>().data(),
             nlist_tensor->matrix<int>().data(),
             nloc,
             nall,
@@ -310,17 +290,18 @@ class DescrptSeAOp : public OpKernel {
 };
 
 // Register the CPU kernels.
-#define REGISTER_CPU()                                           \
-REGISTER_KERNEL_BUILDER(                                         \
-    Name("DescrptSeA").Device(DEVICE_CPU),                       \
-    DescrptSeAOp<CPUDevice>);
-REGISTER_CPU();
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 // Register the GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_GPU()                                           \
-REGISTER_KERNEL_BUILDER(                                         \
-    Name("DescrptSeA").Device(DEVICE_GPU).HostMemory("natoms"),  \
-    DescrptSeAOp<GPUDevice>);
-REGISTER_GPU();
+#define REGISTER_GPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
+    DescrptSeAOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
 #endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc
index 6798df503c..7a8dbb1541 100644
--- a/source/op/descrpt_se_r.cc
+++ b/source/op/descrpt_se_r.cc
@@ -18,39 +18,26 @@ typedef double VALUETYPE ;
 typedef float  VALUETYPE ;
 #endif
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
 REGISTER_OP("DescrptSeR")
-#ifdef HIGH_PREC
-.Input("coord: double")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: double")
-.Input("mesh: int32")
-.Input("davg: double")
-.Input("dstd: double")
-.Attr("rcut: float")
-.Attr("rcut_smth: float")
-.Attr("sel: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
-.Output("nlist: int32");
-#else
-.Input("coord: float")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: float")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("davg: float")
-.Input("dstd: float")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut: float")
 .Attr("rcut_smth: float")
 .Attr("sel: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#endif
 
+template<typename Device, typename T>
 class DescrptSeROp : public OpKernel {
 public:
   explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -169,15 +156,15 @@ class DescrptSeROp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<T>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<T>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto avg	= avg_tensor	.matrix<T>();
+    auto std	= std_tensor	.matrix<T>();
+    auto descrpt	= descrpt_tensor	->matrix<T>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<T>();
+    auto rij		= rij_tensor		->matrix<T>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     OP_REQUIRES (context, (ntypes == int(sel.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
@@ -351,5 +338,9 @@ class DescrptSeROp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_CPU), DescrptSeROp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeROp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc
index 65e2682ef0..bf2f5b0b12 100644
--- a/source/op/descrpt_se_r_gpu.cc
+++ b/source/op/descrpt_se_r_gpu.cc
@@ -31,8 +31,8 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
 
 using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeR")
+    .Attr("T: {float, double}")
     .Input("coord: double")
     .Input("type: int32")
     .Input("natoms: int32")
@@ -47,23 +47,7 @@ REGISTER_OP("DescrptSeR")
     .Output("descrpt_deriv: double")
     .Output("rij: double")
     .Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeR")
-    .Input("coord: float")
-    .Input("type: int32")
-    .Input("natoms: int32")
-    .Input("box: float")
-    .Input("mesh: int32")
-    .Input("davg: float")
-    .Input("dstd: float")
-    .Attr("rcut: float")
-    .Attr("rcut_smth: float")
-    .Attr("sel: list(int)")
-    .Output("descrpt: float")
-    .Output("descrpt_deriv: float")
-    .Output("rij: float")
-    .Output("nlist: int32");
-#endif
+
 
 void DescrptSeRLauncher(const VALUETYPE* coord,
                             const int* type,
@@ -90,6 +74,7 @@ void DescrptSeRLauncher(const VALUETYPE* coord,
                             const bool& fill_nei_a
 );
 
+template<typename Device, typename T>
 class DescrptSeROp : public OpKernel {
 public:
     explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -193,7 +178,7 @@ class DescrptSeROp : public OpKernel {
         // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
         // Launch computation
         for (int II = 0; II < nsamples; II++) {
-            DescrptSeRLauncher( coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),
+            DescrptSeRLauncher( coord_tensor.matrix<T>().data() + II * (nall * 3),
                                 type_tensor.matrix<int>().data() + II * nall,
                                 ilist,
                                 jrange,
@@ -201,11 +186,11 @@ class DescrptSeROp : public OpKernel {
                                 array_int,
                                 array_longlong,
                                 array_double,
-                                avg_tensor.matrix<VALUETYPE>().data(),
-                                std_tensor.matrix<VALUETYPE>().data(),
-                                descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
+                                avg_tensor.matrix<T>().data(),
+                                std_tensor.matrix<T>().data(),
+                                descrpt_tensor->matrix<T>().data() + II * (nloc * ndescrpt),
+                                descrpt_deriv_tensor->matrix<T>().data() + II * (nloc * ndescrpt * 3),
+                                rij_tensor->matrix<T>().data() + II * (nloc * nnei * 3),
                                 nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
                                 ntypes,
                                 nloc,
@@ -244,4 +229,8 @@ class DescrptSeROp : public OpKernel {
     }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_GPU), DescrptSeROp);
\ No newline at end of file
+#define REGISTER_GPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint<T>("T"),                       \
+    DescrptSeROp<GPUDevice, T>); 
+REGISTER_GPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
index 29daaa53c8..3833de1f7b 100644
--- a/source/op/ewald_recp.cc
+++ b/source/op/ewald_recp.cc
@@ -16,30 +16,21 @@ typedef double VALUETYPE ;
 typedef float  VALUETYPE ;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("EwaldRecp")
-.Input("coord: double")
-.Input("charge: double")
-.Input("natoms: int32")
-.Input("box: double")
-.Attr("ewald_beta: float")
-.Attr("ewald_h: float")
-.Output("energy: double")
-.Output("force: double")
-.Output("virial: double");
-#else
+using CPUDevice = Eigen::ThreadPoolDevice;
+
 REGISTER_OP("EwaldRecp")
-.Input("coord: float")
-.Input("charge: float")
+.Attr("T: {float, double}")
+.Input("coord: T")
+.Input("charge: T")
 .Input("natoms: int32")
-.Input("box: float")
+.Input("box: T")
 .Attr("ewald_beta: float")
 .Attr("ewald_h: float")
-.Output("energy: float")
-.Output("force: float")
-.Output("virial: float");
-#endif
+.Output("energy: T")
+.Output("force: T")
+.Output("virial: T");
 
+template<typename Device, typename T>
 class EwaldRecpOp : public OpKernel {
 public:
   explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -90,12 +81,12 @@ class EwaldRecpOp : public OpKernel {
     Tensor* virial_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor));
     
-    auto coord	= coord_tensor	.flat<VALUETYPE>();
-    auto charge	= charge_tensor	.flat<VALUETYPE>();
-    auto box	= box_tensor	.flat<VALUETYPE>();
-    auto energy	= energy_tensor	->flat<VALUETYPE>();
-    auto force	= force_tensor	->matrix<VALUETYPE>();
-    auto virial	= virial_tensor	->matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.flat<T>();
+    auto charge	= charge_tensor	.flat<T>();
+    auto box	= box_tensor	.flat<T>();
+    auto energy	= energy_tensor	->flat<T>();
+    auto force	= force_tensor	->matrix<T>();
+    auto virial	= virial_tensor	->matrix<T>();
 
     for (int kk = 0; kk < nsamples; ++kk){
       int box_iter = kk * 9;
@@ -122,19 +113,19 @@ class EwaldRecpOp : public OpKernel {
 	  else if (inter[dd] >= 1) inter[dd] -= 1.;
 	}
       }
-      vector<VALUETYPE > d_coord3 (nloc*3);
+      vector<T > d_coord3 (nloc*3);
       for (int ii = 0; ii < nloc * 3; ++ii) {
 	d_coord3[ii] = d_coord3_[ii];
       }
 
       // set charge
-      vector<VALUETYPE > d_charge (nloc);
+      vector<T > d_charge (nloc);
       for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii);
 
       // prepare outputs vectors
-      VALUETYPE d_ener;
-      vector<VALUETYPE> d_force(nloc*3);
-      vector<VALUETYPE> d_virial(9);
+      T d_ener;
+      vector<T> d_force(nloc*3);
+      vector<T> d_virial(9);
 
       // compute
       EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep);
@@ -150,8 +141,11 @@ class EwaldRecpOp : public OpKernel {
     }
   }
 private:
-  EwaldParameters<VALUETYPE> ep;
+  EwaldParameters<T> ep;
 };
 
-REGISTER_KERNEL_BUILDER(Name("EwaldRecp").Device(DEVICE_CPU), EwaldRecpOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("EwaldRecp").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    EwaldRecpOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc
index 8aece7e384..34d4183f98 100644
--- a/source/op/gelu_gpu.cc
+++ b/source/op/gelu_gpu.cc
@@ -1,4 +1,3 @@
-#include <algorithm>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -8,7 +7,6 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-#define SQRT_2_PI 0.7978845608028654 
 REGISTER_OP("Gelu")
     .Attr("T: {float, double}")
     .Input("x: T")
@@ -27,76 +25,35 @@ REGISTER_OP("GeluGradGrad")
     .Input("x: T")
     .Output("output: T");
 
-#if GOOGLE_CUDA
 // maybe instead use cudnn activation forward 
-void GeluGPULauncher(const float * in, float * out, int const size);
-void GeluGPULauncher(const double * in, double * out, int const size);
+void GeluLauncher(const float * in, float * out, int const size);
+void GeluLauncher(const double * in, double * out, int const size);
 
-void GeluGradGPULauncher(const float * dy, const float * in, float * out, int const size);
-void GeluGradGPULauncher(const double * dy, const double * in, double * out, int const size);
+void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
+void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
 
-void GeluGradGradGPULauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
-void GeluGradGradGPULauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
-#endif // GOOGLE_CUDA
-
-template<typename T>
-void GeluCPULauncher(const T * in, T * out, int const size) {
-    for (int ii = 0; ii < size; ii++) {
-        out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])));
-    }
-}
-
-template<typename T>
-void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) {
-    for (int ii = 0; ii < size; ii++) {
-        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
-        out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
-    }
-}
-
-template <typename T>
-void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) {
-    for (int ii = 0; ii < size; ii++) {
-        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
-        T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
-	    out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
-    }
-}
+void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
+void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
 
 template <typename Device, typename T>
 struct GeluFunctor {
-    void operator()(const CPUDevice& d, const T * in, T * out, int const size) {
-		GeluCPULauncher(in, out, size);
+    void operator()(const Device& d, const T * in, T * out, int const size) {
+		GeluLauncher(in, out, size);
 	}
-    #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * in, T * out, int const size) {
-        GeluGPULauncher(in, out, size);
-    }
-    #endif
 };
 
 template <typename Device, typename T>
 struct GeluGradFunctor {
-    void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) {
-        GeluGradCPULauncher(dy, in, out, size);
+    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
+        GeluGradLauncher(dy, in, out, size);
     }
-    #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) {
-        GeluGradGPULauncher(dy, in, out, size);
-    }
-    #endif
 };
 
 template <typename Device, typename T>
 struct GeluGradGradFunctor {
-    void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
-        GeluGradGradCPULauncher(dy, dy_, in, out, size);
-    }
-    #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
-        GeluGradGradGPULauncher(dy, dy_, in, out, size);
+    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        GeluGradGradLauncher(dy, dy_, in, out, size);
     }
-    #endif
 };
 
 // OpKernel definition.
@@ -121,6 +78,7 @@ class GeluOp : public OpKernel {
 			output->flat<T>().data(),
 			static_cast<int>(output->NumElements())
 		);
+        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
@@ -149,6 +107,7 @@ class GeluGradOp : public OpKernel {
             output->flat<T>().data(),
             static_cast<int>(output->NumElements())
         );
+        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
@@ -179,27 +138,11 @@ class GeluGradGradOp : public OpKernel {
             output->flat<T>().data(),
             static_cast<int>(output->NumElements())
         );
+        // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
-#define REGISTER_CPU(T)                                                 \
-/* Declare explicit instantiations in kernel_example.cu.cc. */          \
-REGISTER_KERNEL_BUILDER(                                                \
-    Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
-    GeluOp<CPUDevice, T>);                                              \
-/* Declare explicit instantiations in kernel_example.cu.cc. */          \
-REGISTER_KERNEL_BUILDER(                                                \
-    Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
-    GeluGradOp<CPUDevice, T>);                                          \
-/* Declare explicit instantiations in kernel_example.cu.cc. */          \
-REGISTER_KERNEL_BUILDER(                                                \
-    Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
-    GeluGradGradOp<CPUDevice, T>);                                      
-REGISTER_CPU(float);
-REGISTER_CPU(double);
-
-#if GOOGLE_CUDA
-#define REGISTER_GPU(T)                                                 \
+#define REGISTER_GPU(T)                                                     \
     /* Declare explicit instantiations in kernel_example.cu.cc. */          \
     REGISTER_KERNEL_BUILDER(                                                \
         Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
@@ -214,4 +157,3 @@ REGISTER_CPU(double);
         GeluGradGradOp<GPUDevice, T>);                                      
     REGISTER_GPU(float);
     REGISTER_GPU(double);
-#endif // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
new file mode 100644
index 0000000000..2a5656e6d7
--- /dev/null
+++ b/source/op/gelu_multi_device.cc
@@ -0,0 +1,167 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("Gelu")
+    .Attr("T: {float, double}")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGradGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("dy_: T")
+    .Input("x: T")
+    .Output("output: T");
+
+template <typename T>
+struct GeluFunctor {
+    void operator()(const CPUDevice& d, const T * in, T * out, int const size) {
+		GeluCPULauncher(in, out, size);
+	}
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * in, T * out, int const size) {
+        GeluGPULauncher(in, out, size);
+    }
+    #endif
+};
+
+template <typename T>
+struct GeluGradFunctor {
+    void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) {
+        GeluGradCPULauncher(dy, in, out, size);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) {
+        GeluGradGPULauncher(dy, in, out, size);
+    }
+    #endif
+};
+
+template <typename T>
+struct GeluGradGradFunctor {
+    void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        GeluGradGradCPULauncher(dy, dy_, in, out, size);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        GeluGradGradGPULauncher(dy, dy_, in, out, size);
+    }
+    #endif
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluOp : public OpKernel {
+  public :
+    explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& x = context->input(0);
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluFunctor<T>()(
+			context->eigen_device<Device>(),
+			x.flat<T>().data(),
+			output->flat<T>().data(),
+			static_cast<int>(output->NumElements())
+		);
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluGradOp : public OpKernel {
+  public :
+    explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& x  = context->input(1);
+        
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradFunctor<T>()(
+            context->eigen_device<Device>(),
+            dy.flat<T>().data(),
+            x.flat<T>().data(),
+            output->flat<T>().data(),
+            static_cast<int>(output->NumElements())
+        );
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluGradGradOp : public OpKernel {
+  public :
+    explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+	
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& dy_ = context->input(1);
+        const Tensor& x  = context->input(2);
+	
+		Tensor * output = NULL;
+		int context_output_index = 0;	
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradGradFunctor<T>()(
+            context->eigen_device<Device>(),
+            dy.flat<T>().data(),
+            dy_.flat<T>().data(),
+            x.flat<T>().data(),
+            output->flat<T>().data(),
+            static_cast<int>(output->NumElements())
+        );
+    }
+};
+
+#define REGISTER_CPU(T)                                                 \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
+    GeluOp<CPUDevice, T>);                                              \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<CPUDevice, T>);                                          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<CPUDevice, T>);                                      
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                 \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
+    GeluOp<GPUDevice, T>);                                              \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<GPUDevice, T>);                                          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<GPUDevice, T>);                                      
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc
index e41b908d41..e1c6c362c8 100644
--- a/source/op/prod_force.cc
+++ b/source/op/prod_force.cc
@@ -12,30 +12,23 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForce")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("force: double");
-#else
 REGISTER_OP("ProdForce")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("force: float");
-#endif
+.Output("force: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class ProdForceOp : public OpKernel {
  public:
   explicit ProdForceOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -86,11 +79,11 @@ class ProdForceOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<T>();
+    auto in_deriv = in_deriv_tensor.flat<T>();
     auto nlist = nlist_tensor.flat<int>();
     auto axis = axis_tensor.flat<int>();
-    auto force = force_tensor->flat<VALUETYPE>();
+    auto force = force_tensor->flat<T>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -176,7 +169,10 @@ class ProdForceOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForce").Device(DEVICE_CPU), ProdForceOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForce").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc
index 14e884c5c9..48f55a977b 100644
--- a/source/op/prod_force_grad.cc
+++ b/source/op/prod_force_grad.cc
@@ -12,30 +12,21 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
 REGISTER_OP("ProdForceGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class ProdForceGradOp : public OpKernel 
 {
 public:
@@ -97,12 +88,12 @@ class ProdForceGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<T>();
+    auto net_deriv	= net_deriv_tensor	.flat<T>();
+    auto in_deriv	= in_deriv_tensor	.flat<T>();
     auto nlist		= nlist_tensor		.flat<int>();
     auto axis		= axis_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<T>();
 
     // loop over frames
 #pragma omp parallel for
@@ -190,4 +181,8 @@ class ProdForceGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceGrad").Device(DEVICE_CPU), ProdForceGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForceGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceGradOp<CPUDevice, T>);
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc
index af0e712492..c762bf71fd 100644
--- a/source/op/prod_force_se_a.cc
+++ b/source/op/prod_force_se_a.cc
@@ -12,28 +12,23 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceSeA")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("force: double");
-#else
 REGISTER_OP("ProdForceSeA")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("force: float");
-#endif
+.Output("force: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template<typename Device, typename T>
 class ProdForceSeAOp : public OpKernel {
  public:
   explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -83,10 +78,10 @@ class ProdForceSeAOp : public OpKernel {
 						     force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<T>();
+    auto in_deriv = in_deriv_tensor.flat<T>();
     auto nlist = nlist_tensor.flat<int>();
-    auto force = force_tensor->flat<VALUETYPE>();
+    auto force = force_tensor->flat<T>();
 
     assert (nframes == force_shape.dim_size(0));
     assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -155,7 +150,10 @@ class ProdForceSeAOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeA").Device(DEVICE_CPU), ProdForceSeAOp);
-
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeAOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc
index eda965974a..d884782f17 100644
--- a/source/op/prod_force_se_a_grad.cc
+++ b/source/op/prod_force_se_a_grad.cc
@@ -12,28 +12,20 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdForceSeAGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdForceSeAGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class ProdForceSeAGradOp : public OpKernel 
 {
 public:
@@ -158,4 +150,8 @@ class ProdForceSeAGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeAGrad").Device(DEVICE_CPU), ProdForceSeAGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForceSeAGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceSeAGradOp<CPUDevice, T>);
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a_gpu.cc b/source/op/prod_force_se_a_multi_device.cc
similarity index 80%
rename from source/op/prod_force_se_a_gpu.cc
rename to source/op/prod_force_se_a_multi_device.cc
index c5c6c5c6b7..a864617208 100644
--- a/source/op/prod_force_se_a_gpu.cc
+++ b/source/op/prod_force_se_a_multi_device.cc
@@ -1,27 +1,17 @@
 #include "common.h"
 #include "CustomeOperation.h"
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdForceSeA")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
+    .Attr("T: {float, double}")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
     .Input("nlist: int32")
     .Input("natoms: int32")
     .Attr("n_a_sel: int")
     .Attr("n_r_sel: int")
-    .Output("force: double");
-#else
-REGISTER_OP("ProdForceSeA")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Attr("n_a_sel: int")
-    .Attr("n_r_sel: int")
-    .Output("force: float");
-#endif
+    .Output("force: T");
 
-template <typename Device, typename T>
+template <typename T>
 struct ProdForceSeAFunctor {
     void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
         ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
@@ -33,7 +23,7 @@ struct ProdForceSeAFunctor {
     #endif // GOOGLE_CUDA
 };
 
-template<typename Device>
+template<typename Device, typename T>
 class ProdForceSeAOp : public OpKernel {
 public:
     explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -82,10 +72,10 @@ class ProdForceSeAOp : public OpKernel {
 	    					     force_shape, &force_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto net_deriv = net_deriv_tensor.flat<T>();
+        auto in_deriv = in_deriv_tensor.flat<T>();
         auto nlist = nlist_tensor.flat<int>();
-        auto force = force_tensor->flat<VALUETYPE>();
+        auto force = force_tensor->flat<T>();
 
         assert (nframes == force_shape.dim_size(0));
         assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -97,11 +87,11 @@ class ProdForceSeAOp : public OpKernel {
         assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
         assert (nnei * 4 == ndescrpt);	    
 
-        ProdForceSeAFunctor<Device, VALUETYPE>()(
+        ProdForceSeAFunctor<T>()(
             context->eigen_device<Device>(),
-            force_tensor->flat<VALUETYPE>().data(),
-            net_deriv_tensor.flat<VALUETYPE>().data(),
-            in_deriv_tensor.flat<VALUETYPE>().data(),
+            force_tensor->flat<T>().data(),
+            net_deriv_tensor.flat<T>().data(),
+            in_deriv_tensor.flat<T>().data(),
             nlist_tensor.flat<int>().data(),
             nloc,
             nall, 
@@ -116,17 +106,18 @@ class ProdForceSeAOp : public OpKernel {
 };
 
 // Register the CPU kernels.
-#define REGISTER_CPU()                                            \
-REGISTER_KERNEL_BUILDER(                                          \
-    Name("ProdForceSeA").Device(DEVICE_CPU),                      \
-    ProdForceSeAOp<CPUDevice>);
-REGISTER_CPU();
-
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 // Register the GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_GPU()                                            \
-REGISTER_KERNEL_BUILDER(                                          \
-    Name("ProdForceSeA").Device(DEVICE_GPU).HostMemory("natoms"), \
-    ProdForceSeAOp<GPUDevice>);
-REGISTER_GPU();
+#define REGISTER_GPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdForceSeAOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
 #endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc
index b4933c5b4a..a8e5b69162 100644
--- a/source/op/prod_force_se_r.cc
+++ b/source/op/prod_force_se_r.cc
@@ -13,22 +13,18 @@ typedef float  VALUETYPE;
 #endif
 
 REGISTER_OP("ProdForceSeR")
-#ifdef HIGH_PREC
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("force: double");
-#else
-.Input("net_deriv: float")
-.Input("in_deriv: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("force: float");
-#endif
+.Output("force: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class ProdForceSeROp : public OpKernel {
  public:
   explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -73,10 +69,10 @@ class ProdForceSeROp : public OpKernel {
 						     force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<T>();
+    auto in_deriv = in_deriv_tensor.flat<T>();
     auto nlist = nlist_tensor.flat<int>();
-    auto force = force_tensor->flat<VALUETYPE>();
+    auto force = force_tensor->flat<T>();
 
     assert (nframes == force_shape.dim_size(0));
     assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -126,7 +122,11 @@ class ProdForceSeROp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_CPU), ProdForceSeROp);
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeROp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_gpu.cc
index 3dc0bc7853..8f6ee24910 100644
--- a/source/op/prod_force_se_r_gpu.cc
+++ b/source/op/prod_force_se_r_gpu.cc
@@ -22,21 +22,16 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
     typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("ProdForceSeR")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("force: double");
-#else
 REGISTER_OP("ProdForceSeR")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("force: float");
-#endif
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("nlist: int32")
+.Input("natoms: int32")
+.Output("force: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 void ProdForceSeRLauncher(VALUETYPE * force, 
                         const VALUETYPE * net_deriv,
@@ -49,6 +44,7 @@ void ProdForceSeRLauncher(VALUETYPE * force,
                         const int n_a_sel,
                         const int n_a_shift);
 
+template<typename Device, typename T>
 class ProdForceSeROp : public OpKernel {
 public:
     explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -130,4 +126,9 @@ class ProdForceSeROp : public OpKernel {
     int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeR").Device(DEVICE_GPU), ProdForceSeROp);
\ No newline at end of file
+// Register the CPU kernels.
+#define REGISTER_GPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeROp<GPUDevice, T>); 
+REGISTER_GPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc
index 3866ef9b86..488492f699 100644
--- a/source/op/prod_force_se_r_grad.cc
+++ b/source/op/prod_force_se_r_grad.cc
@@ -12,24 +12,18 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdForceSeRGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdForceSeRGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class ProdForceSeRGradOp : public OpKernel 
 {
 public:
@@ -83,11 +77,11 @@ class ProdForceSeRGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<T>();
+    auto net_deriv	= net_deriv_tensor	.flat<T>();
+    auto in_deriv	= in_deriv_tensor	.flat<T>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<T>();
 
     // loop over frames
 #pragma omp parallel for
@@ -131,4 +125,8 @@ class ProdForceSeRGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdForceSeRGrad").Device(DEVICE_CPU), ProdForceSeRGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdForceSeRGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdForceSeRGradOp<CPUDevice, T>);
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc
index 55b0b4b60d..8f607e0cf0 100644
--- a/source/op/prod_virial.cc
+++ b/source/op/prod_virial.cc
@@ -12,36 +12,25 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("ProdVirial")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
 REGISTER_OP("ProdVirial")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class ProdVirialOp : public OpKernel {
  public:
   explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -100,13 +89,13 @@ class ProdVirialOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-    auto rij = rij_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<T>();
+    auto in_deriv = in_deriv_tensor.flat<T>();
+    auto rij = rij_tensor.flat<T>();
     auto nlist = nlist_tensor.flat<int>();
     auto axis = axis_tensor.flat<int>();
-    auto virial = virial_tensor->flat<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+    auto virial = virial_tensor->flat<T>();
+    auto atom_virial = atom_virial_tensor->flat<T>();
 
     // loop over samples
 #pragma omp parallel for
@@ -144,10 +133,10 @@ class ProdVirialOp : public OpKernel {
 	  if (j_idx < 0) continue;
 	  if (jj == axis_0) {
 	    for (int aa = 0; aa < ndescrpt; ++aa){
-	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
+		  T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -156,10 +145,10 @@ class ProdVirialOp : public OpKernel {
 	  }
 	  else if (jj == axis_1) {
 	    for (int aa = 0; aa < ndescrpt; ++aa){
-	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
+		  T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -170,10 +159,10 @@ class ProdVirialOp : public OpKernel {
 	    int aa_start, aa_end;
 	    make_descript_range (aa_start, aa_end, jj);
 	    for (int aa = aa_start; aa < aa_end; ++aa) {
-	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
+		  T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -201,7 +190,10 @@ class ProdVirialOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirial").Device(DEVICE_CPU), ProdVirialOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdVirialOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc
index 5257467029..3d8c7e4639 100644
--- a/source/op/prod_virial_grad.cc
+++ b/source/op/prod_virial_grad.cc
@@ -12,32 +12,22 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("ProdVirialGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("axis: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
 REGISTER_OP("ProdVirialGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("axis: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class ProdVirialGradOp : public OpKernel 
 {
 public:
@@ -200,4 +190,8 @@ class ProdVirialGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialGrad").Device(DEVICE_CPU), ProdVirialGradOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdVirialGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdVirialGradOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc
index 2f71d37505..d975913d88 100644
--- a/source/op/prod_virial_se_a.cc
+++ b/source/op/prod_virial_se_a.cc
@@ -12,34 +12,24 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeA")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
-REGISTER_OP("ProdVirialSeA")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template<typename Device, typename T>
 class ProdVirialSeAOp : public OpKernel {
  public:
   explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -95,12 +85,12 @@ class ProdVirialSeAOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-    auto rij = rij_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<T>();
+    auto in_deriv = in_deriv_tensor.flat<T>();
+    auto rij = rij_tensor.flat<T>();
     auto nlist = nlist_tensor.flat<int>();
-    auto virial = virial_tensor->flat<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+    auto virial = virial_tensor->flat<T>();
+    auto atom_virial = atom_virial_tensor->flat<T>();
 
     // loop over samples
 #pragma omp parallel for
@@ -131,10 +121,10 @@ class ProdVirialSeAOp : public OpKernel {
 	  int aa_start, aa_end;
 	  make_descript_range (aa_start, aa_end, jj);
 	  for (int aa = aa_start; aa < aa_end; ++aa) {
-	    VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	    T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	    for (int dd0 = 0; dd0 < 3; ++dd0){
 	      for (int dd1 = 0; dd1 < 3; ++dd1){
-		VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
+		T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
 		virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 		atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	      }
@@ -161,7 +151,12 @@ class ProdVirialSeAOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeA").Device(DEVICE_CPU), ProdVirialSeAOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
 
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index 660f652566..0f506fc51a 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -12,30 +12,21 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeAGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdVirialSeAGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class ProdVirialSeAGradOp : public OpKernel 
 {
 public:
@@ -98,12 +89,12 @@ class ProdVirialSeAGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
-    auto rij		= rij_tensor		.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<T>();
+    auto net_deriv	= net_deriv_tensor	.flat<T>();
+    auto in_deriv	= in_deriv_tensor	.flat<T>();
+    auto rij		= rij_tensor		.flat<T>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<T>();
 
     // loop over frames
 #pragma omp parallel for
@@ -162,4 +153,9 @@ class ProdVirialSeAGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeAGrad").Device(DEVICE_CPU), ProdVirialSeAGradOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeAGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAGradOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a_gpu.cc b/source/op/prod_virial_se_a_multi_device.cc
similarity index 77%
rename from source/op/prod_virial_se_a_gpu.cc
rename to source/op/prod_virial_se_a_multi_device.cc
index 34a7a03574..21cd78c83c 100644
--- a/source/op/prod_virial_se_a_gpu.cc
+++ b/source/op/prod_virial_se_a_multi_device.cc
@@ -1,31 +1,19 @@
 #include "common.h"
 #include "CustomeOperation.h"
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeA")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("rij: double")
+    .Attr("T: {float, double}")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("rij: T")
     .Input("nlist: int32")
     .Input("natoms: int32")
     .Attr("n_a_sel: int")
     .Attr("n_r_sel: int")
-    .Output("virial: double")
-    .Output("atom_virial: double");
-#else
-REGISTER_OP("ProdVirialSeA")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("rij: float")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Attr("n_a_sel: int")
-    .Attr("n_r_sel: int")
-    .Output("virial: float")
-    .Output("atom_virial: float");
-#endif
+    .Output("virial: T")
+    .Output("atom_virial: T");
 
-template<typename Device, typename T>
+template<typename T>
 struct ProdVirialSeAFunctor {
     void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
         ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
@@ -37,7 +25,7 @@ struct ProdVirialSeAFunctor {
     #endif // GOOGLE_CUDA
 };
 
-template<typename Device>
+template<typename Device, typename T>
 class ProdVirialSeAOp : public OpKernel {
  public:
     explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -92,20 +80,20 @@ class ProdVirialSeAOp : public OpKernel {
         OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-        auto rij = rij_tensor.flat<VALUETYPE>();
+        auto net_deriv = net_deriv_tensor.flat<T>();
+        auto in_deriv = in_deriv_tensor.flat<T>();
+        auto rij = rij_tensor.flat<T>();
         auto nlist = nlist_tensor.flat<int>();
-        auto virial = virial_tensor->flat<VALUETYPE>();
-        auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+        auto virial = virial_tensor->flat<T>();
+        auto atom_virial = atom_virial_tensor->flat<T>();
         
-        ProdVirialSeAFunctor<Device, VALUETYPE>()(
+        ProdVirialSeAFunctor<T>()(
             context->eigen_device<Device>(),
-            virial_tensor->flat<VALUETYPE>().data(), 
-            atom_virial_tensor->flat<VALUETYPE>().data(),
-            net_deriv_tensor.flat<VALUETYPE>().data(),
-            in_deriv_tensor.flat<VALUETYPE>().data(),
-            rij_tensor.flat<VALUETYPE>().data(),
+            virial_tensor->flat<T>().data(), 
+            atom_virial_tensor->flat<T>().data(),
+            net_deriv_tensor.flat<T>().data(),
+            in_deriv_tensor.flat<T>().data(),
+            rij_tensor.flat<T>().data(),
             nlist_tensor.flat<int>().data(),
             nloc,
             nall,
@@ -120,17 +108,18 @@ class ProdVirialSeAOp : public OpKernel {
 };
 
 // Register the CPU kernels.
-#define REGISTER_CPU()                                             \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("ProdVirialSeA").Device(DEVICE_CPU),                      \
-    ProdVirialSeAOp<CPUDevice>);
-REGISTER_CPU();
-
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeAOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 // Register the GPU kernels.
 #if GOOGLE_CUDA
-#define REGISTER_GPU()                                             \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("ProdVirialSeA").Device(DEVICE_GPU).HostMemory("natoms"), \
-    ProdVirialSeAOp<GPUDevice>);
-REGISTER_GPU();
+#define REGISTER_GPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdVirialSeAOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
 #endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc
index 1d21234421..63b74398ea 100644
--- a/source/op/prod_virial_se_r.cc
+++ b/source/op/prod_virial_se_r.cc
@@ -12,30 +12,21 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeR")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
-REGISTER_OP("ProdVirialSeR")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class ProdVirialSeROp : public OpKernel {
  public:
   explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -87,12 +78,12 @@ class ProdVirialSeROp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-    auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-    auto rij = rij_tensor.flat<VALUETYPE>();
+    auto net_deriv = net_deriv_tensor.flat<T>();
+    auto in_deriv = in_deriv_tensor.flat<T>();
+    auto rij = rij_tensor.flat<T>();
     auto nlist = nlist_tensor.flat<int>();
-    auto virial = virial_tensor->flat<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
+    auto virial = virial_tensor->flat<T>();
+    auto atom_virial = atom_virial_tensor->flat<T>();
 
     // loop over samples
 #pragma omp parallel for
@@ -119,10 +110,10 @@ class ProdVirialSeROp : public OpKernel {
 	for (int jj = 0; jj < nnei; ++jj){
 	  int j_idx = nlist (nlist_iter + i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
-	  VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
+	  T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
-	      VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
+	      T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
 	      virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 	      atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	    }
@@ -133,7 +124,11 @@ class ProdVirialSeROp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_CPU), ProdVirialSeROp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdVirialSeROp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
 
diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_gpu.cc
index 91f965b72c..6324bcf88d 100644
--- a/source/op/prod_virial_se_r_gpu.cc
+++ b/source/op/prod_virial_se_r_gpu.cc
@@ -10,28 +10,21 @@
     typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeR")
-    .Input("net_deriv: double")
-    .Input("in_deriv: double")
-    .Input("rij: double")
+    .Attr("T: {float, double}")
+    .Input("net_deriv: T")
+    .Input("in_deriv: T")
+    .Input("rij: T")
     .Input("nlist: int32")
     .Input("natoms: int32")
-    .Output("virial: double")
-    .Output("atom_virial: double");
-#else
-REGISTER_OP("ProdVirialSeR")
-    .Input("net_deriv: float")
-    .Input("in_deriv: float")
-    .Input("rij: float")
-    .Input("nlist: int32")
-    .Input("natoms: int32")
-    .Output("virial: float")
-    .Output("atom_virial: float");
-#endif
+    .Output("virial: T")
+    .Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
 {
@@ -55,6 +48,7 @@ void ProdVirialSeRLauncher(VALUETYPE * virial,
                         const int n_a_sel,
                         const int n_a_shift);
 
+template<typename Device, typename T>
 class ProdVirialSeROp : public OpKernel {
  public:
     explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -134,4 +128,9 @@ class ProdVirialSeROp : public OpKernel {
     int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeR").Device(DEVICE_GPU), ProdVirialSeROp);
\ No newline at end of file
+// Register the GPU kernels.
+#define REGISTER_GPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeROp<GPUDevice, T>); 
+REGISTER_GPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index 20b53cf3c9..b125c9d783 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -12,26 +12,19 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("ProdVirialSeRGrad")
-.Input("grad: double")
-.Input("net_deriv: double")
-.Input("in_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("net_deriv: T")
+.Input("in_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Output("grad_net: double");
-#else
-REGISTER_OP("ProdVirialSeRGrad")
-.Input("grad: float")
-.Input("net_deriv: float")
-.Input("in_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class ProdVirialSeRGradOp : public OpKernel 
 {
 public:
@@ -135,4 +128,9 @@ class ProdVirialSeRGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ProdVirialSeRGrad").Device(DEVICE_CPU), ProdVirialSeRGradOp);
+// Register the GPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeRGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeRGradOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc
index 6b23305d3e..17ab6bc1f5 100644
--- a/source/op/soft_min.cc
+++ b/source/op/soft_min.cc
@@ -14,23 +14,10 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("SoftMinSwitch")
-.Input("type: int32")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Attr("alpha: float")
-.Attr("rmin: float")
-.Attr("rmax: float")
-.Output("sw_value: double")
-.Output("sw_deriv: double");
-#else
 REGISTER_OP("SoftMinSwitch")
+.Attr("T: {float, double}")
 .Input("type: int32")
-.Input("rij: float")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("sel_a: list(int)")
@@ -38,12 +25,14 @@ REGISTER_OP("SoftMinSwitch")
 .Attr("alpha: float")
 .Attr("rmin: float")
 .Attr("rmax: float")
-.Output("sw_value: float")
-.Output("sw_deriv: float");
-#endif
+.Output("sw_value: T")
+.Output("sw_deriv: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class SoftMinSwitchOp : public OpKernel {
  public:
   explicit SoftMinSwitchOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -106,10 +95,10 @@ class SoftMinSwitchOp : public OpKernel {
     
     // flat the tensors
     auto type	= type_tensor	.matrix<int>();
-    auto rij	= rij_tensor	.matrix<VALUETYPE>();
+    auto rij	= rij_tensor	.matrix<T>();
     auto nlist	= nlist_tensor	.matrix<int>();
-    auto sw_value = sw_value_tensor	->matrix<VALUETYPE>();
-    auto sw_deriv = sw_deriv_tensor	->matrix<VALUETYPE>();
+    auto sw_value = sw_value_tensor	->matrix<T>();
+    auto sw_deriv = sw_deriv_tensor	->matrix<T>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -126,26 +115,26 @@ class SoftMinSwitchOp : public OpKernel {
       // compute force of a frame      
       for (int ii = 0; ii < nloc; ++ii){
 	int i_idx = ii;
-	VALUETYPE aa = 0;
-	VALUETYPE bb = 0;
+	T aa = 0;
+	T bb = 0;
 	for (int jj = 0; jj < nnei; ++jj){
 	  int j_idx = nlist (kk, i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
 	  int rij_idx_shift = (i_idx * nnei + jj) * 3;
-	  VALUETYPE dr[3] = {
+	  T dr[3] = {
 	    rij(kk, rij_idx_shift + 0),
 	    rij(kk, rij_idx_shift + 1),
 	    rij(kk, rij_idx_shift + 2)
 	  };
-	  VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
-	  VALUETYPE rr = sqrt(rr2);
-	  VALUETYPE ee = exp(-rr / alpha);
+	  T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
+	  T rr = sqrt(rr2);
+	  T ee = exp(-rr / alpha);
 	  aa += ee;
 	  bb += rr * ee;
 	}
-	VALUETYPE smin = bb / aa;
-	VALUETYPE vv, dd;
-	spline5_switch(vv, dd, smin, static_cast<VALUETYPE>(rmin), static_cast<VALUETYPE>(rmax));
+	T smin = bb / aa;
+	T vv, dd;
+	spline5_switch(vv, dd, smin, static_cast<T>(rmin), static_cast<T>(rmax));
 	// value of switch
 	sw_value(kk, i_idx) = vv;
 	// deriv of switch distributed as force
@@ -153,17 +142,17 @@ class SoftMinSwitchOp : public OpKernel {
 	  int j_idx = nlist (kk, i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
 	  int rij_idx_shift = (ii * nnei + jj) * 3;
-	  VALUETYPE dr[3] = {
+	  T dr[3] = {
 	    rij(kk, rij_idx_shift + 0),
 	    rij(kk, rij_idx_shift + 1),
 	    rij(kk, rij_idx_shift + 2)
 	  };
-	  VALUETYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
-	  VALUETYPE rr = sqrt(rr2);
-	  VALUETYPE ee = exp(-rr / alpha);
-	  VALUETYPE pref_c = (1./rr - 1./alpha) * ee ;
-	  VALUETYPE pref_d = 1./(rr * alpha) * ee;
-	  VALUETYPE ts;
+	  T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
+	  T rr = sqrt(rr2);
+	  T ee = exp(-rr / alpha);
+	  T pref_c = (1./rr - 1./alpha) * ee ;
+	  T pref_d = 1./(rr * alpha) * ee;
+	  T ts;
 	  ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d);
 	  sw_deriv(kk, rij_idx_shift + 0) += ts * dr[0];
 	  sw_deriv(kk, rij_idx_shift + 1) += ts * dr[1];
@@ -196,7 +185,11 @@ class SoftMinSwitchOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinSwitch").Device(DEVICE_CPU), SoftMinSwitchOp);
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinSwitch").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinSwitchOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc
index e51aadbc79..43344e7352 100644
--- a/source/op/soft_min_force.cc
+++ b/source/op/soft_min_force.cc
@@ -12,28 +12,22 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("SoftMinForce")
-.Input("du: double")
-.Input("sw_deriv: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("force: double");
-#else
 REGISTER_OP("SoftMinForce")
-.Input("du: float")
-.Input("sw_deriv: float")
+.Attr("T: {float, double}")
+.Input("du: T")
+.Input("sw_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("force: float");
-#endif
+.Output("force: T");
+
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class SoftMinForceOp : public OpKernel {
  public:
   explicit SoftMinForceOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -78,10 +72,10 @@ class SoftMinForceOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor));
     
     // flat the tensors
-    auto du = du_tensor.matrix<VALUETYPE>();
-    auto sw_deriv = sw_deriv_tensor.matrix<VALUETYPE>();
+    auto du = du_tensor.matrix<T>();
+    auto sw_deriv = sw_deriv_tensor.matrix<T>();
     auto nlist = nlist_tensor.matrix<int>();
-    auto force = force_tensor->matrix<VALUETYPE>();
+    auto force = force_tensor->matrix<T>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -118,4 +112,9 @@ class SoftMinForceOp : public OpKernel {
   int n_r_sel, n_a_sel;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinForce").Device(DEVICE_CPU), SoftMinForceOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinForce").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinForceOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc
index 4c8a4b21ff..bfaa8aca76 100644
--- a/source/op/soft_min_force_grad.cc
+++ b/source/op/soft_min_force_grad.cc
@@ -12,28 +12,20 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("SoftMinForceGrad")
-.Input("grad: double")
-.Input("du: double")
-.Input("sw_deriv: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("du: T")
+.Input("sw_deriv: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("SoftMinForceGrad")
-.Input("grad: float")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class SoftMinForceGradOp : public OpKernel 
 {
 public:
@@ -89,11 +81,11 @@ class SoftMinForceGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.matrix<VALUETYPE>();
-    auto du		= du_tensor		.matrix<VALUETYPE>();
-    auto sw_deriv	= sw_deriv_tensor	.matrix<VALUETYPE>();
+    auto grad		= grad_tensor		.matrix<T>();
+    auto du		= du_tensor		.matrix<T>();
+    auto sw_deriv	= sw_deriv_tensor	.matrix<T>();
     auto nlist		= nlist_tensor		.matrix<int>();
-    auto grad_net	= grad_net_tensor	->matrix<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->matrix<T>();
 
     // loop over frames
 #pragma omp parallel for
@@ -125,4 +117,9 @@ class SoftMinForceGradOp : public OpKernel
   int n_r_sel, n_a_sel;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinForceGrad").Device(DEVICE_CPU), SoftMinForceGradOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinForceGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinForceGradOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc
index 193c34f981..7829c4b7ed 100644
--- a/source/op/soft_min_virial.cc
+++ b/source/op/soft_min_virial.cc
@@ -12,34 +12,23 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("SoftMinVirial")
-.Input("du: double")
-.Input("sw_deriv: double")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("virial: double")
-.Output("atom_virial: double")
-;
-#else
 REGISTER_OP("SoftMinVirial")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("rij: float")
+.Attr("T: {float, double}")
+.Input("du: T")
+.Input("sw_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("virial: float")
-.Output("atom_virial: float")
-;
-#endif
+.Output("virial: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template<typename Device, typename T>
 class SoftMinVirialOp : public OpKernel {
  public:
   explicit SoftMinVirialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -135,7 +124,11 @@ class SoftMinVirialOp : public OpKernel {
   int n_r_sel, n_a_sel;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinVirial").Device(DEVICE_CPU), SoftMinVirialOp);
-
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinVirialOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc
index 6f8703bdee..b454612895 100644
--- a/source/op/soft_min_virial_grad.cc
+++ b/source/op/soft_min_virial_grad.cc
@@ -12,30 +12,21 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
 REGISTER_OP("SoftMinVirialGrad")
-.Input("grad: double")
-.Input("du: double")
-.Input("sw_deriv: double")
-.Input("rij: double")
+.Attr("T: {float, double}")
+.Input("grad: T")
+.Input("du: T")
+.Input("sw_deriv: T")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("grad_net: double");
-#else
-REGISTER_OP("SoftMinVirialGrad")
-.Input("grad: float")
-.Input("du: float")
-.Input("sw_deriv: float")
-.Input("rij: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("grad_net: float");
-#endif
+.Output("grad_net: T");
+
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+template<typename Device, typename T>
 class SoftMinVirialGradOp : public OpKernel 
 {
 public:
@@ -97,12 +88,12 @@ class SoftMinVirialGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.matrix<VALUETYPE>();
-    auto du		= du_tensor		.matrix<VALUETYPE>();
-    auto sw_deriv	= sw_deriv_tensor	.matrix<VALUETYPE>();
-    auto rij		= rij_tensor		.matrix<VALUETYPE>();
+    auto grad		= grad_tensor		.matrix<T>();
+    auto du		= du_tensor		.matrix<T>();
+    auto sw_deriv	= sw_deriv_tensor	.matrix<T>();
+    auto rij		= rij_tensor		.matrix<T>();
     auto nlist		= nlist_tensor		.matrix<int>();
-    auto grad_net	= grad_net_tensor	->matrix<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->matrix<T>();
 
     // loop over frames
 #pragma omp parallel for
@@ -148,4 +139,9 @@ class SoftMinVirialGradOp : public OpKernel
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SoftMinVirialGrad").Device(DEVICE_CPU), SoftMinVirialGradOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("SoftMinVirialGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    SoftMinVirialGradOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/tab_inter.cc b/source/op/tab_inter.cc
index 242e52a6e7..5e86608e9a 100644
--- a/source/op/tab_inter.cc
+++ b/source/op/tab_inter.cc
@@ -12,38 +12,25 @@ typedef double VALUETYPE;
 typedef float  VALUETYPE;
 #endif
 
-#ifdef HIGH_PREC
-REGISTER_OP("TabInter")
-.Input("table_info: double")
-.Input("table_data: double")
-.Input("type: int32")
-.Input("rij: double")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Input("scale: double")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("atom_energy: double")
-.Output("force: double")
-.Output("atom_virial: double");
-#else
 REGISTER_OP("TabInter")
+.Attr("T: {float, double}")
 .Input("table_info: double")
 .Input("table_data: double")
 .Input("type: int32")
-.Input("rij: float")
+.Input("rij: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
-.Input("scale: float")
+.Input("scale: T")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("atom_energy: float")
-.Output("force: float")
-.Output("atom_virial: float");
-#endif
+.Output("atom_energy: T")
+.Output("force: T")
+.Output("atom_virial: T");
 
 using namespace tensorflow;
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+
 inline 
 void tabulated_inter (double & ener, 
 		      double & fscale, 
@@ -86,6 +73,7 @@ void tabulated_inter (double & ener,
   fscale *= -hi;
 }
 
+template<typename Device, typename T>
 class TabInterOp : public OpKernel {
  public:
   explicit TabInterOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -156,15 +144,15 @@ class TabInterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(tmp_idx++, virial_shape, &virial_tensor));
     
     // flat the tensors
-    auto table_info = table_info_tensor.flat<VALUETYPE>();
-    auto table_data = table_data_tensor.flat<VALUETYPE>();
+    auto table_info = table_info_tensor.flat<T>();
+    auto table_data = table_data_tensor.flat<T>();
     auto type	= type_tensor	.matrix<int>();
-    auto rij	= rij_tensor	.matrix<VALUETYPE>();
+    auto rij	= rij_tensor	.matrix<T>();
     auto nlist	= nlist_tensor	.matrix<int>();
-    auto scale  = scale_tensor	.matrix<VALUETYPE>();
-    auto energy = energy_tensor	->matrix<VALUETYPE>();
-    auto force	= force_tensor	->matrix<VALUETYPE>();
-    auto virial = virial_tensor	->matrix<VALUETYPE>();
+    auto scale  = scale_tensor	.matrix<T>();
+    auto energy = energy_tensor	->matrix<T>();
+    auto force	= force_tensor	->matrix<T>();
+    auto virial = virial_tensor	->matrix<T>();
 
     OP_REQUIRES (context, (ntypes == int(table_info(3)+0.1)),	errors::InvalidArgument ("ntypes provided in table does not match deeppot"));
     int nspline = table_info(2)+0.1;
@@ -203,7 +191,7 @@ class TabInterOp : public OpKernel {
       for (int tt = 0; tt < ntypes; ++tt) {
 	for (int ii = 0; ii < natoms(2+tt); ++ii){
 	  int i_type = type(kk, i_idx);
-	  VALUETYPE i_scale = scale(kk, i_idx);
+	  T i_scale = scale(kk, i_idx);
 	  assert(i_type == tt) ;
 	  int jiter = 0;
 	  // a neighbor
@@ -318,7 +306,12 @@ class TabInterOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("TabInter").Device(DEVICE_CPU), TabInterOp);
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("TabInter").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    TabInterOp<CPUDevice, T>); 
+REGISTER_CPU(VALUETYPE);
 
 
 

From 4dd842c9e7b0cccf59221fd497da623cb18f5024 Mon Sep 17 00:00:00 2001
From: Lu <Eric@LuDhMacBook-Pro.local>
Date: Sun, 13 Sep 2020 16:14:09 +0800
Subject: [PATCH 03/65] use standard styles for custome operators

---
 source/lib/include/CustomeOperation.h         | 334 ++++++++++---
 source/lib/include/DeviceFunctor.h            |  39 +-
 source/lib/src/NNPInter.cc                    |   4 +-
 source/op/CMakeLists.txt                      |   5 +-
 source/op/cuda/descrpt_se_a.cu                |  88 ++--
 source/op/cuda/descrpt_se_r.cu                | 455 ++++++++++++------
 source/op/cuda/gelu.cu                        |  30 +-
 source/op/cuda/prod_force_se_a.cu             |  26 +-
 source/op/cuda/prod_force_se_r.cu             |  76 ++-
 source/op/cuda/prod_virial_se_a.cu            |  28 +-
 source/op/cuda/prod_virial_se_r.cu            |  67 +--
 source/op/descrpt.cc                          |  27 +-
 source/op/descrpt_se_a.cc                     |  25 +-
 source/op/descrpt_se_a_multi_device.cc        |  58 +--
 source/op/descrpt_se_r.cc                     |  25 +-
 source/op/descrpt_se_r_gpu.cc                 | 236 ---------
 source/op/descrpt_se_r_multi_device.cc        | 297 ++++++++++++
 source/op/ewald_recp.cc                       |  35 +-
 source/op/gelu.cc                             | 132 +++--
 source/op/gelu_gpu.cc                         | 159 ------
 source/op/gelu_multi_device.cc                |  54 +--
 source/op/prod_force.cc                       |  17 +-
 source/op/prod_force_grad.cc                  |  19 +-
 source/op/prod_force_se_a.cc                  |  17 +-
 source/op/prod_force_se_a_grad.cc             |  19 +-
 source/op/prod_force_se_a_multi_device.cc     |  22 +-
 source/op/prod_force_se_r.cc                  |  17 +-
 source/op/prod_force_se_r_grad.cc             |  19 +-
 ...gpu.cc => prod_force_se_r_multi_device.cc} | 109 ++---
 source/op/prod_virial.cc                      |  33 +-
 source/op/prod_virial_grad.cc                 |  21 +-
 source/op/prod_virial_se_a.cc                 |  24 +-
 source/op/prod_virial_se_a_grad.cc            |  21 +-
 source/op/prod_virial_se_a_multi_device.cc    |  30 +-
 source/op/prod_virial_se_r.cc                 |  24 +-
 source/op/prod_virial_se_r_grad.cc            |  21 +-
 ...pu.cc => prod_virial_se_r_multi_device.cc} | 124 ++---
 source/op/soft_min.cc                         |  48 +-
 source/op/soft_min_force.cc                   |  18 +-
 source/op/soft_min_force_grad.cc              |  19 +-
 source/op/soft_min_virial.cc                  |  23 +-
 source/op/soft_min_virial_grad.cc             |  20 +-
 source/op/tab_inter.cc                        |  26 +-
 43 files changed, 1513 insertions(+), 1378 deletions(-)
 delete mode 100644 source/op/descrpt_se_r_gpu.cc
 create mode 100644 source/op/descrpt_se_r_multi_device.cc
 delete mode 100644 source/op/gelu_gpu.cc
 rename source/op/{prod_force_se_r_gpu.cc => prod_force_se_r_multi_device.cc} (55%)
 rename source/op/{prod_virial_se_r_gpu.cc => prod_virial_se_r_multi_device.cc} (53%)

diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index c0bd8fdbd5..c446db8130 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -23,11 +23,11 @@ struct NeighborInfo {
     }
 };
 
-template <typename T>
+template <typename FPTYPE>
 inline void spline5_switch (
-    T & vv,
-	T & dd,
-	const T & xx, 
+    FPTYPE & vv,
+	FPTYPE & dd,
+	const FPTYPE & xx, 
 	const float & rmin, 
 	const float & rmax)
 {
@@ -36,8 +36,8 @@ inline void spline5_switch (
         vv = 1;
     }
     else if (xx < rmax) {
-        T uu = (xx - rmin) / (rmax - rmin) ;
-        T du = 1. / (rmax - rmin) ;
+        FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+        FPTYPE du = 1. / (rmax - rmin) ;
         vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
         dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
     }
@@ -47,10 +47,10 @@ inline void spline5_switch (
     }
 }
 
-template<typename T> 
+template<typename FPTYPE> 
 int format_nlist_fill_se_a_cpu (
     vector<int > &		    fmt_nei_idx_a,
-	const vector<T > &	    posi,
+	const vector<FPTYPE > &	    posi,
 	const int &			    ntypes,
 	const vector<int > &    type,
 	const int &			    i_idx,
@@ -67,12 +67,12 @@ int format_nlist_fill_se_a_cpu (
     vector<NeighborInfo > sel_nei;
     sel_nei.reserve (nei_idx_a.size());
     for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
-        T diff[3];
+        FPTYPE diff[3];
         const int & j_idx = nei_idx[kk];
         for (int dd = 0; dd < 3; ++dd) {
             diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
         }
-        T rr = sqrt(MathUtilities::dot<T> (diff, diff));    
+        FPTYPE rr = sqrt(MathUtilities::dot<FPTYPE> (diff, diff));    
         if (rr <= rcut) {
             sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx));
         }
@@ -90,12 +90,12 @@ int format_nlist_fill_se_a_cpu (
     return overflowed;
 }
 
-template<typename T> 
+template<typename FPTYPE> 
 void compute_descriptor_se_a_cpu (
-    vector<T > &	        descrpt_a,
-	vector<T > &	        descrpt_a_deriv,
-	vector<T > &	        rij_a,
-	const vector<T > &	    posi,
+    vector<FPTYPE > &	        descrpt_a,
+	vector<FPTYPE > &	        descrpt_a_deriv,
+	vector<FPTYPE > &	        rij_a,
+	const vector<FPTYPE > &	    posi,
 	const int &				ntypes,
 	const vector<int > &	type,
 	const int &				i_idx,
@@ -127,14 +127,14 @@ void compute_descriptor_se_a_cpu (
     for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
         for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
             if (fmt_nlist_a[nei_iter] < 0) break;
-            const T * rr = &rij_a[nei_iter * 3];
-            T nr2 = MathUtilities::dot(rr, rr);
-            T inr = 1./sqrt(nr2);
-            T nr = nr2 * inr;
-            T inr2 = inr * inr;
-            T inr4 = inr2 * inr2;
-            T inr3 = inr4 * nr;
-            T sw, dsw;
+            const FPTYPE * rr = &rij_a[nei_iter * 3];
+            FPTYPE nr2 = MathUtilities::dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
             spline5_switch(sw, dsw, nr, rmin, rmax);
             int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
             int idx_value = nei_iter * 4;	// 4 components
@@ -168,10 +168,10 @@ void compute_descriptor_se_a_cpu (
     }
 }
 
-template<typename T>
-void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+template<typename FPTYPE>
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
     // set & normalize coord
-    std::vector<T> d_coord3(nall * 3);
+    std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
 	    for (int dd = 0; dd < 3; ++dd) {
 	        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
@@ -205,11 +205,11 @@ void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist,
 	    if (fill_nei_a) {
 	        format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
 	    }
-	    std::vector<T> d_descrpt_a;
-	    std::vector<T> d_descrpt_a_deriv;
-	    std::vector<T> d_descrpt_r;
-	    std::vector<T> d_descrpt_r_deriv;
-	    std::vector<T> d_rij_a;
+	    std::vector<FPTYPE> d_descrpt_a;
+	    std::vector<FPTYPE> d_descrpt_a_deriv;
+	    std::vector<FPTYPE> d_descrpt_r;
+	    std::vector<FPTYPE> d_descrpt_r_deriv;
+	    std::vector<FPTYPE> d_rij_a;
 	    compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
 
 	    // check sizes
@@ -234,9 +234,9 @@ void DescrptSeACPULauncher(const T * coord, const int * type, const int * ilist,
 }
 
 #if GOOGLE_CUDA
-template<typename T>
-void DescrptSeAGPULauncher(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeAGPUExecuteFunctor<T>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+template<typename FPTYPE>
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
@@ -254,9 +254,9 @@ inline void make_descript_range (int & idx_start, int & idx_end, const int & nei
     }
 }
 
-template<typename T>
-void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
-    memset(force, 0.0, sizeof(T) * nall * 3);
+template<typename FPTYPE>
+void ProdForceSeACPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
     // compute force of a frame
     for (int i_idx = 0; i_idx < nloc; ++i_idx) {
 	    // deriv wrt center atom
@@ -281,9 +281,9 @@ void ProdForceSeACPULauncher(T * force, const T * net_deriv, const T * in_deriv,
 }
 
 #if GOOGLE_CUDA
-template<typename T>
-void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
-    ProdForceSeAGPUExecuteFunctor<T>()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+template<typename FPTYPE>
+void ProdForceSeAGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    ProdForceSeAGPUExecuteFunctor<FPTYPE>()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
 #endif // GOOGLE_CUDA
 
@@ -291,10 +291,10 @@ void ProdForceSeAGPULauncher(T * force, const T * net_deriv, const T * in_deriv,
 // end of custome op ProdForceSeA
 // ******************************************************************************
 
-template<typename T>
-void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
-    memset(virial, 0.0, sizeof(T) * 9);
-    memset(atom_virial, 0.0, sizeof(T) * nall * 9);
+template<typename FPTYPE>
+void ProdVirialSeACPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    memset(virial, 0.0, sizeof(FPTYPE) * 9);
+    memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9);
 
     // compute virial of a frame
     for (int i_idx = 0; i_idx < nloc; ++i_idx) {
@@ -305,10 +305,10 @@ void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv,
 	        int aa_start, aa_end;
 	        make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
 	        for (int aa = aa_start; aa < aa_end; ++aa) {
-	            T pref = -1.0 * net_deriv[i_idx * ndescrpt + aa];
+	            FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa];
 	            for (int dd0 = 0; dd0 < 3; ++dd0)
 	                for (int dd1 = 0; dd1 < 3; ++dd1) {
-		                T tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
+		                FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
 		                virial[dd0 * 3 + dd1] -= tmp_v;
 		                atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
 	                }
@@ -318,55 +318,255 @@ void ProdVirialSeACPULauncher(T * virial, T * atom_virial, const T * net_deriv,
 }
 
 #if GOOGLE_CUDA
-template<typename T>
-void ProdVirialSeAGPULauncher(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
-    ProdVirialSeAGPUExecuteFunctor<T>()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+template<typename FPTYPE>
+void ProdVirialSeAGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    ProdVirialSeAGPUExecuteFunctor<FPTYPE>()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
 // end of custome op ProdVirialSeA
 // ******************************************************************************
 
-template<typename T>
-void GeluCPULauncher(const T * in, T * out, int const size) {
+template<typename FPTYPE>
+void GeluCPULauncher(const FPTYPE * in, FPTYPE * out, int const size) {
     for (int ii = 0; ii < size; ii++) {
         out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])));
     }
 }
 
-template<typename T>
-void GeluGradCPULauncher(const T * dy, const T * in, T * out, int const size) {
+template<typename FPTYPE>
+void GeluGradCPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
     for (int ii = 0; ii < size; ii++) {
-        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
         out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
     }
 }
 
-template <typename T>
-void GeluGradGradCPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+void GeluGradGradCPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
     for (int ii = 0; ii < size; ii++) {
-        T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
-        T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+        FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
 	    out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
     }
 }
 
 #if GOOGLE_CUDA
-template<typename T>
-void GeluGPULauncher(const T * in, T * out, int const size) {
-    GeluGPUExecuteFunctor<T>()(in, out, size);
+template<typename FPTYPE>
+void GeluGPULauncher(const FPTYPE * in, FPTYPE * out, int const size) {
+    GeluGPUExecuteFunctor<FPTYPE>()(in, out, size);
 }
 
-template<typename T>
-void GeluGradGPULauncher(const T * dy, const T * in, T * out, int const size) {
-    GeluGradGPUExecuteFunctor<T>()(dy, in, out, size);
+template<typename FPTYPE>
+void GeluGradGPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+    GeluGradGPUExecuteFunctor<FPTYPE>()(dy, in, out, size);
 }
 
-template <typename T>
-void GeluGradGradGPULauncher(const T * dy, const T * dy_, const T * in, T * out, int const size) {
-    GeluGradGradGPUExecuteFunctor<T>()(dy, dy_, in, out, size);
+template <typename FPTYPE>
+void GeluGradGradGPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+    GeluGradGradGPUExecuteFunctor<FPTYPE>()(dy, dy_, in, out, size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
 // end of custome op Gelu
 // ******************************************************************************
+
+template<typename FPTYPE> 
+void compute_descriptor_se_r_cpu (
+    vector<FPTYPE > &	        descrpt_a,
+	vector<FPTYPE > &	        descrpt_a_deriv,
+	vector<FPTYPE > &	        rij_a,
+	const vector<FPTYPE > &	    posi,
+	const int &				ntypes,
+	const vector<int > &	type,
+	const int &				i_idx,
+	const vector<int > &	fmt_nlist_a,
+	const vector<int > &	sec_a, 
+	const float &			rmin,
+	const float &			rmax) 
+{
+    // compute the diff of the neighbors
+    rij_a.resize (sec_a.back() * 3);
+    fill (rij_a.begin(), rij_a.end(), 0.0);
+    for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+        for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+            if (fmt_nlist_a[jj] < 0) break;
+            const int & j_idx = fmt_nlist_a[jj];
+
+            for (int dd = 0; dd < 3; ++dd) {
+                rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+            }
+        }
+    }
+    // 1./rr, cos(theta), cos(phi), sin(phi)
+    descrpt_a.resize (sec_a.back());
+    fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+    // deriv wrt center: 3
+    descrpt_a_deriv.resize (sec_a.back() * 3);
+    fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+    for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+        for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
+            if (fmt_nlist_a[nei_iter] < 0) break;
+            const FPTYPE * rr = &rij_a[nei_iter * 3];
+            FPTYPE nr2 = MathUtilities::dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            int idx_deriv = nei_iter * 3;	// 1 components time 3 directions
+            int idx_value = nei_iter;	    // 1 components
+            // 4 value components
+            descrpt_a[idx_value + 0] = 1./nr;
+            // deriv of component 1/r
+            descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
+            descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
+            descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
+            // 4 value components
+            descrpt_a[idx_value + 0] *= sw;
+        }
+    }
+}
+
+template<typename FPTYPE>
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    // set & normalize coord
+    std::vector<FPTYPE> d_coord3(nall * 3);
+    for (int ii = 0; ii < nall; ++ii) {
+	    for (int dd = 0; dd < 3; ++dd) {
+	        d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+	    }
+    }
+
+    // set type
+    std::vector<int> d_type (nall);
+    for (int ii = 0; ii < nall; ++ii) {
+        d_type[ii] = type[ii];
+    }
+    
+    // build nlist
+    std::vector<vector<int > > d_nlist_a(nloc);
+
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+	}
+	for (unsigned ii = 0; ii < nloc; ++ii) {
+	    int i_idx = ilist[ii];
+	    for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
+	        int j_idx = jlist[jj];
+	        d_nlist_a[i_idx].push_back (j_idx);
+	    }
+	}
+    
+    #pragma omp parallel for 
+    for (int ii = 0; ii < nloc; ++ii) {
+	    vector<int> fmt_nlist_a;
+	    int ret = -1;
+	    if (fill_nei_a) {
+	        format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
+	    }
+	    std::vector<FPTYPE> d_descrpt_a;
+	    std::vector<FPTYPE> d_descrpt_a_deriv;
+	    std::vector<FPTYPE> d_descrpt_r;
+	    std::vector<FPTYPE> d_descrpt_r_deriv;
+	    std::vector<FPTYPE> d_rij_a;
+	    compute_descriptor_se_r_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
+
+	    // check sizes
+	    assert (d_descrpt_a.size() == ndescrpt);
+	    assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+	    assert (d_rij_a.size() == nnei * 3);
+	    assert (fmt_nlist_a.size() == nnei);
+	    // record outputs
+	    for (int jj = 0; jj < ndescrpt; ++jj) {
+            descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj];
+        }
+	    for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+	        descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3];
+	    }
+	    for (int jj = 0; jj < nnei * 3; ++jj) {
+	        rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+	    }
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+	    }
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op DescrptSeR
+// ******************************************************************************
+
+template<typename FPTYPE>
+void ProdForceSeRCPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
+    // compute force of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt center atom
+	    for (int aa = 0; aa < ndescrpt; ++aa) {
+	        force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+	        force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+	        force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+	    }
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0];
+	        force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1];
+	        force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2];
+	    }
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void ProdForceSeRGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    ProdForceSeRGPUExecuteFunctor<FPTYPE>()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+}
+#endif // GOOGLE_CUDA
+
+// ******************************************************************************
+// end of custome op ProdForceSeR
+// ******************************************************************************
+
+template<typename FPTYPE>
+void ProdVirialSeRCPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    memset(virial, 0.0, sizeof(FPTYPE) * 9);
+    memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9);
+
+    // compute virial of a frame
+    for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+	    // deriv wrt neighbors
+	    for (int jj = 0; jj < nnei; ++jj) {
+	        int j_idx = nlist[i_idx * nnei + jj];
+	        if (j_idx < 0) continue;
+	        FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + jj];
+	        for (int dd0 = 0; dd0 < 3; ++dd0)
+	            for (int dd1 = 0; dd1 < 3; ++dd1) {
+		            FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] *  in_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
+		            virial[dd0 * 3 + dd1] -= tmp_v;
+		            atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
+	            }
+	    }
+	}
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+    ProdVirialSeRGPUExecuteFunctor<FPTYPE>()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op ProdVirialSeR
+// ******************************************************************************
diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h
index f02ef55651..d51d617f84 100644
--- a/source/lib/include/DeviceFunctor.h
+++ b/source/lib/include/DeviceFunctor.h
@@ -16,32 +16,47 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
     }
 }
 
-template<typename T>
+template<typename FPTYPE>
 struct DescrptSeAGPUExecuteFunctor {
-    void operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+    void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
 };
 
-template<typename T>
+template<typename FPTYPE>
+struct DescrptSeRGPUExecuteFunctor {
+    void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+};
+
+template<typename FPTYPE>
 struct ProdForceSeAGPUExecuteFunctor {
-    void operator()(T * force, const T * net_derive, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+    void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template<typename FPTYPE>
+struct ProdForceSeRGPUExecuteFunctor {
+    void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt);
 };
 
-template<typename T>
+template<typename FPTYPE>
 struct ProdVirialSeAGPUExecuteFunctor {
-    void operator()(T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+    void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template<typename FPTYPE>
+struct ProdVirialSeRGPUExecuteFunctor {
+    void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt);
 };
 
-template<typename T>
+template<typename FPTYPE>
 struct GeluGPUExecuteFunctor {
-    void operator()(const T * in, T * out, const int size);
+    void operator()(const FPTYPE * in, FPTYPE * out, const int size);
 };
 
-template<typename T>
+template<typename FPTYPE>
 struct GeluGradGPUExecuteFunctor {
-    void operator()(const T * dy, const T * in, T * out, const int size);
+    void operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, const int size);
 };
 
-template<typename T>
+template<typename FPTYPE>
 struct GeluGradGradGPUExecuteFunctor {
-    void operator()(const T * dy, const T * dy_, const T * in, T * out, const int size);
+    void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size);
 };
\ No newline at end of file
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index b262851450..879c36d5c0 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -203,7 +203,7 @@ init (const string & model, const int & gpu_rank)
     options.config.set_allow_soft_placement(true);
     options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
     options.config.mutable_gpu_options()->set_allow_growth(true);
-    cudaErrcheck(cudaSetDevice(gpu_rank));
+    cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
     std::string str = "/gpu:";
     str += std::to_string(gpu_rank % gpu_num);
     graph::SetDefaultDevice(str, &graph_def);
@@ -518,7 +518,7 @@ init (const vector<string> & models, const int & gpu_rank)
       options.config.set_allow_soft_placement(true);
       options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
       options.config.mutable_gpu_options()->set_allow_growth(true);
-      cudaErrcheck(cudaSetDevice(gpu_rank));
+      cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
   }
   #endif // GOOGLE_CUDA
 
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 21f675c845..c4d97cd85a 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -4,8 +4,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
 file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc)
-file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
@@ -27,7 +26,7 @@ if (BUILD_PY_IF)
   set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
   set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda)
   if (USE_CUDA_TOOLKIT)
-    add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB})
+    add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})
     add_subdirectory(cuda)
     find_package(CUDA REQUIRED)
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index afed0dbe83..5965254111 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -37,15 +37,15 @@ __global__ void BlockSortKernel(
     cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
 }
 
-template<typename T>
-__device__ inline T dev_dot(T * arr1, T * arr2) {
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) {
     return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
 }
 
-template<typename T>
-__device__ inline void spline5_switch(T & vv,
-        T & dd,
-        T & xx, 
+template<typename FPTYPE>
+__device__ inline void spline5_switch(FPTYPE & vv,
+        FPTYPE & dd,
+        FPTYPE & xx, 
 		const float & rmin, 
 		const float & rmax) 
 {
@@ -54,8 +54,8 @@ __device__ inline void spline5_switch(T & vv,
         vv = 1;
     }
     else if (xx < rmax) {
-        T uu = (xx - rmin) / (rmax - rmin) ;
-        T du = 1. / (rmax - rmin) ;
+        FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+        FPTYPE du = 1. / (rmax - rmin) ;
         vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
         dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
     }
@@ -76,8 +76,8 @@ __global__ void get_i_idx_se_a(const int nloc,
     i_idx[ilist[idy]] = idy;
 }
 
-template<typename T>
-__global__ void format_nlist_fill_a_se_a(const T * coord,
+template<typename FPTYPE>
+__global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
                             const int * type,
                             const int  * jrange,
                             const int  * jlist,
@@ -100,12 +100,12 @@ __global__ void format_nlist_fill_a_se_a(const T * coord,
 
     int_64 * key_in = key + idx * MAGIC_NUMBER;
 
-    T diff[3];
+    FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
     for (int dd = 0; dd < 3; dd++) {
         diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
     }
-    T rr = sqrt(dev_dot(diff, diff)); 
+    FPTYPE rr = sqrt(dev_dot(diff, diff)); 
     if (rr <= rcut) {
         key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
     }
@@ -147,19 +147,19 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
 }
 //it's ok!
 
-template<typename T>
-__global__ void compute_descriptor_se_a (T* descript,
+template<typename FPTYPE>
+__global__ void compute_descriptor_se_a (FPTYPE* descript,
                             const int ndescrpt,
-                            T* descript_deriv,
+                            FPTYPE* descript_deriv,
                             const int descript_deriv_size,
-                            T* rij,
+                            FPTYPE* rij,
                             const int rij_size,
                             const int* type,
-                            const T* avg,
-                            const T* std,
+                            const FPTYPE* avg,
+                            const FPTYPE* std,
                             int* nlist,
                             const int nlist_size,
-                            const T* coord,
+                            const FPTYPE* coord,
                             const float rmin,
                             const float rmax,
                             const int sec_a_size)
@@ -172,9 +172,9 @@ __global__ void compute_descriptor_se_a (T* descript,
     if (idy >= sec_a_size) {return;}
 
     // else {return;}
-    T * row_descript = descript + idx * ndescrpt;
-    T * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    T * row_rij = rij + idx * rij_size;
+    FPTYPE * row_descript = descript + idx * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    FPTYPE * row_rij = rij + idx * rij_size;
     int * row_nlist = nlist + idx * nlist_size;
 
     if (row_nlist[idy] >= 0) {
@@ -182,14 +182,14 @@ __global__ void compute_descriptor_se_a (T* descript,
         for (int kk = 0; kk < 3; kk++) {
             row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const T * rr = &row_rij[idy * 3 + 0];
-        T nr2 = dev_dot(rr, rr);
-        T inr = 1./sqrt(nr2);
-        T nr = nr2 * inr;
-        T inr2 = inr * inr;
-        T inr4 = inr2 * inr2;
-        T inr3 = inr4 * nr;
-        T sw, dsw;
+        const FPTYPE * rr = &row_rij[idy * 3 + 0];
+        FPTYPE nr2 = dev_dot(rr, rr);
+        FPTYPE inr = 1./sqrt(nr2);
+        FPTYPE nr = nr2 * inr;
+        FPTYPE inr2 = inr * inr;
+        FPTYPE inr4 = inr2 * inr2;
+        FPTYPE inr3 = inr4 * nr;
+        FPTYPE sw, dsw;
         spline5_switch(sw, dsw, nr, rmin, rmax);
         row_descript[idx_value + 0] = (1./nr)       ;//* sw;
         row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw;
@@ -228,9 +228,9 @@ __global__ void compute_descriptor_se_a (T* descript,
     }
 }
 
-template<typename T>
+template<typename FPTYPE>
 void format_nbor_list_256 (
-    const T* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -262,9 +262,9 @@ void format_nbor_list_256 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-template<typename T>
+template<typename FPTYPE>
 void format_nbor_list_512 (
-    const T* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -296,9 +296,9 @@ void format_nbor_list_512 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-template<typename T>
+template<typename FPTYPE>
 void format_nbor_list_1024 (
-    const T* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -330,9 +330,9 @@ void format_nbor_list_1024 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-template<typename T>
+template<typename FPTYPE>
 void format_nbor_list_2048 (
-    const T* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -364,9 +364,9 @@ void format_nbor_list_2048 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-template<typename T>
+template<typename FPTYPE>
 void format_nbor_list_4096 (
-    const T* coord,
+    const FPTYPE* coord,
     const int* type,
     const int* jrange,
     const int* jlist,
@@ -398,8 +398,8 @@ void format_nbor_list_4096 (
     BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
 }
 
-template <typename T>
-void DescrptSeAGPUExecuteFunctor<T>::operator()(const T * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descript, T * descript_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
+template <typename FPTYPE>
+void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
     int * sec_a_dev = array_int;
@@ -411,8 +411,8 @@ void DescrptSeAGPUExecuteFunctor<T>::operator()(const T * coord, const int * typ
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(T) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(T) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
         // ~~~
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index fa9678be34..a65ba5887a 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -1,45 +1,7 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_GPU
-#include <vector>
-#include <climits>
-#include <stdio.h>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
-#include <cuda_runtime.h>
-#include <fstream>
-
-#define MAGIC_NUMBER 256
-
-#ifdef HIGH_PREC
-    typedef double  VALUETYPE;
-#else
-    typedef float   VALUETYPE;
-#endif
-
-typedef double compute_t;
-
-typedef unsigned long long int_64;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 template <
     typename    Key,
@@ -48,7 +10,7 @@ template <
 __launch_bounds__ (BLOCK_THREADS)
 __global__ void BlockSortKernel(
     Key * d_in,
-    Key * d_out)         // Tile of output
+    Key * d_out)            // Tile of output
 {   
     enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
     // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
@@ -75,24 +37,25 @@ __global__ void BlockSortKernel(
     cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
 }
 
-template<typename T>
-__device__ inline T dev_dot(T * arr1, T * arr2) {
+template<typename FPTYPE>
+__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) {
     return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
 }
 
-__device__ inline void spline5_switch(compute_t & vv,
-        compute_t & dd,
-        compute_t & xx, 
-		const compute_t & rmin, 
-		const compute_t & rmax) 
+template<typename FPTYPE>
+__device__ inline void spline5_switch(FPTYPE & vv,
+        FPTYPE & dd,
+        FPTYPE & xx, 
+		const float & rmin, 
+		const float & rmax) 
 {
     if (xx < rmin) {
         dd = 0;
         vv = 1;
     }
     else if (xx < rmax) {
-        compute_t uu = (xx - rmin) / (rmax - rmin) ;
-        compute_t du = 1. / (rmax - rmin) ;
+        FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+        FPTYPE du = 1. / (rmax - rmin) ;
         vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
         dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
     }
@@ -104,7 +67,7 @@ __device__ inline void spline5_switch(compute_t & vv,
 
 __global__ void get_i_idx_se_r(const int nloc,
                         const int * ilist,
-                        int * i_idx) 
+                        int * i_idx)
 {
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
     if(idy >= nloc) {
@@ -113,17 +76,19 @@ __global__ void get_i_idx_se_r(const int nloc,
     i_idx[ilist[idy]] = idy;
 }
 
-__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
+template<typename FPTYPE>
+__global__ void format_nlist_fill_a_se_r(const FPTYPE * coord,
                             const int * type,
                             const int  * jrange,
                             const int  * jlist,
                             const float rcut,
                             int_64 * key,
-                            int * i_idx)
+                            int * i_idx,
+                            const int MAGIC_NUMBER)
 {   
     // <<<nloc, MAGIC_NUMBER>>>
     const unsigned int idx = blockIdx.x;
-    const unsigned int idy = threadIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     
     const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
     if (idy >= nsize) {
@@ -135,12 +100,12 @@ __global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
 
     int_64 * key_in = key + idx * MAGIC_NUMBER;
 
-    compute_t diff[3];
+    FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
     for (int dd = 0; dd < 3; dd++) {
         diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
     }
-    compute_t rr = sqrt(dev_dot(diff, diff)); 
+    FPTYPE rr = sqrt(dev_dot(diff, diff)); 
     if (rr <= rcut) {
         key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
     }
@@ -153,9 +118,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
                             const int * jrange,
                             const int * jlist,
                             int_64 * key,
-                            const int * sec,
+                            const int * sec_a,
                             const int sec_a_size,
-                            int * nei_iter_dev)
+                            int * nei_iter_dev,
+                            const int MAGIC_NUMBER)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -169,137 +135,329 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
     int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
 
     for (int ii = 0; ii < sec_a_size; ii++) {
-        nei_iter[ii] = sec[ii];
+        nei_iter[ii] = sec_a[ii];
     }
     
     for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
         const int & nei_type = key_out[kk] / 1E15;
-        if (nei_iter[nei_type] < sec[nei_type + 1]) {
+        if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
             row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
         }
     }
 }
 //it's ok!
 
-__global__ void compute_descriptor_se_r (VALUETYPE* descript,
+template<typename FPTYPE>
+__global__ void compute_descriptor_se_r (FPTYPE* descript,
                             const int ndescrpt,
-                            VALUETYPE* descript_deriv,
+                            FPTYPE* descript_deriv,
                             const int descript_deriv_size,
-                            VALUETYPE* rij,
+                            FPTYPE* rij,
                             const int rij_size,
                             const int* type,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
+                            const FPTYPE* avg,
+                            const FPTYPE* std,
                             int* nlist,
                             const int nlist_size,
-                            const VALUETYPE* coord,
-                            const VALUETYPE rmin,
-                            const VALUETYPE rmax,
-                            compute_t* sel_diff_dev,
-                            const int sec_size)
+                            const FPTYPE* coord,
+                            const float rmin,
+                            const float rmax,
+                            const int sec_a_size)
 {   
-    // <<<nloc, sec.back()>>>
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const int idx_deriv = idy * 3;	// 1 components time 3 directions
-    const int idx_value = idy;	// 1 components
-    if (idy >= sec_size) {return;}
-    
+    // <<<nloc, sec_a.back()>>>
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+    const int idx_deriv = idy * 3;	// 4 components time 3 directions
+    const int idx_value = idy;	    // 4 components
+    if (idy >= sec_a_size) {return;}
+
     // else {return;}
-    VALUETYPE * row_descript = descript + idx * ndescrpt;
-    VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    VALUETYPE * row_rij = rij + idx * rij_size;
-    compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3;
+    FPTYPE * row_descript = descript + idx * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+    FPTYPE * row_rij = rij + idx * rij_size;
     int * row_nlist = nlist + idx * nlist_size;
-    
+
     if (row_nlist[idy] >= 0) {
         const int & j_idx = row_nlist[idy];
         for (int kk = 0; kk < 3; kk++) {
-            sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
-            row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk];
+            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const compute_t * rr = &sel_diff[idy * 3 + 0];
-        compute_t nr2 = dev_dot(rr, rr);
-        compute_t inr = 1./sqrt(nr2);
-        compute_t nr = nr2 * inr;
-        compute_t inr2 = inr * inr;
-        compute_t inr4 = inr2 * inr2;
-        compute_t inr3 = inr4 * nr;
-        compute_t sw, dsw;
+        const FPTYPE * rr = &row_rij[idy * 3 + 0];
+        FPTYPE nr2 = dev_dot(rr, rr);
+        FPTYPE inr = 1./sqrt(nr2);
+        FPTYPE nr = nr2 * inr;
+        FPTYPE inr2 = inr * inr;
+        FPTYPE inr4 = inr2 * inr2;
+        FPTYPE inr3 = inr4 * nr;
+        FPTYPE sw, dsw;
         spline5_switch(sw, dsw, nr, rmin, rmax);
         row_descript[idx_value + 0] = (1./nr)       ;//* sw;
 
         row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
         row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
         row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-        // 1 value components
+        // 4 value components
         row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
     }
 
     for (int ii = 0; ii < 1; ii++) {
         row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
     }
+    // idy nloc, idx ndescrpt * 3
+    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
     for (int ii = 0; ii < 3; ii++) {
         row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
     }
 }
 
-void DescrptSeRLauncher(const VALUETYPE* coord,
-                        const int* type,
-                        const int* ilist,
-                        const int* jrange,
-                        const int* jlist,
-                        int* array_int,
-                        unsigned long long* array_longlong,
-                        compute_t* array_double,
-                        const VALUETYPE* avg,
-                        const VALUETYPE* std,
-                        VALUETYPE* descript,
-                        VALUETYPE* descript_deriv,
-                        VALUETYPE* rij,
-                        int* nlist,
-                        const int& ntypes,
-                        const int& nloc,
-                        const int& nall,
-                        const int& nnei,
-                        const float& rcut,
-                        const float& rcut_smth,
-                        const int& ndescrpt,
-                        const std::vector<int>& sec,
-                        const bool& fill_nei_a
-)
+template<typename FPTYPE>
+void format_nbor_list_256 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 256;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_512 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 512;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_1024 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
 {   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 1024;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_2048 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 2048;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template<typename FPTYPE>
+void format_nbor_list_4096 (
+    const FPTYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 4096;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, LEN);
+    format_nlist_fill_a_se_r
+    <<<block_grid, thread_grid>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 16;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template <typename FPTYPE>
+void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
-    int * sec_dev = array_int;
-    int * nei_iter = array_int + sec.size(); // = new int[sec_a_size];
-    int * i_idx = array_int + sec.size() + nloc * sec.size();
+    int * sec_a_dev = array_int;
+    int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
+    int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
     int_64 * key = array_longlong;
-    compute_t * sel_diff = array_double;    // = new VALUETYPE *[nlist_size]; nnei
     
     cudaError_t res = cudaSuccess;
-    res = cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
+    res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
-    res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
-    res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+    res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
+    res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
+        // ~~~
         // cudaProfilerStart();
         get_i_idx_se_r<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        format_nlist_fill_a_se_r<<<nloc, MAGIC_NUMBER>>> (
-                            coord,
-                            type,
-                            jrange,
-                            jlist,
-                            rcut,
-                            key,
-                            i_idx
-        );
-        const int ITEMS_PER_THREAD = 4;
-        const int BLOCK_THREADS = 64;
-        BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+        if (nnei <= 256) {
+            format_nbor_list_256 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 512) {
+            format_nbor_list_512 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 1024) {
+            format_nbor_list_1024 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 2048) {
+            format_nbor_list_2048 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 4096) {
+            format_nbor_list_4096 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } 
+
         format_nlist_fill_b_se_r<<<nblock, LEN>>> (
                             nlist,
                             nnei,       
@@ -307,14 +465,17 @@ void DescrptSeRLauncher(const VALUETYPE* coord,
                             jrange,
                             jlist,
                             key,
-                            sec_dev,
-                            sec.size(),
-                            nei_iter
+                            sec_a_dev,
+                            sec_a.size(),
+                            nei_iter,
+                            MAGIC_NUMBER
         );
     }
-    const int nblock_ = (sec.back() + LEN -1) / LEN;
-    dim3 block_grid(nblock_, nloc);
-    compute_descriptor_se_r<<<block_grid, LEN>>> (
+
+    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
+    dim3 block_grid(nloc, nblock_);
+    dim3 thread_grid(1, LEN);
+    compute_descriptor_se_r<<<block_grid, thread_grid>>> (
                             descript,
                             ndescrpt,
                             descript_deriv,
@@ -327,9 +488,11 @@ void DescrptSeRLauncher(const VALUETYPE* coord,
                             nlist,
                             nnei,
                             coord,
-                            rcut_smth,
-                            rcut,
-                            sel_diff,
-                            sec.back()
+                            rcut_r_smth,
+                            rcut_r,
+                            sec_a.back()
     );
-}  
\ No newline at end of file
+}
+
+template struct DescrptSeRGPUExecuteFunctor<float>;
+template struct DescrptSeRGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu
index 078a31da33..6329c8f085 100644
--- a/source/op/cuda/gelu.cu
+++ b/source/op/cuda/gelu.cu
@@ -1,31 +1,31 @@
 #include "DeviceFunctor.h"
 
-template <typename T>
-__global__ void gelu(const T * in, T * out, int const size) {
+template <typename FPTYPE>
+__global__ void gelu(const FPTYPE * in, FPTYPE * out, int const size) {
     int const idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= size) {return;}
 
     out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
 }
 
-template <typename T>
-__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+__global__ void gelu_grad(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
     int const idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= size) {return;}
 
     // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
-    T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
+    FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
     out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5);
 }
 
-template <typename T>
-__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+__global__ void gelu_grad_grad(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
     int const idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= size) {return;}
 
     // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
-    T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
-    T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1);
+    FPTYPE const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
+    FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1);
     
 	out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2);
 }
@@ -72,24 +72,24 @@ void GeluGradGradLauncher(const double * dy, const double * dy_, const double *
     gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
 }
 
-template <typename T>
-void GeluGPUExecuteFunctor<T>::operator()(const T * in, T * out, int const size) {
+template <typename FPTYPE>
+void GeluGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * in, FPTYPE * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
 }
 
-template <typename T>
-void GeluGradGPUExecuteFunctor<T>::operator()(const T * dy, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+void GeluGradGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
     gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
 }
  
-template <typename T>
-void GeluGradGradGPUExecuteFunctor<T>::operator()(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+template <typename FPTYPE>
+void GeluGradGradGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
     int const THREAD_ITEMS = 1024;
     int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
index ee826449af..1667c15f90 100644
--- a/source/op/cuda/prod_force_se_a.cu
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -14,10 +14,10 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-template<typename T>
-__global__ void deriv_wrt_center_atom_se_a(T * force, 
-                        const T * net_deriv,
-                        const T * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_center_atom_se_a(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int ndescrpt)
 {
     const unsigned int idx = blockIdx.x;
@@ -29,10 +29,10 @@ __global__ void deriv_wrt_center_atom_se_a(T * force,
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-template<typename T>
-__global__ void deriv_wrt_neighbors_se_a(T * force, 
-                        const T * net_deriv,
-                        const T * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_a(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
@@ -57,10 +57,10 @@ __global__ void deriv_wrt_neighbors_se_a(T * force,
     atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]);
 }
 
-template <typename T>
-void ProdForceSeAGPUExecuteFunctor<T>::operator()(T * force, 
-                        const T * net_deriv,
-                        const T * in_deriv,
+template <typename FPTYPE>
+void ProdForceSeAGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nall,
@@ -70,7 +70,7 @@ void ProdForceSeAGPUExecuteFunctor<T>::operator()(T * force,
                         const int n_a_shift)
 {   
     // std::cout << "I'm here!" << std::endl;
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(T) * nall * 3));
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
     const int LEN1 = 256;
     const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
     dim3 grid(nloc, nblock1);
diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu
index 765842d9c3..5a4b582dd0 100644
--- a/source/op/cuda/prod_force_se_r.cu
+++ b/source/op/cuda/prod_force_se_r.cu
@@ -1,21 +1,4 @@
-#include <stdio.h>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
+#include "DeviceFunctor.h"
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
@@ -31,31 +14,29 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_center_atom_se_r(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_center_atom_se_r(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.y;
-    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
-    const unsigned int idz = threadIdx.y;
-    
-    if (idy >= ndescrpt) {
-        return;
-    }
+    const unsigned int idx = blockIdx.x;
+    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+    const unsigned int idz = threadIdx.x;
 
+    if (idy >= ndescrpt) {return;}
+    
     atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_r(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift)
+                        const int ndescrpt)
 {  
     // idy -> nnei
     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -73,27 +54,30 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * force,
     atomicAdd(force + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
-void ProdForceSeRLauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
+template <typename FPTYPE>
+void ProdForceSeRGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force, 
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
                         const int * nlist,
                         const int nloc,
                         const int nall,
-                        const int ndescrpt,
                         const int nnei,
-                        const int n_a_sel,
-                        const int n_a_shift)
-{
-    cudaErrcheck(cudaMemset(force, 0.0, sizeof(VALUETYPE) * nall * 3));
+                        const int ndescrpt)
+{   
+    // std::cout << "I'm here!" << std::endl;
+    cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
     const int LEN1 = 256;
     const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nblock1, nloc);
-    dim3 thread(LEN1, 3);
+    dim3 grid(nloc, nblock1);
+    dim3 thread(3, LEN1);
     deriv_wrt_center_atom_se_r<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3);
-    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
-}
\ No newline at end of file
+    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt);
+}
+
+template struct ProdForceSeRGPUExecuteFunctor<float>;
+template struct ProdForceSeRGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu
index 5825120970..e084720c6d 100644
--- a/source/op/cuda/prod_virial_se_a.cu
+++ b/source/op/cuda/prod_virial_se_a.cu
@@ -14,12 +14,12 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-template<typename T>
-__global__ void deriv_wrt_neighbors_se_a(T * virial, 
-                        T * atom_virial,
-                        const T * net_deriv,
-                        const T * in_deriv,
-                        const T * rij,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_a(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
@@ -48,12 +48,12 @@ __global__ void deriv_wrt_neighbors_se_a(T * virial,
     atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
 }
 
-template <typename T>
-void ProdVirialSeAGPUExecuteFunctor<T>::operator()(T * virial, 
-                        T * atom_virial,
-                        const T * net_deriv,
-                        const T * in_deriv,
-                        const T * rij,
+template <typename FPTYPE>
+void ProdVirialSeAGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nall,
@@ -62,8 +62,8 @@ void ProdVirialSeAGPUExecuteFunctor<T>::operator()(T * virial,
                         const int n_a_sel,
                         const int n_a_shift)
 {
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(T) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(T) * 9 * nall));
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall));
 
     const int LEN = 16;
     int nblock = (nloc + LEN -1) / LEN;
diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu
index a2c02007fc..9b8f43543f 100644
--- a/source/op/cuda/prod_virial_se_r.cu
+++ b/source/op/cuda/prod_virial_se_r.cu
@@ -1,24 +1,5 @@
-#include <stdio.h>
-#include <iostream>
-#include <cuda_runtime.h>
+#include "DeviceFunctor.h"
 
-#define MUL 512
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
-    if (code != cudaSuccess) {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-// currently, double precision atomicAdd only support arch number larger than 6.0
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
@@ -33,17 +14,16 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-__global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template<typename FPTYPE>
+__global__ void deriv_wrt_neighbors_se_r(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift) 
+                        const int ndescrpt) 
 {
     // idx -> nloc
     // idy -> nnei
@@ -61,26 +41,26 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial,
     if (j_idx < 0) {
         return;
     }
+    // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
     atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
 }
 
-void ProdVirialSeRLauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
+template <typename FPTYPE>
+void ProdVirialSeRGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * virial, 
+                        FPTYPE * atom_virial,
+                        const FPTYPE * net_deriv,
+                        const FPTYPE * in_deriv,
+                        const FPTYPE * rij,
                         const int * nlist,
                         const int nloc,
                         const int nall,
                         const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift) 
+                        const int ndescrpt)
 {
-    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(VALUETYPE) * 9));
-    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(VALUETYPE) * 9 * nall));
+    cudaErrcheck(cudaMemset(virial, 0.0, sizeof(FPTYPE) * 9));
+    cudaErrcheck(cudaMemset(atom_virial, 0.0, sizeof(FPTYPE) * 9 * nall));
 
-    const int LEN = 16;
+    const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 9);
@@ -94,8 +74,9 @@ void ProdVirialSeRLauncher(VALUETYPE * virial,
                         nlist,
                         nloc,
                         nnei,
-                        ndescrpt,
-                        n_a_sel,
-                        n_a_shift
+                        ndescrpt
     );
-}
\ No newline at end of file
+}
+
+template struct ProdVirialSeRGPUExecuteFunctor<float>;
+template struct ProdVirialSeRGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index c59ba817b1..147ca687cf 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -12,12 +12,6 @@ typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
-
 using CPUDevice = Eigen::ThreadPoolDevice;
 
 REGISTER_OP("Descrpt")
@@ -41,7 +35,7 @@ REGISTER_OP("Descrpt")
 .Output("axis: int32")
 .Output("rot_mat: T");
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class DescrptOp : public OpKernel {
 public:
   explicit DescrptOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -164,18 +158,18 @@ class DescrptOp : public OpKernel {
     Tensor* rot_mat_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(5, rot_mat_shape, &rot_mat_tensor));
     
-    auto coord	= coord_tensor	.matrix<T>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<T>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<T>();
-    auto std	= std_tensor	.matrix<T>();
-    auto descrpt	= descrpt_tensor	->matrix<T>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<T>();
-    auto rij		= rij_tensor		->matrix<T>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
     auto axis		= axis_tensor		->matrix<int>();
-    auto rot_mat	= rot_mat_tensor		->matrix<T>();
+    auto rot_mat	= rot_mat_tensor		->matrix<FPTYPE>();
 
     // // check the types
     // int max_type_v = 0;
@@ -610,5 +604,6 @@ class DescrptOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     DescrptOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc
index 3dca8040aa..88f866944d 100644
--- a/source/op/descrpt_se_a.cc
+++ b/source/op/descrpt_se_a.cc
@@ -12,12 +12,6 @@ typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
-
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
@@ -40,7 +34,7 @@ REGISTER_OP("DescrptSeA")
     .Output("rij: T")
     .Output("nlist: int32");
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class DescrptSeAOp : public OpKernel {
 public:
   explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -165,15 +159,15 @@ class DescrptSeAOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<T>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<T>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<T>();
-    auto std	= std_tensor	.matrix<T>();
-    auto descrpt	= descrpt_tensor	->matrix<T>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<T>();
-    auto rij		= rij_tensor		->matrix<T>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -358,5 +352,6 @@ class DescrptSeAOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     DescrptSeAOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc
index 0069bb5375..ae5e623171 100644
--- a/source/op/descrpt_se_a_multi_device.cc
+++ b/source/op/descrpt_se_a_multi_device.cc
@@ -21,24 +21,6 @@ REGISTER_OP("DescrptSeA")
     .Output("nlist: int32");
     // only sel_a and rcut_r uesd.
 
-int get_magic_number(int const nnei) {
-    if (nnei <= 256) {
-        return 256;
-    }
-    else if (nnei <= 512) {
-        return 512;
-    }
-    else if (nnei <= 1024) {
-        return 1024;
-    }
-    else if (nnei <= 2048) {
-        return 2048;
-    }
-    else if (nnei <= 4096) {
-        return 4096;
-    }
-}
-
 struct DeviceFunctor {
     void operator()(const CPUDevice& d, std::string& device) {
         device = "CPU";
@@ -50,20 +32,20 @@ struct DeviceFunctor {
     #endif // GOOGLE_CUDA
 };
 
-template <typename T>
+template <typename FPTYPE>
 struct DescrptSeAFunctor {
-    void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
         DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
     }
 
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
         DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
     }
     #endif // GOOGLE_CUDA 
 };
 
-template <typename Device, typename T>
+template <typename Device, typename FPTYPE>
 class DescrptSeAOp : public OpKernel {
 public:
     explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -191,9 +173,9 @@ class DescrptSeAOp : public OpKernel {
 	        memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
         }
 
-        DescrptSeAFunctor<T>()(
+        DescrptSeAFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
-            coord_tensor.matrix<T>().data(),    // related to the kk argument
+            coord_tensor.matrix<FPTYPE>().data(),    // related to the kk argument
             type_tensor.matrix<int>().data(),           // also related to the kk argument
             mesh_tensor.flat<int>().data(),
             ilist,
@@ -201,11 +183,11 @@ class DescrptSeAOp : public OpKernel {
             jlist,
             array_int,
             array_longlong,
-            avg_tensor.matrix<T>().data(),
-            std_tensor.matrix<T>().data(),
-            descrpt_tensor->matrix<T>().data(),
-            descrpt_deriv_tensor->matrix<T>().data(),
-            rij_tensor->matrix<T>().data(),
+            avg_tensor.matrix<FPTYPE>().data(),
+            std_tensor.matrix<FPTYPE>().data(),
+            descrpt_tensor->matrix<FPTYPE>().data(),
+            descrpt_deriv_tensor->matrix<FPTYPE>().data(),
+            rij_tensor->matrix<FPTYPE>().data(),
             nlist_tensor->matrix<int>().data(),
             nloc,
             nall,
@@ -287,6 +269,24 @@ class DescrptSeAOp : public OpKernel {
         }
         delete [] mesh_host;
     }
+
+    int get_magic_number(int const nnei) {
+        if (nnei <= 256) {
+            return 256;
+        }
+        else if (nnei <= 512) {
+            return 512;
+        }
+        else if (nnei <= 1024) {
+            return 1024;
+        }
+        else if (nnei <= 2048) {
+            return 2048;
+        }
+        else if (nnei <= 4096) {
+            return 4096;
+        }
+    }
 };
 
 // Register the CPU kernels.
diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc
index 7a8dbb1541..a4bfe341ac 100644
--- a/source/op/descrpt_se_r.cc
+++ b/source/op/descrpt_se_r.cc
@@ -12,12 +12,6 @@ typedef double compute_t;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
-
 using CPUDevice = Eigen::ThreadPoolDevice;
 
 REGISTER_OP("DescrptSeR")
@@ -37,7 +31,7 @@ REGISTER_OP("DescrptSeR")
 .Output("rij: T")
 .Output("nlist: int32");
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class DescrptSeROp : public OpKernel {
 public:
   explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -156,15 +150,15 @@ class DescrptSeROp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<T>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<T>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<T>();
-    auto std	= std_tensor	.matrix<T>();
-    auto descrpt	= descrpt_tensor	->matrix<T>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<T>();
-    auto rij		= rij_tensor		->matrix<T>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     OP_REQUIRES (context, (ntypes == int(sel.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
@@ -342,5 +336,6 @@ class DescrptSeROp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     DescrptSeROp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/descrpt_se_r_gpu.cc b/source/op/descrpt_se_r_gpu.cc
deleted file mode 100644
index bf2f5b0b12..0000000000
--- a/source/op/descrpt_se_r_gpu.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-#include <vector>
-#include <string.h>
-#include <iostream>
-#include <cuda_runtime.h>
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;  // NOLINT(build/namespaces)
-#define MAGIC_NUMBER 256
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE ;
-#else
-    typedef float  VALUETYPE ;
-#endif
-
-typedef double compute_t;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-// sec_a kao,sec_r,
-
-using GPUDevice = Eigen::GpuDevice;
-
-REGISTER_OP("DescrptSeR")
-    .Attr("T: {float, double}")
-    .Input("coord: double")
-    .Input("type: int32")
-    .Input("natoms: int32")
-    .Input("box: double")
-    .Input("mesh: int32")
-    .Input("davg: double")
-    .Input("dstd: double")
-    .Attr("rcut: float")
-    .Attr("rcut_smth: float")
-    .Attr("sel: list(int)")
-    .Output("descrpt: double")
-    .Output("descrpt_deriv: double")
-    .Output("rij: double")
-    .Output("nlist: int32");
-
-
-void DescrptSeRLauncher(const VALUETYPE* coord,
-                            const int* type,
-                            const int* ilist,
-                            const int* jrange,
-                            const int* jlist,
-                            int* array_int,
-                            unsigned long long* array_longlong,
-                            compute_t* array_double,
-                            const VALUETYPE* avg,
-                            const VALUETYPE* std,
-                            VALUETYPE* descript,
-                            VALUETYPE* descript_deriv,
-                            VALUETYPE* rij,
-                            int* nlist,
-                            const int& ntypes,
-                            const int& nloc,
-                            const int& nall,
-                            const int& nnei,
-                            const float& rcut,
-                            const float& rcut_smth,
-                            const int& ndescrpt,
-                            const std::vector<int>& sec,
-                            const bool& fill_nei_a
-);
-
-template<typename Device, typename T>
-class DescrptSeROp : public OpKernel {
-public:
-    explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
-        float nloc_f, nall_f;
-        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
-        OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
-        OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
-        cum_sum (sec, sel);
-        sel_null.resize(3, 0);
-        cum_sum (sec_null, sel_null);
-        ndescrpt = sec.back() * 1;
-        nnei = sec.back();
-        fill_nei_a = true;
-        // count_nei_idx_overflow = 0;
-        // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl;
-    }
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        int context_input_index = 0;
-        const Tensor& coord_tensor	= context->input(context_input_index++);
-        const Tensor& type_tensor	= context->input(context_input_index++);
-        const Tensor& natoms_tensor	= context->input(context_input_index++);
-        const Tensor& box_tensor	= context->input(context_input_index++);
-        const Tensor& mesh_tensor   = context->input(context_input_index++);
-        const Tensor& avg_tensor	= context->input(context_input_index++);
-        const Tensor& std_tensor	= context->input(context_input_index++);
-        // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
-        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
-        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
-        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
-        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of box should be 2"));
-        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
-        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of avg should be 2"));
-        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	    errors::InvalidArgument ("Dim of std should be 2"));
-        OP_REQUIRES (context, (fill_nei_a),                         errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
-
-        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
-        int nloc = natoms[0];
-        int nall = natoms[1];
-        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
-        int nsamples = coord_tensor.shape().dim_size(0);
-        //
-        //// check the sizes
-        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
-        OP_REQUIRES (context, (ntypes == int(sel.size())),	                    errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
-        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
-        
-        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
-        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
-        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		    errors::InvalidArgument ("number of box should be 9"));
-        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
-        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));   
-        
-        // Create output tensors
-        TensorShape descrpt_shape ;
-        descrpt_shape.AddDim (nsamples);
-        descrpt_shape.AddDim (nloc * ndescrpt);
-        TensorShape descrpt_deriv_shape ;
-        descrpt_deriv_shape.AddDim (nsamples);
-        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
-        TensorShape rij_shape ;
-        rij_shape.AddDim (nsamples);
-        rij_shape.AddDim (nloc * nnei * 3);
-        TensorShape nlist_shape ;
-        nlist_shape.AddDim (nsamples);
-        nlist_shape.AddDim (nloc * nnei);
-
-        int context_output_index = 0;
-        Tensor* descrpt_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     descrpt_shape,
-	    					     &descrpt_tensor));
-        Tensor* descrpt_deriv_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     descrpt_deriv_shape,
-	    					     &descrpt_deriv_tensor));
-        Tensor* rij_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     rij_shape,
-	    					     &rij_tensor));
-        Tensor* nlist_tensor = NULL;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
-	    					     nlist_shape,
-	    					     &nlist_tensor));
-        
-	    int * ilist = NULL, *jrange = NULL, *jlist = NULL;
-        int *array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL;
-        cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
-
-        // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
-        // Launch computation
-        for (int II = 0; II < nsamples; II++) {
-            DescrptSeRLauncher( coord_tensor.matrix<T>().data() + II * (nall * 3),
-                                type_tensor.matrix<int>().data() + II * nall,
-                                ilist,
-                                jrange,
-                                jlist,
-                                array_int,
-                                array_longlong,
-                                array_double,
-                                avg_tensor.matrix<T>().data(),
-                                std_tensor.matrix<T>().data(),
-                                descrpt_tensor->matrix<T>().data() + II * (nloc * ndescrpt),
-                                descrpt_deriv_tensor->matrix<T>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor->matrix<T>().data() + II * (nloc * nnei * 3),
-                                nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
-                                ntypes,
-                                nloc,
-                                nall,
-                                nnei,
-                                rcut,
-                                rcut_smth,
-                                ndescrpt,
-                                sec,
-                                fill_nei_a
-            );
-        }
-        // std::cout << "done" << std::endl;
-        delete[] natoms;
-    }
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-
-private:
-    float rcut;
-    float rcut_smth;
-    std::vector<int32> sel;
-    std::vector<int32> sel_null;
-    std::vector<int> sec;
-    std::vector<int> sec_null;
-    int nnei, ndescrpt, nloc, nall;
-    bool fill_nei_a;
-
-    //private func
-    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
-        sec.resize (n_sel.size() + 1);
-        sec[0] = 0;
-        for (int ii = 1; ii < sec.size(); ++ii) {
-            sec[ii] = sec[ii-1] + n_sel[ii-1];
-        }
-    }
-};
-
-#define REGISTER_GPU(T)                                                                 \
-REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint<T>("T"),                       \
-    DescrptSeROp<GPUDevice, T>); 
-REGISTER_GPU(VALUETYPE);
\ No newline at end of file
diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc
new file mode 100644
index 0000000000..c5eaff616c
--- /dev/null
+++ b/source/op/descrpt_se_r_multi_device.cc
@@ -0,0 +1,297 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("DescrptSeR")
+    .Attr("T: {float, double}")
+    .Input("coord: T")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box: T")
+    .Input("mesh: int32")
+    .Input("davg: T")
+    .Input("dstd: T")
+    .Attr("rcut: float")
+    .Attr("rcut_smth: float")
+    .Attr("sel: list(int)")
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32");
+
+struct DeviceFunctor {
+    void operator()(const CPUDevice& d, std::string& device) {
+        device = "CPU";
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, std::string& device) {
+        device = "GPU";
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template <typename T>
+struct DescrptSeRFunctor {
+    void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+        DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template<typename Device, typename FPTYPE>
+class DescrptSeROp : public OpKernel {
+public:
+    explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+        OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
+        OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
+        cum_sum (sec, sel);
+        sel_null.resize(3, 0);
+        cum_sum (sec_null, sel_null);
+        ndescrpt = sec.back() * 1;
+        nnei = sec.back();
+        fill_nei_a = true;
+        magic_number = get_magic_number(nnei);
+        // count_nei_idx_overflow = 0;
+        // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor	= context->input(context_input_index++);
+        const Tensor& avg_tensor	= context->input(context_input_index++);
+        const Tensor& std_tensor	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of avg should be 2"));
+        OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of std should be 2"));
+        OP_REQUIRES (context, (fill_nei_a),				errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
+
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+
+        DeviceFunctor() (
+            context->eigen_device<Device>(),
+            device
+        );
+
+        const int * natoms = natoms_tensor.flat<int>().data();
+        int nloc = natoms[0];
+        int nall = natoms[1];
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2; //nloc and nall mean something.
+        int nsamples = coord_tensor.shape().dim_size(0);
+        //
+        //// check the sizes
+        // check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
+        OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
+
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
+        OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
+        OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt")); 
+        
+        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
+
+        // Create an output tensor
+        TensorShape descrpt_shape ;
+        descrpt_shape.AddDim (nsamples);
+        descrpt_shape.AddDim (nloc * ndescrpt);
+        TensorShape descrpt_deriv_shape ;
+        descrpt_deriv_shape.AddDim (nsamples);
+        descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+        TensorShape rij_shape ;
+        rij_shape.AddDim (nsamples);
+        rij_shape.AddDim (nloc * nnei * 3);
+        TensorShape nlist_shape ;
+        nlist_shape.AddDim (nsamples);
+        nlist_shape.AddDim (nloc * nnei);
+    
+        int context_output_index = 0;
+        Tensor* descrpt_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     descrpt_shape, 
+	    					     &descrpt_tensor));
+        Tensor* descrpt_deriv_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     descrpt_deriv_shape, 
+	    					     &descrpt_deriv_tensor));
+        Tensor* rij_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     rij_shape,
+	    					     &rij_tensor));
+        Tensor* nlist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     nlist_shape,
+	    					     &nlist_tensor));
+            
+        if(device == "GPU") {
+            // allocate temp memory, temp memory must not be used after this operation!
+            Tensor int_temp;
+            TensorShape int_shape;
+            int_shape.AddDim(sec.size() + nloc * sec.size() + nloc);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+            Tensor uint64_temp;
+            TensorShape uint64_shape;
+            uint64_shape.AddDim(nloc * magic_number * 2);
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+
+            array_int = int_temp.flat<int>().data(); 
+            array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+            nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
+        }
+        else if (device == "CPU") {
+            memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jrange, 8  + mesh_tensor.flat<int>().data(), sizeof(int *));
+	        memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
+        }
+
+        DescrptSeRFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            coord_tensor.matrix<FPTYPE>().data(),    // related to the kk argument
+            type_tensor.matrix<int>().data(),           // also related to the kk argument
+            mesh_tensor.flat<int>().data(),
+            ilist,
+            jrange,
+            jlist,
+            array_int,
+            array_longlong,
+            avg_tensor.matrix<FPTYPE>().data(),
+            std_tensor.matrix<FPTYPE>().data(),
+            descrpt_tensor->matrix<FPTYPE>().data(),
+            descrpt_deriv_tensor->matrix<FPTYPE>().data(),
+            rij_tensor->matrix<FPTYPE>().data(),
+            nlist_tensor->matrix<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ntypes,
+            ndescrpt,
+            rcut,
+            rcut_smth,
+            sec,
+            fill_nei_a,
+            magic_number
+        );
+    }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+    float rcut;
+    float rcut_smth;
+    std::vector<int32> sel;
+    std::vector<int32> sel_null;
+    std::vector<int> sec;
+    std::vector<int> sec_null;
+    int nnei, ndescrpt, nloc, nall;
+    bool fill_nei_a;
+
+    //private func
+    void cum_sum (std::vector<int> & sec, const std::vector<int32> & n_sel) const {
+        sec.resize (n_sel.size() + 1);
+        sec[0] = 0;
+        for (int ii = 1; ii < sec.size(); ++ii) {
+            sec[ii] = sec[ii-1] + n_sel[ii-1];
+        }
+    }
+
+    int magic_number;
+    std::string device;
+    int *array_int;
+    unsigned long long*array_longlong;
+    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
+    int ilist_size = 0, jrange_size = 0, jlist_size = 0;
+    bool init = false;
+
+    void nbor_update(const int * mesh, const int size) {
+        int *mesh_host = new int[size], *ilist_host = NULL, *jrange_host = NULL, *jlist_host = NULL;
+        cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost));
+        memcpy (&ilist_host,  4  + mesh_host, sizeof(int *));
+	    memcpy (&jrange_host, 8  + mesh_host, sizeof(int *));
+	    memcpy (&jlist_host,  12 + mesh_host, sizeof(int *));
+        int const ago = mesh_host[0];
+        if (!init) {
+            ilist_size  = (int)(mesh_host[1] * 1.2);
+            jrange_size = (int)(mesh_host[2] * 1.2);
+            jlist_size  = (int)(mesh_host[3] * 1.2);
+            cudaErrcheck(cudaMalloc((void **)&ilist,     sizeof(int) * ilist_size));
+            cudaErrcheck(cudaMalloc((void **)&jrange,    sizeof(int) * jrange_size));
+            cudaErrcheck(cudaMalloc((void **)&jlist,     sizeof(int) * jlist_size));
+            init = true;
+        }
+        if (ago == 0) {
+            if (ilist_size < mesh_host[1]) {
+                ilist_size = (int)(mesh_host[1] * 1.2);
+                cudaErrcheck(cudaFree(ilist));
+                cudaErrcheck(cudaMalloc((void **)&ilist, sizeof(int) * ilist_size));
+            }
+            if (jrange_size < mesh_host[2]) {
+                jrange_size = (int)(mesh_host[2] * 1.2);
+                cudaErrcheck(cudaFree(jrange));
+                cudaErrcheck(cudaMalloc((void **)&jrange,sizeof(int) * jrange_size));
+            }
+            if (jlist_size < mesh_host[3]) {
+                jlist_size = (int)(mesh_host[3] * 1.2);
+                cudaErrcheck(cudaFree(jlist));
+                cudaErrcheck(cudaMalloc((void **)&jlist, sizeof(int) * jlist_size));
+            }
+            cudaErrcheck(cudaMemcpy(ilist,  ilist_host,  sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
+            cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
+        }
+        delete [] mesh_host;
+    }
+
+    int get_magic_number(int const nnei) {
+        if (nnei <= 256) {
+            return 256;
+        }
+        else if (nnei <= 512) {
+            return 512;
+        }
+        else if (nnei <= 1024) {
+            return 1024;
+        }
+        else if (nnei <= 2048) {
+            return 2048;
+        }
+        else if (nnei <= 4096) {
+            return 4096;
+        }
+    }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DescrptSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
+    DescrptSeROp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
index 3833de1f7b..22c61b7429 100644
--- a/source/op/ewald_recp.cc
+++ b/source/op/ewald_recp.cc
@@ -10,12 +10,6 @@ typedef double boxtensor_t ;
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
-
 using CPUDevice = Eigen::ThreadPoolDevice;
 
 REGISTER_OP("EwaldRecp")
@@ -30,7 +24,7 @@ REGISTER_OP("EwaldRecp")
 .Output("force: T")
 .Output("virial: T");
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class EwaldRecpOp : public OpKernel {
 public:
   explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -81,12 +75,12 @@ class EwaldRecpOp : public OpKernel {
     Tensor* virial_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor));
     
-    auto coord	= coord_tensor	.flat<T>();
-    auto charge	= charge_tensor	.flat<T>();
-    auto box	= box_tensor	.flat<T>();
-    auto energy	= energy_tensor	->flat<T>();
-    auto force	= force_tensor	->matrix<T>();
-    auto virial	= virial_tensor	->matrix<T>();
+    auto coord	= coord_tensor	.flat<FPTYPE>();
+    auto charge	= charge_tensor	.flat<FPTYPE>();
+    auto box	= box_tensor	.flat<FPTYPE>();
+    auto energy	= energy_tensor	->flat<FPTYPE>();
+    auto force	= force_tensor	->matrix<FPTYPE>();
+    auto virial	= virial_tensor	->matrix<FPTYPE>();
 
     for (int kk = 0; kk < nsamples; ++kk){
       int box_iter = kk * 9;
@@ -113,19 +107,19 @@ class EwaldRecpOp : public OpKernel {
 	  else if (inter[dd] >= 1) inter[dd] -= 1.;
 	}
       }
-      vector<T > d_coord3 (nloc*3);
+      vector<FPTYPE > d_coord3 (nloc*3);
       for (int ii = 0; ii < nloc * 3; ++ii) {
 	d_coord3[ii] = d_coord3_[ii];
       }
 
       // set charge
-      vector<T > d_charge (nloc);
+      vector<FPTYPE > d_charge (nloc);
       for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii);
 
       // prepare outputs vectors
-      T d_ener;
-      vector<T> d_force(nloc*3);
-      vector<T> d_virial(9);
+      FPTYPE d_ener;
+      vector<FPTYPE> d_force(nloc*3);
+      vector<FPTYPE> d_virial(9);
 
       // compute
       EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep);
@@ -141,11 +135,12 @@ class EwaldRecpOp : public OpKernel {
     }
   }
 private:
-  EwaldParameters<T> ep;
+  EwaldParameters<FPTYPE> ep;
 };
 
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("EwaldRecp").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     EwaldRecpOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/gelu.cc b/source/op/gelu.cc
index 26c53c8511..7012438db9 100644
--- a/source/op/gelu.cc
+++ b/source/op/gelu.cc
@@ -2,11 +2,11 @@
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#define SQRT_2_PI 0.7978845608028654
 
 using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
+#define SQRT_2_PI 0.7978845608028654
 
 REGISTER_OP("Gelu")
     .Attr("T: {float, double}")
@@ -26,43 +26,68 @@ REGISTER_OP("GeluGradGrad")
     .Input("x: T")
     .Output("output: T");
 
-template <typename Device, typename T>
+#if GOOGLE_CUDA
+// maybe instead use cudnn activation forward 
+void GeluLauncher(const float * in, float * out, int const size);
+void GeluLauncher(const double * in, double * out, int const size);
+void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
+void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
+void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
+void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
+#endif // GOOGLE_CUDa
+
+template <typename FPTYPE>
 struct GeluFunctor {
-    void operator()(const Device& d, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
 		#pragma omp parallel for 
 		for (int ii = 0; ii < size; ii++) {
 			out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii])));
 		}
 	}
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
+		GeluLauncher(in, out, size);
+	}
+    #endif
 };
 
-template <typename Device, typename T>
+template <typename FPTYPE>
 struct GeluGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
         #pragma omp parallel for 
 		for (int ii = 0; ii < size; ii++) {
-        	T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+        	FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
     		out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
 		}
     }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradLauncher(dy, in, out, size);
+    }
+    #endif
 };
 
-template <typename Device, typename T>
+template <typename FPTYPE>
 struct GeluGradGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
         #pragma omp parallel for
 		for (int ii = 0; ii < size; ii++) {
-			T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
-    		T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+			FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+    		FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
 
     		out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
         }
     }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+        GeluGradGradLauncher(dy, dy_, in, out, size);
+    }
+    #endif
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluOp : public OpKernel {
   public :
     explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -70,26 +95,25 @@ class GeluOp : public OpKernel {
     void Compute(OpKernelContext* context) override {
         // Grab the input tensor
         const Tensor& x = context->input(0);
-        
         Tensor * output = NULL;
         int context_output_index = 0;
         OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
 					    x.shape(),
 					    &output));
 		
-		GeluFunctor<Device, T>()(
+		GeluFunctor<FPTYPE>()(
 			context->eigen_device<Device>(),
-			x.flat<T>().data(),
-			output->flat<T>().data(),
+			x.flat<FPTYPE>().data(),
+			output->flat<FPTYPE>().data(),
 			static_cast<int>(output->NumElements())
 		);
-        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+        // GeluLauncher(x.flat<FPTYPE>().data(), output->flat<FPTYPE>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluGradOp : public OpKernel {
   public :
     explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -105,20 +129,20 @@ class GeluGradOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluGradFunctor<Device, T>()(
+		GeluGradFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
+            dy.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
             static_cast<int>(output->NumElements())
         );
-        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+        // GeluGradLauncher(dy.flat<FPTYPE>().data(), x.flat<FPTYPE>().data(), output->flat<FPTYPE>().data(), static_cast<int>(output->NumElements()));
     }
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluGradGradOp : public OpKernel {
   public :
     explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -135,12 +159,12 @@ class GeluGradGradOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluGradGradFunctor<Device, T>()(
+		GeluGradGradFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            dy_.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
+            dy.flat<FPTYPE>().data(),
+            dy_.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
             static_cast<int>(output->NumElements())
         );
         // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
@@ -148,17 +172,35 @@ class GeluGradGradOp : public OpKernel {
 };
 
 #define REGISTER_CPU(T)                                                     \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
-        GeluOp<CPUDevice, T>);                                              \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
-        GeluGradOp<CPUDevice, T>);                                          \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
-        GeluGradGradOp<CPUDevice, T>);
-    REGISTER_CPU(float);
-    REGISTER_CPU(double);
\ No newline at end of file
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
+    GeluOp<CPUDevice, T>);                                              \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<CPUDevice, T>);                                          \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                     \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
+    GeluOp<GPUDevice, T>);                                              \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
+    GeluGradOp<GPUDevice, T>);                                          \
+/* Declare explicit instantiations in kernel_example.cu.cc. */          \
+REGISTER_KERNEL_BUILDER(                                                \
+    Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+    GeluGradGradOp<GPUDevice, T>);                                      
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif // GOOGLE_CUDA
diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc
deleted file mode 100644
index 34d4183f98..0000000000
--- a/source/op/gelu_gpu.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-using namespace tensorflow;
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-REGISTER_OP("Gelu")
-    .Attr("T: {float, double}")
-    .Input("x: T")
-    .Output("output: T");
-
-REGISTER_OP("GeluGrad")
-    .Attr("T: {float, double}")
-    .Input("dy: T")
-    .Input("x: T")
-    .Output("output: T");
-
-REGISTER_OP("GeluGradGrad")
-    .Attr("T: {float, double}")
-    .Input("dy: T")
-    .Input("dy_: T")
-    .Input("x: T")
-    .Output("output: T");
-
-// maybe instead use cudnn activation forward 
-void GeluLauncher(const float * in, float * out, int const size);
-void GeluLauncher(const double * in, double * out, int const size);
-
-void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
-void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
-
-void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
-void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
-
-template <typename Device, typename T>
-struct GeluFunctor {
-    void operator()(const Device& d, const T * in, T * out, int const size) {
-		GeluLauncher(in, out, size);
-	}
-};
-
-template <typename Device, typename T>
-struct GeluGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
-        GeluGradLauncher(dy, in, out, size);
-    }
-};
-
-template <typename Device, typename T>
-struct GeluGradGradFunctor {
-    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
-        GeluGradGradLauncher(dy, dy_, in, out, size);
-    }
-};
-
-// OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
-class GeluOp : public OpKernel {
-  public :
-    explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        const Tensor& x = context->input(0);
-        Tensor * output = NULL;
-        int context_output_index = 0;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-					    x.shape(),
-					    &output));
-		
-		GeluFunctor<Device, T>()(
-			context->eigen_device<Device>(),
-			x.flat<T>().data(),
-			output->flat<T>().data(),
-			static_cast<int>(output->NumElements())
-		);
-        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
-    }
-};
-
-// OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
-class GeluGradOp : public OpKernel {
-  public :
-    explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        const Tensor& dy = context->input(0);
-        const Tensor& x  = context->input(1);
-        
-        Tensor * output = NULL;
-        int context_output_index = 0;
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-					    x.shape(),
-					    &output));
-		
-		GeluGradFunctor<Device, T>()(
-            context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
-            static_cast<int>(output->NumElements())
-        );
-        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
-    }
-};
-
-// OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
-class GeluGradGradOp : public OpKernel {
-  public :
-    explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-	
-    void Compute(OpKernelContext* context) override {
-        // Grab the input tensor
-        const Tensor& dy = context->input(0);
-        const Tensor& dy_ = context->input(1);
-        const Tensor& x  = context->input(2);
-	
-		Tensor * output = NULL;
-		int context_output_index = 0;	
-        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-					    x.shape(),
-					    &output));
-		
-		GeluGradGradFunctor<Device, T>()(
-            context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            dy_.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
-            static_cast<int>(output->NumElements())
-        );
-        // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
-    }
-};
-
-#define REGISTER_GPU(T)                                                     \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
-        GeluOp<GPUDevice, T>);                                              \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
-        GeluGradOp<GPUDevice, T>);                                          \
-    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
-    REGISTER_KERNEL_BUILDER(                                                \
-        Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
-        GeluGradGradOp<GPUDevice, T>);                                      
-    REGISTER_GPU(float);
-    REGISTER_GPU(double);
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index 2a5656e6d7..f84c9c0f9f 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -19,45 +19,45 @@ REGISTER_OP("GeluGradGrad")
     .Input("x: T")
     .Output("output: T");
 
-template <typename T>
+template <typename FPTYPE>
 struct GeluFunctor {
-    void operator()(const CPUDevice& d, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
 		GeluCPULauncher(in, out, size);
 	}
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * in, T * out, int const size) {
+    void operator()(const GPUDevice& d, const FPTYPE * in, FPTYPE * out, int const size) {
         GeluGPULauncher(in, out, size);
     }
     #endif
 };
 
-template <typename T>
+template <typename FPTYPE>
 struct GeluGradFunctor {
-    void operator()(const CPUDevice& d, const T * dy, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
         GeluGradCPULauncher(dy, in, out, size);
     }
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * dy, const T * in, T * out, int const size) {
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
         GeluGradGPULauncher(dy, in, out, size);
     }
     #endif
 };
 
-template <typename T>
+template <typename FPTYPE>
 struct GeluGradGradFunctor {
-    void operator()(const CPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    void operator()(const CPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
         GeluGradGradCPULauncher(dy, dy_, in, out, size);
     }
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    void operator()(const GPUDevice& d, const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
         GeluGradGradGPULauncher(dy, dy_, in, out, size);
     }
     #endif
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluOp : public OpKernel {
   public :
     explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -71,18 +71,18 @@ class GeluOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluFunctor<T>()(
+		GeluFunctor<FPTYPE>()(
 			context->eigen_device<Device>(),
-			x.flat<T>().data(),
-			output->flat<T>().data(),
+			x.flat<FPTYPE>().data(),
+			output->flat<FPTYPE>().data(),
 			static_cast<int>(output->NumElements())
 		);
     }
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluGradOp : public OpKernel {
   public :
     explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -98,19 +98,19 @@ class GeluGradOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluGradFunctor<T>()(
+		GeluGradFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
+            dy.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
             static_cast<int>(output->NumElements())
         );
     }
 };
 
 // OpKernel definition.
-// template parameter <T> is the datatype of the tensors.
-template <typename Device, typename T>
+// template parameter <FPTYPE> is the datatype of the tensors.
+template <typename Device, typename FPTYPE>
 class GeluGradGradOp : public OpKernel {
   public :
     explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -127,12 +127,12 @@ class GeluGradGradOp : public OpKernel {
 					    x.shape(),
 					    &output));
 		
-		GeluGradGradFunctor<T>()(
+		GeluGradGradFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            dy.flat<T>().data(),
-            dy_.flat<T>().data(),
-            x.flat<T>().data(),
-            output->flat<T>().data(),
+            dy.flat<FPTYPE>().data(),
+            dy_.flat<FPTYPE>().data(),
+            x.flat<FPTYPE>().data(),
+            output->flat<FPTYPE>().data(),
             static_cast<int>(output->NumElements())
         );
     }
diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc
index e1c6c362c8..7ba99b6e81 100644
--- a/source/op/prod_force.cc
+++ b/source/op/prod_force.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForce")
 .Attr("T: {float, double}")
 .Input("net_deriv: T")
@@ -28,7 +22,7 @@ using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceOp : public OpKernel {
  public:
   explicit ProdForceOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -79,11 +73,11 @@ class ProdForceOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<T>();
-    auto in_deriv = in_deriv_tensor.flat<T>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
     auto axis = axis_tensor.flat<int>();
-    auto force = force_tensor->flat<T>();
+    auto force = force_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -173,6 +167,7 @@ class ProdForceOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdForce").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdForceOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc
index 48f55a977b..2c8e62550c 100644
--- a/source/op/prod_force_grad.cc
+++ b/source/op/prod_force_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForceGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -26,7 +20,7 @@ REGISTER_OP("ProdForceGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceGradOp : public OpKernel 
 {
 public:
@@ -88,12 +82,12 @@ class ProdForceGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<T>();
-    auto net_deriv	= net_deriv_tensor	.flat<T>();
-    auto in_deriv	= in_deriv_tensor	.flat<T>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
     auto axis		= axis_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<T>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -185,4 +179,5 @@ class ProdForceGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdForceGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdForceGradOp<CPUDevice, T>);
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc
index c762bf71fd..1b5053377c 100644
--- a/source/op/prod_force_se_a.cc
+++ b/source/op/prod_force_se_a.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForceSeA")
 .Attr("T: {float, double}")
 .Input("net_deriv: T")
@@ -28,7 +22,7 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceSeAOp : public OpKernel {
  public:
   explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -78,10 +72,10 @@ class ProdForceSeAOp : public OpKernel {
 						     force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<T>();
-    auto in_deriv = in_deriv_tensor.flat<T>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto force = force_tensor->flat<T>();
+    auto force = force_tensor->flat<FPTYPE>();
 
     assert (nframes == force_shape.dim_size(0));
     assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -155,5 +149,6 @@ class ProdForceSeAOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ProdForceSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdForceSeAOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc
index d884782f17..884e46f9a8 100644
--- a/source/op/prod_force_se_a_grad.cc
+++ b/source/op/prod_force_se_a_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForceSeAGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -25,7 +19,7 @@ REGISTER_OP("ProdForceSeAGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceSeAGradOp : public OpKernel 
 {
 public:
@@ -83,11 +77,11 @@ class ProdForceSeAGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -154,4 +148,5 @@ class ProdForceSeAGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdForceSeAGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdForceSeAGradOp<CPUDevice, T>);
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_force_se_a_multi_device.cc b/source/op/prod_force_se_a_multi_device.cc
index a864617208..87a3ae3ecc 100644
--- a/source/op/prod_force_se_a_multi_device.cc
+++ b/source/op/prod_force_se_a_multi_device.cc
@@ -11,19 +11,19 @@ REGISTER_OP("ProdForceSeA")
     .Attr("n_r_sel: int")
     .Output("force: T");
 
-template <typename T>
+template <typename FPTYPE>
 struct ProdForceSeAFunctor {
-    void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    void operator()(const CPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
         ProdForceSeACPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
     }
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    void operator()(const GPUDevice& d, FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
         ProdForceSeAGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
     }
     #endif // GOOGLE_CUDA
 };
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceSeAOp : public OpKernel {
 public:
     explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -72,10 +72,10 @@ class ProdForceSeAOp : public OpKernel {
 	    					     force_shape, &force_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<T>();
-        auto in_deriv = in_deriv_tensor.flat<T>();
+        auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+        auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
         auto nlist = nlist_tensor.flat<int>();
-        auto force = force_tensor->flat<T>();
+        auto force = force_tensor->flat<FPTYPE>();
 
         assert (nframes == force_shape.dim_size(0));
         assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -87,11 +87,11 @@ class ProdForceSeAOp : public OpKernel {
         assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
         assert (nnei * 4 == ndescrpt);	    
 
-        ProdForceSeAFunctor<T>()(
+        ProdForceSeAFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            force_tensor->flat<T>().data(),
-            net_deriv_tensor.flat<T>().data(),
-            in_deriv_tensor.flat<T>().data(),
+            force_tensor->flat<FPTYPE>().data(),
+            net_deriv_tensor.flat<FPTYPE>().data(),
+            in_deriv_tensor.flat<FPTYPE>().data(),
             nlist_tensor.flat<int>().data(),
             nloc,
             nall, 
diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc
index a8e5b69162..4347442167 100644
--- a/source/op/prod_force_se_r.cc
+++ b/source/op/prod_force_se_r.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForceSeR")
 .Attr("T: {float, double}")
 .Input("net_deriv: T")
@@ -24,7 +18,7 @@ using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceSeROp : public OpKernel {
  public:
   explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -69,10 +63,10 @@ class ProdForceSeROp : public OpKernel {
 						     force_shape, &force_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<T>();
-    auto in_deriv = in_deriv_tensor.flat<T>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto force = force_tensor->flat<T>();
+    auto force = force_tensor->flat<FPTYPE>();
 
     assert (nframes == force_shape.dim_size(0));
     assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -127,6 +121,7 @@ class ProdForceSeROp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                 \
     Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdForceSeROp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc
index 488492f699..dfe9a5ff98 100644
--- a/source/op/prod_force_se_r_grad.cc
+++ b/source/op/prod_force_se_r_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdForceSeRGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -23,7 +17,7 @@ REGISTER_OP("ProdForceSeRGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdForceSeRGradOp : public OpKernel 
 {
 public:
@@ -77,11 +71,11 @@ class ProdForceSeRGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<T>();
-    auto net_deriv	= net_deriv_tensor	.flat<T>();
-    auto in_deriv	= in_deriv_tensor	.flat<T>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<T>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -129,4 +123,5 @@ class ProdForceSeRGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdForceSeRGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdForceSeRGradOp<CPUDevice, T>);
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_force_se_r_gpu.cc b/source/op/prod_force_se_r_multi_device.cc
similarity index 55%
rename from source/op/prod_force_se_r_gpu.cc
rename to source/op/prod_force_se_r_multi_device.cc
index 8f6ee24910..0a97d17742 100644
--- a/source/op/prod_force_se_r_gpu.cc
+++ b/source/op/prod_force_se_r_multi_device.cc
@@ -1,26 +1,5 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <cuda_runtime.h>
-#include <iostream>
-
-using namespace tensorflow;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
+#include "common.h"
+#include "CustomeOperation.h"
 
 REGISTER_OP("ProdForceSeR")
 .Attr("T: {float, double}")
@@ -30,26 +9,22 @@ REGISTER_OP("ProdForceSeR")
 .Input("natoms: int32")
 .Output("force: T");
 
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-void ProdForceSeRLauncher(VALUETYPE * force, 
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int ndescrpt,
-                        const int nnei,
-                        const int n_a_sel,
-                        const int n_a_shift);
+template <typename T>
+struct ProdForceSeRFunctor {
+    void operator()(const CPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdForceSeRCPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, T * force, const T * net_deriv, const T * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdForceSeRGPULauncher(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #endif // GOOGLE_CUDA
+};
 
 template<typename Device, typename T>
 class ProdForceSeROp : public OpKernel {
 public:
-    explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {
-        // std::cout << "I'm in prod_force_se_r_gpu.cc" << std::endl;
-    }
+    explicit ProdForceSeROp(OpKernelConstruction* context) : OpKernel(context) {}
 
     void Compute(OpKernelContext* context) override {
         // Grab the input tensor
@@ -66,8 +41,7 @@ class ProdForceSeROp : public OpKernel {
         OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
 
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int nframes = net_deriv_tensor.shape().dim_size(0);
@@ -79,11 +53,9 @@ class ProdForceSeROp : public OpKernel {
         OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
 
         OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
-        // OP_REQUIRES (context, (nnei == n_a_sel + n_r_sel),				errors::InvalidArgument ("number of neighbors should match"));
-        // OP_REQUIRES (context, (0 == n_r_sel),					        errors::InvalidArgument ("Rotational free only support all-angular information"));
 
         // Create an output tensor
-        TensorShape force_shape;
+        TensorShape force_shape ;
         force_shape.AddDim (nframes);
         force_shape.AddDim (3 * nall);
         Tensor* force_tensor = NULL;
@@ -92,10 +64,10 @@ class ProdForceSeROp : public OpKernel {
 	    					     force_shape, &force_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
+        auto net_deriv = net_deriv_tensor.flat<T>();
+        auto in_deriv = in_deriv_tensor.flat<T>();
         auto nlist = nlist_tensor.flat<int>();
-        auto force = force_tensor->flat<VALUETYPE>();
+        auto force = force_tensor->flat<T>();
 
         assert (nframes == force_shape.dim_size(0));
         assert (nframes == net_deriv_tensor.shape().dim_size(0));
@@ -107,28 +79,33 @@ class ProdForceSeROp : public OpKernel {
         assert (nloc * nnei == nlist_tensor.shape().dim_size(1));
         assert (nnei * 4 == ndescrpt);	    
 
-        for (int II = 0; II < nframes; II++) {
-            ProdForceSeRLauncher(force_tensor->flat<VALUETYPE>().data() + II * (nall * 3),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall, 
-                                ndescrpt,
-                                nnei,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
+        ProdForceSeRFunctor<T>()(
+            context->eigen_device<Device>(),
+            force_tensor->flat<T>().data(),
+            net_deriv_tensor.flat<T>().data(),
+            in_deriv_tensor.flat<T>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall, 
+            nnei,
+            ndescrpt
+        );
     }
-private:
-    int n_r_sel, n_a_sel, n_a_shift;
 };
 
 // Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                  \
+REGISTER_KERNEL_BUILDER(                                                                 \
+    Name("ProdForceSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdForceSeROp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Register the GPU kernels.
+#if GOOGLE_CUDA
 #define REGISTER_GPU(T)                                                                  \
 REGISTER_KERNEL_BUILDER(                                                                 \
-    Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint<T>("T"),                      \
-    ProdForceSeROp<GPUDevice, T>); 
-REGISTER_GPU(VALUETYPE);
\ No newline at end of file
+    Name("ProdForceSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdForceSeROp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc
index 8f607e0cf0..f42a5055c2 100644
--- a/source/op/prod_virial.cc
+++ b/source/op/prod_virial.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdVirial")
 .Attr("T: {float, double}")
 .Input("net_deriv: T")
@@ -30,7 +24,7 @@ using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialOp : public OpKernel {
  public:
   explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -89,13 +83,13 @@ class ProdVirialOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<T>();
-    auto in_deriv = in_deriv_tensor.flat<T>();
-    auto rij = rij_tensor.flat<T>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
     auto axis = axis_tensor.flat<int>();
-    auto virial = virial_tensor->flat<T>();
-    auto atom_virial = atom_virial_tensor->flat<T>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -133,10 +127,10 @@ class ProdVirialOp : public OpKernel {
 	  if (j_idx < 0) continue;
 	  if (jj == axis_0) {
 	    for (int aa = 0; aa < ndescrpt; ++aa){
-	      T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
+		  FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -145,10 +139,10 @@ class ProdVirialOp : public OpKernel {
 	  }
 	  else if (jj == axis_1) {
 	    for (int aa = 0; aa < ndescrpt; ++aa){
-	      T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
+		  FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -159,10 +153,10 @@ class ProdVirialOp : public OpKernel {
 	    int aa_start, aa_end;
 	    make_descript_range (aa_start, aa_end, jj);
 	    for (int aa = aa_start; aa < aa_end; ++aa) {
-	      T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	      FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
+		  FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -194,6 +188,7 @@ class ProdVirialOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdVirialOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc
index 3d8c7e4639..5d75f5f649 100644
--- a/source/op/prod_virial_grad.cc
+++ b/source/op/prod_virial_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdVirialGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -27,7 +21,7 @@ REGISTER_OP("ProdVirialGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialGradOp : public OpKernel 
 {
 public:
@@ -94,13 +88,13 @@ class ProdVirialGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
-    auto rij		= rij_tensor		.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
+    auto rij		= rij_tensor		.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
     auto axis		= axis_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -194,4 +188,5 @@ class ProdVirialGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdVirialGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdVirialGradOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc
index d975913d88..ba934fa54e 100644
--- a/source/op/prod_virial_se_a.cc
+++ b/source/op/prod_virial_se_a.cc
@@ -6,11 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
 REGISTER_OP("ProdVirialSeA")
 .Attr("T: {float, double}")
@@ -29,7 +24,7 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialSeAOp : public OpKernel {
  public:
   explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -85,12 +80,12 @@ class ProdVirialSeAOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<T>();
-    auto in_deriv = in_deriv_tensor.flat<T>();
-    auto rij = rij_tensor.flat<T>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto virial = virial_tensor->flat<T>();
-    auto atom_virial = atom_virial_tensor->flat<T>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -121,10 +116,10 @@ class ProdVirialSeAOp : public OpKernel {
 	  int aa_start, aa_end;
 	  make_descript_range (aa_start, aa_end, jj);
 	  for (int aa = aa_start; aa < aa_end; ++aa) {
-	    T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
+	    FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	    for (int dd0 = 0; dd0 < 3; ++dd0){
 	      for (int dd1 = 0; dd1 < 3; ++dd1){
-		T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
+		FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
 		virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 		atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	      }
@@ -156,7 +151,8 @@ class ProdVirialSeAOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdVirialSeAOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index 0f506fc51a..ebbd857bf0 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdVirialSeAGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -26,7 +20,7 @@ REGISTER_OP("ProdVirialSeAGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialSeAGradOp : public OpKernel 
 {
 public:
@@ -89,12 +83,12 @@ class ProdVirialSeAGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<T>();
-    auto net_deriv	= net_deriv_tensor	.flat<T>();
-    auto in_deriv	= in_deriv_tensor	.flat<T>();
-    auto rij		= rij_tensor		.flat<T>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
+    auto rij		= rij_tensor		.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<T>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -158,4 +152,5 @@ class ProdVirialSeAGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeAGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdVirialSeAGradOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_a_multi_device.cc b/source/op/prod_virial_se_a_multi_device.cc
index 21cd78c83c..7929fbd588 100644
--- a/source/op/prod_virial_se_a_multi_device.cc
+++ b/source/op/prod_virial_se_a_multi_device.cc
@@ -13,19 +13,19 @@ REGISTER_OP("ProdVirialSeA")
     .Output("virial: T")
     .Output("atom_virial: T");
 
-template<typename T>
+template<typename FPTYPE>
 struct ProdVirialSeAFunctor {
-    void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    void operator()(const CPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
         ProdVirialSeACPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
     }
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+    void operator()(const GPUDevice& d, FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
         ProdVirialSeAGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
     }
     #endif // GOOGLE_CUDA
 };
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialSeAOp : public OpKernel {
  public:
     explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -80,20 +80,20 @@ class ProdVirialSeAOp : public OpKernel {
         OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<T>();
-        auto in_deriv = in_deriv_tensor.flat<T>();
-        auto rij = rij_tensor.flat<T>();
+        auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+        auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+        auto rij = rij_tensor.flat<FPTYPE>();
         auto nlist = nlist_tensor.flat<int>();
-        auto virial = virial_tensor->flat<T>();
-        auto atom_virial = atom_virial_tensor->flat<T>();
+        auto virial = virial_tensor->flat<FPTYPE>();
+        auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
         
-        ProdVirialSeAFunctor<T>()(
+        ProdVirialSeAFunctor<FPTYPE>()(
             context->eigen_device<Device>(),
-            virial_tensor->flat<T>().data(), 
-            atom_virial_tensor->flat<T>().data(),
-            net_deriv_tensor.flat<T>().data(),
-            in_deriv_tensor.flat<T>().data(),
-            rij_tensor.flat<T>().data(),
+            virial_tensor->flat<FPTYPE>().data(), 
+            atom_virial_tensor->flat<FPTYPE>().data(),
+            net_deriv_tensor.flat<FPTYPE>().data(),
+            in_deriv_tensor.flat<FPTYPE>().data(),
+            rij_tensor.flat<FPTYPE>().data(),
             nlist_tensor.flat<int>().data(),
             nloc,
             nall,
diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc
index 63b74398ea..5337246c21 100644
--- a/source/op/prod_virial_se_r.cc
+++ b/source/op/prod_virial_se_r.cc
@@ -6,11 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
 REGISTER_OP("ProdVirialSeR")
 .Attr("T: {float, double}")
@@ -26,7 +21,7 @@ using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialSeROp : public OpKernel {
  public:
   explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
@@ -78,12 +73,12 @@ class ProdVirialSeROp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<T>();
-    auto in_deriv = in_deriv_tensor.flat<T>();
-    auto rij = rij_tensor.flat<T>();
+    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
+    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
+    auto rij = rij_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto virial = virial_tensor->flat<T>();
-    auto atom_virial = atom_virial_tensor->flat<T>();
+    auto virial = virial_tensor->flat<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -110,10 +105,10 @@ class ProdVirialSeROp : public OpKernel {
 	for (int jj = 0; jj < nnei; ++jj){
 	  int j_idx = nlist (nlist_iter + i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
-	  T pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
+	  FPTYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
-	      T tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
+	      FPTYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
 	      virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 	      atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	    }
@@ -128,7 +123,8 @@ class ProdVirialSeROp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                \
     Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     ProdVirialSeROp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index b125c9d783..61b8970e71 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("ProdVirialSeRGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -24,7 +18,7 @@ REGISTER_OP("ProdVirialSeRGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class ProdVirialSeRGradOp : public OpKernel 
 {
 public:
@@ -83,12 +77,12 @@ class ProdVirialSeRGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.flat<VALUETYPE>();
-    auto net_deriv	= net_deriv_tensor	.flat<VALUETYPE>();
-    auto in_deriv	= in_deriv_tensor	.flat<VALUETYPE>();
-    auto rij		= rij_tensor		.flat<VALUETYPE>();
+    auto grad		= grad_tensor		.flat<FPTYPE>();
+    auto net_deriv	= net_deriv_tensor	.flat<FPTYPE>();
+    auto in_deriv	= in_deriv_tensor	.flat<FPTYPE>();
+    auto rij		= rij_tensor		.flat<FPTYPE>();
     auto nlist		= nlist_tensor		.flat<int>();
-    auto grad_net	= grad_net_tensor	->flat<VALUETYPE>();
+    auto grad_net	= grad_net_tensor	->flat<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -133,4 +127,5 @@ class ProdVirialSeRGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("ProdVirialSeRGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     ProdVirialSeRGradOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/prod_virial_se_r_gpu.cc b/source/op/prod_virial_se_r_multi_device.cc
similarity index 53%
rename from source/op/prod_virial_se_r_gpu.cc
rename to source/op/prod_virial_se_r_multi_device.cc
index 6324bcf88d..61e37eb215 100644
--- a/source/op/prod_virial_se_r_gpu.cc
+++ b/source/op/prod_virial_se_r_multi_device.cc
@@ -1,14 +1,5 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <iostream>
-#include <cuda_runtime.h>
-
-#ifdef HIGH_PREC
-    typedef double VALUETYPE;
-#else
-    typedef float  VALUETYPE;
-#endif
+#include "common.h"
+#include "CustomeOperation.h"
 
 REGISTER_OP("ProdVirialSeR")
     .Attr("T: {float, double}")
@@ -20,40 +11,22 @@ REGISTER_OP("ProdVirialSeR")
     .Output("virial: T")
     .Output("atom_virial: T");
 
-using namespace tensorflow;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess) 
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
+template<typename T>
+struct ProdVirialSeRFunctor {
+    void operator()(const CPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdVirialSeRCPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
     }
-}
-
-void ProdVirialSeRLauncher(VALUETYPE * virial, 
-                        VALUETYPE * atom_virial,
-                        const VALUETYPE * net_deriv,
-                        const VALUETYPE * in_deriv,
-                        const VALUETYPE * rij,
-                        const int * nlist,
-                        const int nloc,
-                        const int nall,
-                        const int nnei,
-                        const int ndescrpt,
-                        const int n_a_sel,
-                        const int n_a_shift);
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, T * virial, T * atom_virial, const T * net_deriv, const T * in_deriv, const T * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+        ProdVirialSeRGPULauncher(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
+    }
+    #endif // GOOGLE_CUDA
+};
 
 template<typename Device, typename T>
 class ProdVirialSeROp : public OpKernel {
  public:
-    explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {
-        // std::cout << "I'm in prod_virial_se_r_gpu.cc" << std::endl;
-    }
+    explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {}
 
     void Compute(OpKernelContext* context) override {
         // Grab the input tensor
@@ -63,16 +36,16 @@ class ProdVirialSeROp : public OpKernel {
         const Tensor& rij_tensor		= context->input(context_input_index++);
         const Tensor& nlist_tensor		= context->input(context_input_index++);
         const Tensor& natoms_tensor		= context->input(context_input_index++);
+
         // set size of the sample
         OP_REQUIRES (context, (net_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of net deriv should be 2"));
         OP_REQUIRES (context, (in_deriv_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of input deriv should be 2"));
         OP_REQUIRES (context, (rij_tensor.shape().dims() == 2),		    errors::InvalidArgument ("Dim of rij should be 2"));
         OP_REQUIRES (context, (nlist_tensor.shape().dims() == 2),		errors::InvalidArgument ("Dim of nlist should be 2"));
         OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of natoms should be 1"));
-        cudaErrcheck(cudaDeviceSynchronize());
+
         OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),	errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-        int * natoms = new int[natoms_tensor.shape().dim_size(0)];
-        cudaErrcheck(cudaMemcpy(natoms, natoms_tensor.flat<int>().data(), sizeof(int) * natoms_tensor.shape().dim_size(0), cudaMemcpyDeviceToHost));
+        const int * natoms = natoms_tensor.flat<int>().data();
         int nloc = natoms[0];
         int nall = natoms[1];
         int nnei = nlist_tensor.shape().dim_size(1) / nloc;
@@ -85,52 +58,57 @@ class ProdVirialSeROp : public OpKernel {
         OP_REQUIRES (context, (nframes == nlist_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
 
         OP_REQUIRES (context, (nloc * ndescrpt * 3 == in_deriv_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of descriptors should match"));
-        OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),	        errors::InvalidArgument ("dim of rij should be nnei * 3"));
+        OP_REQUIRES (context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),	errors::InvalidArgument ("dim of rij should be nnei * 3"));
 
         // Create an output tensor
-        TensorShape virial_shape;
+        TensorShape virial_shape ;
         virial_shape.AddDim (nframes);
         virial_shape.AddDim (9);
         Tensor* virial_tensor = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(0, virial_shape, &virial_tensor));
-        TensorShape atom_virial_shape ;
+        TensorShape atom_virial_shape;
         atom_virial_shape.AddDim (nframes);
         atom_virial_shape.AddDim (9 * nall);
         Tensor* atom_virial_tensor = NULL;
         OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
 
         // flat the tensors
-        auto net_deriv = net_deriv_tensor.flat<VALUETYPE>();
-        auto in_deriv = in_deriv_tensor.flat<VALUETYPE>();
-        auto rij = rij_tensor.flat<VALUETYPE>();
+        auto net_deriv = net_deriv_tensor.flat<T>();
+        auto in_deriv = in_deriv_tensor.flat<T>();
+        auto rij = rij_tensor.flat<T>();
         auto nlist = nlist_tensor.flat<int>();
-        auto virial = virial_tensor->flat<VALUETYPE>();
-        auto atom_virial = atom_virial_tensor->flat<VALUETYPE>();
-
-        for (int II = 0; II < nframes; II++) {
-            ProdVirialSeRLauncher(virial_tensor->flat<VALUETYPE>().data() + II * 9, 
-                                atom_virial_tensor->flat<VALUETYPE>().data() + II * (nall * 9),
-                                net_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt),
-                                in_deriv_tensor.flat<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
-                                rij_tensor.flat<VALUETYPE>().data() + II * (nloc * nnei * 3),
-                                nlist_tensor.flat<int>().data() + II * (nloc * nnei),
-                                nloc,
-                                nall,
-                                nnei,
-                                ndescrpt,
-                                n_a_sel,
-                                n_a_shift
-            );
-        }
-        delete[] natoms;
+        auto virial = virial_tensor->flat<T>();
+        auto atom_virial = atom_virial_tensor->flat<T>();
+        
+        ProdVirialSeRFunctor<T>()(
+            context->eigen_device<Device>(),
+            virial_tensor->flat<T>().data(), 
+            atom_virial_tensor->flat<T>().data(),
+            net_deriv_tensor.flat<T>().data(),
+            in_deriv_tensor.flat<T>().data(),
+            rij_tensor.flat<T>().data(),
+            nlist_tensor.flat<int>().data(),
+            nloc,
+            nall,
+            nnei,
+            ndescrpt
+        );
     }
-private:
-    int n_r_sel, n_a_sel, n_a_shift;
 };
 
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                   \
+REGISTER_KERNEL_BUILDER(                                                                  \
+    Name("ProdVirialSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    ProdVirialSeROp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 // Register the GPU kernels.
+#if GOOGLE_CUDA
 #define REGISTER_GPU(T)                                                                   \
 REGISTER_KERNEL_BUILDER(                                                                  \
-    Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint<T>("T"),                      \
-    ProdVirialSeROp<GPUDevice, T>); 
-REGISTER_GPU(VALUETYPE);
\ No newline at end of file
+    Name("ProdVirialSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"), \
+    ProdVirialSeROp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc
index 17ab6bc1f5..6f3bc58932 100644
--- a/source/op/soft_min.cc
+++ b/source/op/soft_min.cc
@@ -8,11 +8,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
 REGISTER_OP("SoftMinSwitch")
 .Attr("T: {float, double}")
@@ -32,7 +27,7 @@ using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class SoftMinSwitchOp : public OpKernel {
  public:
   explicit SoftMinSwitchOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -95,10 +90,10 @@ class SoftMinSwitchOp : public OpKernel {
     
     // flat the tensors
     auto type	= type_tensor	.matrix<int>();
-    auto rij	= rij_tensor	.matrix<T>();
+    auto rij	= rij_tensor	.matrix<FPTYPE>();
     auto nlist	= nlist_tensor	.matrix<int>();
-    auto sw_value = sw_value_tensor	->matrix<T>();
-    auto sw_deriv = sw_deriv_tensor	->matrix<T>();
+    auto sw_value = sw_value_tensor	->matrix<FPTYPE>();
+    auto sw_deriv = sw_deriv_tensor	->matrix<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -115,26 +110,26 @@ class SoftMinSwitchOp : public OpKernel {
       // compute force of a frame      
       for (int ii = 0; ii < nloc; ++ii){
 	int i_idx = ii;
-	T aa = 0;
-	T bb = 0;
+	FPTYPE aa = 0;
+	FPTYPE bb = 0;
 	for (int jj = 0; jj < nnei; ++jj){
 	  int j_idx = nlist (kk, i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
 	  int rij_idx_shift = (i_idx * nnei + jj) * 3;
-	  T dr[3] = {
+	  FPTYPE dr[3] = {
 	    rij(kk, rij_idx_shift + 0),
 	    rij(kk, rij_idx_shift + 1),
 	    rij(kk, rij_idx_shift + 2)
 	  };
-	  T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
-	  T rr = sqrt(rr2);
-	  T ee = exp(-rr / alpha);
+	  FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
+	  FPTYPE rr = sqrt(rr2);
+	  FPTYPE ee = exp(-rr / alpha);
 	  aa += ee;
 	  bb += rr * ee;
 	}
-	T smin = bb / aa;
-	T vv, dd;
-	spline5_switch(vv, dd, smin, static_cast<T>(rmin), static_cast<T>(rmax));
+	FPTYPE smin = bb / aa;
+	FPTYPE vv, dd;
+	spline5_switch(vv, dd, smin, static_cast<FPTYPE>(rmin), static_cast<FPTYPE>(rmax));
 	// value of switch
 	sw_value(kk, i_idx) = vv;
 	// deriv of switch distributed as force
@@ -142,17 +137,17 @@ class SoftMinSwitchOp : public OpKernel {
 	  int j_idx = nlist (kk, i_idx * nnei + jj);
 	  if (j_idx < 0) continue;
 	  int rij_idx_shift = (ii * nnei + jj) * 3;
-	  T dr[3] = {
+	  FPTYPE dr[3] = {
 	    rij(kk, rij_idx_shift + 0),
 	    rij(kk, rij_idx_shift + 1),
 	    rij(kk, rij_idx_shift + 2)
 	  };
-	  T rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
-	  T rr = sqrt(rr2);
-	  T ee = exp(-rr / alpha);
-	  T pref_c = (1./rr - 1./alpha) * ee ;
-	  T pref_d = 1./(rr * alpha) * ee;
-	  T ts;
+	  FPTYPE rr2 = dr[0] * dr[0] + dr[1] * dr[1] + dr[2] * dr[2];
+	  FPTYPE rr = sqrt(rr2);
+	  FPTYPE ee = exp(-rr / alpha);
+	  FPTYPE pref_c = (1./rr - 1./alpha) * ee ;
+	  FPTYPE pref_d = 1./(rr * alpha) * ee;
+	  FPTYPE ts;
 	  ts = dd / (aa * aa) * (aa * pref_c + bb * pref_d);
 	  sw_deriv(kk, rij_idx_shift + 0) += ts * dr[0];
 	  sw_deriv(kk, rij_idx_shift + 1) += ts * dr[1];
@@ -190,6 +185,7 @@ class SoftMinSwitchOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("SoftMinSwitch").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     SoftMinSwitchOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc
index 43344e7352..ffd6442a1a 100644
--- a/source/op/soft_min_force.cc
+++ b/source/op/soft_min_force.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("SoftMinForce")
 .Attr("T: {float, double}")
 .Input("du: T")
@@ -22,12 +16,11 @@ REGISTER_OP("SoftMinForce")
 .Attr("n_r_sel: int")
 .Output("force: T");
 
-
 using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class SoftMinForceOp : public OpKernel {
  public:
   explicit SoftMinForceOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -72,10 +65,10 @@ class SoftMinForceOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, force_shape, &force_tensor));
     
     // flat the tensors
-    auto du = du_tensor.matrix<T>();
-    auto sw_deriv = sw_deriv_tensor.matrix<T>();
+    auto du = du_tensor.matrix<FPTYPE>();
+    auto sw_deriv = sw_deriv_tensor.matrix<FPTYPE>();
     auto nlist = nlist_tensor.matrix<int>();
-    auto force = force_tensor->matrix<T>();
+    auto force = force_tensor->matrix<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -117,4 +110,5 @@ class SoftMinForceOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("SoftMinForce").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     SoftMinForceOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc
index bfaa8aca76..2461828853 100644
--- a/source/op/soft_min_force_grad.cc
+++ b/source/op/soft_min_force_grad.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("SoftMinForceGrad")
 .Attr("T: {float, double}")
 .Input("grad: T")
@@ -25,7 +19,7 @@ REGISTER_OP("SoftMinForceGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class SoftMinForceGradOp : public OpKernel 
 {
 public:
@@ -81,11 +75,11 @@ class SoftMinForceGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.matrix<T>();
-    auto du		= du_tensor		.matrix<T>();
-    auto sw_deriv	= sw_deriv_tensor	.matrix<T>();
+    auto grad		= grad_tensor		.matrix<FPTYPE>();
+    auto du		= du_tensor		.matrix<FPTYPE>();
+    auto sw_deriv	= sw_deriv_tensor	.matrix<FPTYPE>();
     auto nlist		= nlist_tensor		.matrix<int>();
-    auto grad_net	= grad_net_tensor	->matrix<T>();
+    auto grad_net	= grad_net_tensor	->matrix<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -122,4 +116,5 @@ class SoftMinForceGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("SoftMinForceGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     SoftMinForceGradOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc
index 7829c4b7ed..f369808685 100644
--- a/source/op/soft_min_virial.cc
+++ b/source/op/soft_min_virial.cc
@@ -6,12 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
-
 REGISTER_OP("SoftMinVirial")
 .Attr("T: {float, double}")
 .Input("du: T")
@@ -28,7 +22,7 @@ using namespace tensorflow;
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class SoftMinVirialOp : public OpKernel {
  public:
   explicit SoftMinVirialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -83,12 +77,12 @@ class SoftMinVirialOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, &atom_virial_tensor));
     
     // flat the tensors
-    auto du = du_tensor.matrix<VALUETYPE>();
-    auto sw_deriv = sw_deriv_tensor.matrix<VALUETYPE>();
-    auto rij = rij_tensor.matrix<VALUETYPE>();
+    auto du = du_tensor.matrix<FPTYPE>();
+    auto sw_deriv = sw_deriv_tensor.matrix<FPTYPE>();
+    auto rij = rij_tensor.matrix<FPTYPE>();
     auto nlist = nlist_tensor.matrix<int>();
-    auto virial = virial_tensor->matrix<VALUETYPE>();
-    auto atom_virial = atom_virial_tensor->matrix<VALUETYPE>();
+    auto virial = virial_tensor->matrix<FPTYPE>();
+    auto atom_virial = atom_virial_tensor->matrix<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for
@@ -111,7 +105,7 @@ class SoftMinVirialOp : public OpKernel {
 	  int rij_idx_shift = (ii * nnei + jj) * 3;
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
-	      VALUETYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1);
+	      FPTYPE tmp_v = du(kk, i_idx) * sw_deriv(kk, rij_idx_shift + dd0) * rij(kk, rij_idx_shift + dd1);
 	      virial(kk, dd0 * 3 + dd1) -= tmp_v;		  
 	      atom_virial(kk, j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	    }
@@ -129,6 +123,7 @@ class SoftMinVirialOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("SoftMinVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     SoftMinVirialOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc
index b454612895..0bafaa5bfa 100644
--- a/source/op/soft_min_virial_grad.cc
+++ b/source/op/soft_min_virial_grad.cc
@@ -6,11 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
 REGISTER_OP("SoftMinVirialGrad")
 .Attr("T: {float, double}")
@@ -26,7 +21,7 @@ REGISTER_OP("SoftMinVirialGrad")
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class SoftMinVirialGradOp : public OpKernel 
 {
 public:
@@ -88,12 +83,12 @@ class SoftMinVirialGradOp : public OpKernel
     OP_REQUIRES_OK(context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
     
     // flat the tensors
-    auto grad		= grad_tensor		.matrix<T>();
-    auto du		= du_tensor		.matrix<T>();
-    auto sw_deriv	= sw_deriv_tensor	.matrix<T>();
-    auto rij		= rij_tensor		.matrix<T>();
+    auto grad		= grad_tensor		.matrix<FPTYPE>();
+    auto du		= du_tensor		.matrix<FPTYPE>();
+    auto sw_deriv	= sw_deriv_tensor	.matrix<FPTYPE>();
+    auto rij		= rij_tensor		.matrix<FPTYPE>();
     auto nlist		= nlist_tensor		.matrix<int>();
-    auto grad_net	= grad_net_tensor	->matrix<T>();
+    auto grad_net	= grad_net_tensor	->matrix<FPTYPE>();
 
     // loop over frames
 #pragma omp parallel for
@@ -144,4 +139,5 @@ class SoftMinVirialGradOp : public OpKernel
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("SoftMinVirialGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     SoftMinVirialGradOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
\ No newline at end of file
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/tab_inter.cc b/source/op/tab_inter.cc
index 5e86608e9a..63c7af210c 100644
--- a/source/op/tab_inter.cc
+++ b/source/op/tab_inter.cc
@@ -6,11 +6,6 @@
 using namespace tensorflow;
 using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
 
 REGISTER_OP("TabInter")
 .Attr("T: {float, double}")
@@ -73,7 +68,7 @@ void tabulated_inter (double & ener,
   fscale *= -hi;
 }
 
-template<typename Device, typename T>
+template<typename Device, typename FPTYPE>
 class TabInterOp : public OpKernel {
  public:
   explicit TabInterOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -144,15 +139,15 @@ class TabInterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(tmp_idx++, virial_shape, &virial_tensor));
     
     // flat the tensors
-    auto table_info = table_info_tensor.flat<T>();
-    auto table_data = table_data_tensor.flat<T>();
+    auto table_info = table_info_tensor.flat<FPTYPE>();
+    auto table_data = table_data_tensor.flat<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto rij	= rij_tensor	.matrix<T>();
+    auto rij	= rij_tensor	.matrix<FPTYPE>();
     auto nlist	= nlist_tensor	.matrix<int>();
-    auto scale  = scale_tensor	.matrix<T>();
-    auto energy = energy_tensor	->matrix<T>();
-    auto force	= force_tensor	->matrix<T>();
-    auto virial = virial_tensor	->matrix<T>();
+    auto scale  = scale_tensor	.matrix<FPTYPE>();
+    auto energy = energy_tensor	->matrix<FPTYPE>();
+    auto force	= force_tensor	->matrix<FPTYPE>();
+    auto virial = virial_tensor	->matrix<FPTYPE>();
 
     OP_REQUIRES (context, (ntypes == int(table_info(3)+0.1)),	errors::InvalidArgument ("ntypes provided in table does not match deeppot"));
     int nspline = table_info(2)+0.1;
@@ -191,7 +186,7 @@ class TabInterOp : public OpKernel {
       for (int tt = 0; tt < ntypes; ++tt) {
 	for (int ii = 0; ii < natoms(2+tt); ++ii){
 	  int i_type = type(kk, i_idx);
-	  T i_scale = scale(kk, i_idx);
+	  FPTYPE i_scale = scale(kk, i_idx);
 	  assert(i_type == tt) ;
 	  int jiter = 0;
 	  // a neighbor
@@ -311,7 +306,8 @@ class TabInterOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(                                                                  \
     Name("TabInter").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
     TabInterOp<CPUDevice, T>); 
-REGISTER_CPU(VALUETYPE);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 

From a3f467e3e68a64d5f980c1dff68458e45e8e367d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 16 Sep 2020 13:32:53 +0800
Subject: [PATCH 04/65] global_polar: print loss not normalized by
 sqrt(natoms). add dp test for global_polar

---
 source/train/Loss.py |  7 ++++---
 source/train/test.py | 27 +++++++++++++++++++++++----
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/source/train/Loss.py b/source/train/Loss.py
index d939273f26..1f336325a3 100644
--- a/source/train/Loss.py
+++ b/source/train/Loss.py
@@ -301,11 +301,12 @@ def build (self,
         polar_hat = label_dict[self.label_name]
         polar = model_dict[self.tensor_name]
         l2_loss = tf.reduce_mean( tf.square(self.scale*(polar - polar_hat)), name='l2_'+suffix)
+        more_loss = {'nonorm': l2_loss}
         if not self.atomic :
             atom_norm  = 1./ global_cvt_2_tf_float(natoms[0]) 
             l2_loss = l2_loss * atom_norm
         self.l2_l = l2_loss
-        more_loss = {}
+        self.l2_more = more_loss['nonorm']
 
         return l2_loss, more_loss
 
@@ -321,10 +322,10 @@ def print_on_training(self,
                           feed_dict_test,
                           feed_dict_batch) :
         error_test\
-            = sess.run([self.l2_l], \
+            = sess.run([self.l2_more], \
                        feed_dict=feed_dict_test)
         error_train\
-            = sess.run([self.l2_l], \
+            = sess.run([self.l2_more], \
                        feed_dict=feed_dict_batch)
         print_str = ""
         prop_fmt = "   %9.2e %9.2e"
diff --git a/source/train/test.py b/source/train/test.py
index d8639020ce..c01b81d4d0 100644
--- a/source/train/test.py
+++ b/source/train/test.py
@@ -12,6 +12,7 @@
 from deepmd import DeepPot
 from deepmd import DeepDipole
 from deepmd import DeepPolar
+from deepmd import DeepGlobalPolar
 from deepmd import DeepWFC
 from tensorflow.python.framework import ops
 
@@ -28,6 +29,8 @@ def test (args):
         dp = DeepDipole(args.model)    
     elif de.model_type == 'polar':
         dp = DeepPolar(args.model)    
+    elif de.model_type == 'global_polar':
+        dp = DeepGlobalPolar(args.model)    
     elif de.model_type == 'wfc':
         dp = DeepWFC(args.model)    
     else :
@@ -41,7 +44,9 @@ def test (args):
         elif de.model_type == 'dipole':
             err, siz = test_dipole(dp, args)
         elif de.model_type == 'polar':
-            err, siz = test_polar(dp, args)
+            err, siz = test_polar(dp, args, global_polar=False)
+        elif de.model_type == 'global_polar':
+            err, siz = test_polar(dp, args, global_polar=True)
         elif de.model_type == 'wfc':
             err, siz = test_wfc(dp, args)
         else :
@@ -61,6 +66,8 @@ def test (args):
             print_dipole_sys_avg(avg_err)
         elif de.model_type == 'polar':
             print_polar_sys_avg(avg_err)
+        elif de.model_type == 'global_polar':
+            print_polar_sys_avg(avg_err)
         elif de.model_type == 'wfc':
             print_wfc_sys_avg(avg_err)
         else :
@@ -223,12 +230,15 @@ def print_wfc_sys_avg(avg):
     print ("WFC  L2err : %e eV/A" % avg[0])
 
 
-def test_polar (dp, args) :
+def test_polar (dp, args, global_polar = False) :
     if args.rand_seed is not None :
         np.random.seed(args.rand_seed % (2**32))
 
     data = DeepmdData(args.system, args.set_prefix, shuffle_test = args.shuffle_test)
-    data.add('polarizability', 9, atomic=True, must=True, high_prec=False, type_sel = dp.get_sel_type())
+    if not global_polar:
+        data.add('polarizability', 9, atomic=True,  must=True, high_prec=False, type_sel = dp.get_sel_type())
+    else:
+        data.add('polarizability', 9, atomic=False, must=True, high_prec=False, type_sel = dp.get_sel_type())
     test_data = data.get_test ()
     numb_test = args.numb_test
     natoms = len(test_data["type"][0])
@@ -239,12 +249,21 @@ def test_polar (dp, args) :
     box = test_data["box"][:numb_test]
     atype = test_data["type"][0]
     polar = dp.eval(coord, box, atype)
+    sel_type = dp.get_sel_type()
+    sel_natoms = 0
+    for ii in sel_type:
+        sel_natoms += sum(atype == ii)
 
     polar = polar.reshape([numb_test,-1])
     l2f = (l2err (polar  - test_data["polarizability"] [:numb_test]))
+    l2fs = l2f/np.sqrt(sel_natoms)
+    l2fa = l2f/sel_natoms
 
     print ("# number of test data : %d " % numb_test)
-    print ("Polarizability  L2err : %e eV/A" % l2f)
+    print ("Polarizability  L2err       : %e eV/A" % l2f)
+    if global_polar:
+        print ("Polarizability  L2err/sqrtN : %e eV/A" % l2fs)
+        print ("Polarizability  L2err/N     : %e eV/A" % l2fa)
 
     detail_file = args.detail_file
     if detail_file is not None :

From c9fd68d391f09787f13cea1aa983bf59c2943588 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 17 Sep 2020 14:47:40 +0800
Subject: [PATCH 05/65] correct way of getting site package path

---
 setup.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 99a86d3da8..2e147160e1 100644
--- a/setup.py
+++ b/setup.py
@@ -4,17 +4,16 @@
 from setuptools_scm import get_version
 from packaging.version import LegacyVersion
 from os import path, makedirs
-import imp, sys, platform
+import os, imp, sys, platform, sysconfig
 
 def get_dp_install_path() :
-    site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
-    dp_scm_version     = get_version(root="./", relative_to=__file__)
+    site_packages_path = sysconfig.get_paths()['purelib']
+    dp_scm_version     = get_version(root=".", relative_to=__file__)
     python_version     = 'py' + str(sys.version_info.major + sys.version_info.minor * 0.1)
     os_info            = sys.platform
     machine_info       = platform.machine()
-    dp_pip_install_path    = site_packages_path + '/deepmd'
-    dp_setup_install_path    = site_packages_path + '/deepmd_kit-' + dp_scm_version + '-' + python_version + '-' + os_info + '-' + machine_info + '.egg/deepmd'
-    
+    dp_pip_install_path         = os.path.join(site_packages_path, 'deepmd')
+    dp_setup_install_path       = os.path.join(site_packages_path, 'deepmd_kit-' + dp_scm_version + '-' + python_version + '-' + os_info + '-' + machine_info + '.egg', 'deepmd')
     return dp_pip_install_path, dp_setup_install_path
 
 readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')

From 173afe5f270ba43d4e332d3e5098203eb225d5b5 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 30 Sep 2020 09:42:36 -0400
Subject: [PATCH 06/65] set RPATH to $ORIGIN

---
 setup.py                 | 12 ------------
 source/op/CMakeLists.txt |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/setup.py b/setup.py
index 2e147160e1..858f84aa5a 100644
--- a/setup.py
+++ b/setup.py
@@ -6,15 +6,6 @@
 from os import path, makedirs
 import os, imp, sys, platform, sysconfig
 
-def get_dp_install_path() :
-    site_packages_path = sysconfig.get_paths()['purelib']
-    dp_scm_version     = get_version(root=".", relative_to=__file__)
-    python_version     = 'py' + str(sys.version_info.major + sys.version_info.minor * 0.1)
-    os_info            = sys.platform
-    machine_info       = platform.machine()
-    dp_pip_install_path         = os.path.join(site_packages_path, 'deepmd')
-    dp_setup_install_path       = os.path.join(site_packages_path, 'deepmd_kit-' + dp_scm_version + '-' + python_version + '-' + os_info + '-' + machine_info + '.egg', 'deepmd')
-    return dp_pip_install_path, dp_setup_install_path
 
 readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')
 try:
@@ -45,7 +36,6 @@ def get_dp_install_path() :
 except OSError:
     pass
 
-dp_pip_install_path, dp_setup_install_path = get_dp_install_path()
 
 setup(
     name="deepmd-kit",
@@ -69,8 +59,6 @@ def get_dp_install_path() :
                 '-DBUILD_PY_IF:BOOL=TRUE', 
                 '-DBUILD_CPP_IF:BOOL=FALSE',
                 '-DFLOAT_PREC:STRING=high',
-                '-DDP_PIP_INSTALL_PATH=%s' % dp_pip_install_path,
-                '-DDP_SETUP_INSTALL_PATH=%s' % dp_setup_install_path,
     ],
     cmake_source_dir='source',
     cmake_minimum_required_version='3.0',
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index c4d97cd85a..1ddaaf3255 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -24,7 +24,7 @@ endif (BUILD_CPP_IF)
 
 if (BUILD_PY_IF)
   set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
-  set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda)
+  set(CMAKE_INSTALL_RPATH $ORIGIN)
   if (USE_CUDA_TOOLKIT)
     add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
     add_library(op_grads SHARED ${OP_GRADS_SRC})

From 2039b4342148862f41d116c8a335d7465ca4cae4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Wed, 7 Oct 2020 15:27:55 +0200
Subject: [PATCH 07/65] enable setting test size individually for each system

---
 .gitignore                 |  1 +
 README.md                  |  9 ++---
 source/train/DataSystem.py | 68 ++++++++++++++++++++++++++++++--------
 source/train/Trainer.py    | 23 ++++++++-----
 source/train/common.py     | 16 ++++++---
 5 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index c41f53e077..435a560708 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ dist
 .eggs
 _version.py
 venv*
+.vscode/**
diff --git a/README.md b/README.md
index 5520daa57f..abf3fcc35a 100644
--- a/README.md
+++ b/README.md
@@ -398,10 +398,10 @@ Since we do not have virial data, the virial prefactors `start_pref_v` and `limi
 An example of `training` is
 ```json
     "training" : {
-	"systems":	["../data/"],
+	"systems":	["../data1/", "../data2/"],
 	"set_prefix":	"set",    
 	"stop_batch":	1000000,
-	"batch_size":	1,
+	"batch_size":	1, # or "auto" or [10, 20]
 
 	"seed":		1,
 
@@ -409,7 +409,7 @@ An example of `training` is
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
-	"numb_test":	10,
+	"numb_test":	10,  # or "XX%" or [10, 20]
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",
 	"load_ckpt":	"model.ckpt",
@@ -422,9 +422,10 @@ An example of `training` is
 ```
 The option **`systems`** provide location of the systems (path to `set.*` and `type.raw`). It is a vector, thus DeePMD-kit allows you to provide multiple systems. DeePMD-kit will train the model with the systems in the vector one by one in a cyclic manner. **It is warned that the example water data (in folder `examples/data/water`) is of very limited amount, is provided only for testing purpose, and should not be used to train a productive model.**
 
-The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size.
+The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size or it can be input as a list setting batch size individually for each system.
 The option **`stop_batch`** specifies the total number of batches will be used in the training.
 
+The option **`numb_test`** specifies the number of tests that will be used for each system. If it is an integer each system will be tested with the same number of tests. It can be set to percentage `"XX%"` to use XX% of frames of each system for its testing or it can be input as a list setting numer of tests individually for each system (the order should correspond to ordering of the systems key in json).
 
 ### Training
 
diff --git a/source/train/DataSystem.py b/source/train/DataSystem.py
index 61d59f5cea..03971d196d 100644
--- a/source/train/DataSystem.py
+++ b/source/train/DataSystem.py
@@ -65,6 +65,29 @@ def __init__ (self,
             type_map_list.append(self.data_systems[ii].get_type_map())
         self.type_map = self._check_type_map_consistency(type_map_list)
 
+        # ! altered by Marián Rynik
+        # test size
+        # now test size can be set as a percentage of systems data or test size
+        # can be set for each system individualy in the same manner as batch
+        # size. This enables one to use systems with diverse number of
+        # structures and different number of atoms.
+        self.test_size = test_size
+        if isinstance(self.test_size, int):
+            self.test_size = self.test_size * np.ones(self.nsystems, dtype=int)
+        elif isinstance(self.test_size, str):
+            words = self.test_size.split('%')
+            try:
+                percent = int(words[0])
+            except ValueError:
+                raise RuntimeError('unknown test_size rule ' + words[0])
+            self.test_size = self._make_auto_ts(percent)
+        elif isinstance(self.test_size, list):
+            pass
+        else :
+            raise RuntimeError('invalid test_size')            
+        assert(isinstance(self.test_size, (list,np.ndarray)))
+        assert(len(self.test_size) == self.nsystems)
+
         # prob of batch, init pick idx
         self.prob_nbatches = [ float(i) for i in self.nbatches] / np.sum(self.nbatches)        
         self.pick_idx = 0
@@ -75,10 +98,10 @@ def __init__ (self,
             if chk_ret is not None :
                 warnings.warn("system %s required batch size is larger than the size of the dataset %s (%d > %d)" % \
                               (self.system_dirs[ii], chk_ret[0], self.batch_size[ii], chk_ret[1]))
-            chk_ret = self.data_systems[ii].check_test_size(test_size)
+            chk_ret = self.data_systems[ii].check_test_size(self.test_size[ii])
             if chk_ret is not None :
                 warnings.warn("system %s required test size is larger than the size of the dataset %s (%d > %d)" % \
-                              (self.system_dirs[ii], chk_ret[0], test_size, chk_ret[1]))
+                              (self.system_dirs[ii], chk_ret[0], self.test_size[ii], chk_ret[1]))
 
 
     def _load_test(self, ntests = -1):
@@ -207,17 +230,23 @@ def get_batch (self,
         b_data["default_mesh"] = self.default_mesh[self.pick_idx]
         return b_data
 
+    # ! altered by Marián Rynik
     def get_test (self, 
-                  sys_idx = None, 
-                  ntests = -1) :
-        if not hasattr(self, 'default_mesh') :
-            self._make_default_mesh()
-        if not hasattr(self, 'test_data') :
-            self._load_test(ntests = ntests)
+                  sys_idx = None) :
+
+        # need to get idx first to get the appropriate test size for the
+        # current system
         if sys_idx is not None :
             idx = sys_idx
         else :
+            # idx get selected in get batch, it is the index of a system
             idx = self.pick_idx
+
+        if not hasattr(self, 'default_mesh') :
+            self._make_default_mesh()
+        if not hasattr(self, 'test_data') :
+            self._load_test(ntests = self.test_size[idx])
+        
         test_system_data = {}
         for nn in self.test_data:
             test_system_data[nn] = self.test_data[nn][idx]
@@ -261,20 +290,21 @@ def print_summary(self,
         # width 65
         sys_width = 42
         tmp_msg += "---Summary of DataSystem------------------------------------------------\n"
-        tmp_msg += "find %d system(s):\n" % self.nsystems
+        tmp_msg += "found %d system(s):\n" % self.nsystems
         tmp_msg += "%s  " % self._format_name_length('system', sys_width)
-        tmp_msg += "%s  %s  %s  %5s\n" % ('natoms', 'bch_sz', 'n_bch', 'prob')
+        tmp_msg += "%s  %s  %s   %s  %5s\n" % ('natoms', 'bch_sz', 'n_bch', "n_test", 'prob')
         for ii in range(self.nsystems) :
-            tmp_msg += ("%s  %6d  %6d  %5d  %5.3f\n" % 
+            tmp_msg += ("%s  %6d  %6d  %6d  %6d  %5.3f\n" % 
                         (self._format_name_length(self.system_dirs[ii], sys_width),
                          self.natoms[ii], 
-                         self.batch_size[ii], 
-                         self.nbatches[ii], 
+                         # TODO batch size * nbatches = number of structures
+                         self.batch_size[ii],
+                         self.nbatches[ii],
+                         self.test_size[ii],
                          prob[ii]) )
         tmp_msg += "------------------------------------------------------------------------\n"
         run_opt.message(tmp_msg)
 
-        
     def _make_auto_bs(self, rule) :
         bs = []
         for ii in self.data_systems:
@@ -285,6 +315,16 @@ def _make_auto_bs(self, rule) :
             bs.append(bsi)
         return bs
 
+    # ! added by Marián Rynik
+    def _make_auto_ts(self, percent):
+        ts = []
+        for ii in range(self.nsystems):
+            ni = self.batch_size[ii] * self.nbatches[ii]
+            tsi = int(ni * percent / 100)
+            ts.append(tsi)
+
+        return ts
+
     def _check_type_map_consistency(self, type_map_list):
         ret = []
         for ii in type_map_list:
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index b6428c987e..b5beb18ef6 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -169,8 +169,9 @@ def _init_param(self, jdata):
         # training
         training_param = j_must_have(jdata, 'training')
 
+        # ! first .add() altered by Marián Rynik
         tr_args = ClassArg()\
-                  .add('numb_test',     int,    default = 1)\
+                  .add('numb_test',     [int, list, str],    default = 1)\
                   .add('disp_file',     str,    default = 'lcurve.out')\
                   .add('disp_freq',     int,    default = 100)\
                   .add('save_freq',     int,    default = 1000)\
@@ -182,7 +183,8 @@ def _init_param(self, jdata):
                   .add('sys_probs',   list    )\
                   .add('auto_prob_style', str, default = "prob_sys_size")
         tr_data = tr_args.parse(training_param)
-        self.numb_test = tr_data['numb_test']
+        # not needed
+        # self.numb_test = tr_data['numb_test']
         self.disp_file = tr_data['disp_file']
         self.disp_freq = tr_data['disp_freq']
         self.save_freq = tr_data['save_freq']
@@ -458,7 +460,10 @@ def test_on_the_fly (self,
                          fp,
                          data,
                          feed_dict_batch) :
-        test_data = data.get_test(ntests = self.numb_test)
+        # ! altered by Marián Rynik
+        # Do not need to pass numb_test here as data object already knows it.
+        # Both DeepmdDataSystem and ClassArg parse the same json file
+        test_data = data.get_test()
         feed_dict_test = {}
         for kk in test_data.keys():
             if kk == 'find_type' or kk == 'type' :
@@ -466,9 +471,13 @@ def test_on_the_fly (self,
             if 'find_' in kk:
                 feed_dict_test[self.place_holders[kk]] = test_data[kk]
             else:
-                feed_dict_test[self.place_holders[kk]] = np.reshape(test_data[kk][:self.numb_test], [-1])
+                # ! altered by Marián Rynik
+                # again the data object knows appropriate test data shape,
+                # there is no need to slice again!
+                # feed_dict_test[self.place_holders[kk]] = np.reshape(test_data[kk][:self.numb_test[data.pick_idx]], [-1])
+                feed_dict_test[self.place_holders[kk]] = np.reshape(test_data[kk], [-1])
         for ii in ['type'] :
-            feed_dict_test[self.place_holders[ii]] = np.reshape(test_data[ii][:self.numb_test], [-1])            
+            feed_dict_test[self.place_holders[ii]] = np.reshape(test_data[ii], [-1])            
         for ii in ['natoms_vec', 'default_mesh'] :
             feed_dict_test[self.place_holders[ii]] = test_data[ii]
         feed_dict_test[self.place_holders['is_training']] = False
@@ -483,6 +492,4 @@ def test_on_the_fly (self,
                                                      feed_dict_batch)
             print_str += "   %8.1e\n" % current_lr
             fp.write(print_str)
-            fp.flush ()
-
-
+            fp.flush ()
\ No newline at end of file
diff --git a/source/train/common.py b/source/train/common.py
index 887669a278..f5092bbbe9 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -110,11 +110,17 @@ def add (self,
     def _add_single(self, key, data) :
         vtype = type(data)
         if not(vtype in self.arg_dict[key]['types']) :
-            # try the type convertion to the first listed type
-            try :
-                vv = (self.arg_dict[key]['types'][0])(data)
-            except TypeError:
-                raise TypeError ("cannot convert provided key \"%s\" to type %s " % (key, str(self.arg_dict[key]['types'][0])) )
+            # ! altered by Marián Rynik
+            # try the type convertion to one of the types
+            for tp in self.arg_dict[key]['types']:
+                try :
+                    vv = tp(data)
+                except TypeError:
+                    pass
+                else:
+                    break
+            else:
+                raise TypeError ("cannot convert provided key \"%s\" to type(s) %s " % (key, str(self.arg_dict[key]['types'])) )
         else :
             vv = data
         self.arg_dict[key]['value'] = vv

From 45849aac04519dedafd18d60d43759b882625db6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Wed, 7 Oct 2020 16:45:07 +0200
Subject: [PATCH 08/65] add ability to parse yaml config files

---
 setup.py                                |  2 +-
 source/tests/test_compat_input_v0_v1.py | 15 ++++++---------
 source/tests/test_data_modifier.py      |  7 +++----
 source/tests/test_fitting_stat.py       |  8 ++++----
 source/tests/test_model_loc_frame.py    |  7 +++----
 source/tests/test_model_se_a.py         |  8 ++++----
 source/tests/test_model_se_a_aparam.py  |  7 +++----
 source/tests/test_model_se_a_fparam.py  |  8 ++++----
 source/tests/test_model_se_a_srtab.py   |  8 ++++----
 source/tests/test_model_se_r.py         |  8 ++++----
 source/tests/test_polar_se_a.py         |  8 ++++----
 source/tests/test_wfc.py                |  8 ++++----
 source/train/common.py                  | 14 +++++++++++++-
 source/train/print_old_model.py         |  7 +++----
 source/train/train.py                   |  7 +++----
 15 files changed, 63 insertions(+), 59 deletions(-)

diff --git a/setup.py b/setup.py
index 98bcfded26..172904a506 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ def get_dp_install_path() :
     site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
     tf_install_dir = imp.find_module('tensorflow', [site_packages_path])[1]
 
-install_requires=['numpy', 'scipy']
+install_requires=['numpy', 'scipy', 'pyyaml']
 setup_requires=['setuptools_scm', 'scikit-build']
 
 # add cmake as a build requirement if cmake>3.0 is not installed
diff --git a/source/tests/test_compat_input_v0_v1.py b/source/tests/test_compat_input_v0_v1.py
index 6890a89301..7c4a46c361 100644
--- a/source/tests/test_compat_input_v0_v1.py
+++ b/source/tests/test_compat_input_v0_v1.py
@@ -1,23 +1,20 @@
-import os,sys,json
+import os,sys
 import numpy as np
 import unittest
 
 from deepmd.compat import convert_input_v0_v1
+from deepmd.common import j_loader
 
 class TestConvertInputV0V1 (unittest.TestCase) :
     def test_convert_smth(self):
-        with open(os.path.join('compat_inputs', 'water_se_a_v0.json')) as fp:
-            jdata0 = json.load(fp)
-        with open(os.path.join('compat_inputs', 'water_se_a_v1.json')) as fp:
-            jdata1 = json.load(fp)
+        jdata0 = j_loader(os.path.join('compat_inputs', 'water_se_a_v0.json'))
+        jdata1 = j_loader(os.path.join('compat_inputs', 'water_se_a_v1.json'))
         jdata = convert_input_v0_v1(jdata0, warning = False, dump = None)
         self.assertEqual(jdata, jdata1)
 
     def test_convert_nonsmth(self):
-        with open(os.path.join('compat_inputs', 'water_v0.json')) as fp:
-            jdata0 = json.load(fp)
-        with open(os.path.join('compat_inputs', 'water_v1.json')) as fp:
-            jdata1 = json.load(fp)
+        jdata0 = j_loader(os.path.join('compat_inputs', 'water_v0.json'))
+        jdata1 = j_loader(os.path.join('compat_inputs', 'water_v1.json'))
         jdata = convert_input_v0_v1(jdata0, warning = False, dump = None)
         self.assertEqual(jdata, jdata1)
 
diff --git a/source/tests/test_data_modifier.py b/source/tests/test_data_modifier.py
index 4e7b43663e..f71e31377d 100644
--- a/source/tests/test_data_modifier.py
+++ b/source/tests/test_data_modifier.py
@@ -1,9 +1,9 @@
-import os,sys,platform,json
+import os,sys,platform
 import numpy as np
 import unittest
 from deepmd.env import tf
 
-from deepmd.common import j_must_have, data_requirement
+from deepmd.common import j_must_have, data_requirement, j_loader
 from deepmd.RunOptions import RunOptions
 from deepmd.Trainer import NNPTrainer
 from deepmd.DataSystem import DeepmdDataSystem
@@ -45,8 +45,7 @@ def tearDown(self):
     def _setUp(self):
         args = Args()
         run_opt = RunOptions(args, False)
-        with open (args.INPUT, 'r') as fp:
-           jdata = json.load (fp)
+        jdata = j_loader(args.INPUT)
 
         # init model
         model = NNPTrainer (jdata, run_opt = run_opt)
diff --git a/source/tests/test_fitting_stat.py b/source/tests/test_fitting_stat.py
index 0cbd693ae1..1e9c48b30b 100644
--- a/source/tests/test_fitting_stat.py
+++ b/source/tests/test_fitting_stat.py
@@ -1,10 +1,11 @@
-import os,sys,json
+import os,sys
 import numpy as np
 import unittest
 
 from collections import defaultdict
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.Fitting import EnerFitting
+from deepmd.common import j_loader
 
 input_json = 'water_se_a_afparam.json'
 
@@ -57,9 +58,8 @@ def _brute_aparam(data, ndim):
 
 class TestEnerFittingStat (unittest.TestCase) :
     def test (self) :
-        with open(input_json) as fp:
-            jdata = json.load(fp)
-            jdata = jdata['model']
+        jdata = j_loader(input_json)
+        jdata = jdata['model']
         descrpt = DescrptSeA(jdata['descriptor'])
         fitting = EnerFitting(jdata['fitting_net'], descrpt)
         avgs = [0, 10]
diff --git a/source/tests/test_model_loc_frame.py b/source/tests/test_model_loc_frame.py
index b651862885..e79e59de1a 100644
--- a/source/tests/test_model_loc_frame.py
+++ b/source/tests/test_model_loc_frame.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptLocFrame import DescrptLocFrame
 from deepmd.Fitting import EnerFitting
 from deepmd.Model import Model
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -21,8 +21,7 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'water.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_model_se_a.py b/source/tests/test_model_se_a.py
index 0d54f14c5f..2d32d89e45 100644
--- a/source/tests/test_model_se_a.py
+++ b/source/tests/test_model_se_a.py
@@ -1,5 +1,5 @@
 
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -9,7 +9,7 @@
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.Fitting import EnerFitting
 from deepmd.Model import Model
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -21,8 +21,8 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'water_se_a.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
+
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_model_se_a_aparam.py b/source/tests/test_model_se_a_aparam.py
index 58b060225c..f22629ca19 100644
--- a/source/tests/test_model_se_a_aparam.py
+++ b/source/tests/test_model_se_a_aparam.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.Fitting import EnerFitting
 from deepmd.Model import Model
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -20,8 +20,7 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'water_se_a_aparam.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_model_se_a_fparam.py b/source/tests/test_model_se_a_fparam.py
index ec4a46c7d4..7c5ca2dfc6 100644
--- a/source/tests/test_model_se_a_fparam.py
+++ b/source/tests/test_model_se_a_fparam.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.Fitting import EnerFitting
 from deepmd.Model import Model
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -20,8 +20,8 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'water_se_a_fparam.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
+
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_model_se_a_srtab.py b/source/tests/test_model_se_a_srtab.py
index c2950fe788..2eeda45b50 100644
--- a/source/tests/test_model_se_a_srtab.py
+++ b/source/tests/test_model_se_a_srtab.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.Fitting import EnerFitting
 from deepmd.Model import Model
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -30,8 +30,8 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'water_se_a.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
+
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_model_se_r.py b/source/tests/test_model_se_r.py
index d3607a9164..32e3276760 100644
--- a/source/tests/test_model_se_r.py
+++ b/source/tests/test_model_se_r.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptSeR import DescrptSeR
 from deepmd.Fitting import EnerFitting
 from deepmd.Model import Model
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -20,8 +20,8 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'water_se_r.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
+
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_polar_se_a.py b/source/tests/test_polar_se_a.py
index 275b4fa707..ad2168dcb5 100644
--- a/source/tests/test_polar_se_a.py
+++ b/source/tests/test_polar_se_a.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.Fitting import PolarFittingSeA
 from deepmd.Model import PolarModel
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -20,8 +20,8 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'polar_se_a.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
+
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/tests/test_wfc.py b/source/tests/test_wfc.py
index d4b408cd60..876f4dba0a 100644
--- a/source/tests/test_wfc.py
+++ b/source/tests/test_wfc.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json,unittest
+import dpdata,os,sys,unittest
 import numpy as np
 from deepmd.env import tf
 from common import Data,gen_data
@@ -8,7 +8,7 @@
 from deepmd.DescrptLocFrame import DescrptLocFrame
 from deepmd.Fitting import WFCFitting
 from deepmd.Model import WFCModel
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have, j_must_have_d, j_have, j_loader
 
 global_ener_float_precision = tf.float64
 global_tf_float_precision = tf.float64
@@ -20,8 +20,8 @@ def setUp(self) :
 
     def test_model(self):
         jfile = 'wfc.json'
-        with open(jfile) as fp:
-            jdata = json.load (fp)
+        jdata = j_loader(jfile)
+
         run_opt = RunOptions(None) 
         systems = j_must_have(jdata, 'systems')
         set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/train/common.py b/source/train/common.py
index 887669a278..a68f8d27b8 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -4,6 +4,8 @@
 from deepmd.env import tf
 from deepmd.env import op_module
 from deepmd.RunOptions import global_tf_float_precision
+import json
+import yaml
 
 # def gelu(x):
 #     """Gaussian Error Linear Unit.
@@ -163,7 +165,17 @@ def j_must_have_d (jdata, key, deprecated_key) :
 
 def j_have (jdata, key) :
     return key in jdata.keys() 
-  
+
+def j_loader(filename):
+
+    with open(filename, 'r') as fp:
+        if filename.endswith("json"):
+            return json.load(fp)
+        elif filename.endswith(("yml", "yaml")):
+            return yaml.safe_load(fp)
+        else:
+            raise TypeError("config file must be json, or yaml/yml")
+
 def get_activation_func(activation_fn):
     if activation_fn not in activation_fn_dict:
         raise RuntimeError(activation_fn+" is not a valid activation function")
diff --git a/source/train/print_old_model.py b/source/train/print_old_model.py
index 14719723f9..d125e7f8b6 100644
--- a/source/train/print_old_model.py
+++ b/source/train/print_old_model.py
@@ -1,4 +1,4 @@
-import dpdata,os,sys,json
+import dpdata,os,sys
 import numpy as np
 import tensorflow as tf
 from common import Data
@@ -12,7 +12,7 @@
 from deepmd.DataSystem import DataSystem
 from deepmd.Model import NNPModel
 from deepmd.Model import LearingRate
-from deepmd.common import j_must_have
+from deepmd.common import j_must_have, j_loader
 
 def gen_data() :
     tmpdata = Data(rand_pert = 0.1, seed = 1)
@@ -32,8 +32,7 @@ def gen_data() :
     np.save('system/set.000/fparam.npy', tmpdata.fparam)
 
 def compute_efv(jfile):
-    fp = open (jfile, 'r')
-    jdata = json.load (fp)
+    jdata = j_loader(jfile)
     run_opt = RunOptions(None) 
     systems = j_must_have(jdata, 'systems')
     set_pfx = j_must_have(jdata, 'set_prefix')
diff --git a/source/train/train.py b/source/train/train.py
index c89760fa4d..3e7ba2955b 100755
--- a/source/train/train.py
+++ b/source/train/train.py
@@ -4,13 +4,12 @@
 import sys
 import time
 import numpy as np
-import json
 from deepmd.env import tf
 from deepmd.compat import convert_input_v0_v1
 from deepmd.RunOptions import RunOptions
 from deepmd.DataSystem import DeepmdDataSystem
 from deepmd.Trainer import NNPTrainer
-from deepmd.common import data_requirement, expand_sys_str
+from deepmd.common import data_requirement, expand_sys_str, j_loader
 from deepmd.DataModifier import DipoleChargeModifier
 
 def create_done_queue(cluster_spec, task_index):
@@ -49,8 +48,8 @@ def j_must_have (jdata, key) :
 
 def train (args) :
     # load json database
-    with open (args.INPUT, 'r') as fp:
-       jdata = json.load (fp)
+    jdata = j_loader(args.INPUT)
+
     if not 'model' in jdata.keys():
        jdata = convert_input_v0_v1(jdata, 
                                    warning = True, 

From faae175e52da30ed93cbf6fe969200beae4ed7b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Wed, 7 Oct 2020 21:36:02 +0200
Subject: [PATCH 09/65] added script to convert json to yaml

---
 data/raw/json2yaml.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 data/raw/json2yaml.py

diff --git a/data/raw/json2yaml.py b/data/raw/json2yaml.py
new file mode 100644
index 0000000000..8f47da40dc
--- /dev/null
+++ b/data/raw/json2yaml.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import json
+import yaml
+
+
+def _main():
+    parser = argparse.ArgumentParser(
+        description="convert json config file to yaml",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # get all json files in dir
+    jsons = [p for p in Path.cwd().glob("*.json")]
+    # use the newest as autosuggestion
+    jsons.sort(key=lambda x: x.stat().st_mtime, reverse=True)
+    jfile = jsons[0]
+    yfile = jfile.with_suffix(".yaml")
+
+    parser.add_argument("INPUT", default=jfile, type=Path, nargs="?",
+                        help="input json file")
+    parser.add_argument("OUTPUT", default=yfile, type=Path, nargs="?",
+                        help="output yaml file")
+    args = parser.parse_args()
+
+    with args.INPUT.open("r") as infile, args.OUTPUT.open("w") as outfile:
+        yaml.dump(json.load(infile), outfile, default_flow_style=False,
+                  sort_keys=False)
+
+if __name__ == "__main__":
+    _main()

From af3867a4cceef3aa0b513a043cc4825d63eacbf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Wed, 7 Oct 2020 22:55:14 +0200
Subject: [PATCH 10/65] fix failing test_get_test

---
 source/tests/test_deepmd_data_sys.py | 5 +++--
 source/train/DataSystem.py           | 9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py
index d08b148f3a..c684de7c0c 100644
--- a/source/tests/test_deepmd_data_sys.py
+++ b/source/tests/test_deepmd_data_sys.py
@@ -83,7 +83,7 @@ def test_get_test(self):
         ds.add('test', self.test_ndof, atomic = True, must = True)
         ds.add('null', self.test_ndof, atomic = True, must = False)
         sys_idx = 0
-        data = ds.get_test(sys_idx=sys_idx)
+        data = ds.get_test(sys_idx=sys_idx, n_test=-1)
         self.assertEqual(list(data['type'][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(np.load('sys_0/set.002/coord.npy'),
                        ds.get_sys(sys_idx).idx_map,
@@ -98,8 +98,9 @@ def test_get_test(self):
                                               -
                                               data['null']
         ), 0.0)
+
         sys_idx = 2
-        data = ds.get_test(sys_idx=sys_idx)
+        data = ds.get_test(sys_idx=sys_idx, n_test=-1)
         self.assertEqual(list(data['type'][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(np.load('sys_2/set.002/coord.npy'),
                        ds.get_sys(sys_idx).idx_map,
diff --git a/source/train/DataSystem.py b/source/train/DataSystem.py
index 03971d196d..271d898774 100644
--- a/source/train/DataSystem.py
+++ b/source/train/DataSystem.py
@@ -232,20 +232,23 @@ def get_batch (self,
 
     # ! altered by Marián Rynik
     def get_test (self, 
-                  sys_idx = None) :
+                  sys_idx = None,
+                  n_test = None) :
 
         # need to get idx first to get the appropriate test size for the
         # current system
         if sys_idx is not None :
             idx = sys_idx
         else :
-            # idx get selected in get batch, it is the index of a system
+            # idx get selected in get_batch method, it must be run first
+            # otherwise this will get messed-up
             idx = self.pick_idx
 
         if not hasattr(self, 'default_mesh') :
             self._make_default_mesh()
         if not hasattr(self, 'test_data') :
-            self._load_test(ntests = self.test_size[idx])
+            n_test = n_test if n_test is not None else self.test_size[idx]
+            self._load_test(ntests = n_test)
         
         test_system_data = {}
         for nn in self.test_data:

From 265c559beee40230d9981541cf991ad71cf0ab6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Thu, 8 Oct 2020 10:35:48 +0200
Subject: [PATCH 11/65] some small alterations to better preserve the original
 logic of the code

---
 source/tests/test_deepmd_data_sys.py |  4 ++--
 source/train/DataSystem.py           | 23 ++++++++++++-----------
 source/train/Trainer.py              |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py
index c684de7c0c..4cbf2af1b7 100644
--- a/source/tests/test_deepmd_data_sys.py
+++ b/source/tests/test_deepmd_data_sys.py
@@ -83,7 +83,7 @@ def test_get_test(self):
         ds.add('test', self.test_ndof, atomic = True, must = True)
         ds.add('null', self.test_ndof, atomic = True, must = False)
         sys_idx = 0
-        data = ds.get_test(sys_idx=sys_idx, n_test=-1)
+        data = ds.get_test(sys_idx=sys_idx)
         self.assertEqual(list(data['type'][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(np.load('sys_0/set.002/coord.npy'),
                        ds.get_sys(sys_idx).idx_map,
@@ -100,7 +100,7 @@ def test_get_test(self):
         ), 0.0)
 
         sys_idx = 2
-        data = ds.get_test(sys_idx=sys_idx, n_test=-1)
+        data = ds.get_test(sys_idx=sys_idx)
         self.assertEqual(list(data['type'][0]), list(np.sort(self.atom_type[sys_idx])))
         self._in_array(np.load('sys_2/set.002/coord.npy'),
                        ds.get_sys(sys_idx).idx_map,
diff --git a/source/train/DataSystem.py b/source/train/DataSystem.py
index 271d898774..5aa866d550 100644
--- a/source/train/DataSystem.py
+++ b/source/train/DataSystem.py
@@ -233,23 +233,17 @@ def get_batch (self,
     # ! altered by Marián Rynik
     def get_test (self, 
                   sys_idx = None,
-                  n_test = None) :
+                  n_test = -1) :
 
-        # need to get idx first to get the appropriate test size for the
-        # current system
+        if not hasattr(self, 'default_mesh') :
+            self._make_default_mesh()
+        if not hasattr(self, 'test_data') :
+            self._load_test(ntests = n_test)
         if sys_idx is not None :
             idx = sys_idx
         else :
-            # idx get selected in get_batch method, it must be run first
-            # otherwise this will get messed-up
             idx = self.pick_idx
 
-        if not hasattr(self, 'default_mesh') :
-            self._make_default_mesh()
-        if not hasattr(self, 'test_data') :
-            n_test = n_test if n_test is not None else self.test_size[idx]
-            self._load_test(ntests = n_test)
-        
         test_system_data = {}
         for nn in self.test_data:
             test_system_data[nn] = self.test_data[nn][idx]
@@ -257,6 +251,13 @@ def get_test (self,
         test_system_data["default_mesh"] = self.default_mesh[idx]
         return test_system_data
 
+    def get_sys_ntest(self, sys_idx=None):
+        """Get number of tests for the currently selected system,
+            or one defined by sys_idx."""
+        if sys_idx is not None :
+            return self.test_size[sys_idx]
+        else :
+            return self.test_size[self.pick_idx]
             
     def get_type_map(self):
         return self.type_map
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index b5beb18ef6..31a95346a9 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -463,7 +463,7 @@ def test_on_the_fly (self,
         # ! altered by Marián Rynik
         # Do not need to pass numb_test here as data object already knows it.
         # Both DeepmdDataSystem and ClassArg parse the same json file
-        test_data = data.get_test()
+        test_data = data.get_test(n_test=data.get_sys_ntest())
         feed_dict_test = {}
         for kk in test_data.keys():
             if kk == 'find_type' or kk == 'type' :

From 9806168fe1cb2a7a7d1be0202f5e987ff42b1b1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Thu, 8 Oct 2020 13:01:56 +0200
Subject: [PATCH 12/65] added test for json yaml equality

---
 data/{raw => json}/json2yaml.py               |  7 ++-
 ...at_input_v0_v1.py => test_compat_input.py} | 18 +++++-
 source/tests/yaml_inputs/water_se_a_v1.json   | 55 +++++++++++++++++++
 source/tests/yaml_inputs/water_se_a_v1.yaml   | 50 +++++++++++++++++
 source/tests/yaml_inputs/water_v1.json        | 51 +++++++++++++++++
 source/tests/yaml_inputs/water_v1.yaml        | 48 ++++++++++++++++
 source/train/common.py                        | 11 ++--
 7 files changed, 232 insertions(+), 8 deletions(-)
 rename data/{raw => json}/json2yaml.py (86%)
 rename source/tests/{test_compat_input_v0_v1.py => test_compat_input.py} (54%)
 create mode 100644 source/tests/yaml_inputs/water_se_a_v1.json
 create mode 100644 source/tests/yaml_inputs/water_se_a_v1.yaml
 create mode 100644 source/tests/yaml_inputs/water_v1.json
 create mode 100644 source/tests/yaml_inputs/water_v1.yaml

diff --git a/data/raw/json2yaml.py b/data/json/json2yaml.py
similarity index 86%
rename from data/raw/json2yaml.py
rename to data/json/json2yaml.py
index 8f47da40dc..f601928427 100644
--- a/data/raw/json2yaml.py
+++ b/data/json/json2yaml.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 
 import argparse
-from pathlib import Path
 import json
+from pathlib import Path
+from warnings import warn
+
 import yaml
 
 
@@ -28,5 +30,8 @@ def _main():
         yaml.dump(json.load(infile), outfile, default_flow_style=False,
                   sort_keys=False)
 
+    warn("The order of the keys won't be preserved!", SyntaxWarning)
+    warn("_comment keys will also be lostt in the conversion")
+
 if __name__ == "__main__":
     _main()
diff --git a/source/tests/test_compat_input_v0_v1.py b/source/tests/test_compat_input.py
similarity index 54%
rename from source/tests/test_compat_input_v0_v1.py
rename to source/tests/test_compat_input.py
index 7c4a46c361..61eb16faf3 100644
--- a/source/tests/test_compat_input_v0_v1.py
+++ b/source/tests/test_compat_input.py
@@ -3,9 +3,11 @@
 import unittest
 
 from deepmd.compat import convert_input_v0_v1
-from deepmd.common import j_loader
+sys.path.append("/mnt/md0/OneDrive/dizertacka/code/deepmd-kit/source")
+from train.common import j_loader
+#from deepmd.common import j_loader
 
-class TestConvertInputV0V1 (unittest.TestCase) :
+class TestConvertInput (unittest.TestCase) :
     def test_convert_smth(self):
         jdata0 = j_loader(os.path.join('compat_inputs', 'water_se_a_v0.json'))
         jdata1 = j_loader(os.path.join('compat_inputs', 'water_se_a_v1.json'))
@@ -18,3 +20,15 @@ def test_convert_nonsmth(self):
         jdata = convert_input_v0_v1(jdata0, warning = False, dump = None)
         self.assertEqual(jdata, jdata1)
 
+    def test_json_yaml_equal(self):
+
+        inputs = ("water_v1", "water_se_a_v1")
+
+        for i in inputs:
+            jdata = j_loader(os.path.join('yaml_inputs', f'{i}.json'))
+            ydata = j_loader(os.path.join('yaml_inputs', f'{i}.yaml'))
+            self.assertEqual(jdata, ydata)
+
+        with self.assertRaises(TypeError):
+            j_loader("path_with_wrong.extension")
+
diff --git a/source/tests/yaml_inputs/water_se_a_v1.json b/source/tests/yaml_inputs/water_se_a_v1.json
new file mode 100644
index 0000000000..402da962ca
--- /dev/null
+++ b/source/tests/yaml_inputs/water_se_a_v1.json
@@ -0,0 +1,55 @@
+{
+    "model": {
+	"descriptor" :{
+	    "type":		"se_a",
+	    "sel":		[46, 92],
+	    "rcut_smth":	5.80,
+	    "rcut":		6.00,
+	    "neuron":		[25, 50, 100],
+	    "axis_neuron":	16,
+	    "resnet_dt":	false,
+	    "seed":		1
+	},
+	"fitting_net" : {
+	    "neuron":		[240, 240, 240],
+	    "resnet_dt":	true,
+	    "seed":		1
+	}
+    },
+
+    "learning_rate" :{
+	"type":		"exp",
+	"decay_steps":	5000,
+	"decay_rate":	0.95,
+	"start_lr":	0.001
+    },
+
+    "loss" :{
+	"start_pref_e":	0.02,
+	"limit_pref_e":	1,
+	"start_pref_f":	1000,
+	"limit_pref_f":	1,
+	"start_pref_v":	0,
+	"limit_pref_v":	0
+    },
+
+    "training" : {
+	"systems":	["../data/"],
+	"set_prefix":	"set",    
+	"stop_batch":	1000000,
+	"batch_size":	[1],
+
+	"seed":		1,
+
+	"disp_file":	"lcurve.out",
+	"disp_freq":	100,
+	"numb_test":	10,
+	"save_freq":	1000,
+	"save_ckpt":	"model.ckpt",
+	"disp_training":true,
+	"time_training":true,
+	"profiling":	true,
+	"profiling_file":"timeline.json"
+    }
+}
+
diff --git a/source/tests/yaml_inputs/water_se_a_v1.yaml b/source/tests/yaml_inputs/water_se_a_v1.yaml
new file mode 100644
index 0000000000..55580daf1e
--- /dev/null
+++ b/source/tests/yaml_inputs/water_se_a_v1.yaml
@@ -0,0 +1,50 @@
+model:
+  descriptor:
+    type: se_a
+    sel:
+    - 46
+    - 92
+    rcut_smth: 5.8
+    rcut: 6.0
+    neuron:
+    - 25
+    - 50
+    - 100
+    axis_neuron: 16
+    resnet_dt: false
+    seed: 1
+  fitting_net:
+    neuron:
+    - 240
+    - 240
+    - 240
+    resnet_dt: true
+    seed: 1
+learning_rate:
+  type: exp
+  decay_steps: 5000
+  decay_rate: 0.95
+  start_lr: 0.001
+loss:
+  start_pref_e: 0.02
+  limit_pref_e: 1
+  start_pref_f: 1000
+  limit_pref_f: 1
+  start_pref_v: 0
+  limit_pref_v: 0
+training:
+  systems: ['../data/']
+  set_prefix: set
+  stop_batch: 1000000
+  batch_size:
+  - 1
+  seed: 1
+  disp_file: lcurve.out
+  disp_freq: 100
+  numb_test: 10
+  save_freq: 1000
+  save_ckpt: model.ckpt
+  disp_training: true
+  time_training: true
+  profiling: true
+  profiling_file: timeline.json
diff --git a/source/tests/yaml_inputs/water_v1.json b/source/tests/yaml_inputs/water_v1.json
new file mode 100644
index 0000000000..e5f2032ea2
--- /dev/null
+++ b/source/tests/yaml_inputs/water_v1.json
@@ -0,0 +1,51 @@
+{
+    "with_distrib":	false,
+    "model":{
+	"descriptor": {
+	    "type":		"loc_frame",
+	    "sel_a":		[16, 32],
+	    "sel_r":		[30, 60],
+	    "rcut":		6.00,
+	    "axis_rule":	[0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0]
+	},
+	"fitting_net": {
+	    "neuron":		[240, 120, 60, 30, 10],
+	    "resnet_dt":	true,
+	    "seed":		1
+	}
+    },
+    
+    "learning_rate" :{
+	"type":		"exp",
+	"decay_steps":	5000,
+	"decay_rate":	0.95,
+	"start_lr":	0.001
+    },
+
+    "loss" : {
+	"start_pref_e":	0.02,
+	"limit_pref_e":	8,
+	"start_pref_f":	1000,
+	"limit_pref_f":	1,
+	"start_pref_v":	0,
+	"limit_pref_v":	0
+    },
+
+    "training": {
+	"systems":	["../data/"], 
+	"set_prefix":	"set",    
+	"stop_batch":	1000000,
+	"batch_size":	[4],
+
+	"seed":		1,
+
+	"disp_file":	"lcurve.out",
+	"disp_freq":	100,
+	"numb_test":	10,
+	"save_freq":	1000,
+	"save_ckpt":	"model.ckpt",
+	"disp_training":true,
+	"time_training":true
+    }
+}
+
diff --git a/source/tests/yaml_inputs/water_v1.yaml b/source/tests/yaml_inputs/water_v1.yaml
new file mode 100644
index 0000000000..5121a961b0
--- /dev/null
+++ b/source/tests/yaml_inputs/water_v1.yaml
@@ -0,0 +1,48 @@
+with_distrib: false
+model:
+  descriptor:
+    type: loc_frame
+    sel_a:
+    - 16
+    - 32
+    sel_r:
+    - 30
+    - 60
+    rcut: 6.0
+    axis_rule: [0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0]
+  fitting_net:
+    neuron:
+    - 240
+    - 120
+    - 60
+    - 30
+    - 10
+    resnet_dt: true
+    seed: 1
+learning_rate:
+  type: exp
+  decay_steps: 5000
+  decay_rate: 0.95
+  start_lr: 0.001
+loss:
+  start_pref_e: 0.02
+  limit_pref_e: 8
+  start_pref_f: 1000
+  limit_pref_f: 1
+  start_pref_v: 0
+  limit_pref_v: 0
+training:
+  systems:
+  - ../data/
+  set_prefix: set
+  stop_batch: 1000000
+  batch_size:
+  - 4
+  seed: 1
+  disp_file: lcurve.out
+  disp_freq: 100
+  numb_test: 10
+  save_freq: 1000
+  save_ckpt: model.ckpt
+  disp_training: true
+  time_training: true
diff --git a/source/train/common.py b/source/train/common.py
index a68f8d27b8..83f5e6ecf3 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -168,13 +168,14 @@ def j_have (jdata, key) :
 
 def j_loader(filename):
 
-    with open(filename, 'r') as fp:
-        if filename.endswith("json"):
+    if filename.endswith("json"):
+        with open(filename, 'r') as fp:
             return json.load(fp)
-        elif filename.endswith(("yml", "yaml")):
+    elif filename.endswith(("yml", "yaml")):
+        with open(filename, 'r') as fp:
             return yaml.safe_load(fp)
-        else:
-            raise TypeError("config file must be json, or yaml/yml")
+    else:
+        raise TypeError("config file must be json, or yaml/yml")
 
 def get_activation_func(activation_fn):
     if activation_fn not in activation_fn_dict:

From 865a1f46c198e4061744aedb63fd192bd2d00c60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Thu, 8 Oct 2020 13:30:40 +0200
Subject: [PATCH 13/65] minor bugfix

---
 source/tests/test_compat_input.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/source/tests/test_compat_input.py b/source/tests/test_compat_input.py
index 61eb16faf3..c0a29283dd 100644
--- a/source/tests/test_compat_input.py
+++ b/source/tests/test_compat_input.py
@@ -3,9 +3,7 @@
 import unittest
 
 from deepmd.compat import convert_input_v0_v1
-sys.path.append("/mnt/md0/OneDrive/dizertacka/code/deepmd-kit/source")
-from train.common import j_loader
-#from deepmd.common import j_loader
+from deepmd.common import j_loader
 
 class TestConvertInput (unittest.TestCase) :
     def test_convert_smth(self):

From 373b0aebfb22cd3ccf6da98f40cc9d3acaddaf7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Thu, 8 Oct 2020 14:09:28 +0200
Subject: [PATCH 14/65] trigger new travis build

---
 source/train/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/train/common.py b/source/train/common.py
index f5092bbbe9..b4a8b42ce8 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -16,7 +16,7 @@
 #     """
 #     cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
 #     return x * cdf
-def gelu(x) :
+def gelu(x):
     return op_module.gelu(x)
 
 data_requirement = {}

From ad3cb1904c79d884f656104fa439588194679964 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Mon, 12 Oct 2020 11:20:10 +0200
Subject: [PATCH 15/65] add info about yaml to docs and parser

---
 README.md                            |  2 +-
 examples/water/train/water_se_a.yaml | 68 ++++++++++++++++++++++++++++
 source/train/main.py                 |  2 +-
 3 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 examples/water/train/water_se_a.yaml

diff --git a/README.md b/README.md
index 5520daa57f..5862ed40b5 100644
--- a/README.md
+++ b/README.md
@@ -330,7 +330,7 @@ The method of training is explained in our [DeePMD][2] and [DeepPot-SE][3] paper
 $ cd $deepmd_source_dir/examples/water/train/
 $ dp train water_se_a.json
 ```
-where `water_se_a.json` is the `json` format parameter file that controls the training. The components of the `water.json` contains three parts, `model`, `learning_rate`, `loss` and `training`.
+where `water_se_a.json` is the `json` format parameter file that controls the training. It is also possible to use `yaml` format file with the same keys as json (see `water_se_a.yaml` example). You can use script `json2yaml.py` in `data/json/` dir to convert your json files to yaml. The components of the `water.json` contains four parts, `model`, `learning_rate`, `loss` and `training`.
 
 The `model` section specify how the deep potential model is built. An example of the smooth-edition is provided as follows
 ```json
diff --git a/examples/water/train/water_se_a.yaml b/examples/water/train/water_se_a.yaml
new file mode 100644
index 0000000000..b92bf2ab4e
--- /dev/null
+++ b/examples/water/train/water_se_a.yaml
@@ -0,0 +1,68 @@
+# model parameters
+model:
+  type_map:
+  - O
+  - H
+  descriptor:
+    type: se_a
+    sel:
+    - 46
+    - 92
+    rcut_smth: 5.8
+    rcut: 6.0
+    neuron:
+    - 25
+    - 50
+    - 100
+    resnet_dt: false
+    axis_neuron: 16
+    seed: 1
+    # that's all for descriptor
+  fitting_net:
+    neuron:
+    - 240
+    - 240
+    - 240
+    resnet_dt: true
+    seed: 1
+    # that's all for fitting net
+  # that's all for model
+
+learning_rate:
+  type: exp
+  decay_steps: 5000
+  start_lr: 0.001
+  stop_lr: 3.51e-08
+  # that's all for learnnig rate
+
+loss:
+  start_pref_e: 0.02
+  limit_pref_e: 1
+  start_pref_f: 1000
+  limit_pref_f: 1
+  start_pref_v: 0
+  limit_pref_v: 0
+  # that's all for loss
+
+# training contols
+training:
+  systems:
+  - ../data/
+  set_prefix: set
+  stop_batch: 1000000
+  batch_size: 1
+  seed: 1
+  # display and restart
+  # frequencies counted in batch
+  disp_file: lcurve.out
+  disp_freq: 100
+  numb_test: 10
+  save_freq: 1000
+  save_ckpt: model.ckpt
+  load_ckpt: model.ckpt
+  disp_training: true
+  time_training: true
+  profiling: false
+  profiling_file: timeline.json
+  # that's all for training
+# that's all
\ No newline at end of file
diff --git a/source/train/main.py b/source/train/main.py
index 1e35d3bf17..56100ec54e 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -25,7 +25,7 @@ def main () :
 				  help = "the model after passing parameters")
     parser_train = subparsers.add_parser('train', help='train a model')
     parser_train.add_argument('INPUT', 
-                              help='the input parameter file in json format')
+                              help='the input parameter file in json or yaml format')
     parser_train.add_argument('--init-model', type = str, 
                               help=
                               'Initialize the model by the provided checkpoint.')

From 3553e264a0b43f4b158acac66f11bc44e8e081ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Wed, 7 Oct 2020 21:36:02 +0200
Subject: [PATCH 16/65] added script to convert json to yaml

---
 data/raw/json2yaml.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 data/raw/json2yaml.py

diff --git a/data/raw/json2yaml.py b/data/raw/json2yaml.py
new file mode 100644
index 0000000000..8f47da40dc
--- /dev/null
+++ b/data/raw/json2yaml.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import json
+import yaml
+
+
+def _main():
+    parser = argparse.ArgumentParser(
+        description="convert json config file to yaml",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # get all json files in dir
+    jsons = [p for p in Path.cwd().glob("*.json")]
+    # use the newest as autosuggestion
+    jsons.sort(key=lambda x: x.stat().st_mtime, reverse=True)
+    jfile = jsons[0]
+    yfile = jfile.with_suffix(".yaml")
+
+    parser.add_argument("INPUT", default=jfile, type=Path, nargs="?",
+                        help="input json file")
+    parser.add_argument("OUTPUT", default=yfile, type=Path, nargs="?",
+                        help="output yaml file")
+    args = parser.parse_args()
+
+    with args.INPUT.open("r") as infile, args.OUTPUT.open("w") as outfile:
+        yaml.dump(json.load(infile), outfile, default_flow_style=False,
+                  sort_keys=False)
+
+if __name__ == "__main__":
+    _main()

From 2f6e77c167e791a9f373fb53bb26fe1a56e48b03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Thu, 8 Oct 2020 13:01:56 +0200
Subject: [PATCH 17/65] added test for json yaml equality

---
 data/raw/json2yaml.py             | 32 -------------------------------
 source/tests/test_compat_input.py |  4 +++-
 2 files changed, 3 insertions(+), 33 deletions(-)
 delete mode 100644 data/raw/json2yaml.py

diff --git a/data/raw/json2yaml.py b/data/raw/json2yaml.py
deleted file mode 100644
index 8f47da40dc..0000000000
--- a/data/raw/json2yaml.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-from pathlib import Path
-import json
-import yaml
-
-
-def _main():
-    parser = argparse.ArgumentParser(
-        description="convert json config file to yaml",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # get all json files in dir
-    jsons = [p for p in Path.cwd().glob("*.json")]
-    # use the newest as autosuggestion
-    jsons.sort(key=lambda x: x.stat().st_mtime, reverse=True)
-    jfile = jsons[0]
-    yfile = jfile.with_suffix(".yaml")
-
-    parser.add_argument("INPUT", default=jfile, type=Path, nargs="?",
-                        help="input json file")
-    parser.add_argument("OUTPUT", default=yfile, type=Path, nargs="?",
-                        help="output yaml file")
-    args = parser.parse_args()
-
-    with args.INPUT.open("r") as infile, args.OUTPUT.open("w") as outfile:
-        yaml.dump(json.load(infile), outfile, default_flow_style=False,
-                  sort_keys=False)
-
-if __name__ == "__main__":
-    _main()
diff --git a/source/tests/test_compat_input.py b/source/tests/test_compat_input.py
index c0a29283dd..61eb16faf3 100644
--- a/source/tests/test_compat_input.py
+++ b/source/tests/test_compat_input.py
@@ -3,7 +3,9 @@
 import unittest
 
 from deepmd.compat import convert_input_v0_v1
-from deepmd.common import j_loader
+sys.path.append("/mnt/md0/OneDrive/dizertacka/code/deepmd-kit/source")
+from train.common import j_loader
+#from deepmd.common import j_loader
 
 class TestConvertInput (unittest.TestCase) :
     def test_convert_smth(self):

From 2c5bd3243e1dd8ad1303c1c822b6e036dc9e40bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mari=C3=A1n=20Rynik?= <marian.rynik@outlook.sk>
Date: Thu, 8 Oct 2020 13:30:40 +0200
Subject: [PATCH 18/65] minor bugfix

---
 source/tests/test_compat_input.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/source/tests/test_compat_input.py b/source/tests/test_compat_input.py
index 61eb16faf3..c0a29283dd 100644
--- a/source/tests/test_compat_input.py
+++ b/source/tests/test_compat_input.py
@@ -3,9 +3,7 @@
 import unittest
 
 from deepmd.compat import convert_input_v0_v1
-sys.path.append("/mnt/md0/OneDrive/dizertacka/code/deepmd-kit/source")
-from train.common import j_loader
-#from deepmd.common import j_loader
+from deepmd.common import j_loader
 
 class TestConvertInput (unittest.TestCase) :
     def test_convert_smth(self):

From 4df0eae03de17234d98424267bf5055dcb75ee6f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 16 Oct 2020 19:19:26 +0800
Subject: [PATCH 19/65] change default rcut_smth

---
 source/train/DescrptSeA.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py
index 1c93e92f02..edfe8bcc31 100644
--- a/source/train/DescrptSeA.py
+++ b/source/train/DescrptSeA.py
@@ -11,7 +11,7 @@ def __init__ (self, jdata):
         args = ClassArg()\
                .add('sel',      list,   must = True) \
                .add('rcut',     float,  default = 6.0) \
-               .add('rcut_smth',float,  default = 5.5) \
+               .add('rcut_smth',float,  default = 0.5) \
                .add('neuron',   list,   default = [10, 20, 40]) \
                .add('axis_neuron', int, default = 4, alias = 'n_axis_neuron') \
                .add('resnet_dt',bool,   default = False) \

From db257860d628efbabb7ef03da389bc51775c19e4 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 16 Oct 2020 19:20:29 +0800
Subject: [PATCH 20/65] change energy loss type from `std` to `ener`

---
 source/train/Trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index b6428c987e..e6392c6ea9 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -126,13 +126,13 @@ def _init_param(self, jdata):
         # infer loss type by fitting_type
         try :
             loss_param = jdata['loss']
-            loss_type = loss_param.get('type', 'std')
+            loss_type = loss_param.get('type', 'ener')
         except:
             loss_param = None
-            loss_type = 'std'
+            loss_type = 'ener'
 
         if fitting_type == 'ener':
-            if loss_type == 'std':
+            if loss_type == 'ener':
                 self.loss = EnerStdLoss(loss_param, starter_learning_rate = self.lr.start_lr())
             elif loss_type == 'ener_dipole':
                 self.loss = EnerDipoleLoss(loss_param, starter_learning_rate = self.lr.start_lr())

From ffc8cbdafadfe8af2741b116c56164a7c762c45c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 16 Oct 2020 19:26:58 +0800
Subject: [PATCH 21/65] add type_one_side to se_r

---
 source/train/DescrptSeR.py | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/source/train/DescrptSeR.py b/source/train/DescrptSeR.py
index 5c4af52d4d..5d4bc4e4de 100644
--- a/source/train/DescrptSeR.py
+++ b/source/train/DescrptSeR.py
@@ -11,11 +11,12 @@ def __init__ (self, jdata):
         args = ClassArg()\
                .add('sel',      list,   must = True) \
                .add('rcut',     float,  default = 6.0) \
-               .add('rcut_smth',float,  default = 5.5) \
+               .add('rcut_smth',float,  default = 0.5) \
                .add('neuron',   list,   default = [10, 20, 40]) \
                .add('resnet_dt',bool,   default = False) \
                .add('trainable',bool,   default = True) \
                .add('seed',     int) \
+               .add('type_one_side', bool, default = False) \
                .add('exclude_types', list, default = []) \
                .add('set_davg_zero', bool, default = False) \
                .add("activation_function", str, default = "tanh") \
@@ -37,6 +38,7 @@ def __init__ (self, jdata):
             self.exclude_types.add((tt[0], tt[1]))
             self.exclude_types.add((tt[1], tt[0]))
         self.set_davg_zero = class_data['set_davg_zero']
+        self.type_one_side = class_data['type_one_side']
 
         # descrpt config
         self.sel_a = [ 0 for ii in range(len(self.sel_r)) ]
@@ -143,6 +145,12 @@ def build (self,
             t_ntypes = tf.constant(self.ntypes, 
                                    name = 'ntypes', 
                                    dtype = tf.int32)
+            t_ndescrpt = tf.constant(self.ndescrpt, 
+                                     name = 'ndescrpt', 
+                                     dtype = tf.int32)            
+            t_sel = tf.constant(self.sel_a, 
+                                name = 'sel', 
+                                dtype = tf.int32)            
             self.t_avg = tf.get_variable('t_avg', 
                                          davg.shape, 
                                          dtype = global_tf_float_precision,
@@ -171,6 +179,10 @@ def build (self,
                                       sel = self.sel_r)
 
         self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
+        self.descrpt_reshape = tf.identity(self.descrpt_reshape, name = 'o_rmat')
+        self.descrpt_deriv = tf.identity(self.descrpt_deriv, name = 'o_rmat_deriv')
+        self.rij = tf.identity(self.rij, name = 'o_rij')
+        self.nlist = tf.identity(self.nlist, name = 'o_nlist')
 
         self.dout = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse, trainable = self.trainable)
 
@@ -203,15 +215,23 @@ def _pass_filter(self,
         start_index = 0
         inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
         output = []
-        for type_i in range(self.ntypes):
-            inputs_i = tf.slice (inputs,
-                                 [ 0, start_index*      self.ndescrpt],
-                                 [-1, natoms[2+type_i]* self.ndescrpt] )
+        if not self.type_one_side:
+            for type_i in range(self.ntypes):
+                inputs_i = tf.slice (inputs,
+                                     [ 0, start_index*      self.ndescrpt],
+                                     [-1, natoms[2+type_i]* self.ndescrpt] )
+                inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
+                layer = self._filter_r(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+                layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
+                output.append(layer)
+                start_index += natoms[2+type_i]
+        else :
+            inputs_i = inputs
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            layer = self._filter_r(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
-            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
+            type_i = -1
+            layer = self._filter_r(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
             output.append(layer)
-            start_index += natoms[2+type_i]
         output = tf.concat(output, axis = 1)
         return output
 

From 44b889c1b7fe54dafd6d54bd459ef41459185ad9 Mon Sep 17 00:00:00 2001
From: marian-code <marian.rynik@outlook.sk>
Date: Fri, 16 Oct 2020 16:30:56 +0200
Subject: [PATCH 22/65] resolve requested changes

clear the confusion caused by adding python style comments to json file
---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index abf3fcc35a..c16fe7dd8c 100644
--- a/README.md
+++ b/README.md
@@ -401,7 +401,8 @@ An example of `training` is
 	"systems":	["../data1/", "../data2/"],
 	"set_prefix":	"set",    
 	"stop_batch":	1000000,
-	"batch_size":	1, # or "auto" or [10, 20]
+	"_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]"
+	"batch_size":	1,
 
 	"seed":		1,
 
@@ -409,7 +410,8 @@ An example of `training` is
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
-	"numb_test":	10,  # or "XX%" or [10, 20]
+	"_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]"
+	"numb_test":	10,
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",
 	"load_ckpt":	"model.ckpt",

From b576caa62e34a8b62d44304c07777ebd755d0960 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 17 Oct 2020 23:04:07 +0800
Subject: [PATCH 23/65] fix the bug when getting an NoneType value

---
 source/train/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/train/common.py b/source/train/common.py
index 83f5e6ecf3..4ddf41b229 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -111,6 +111,8 @@ def add (self,
 
     def _add_single(self, key, data) :
         vtype = type(data)
+        if data is None:
+            return data
         if not(vtype in self.arg_dict[key]['types']) :
             # try the type convertion to the first listed type
             try :

From 041c45baaa895db0c62ab8c2d4d2e60a50c0ed3a Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 7 Aug 2020 20:00:30 -0400
Subject: [PATCH 24/65] test on tensorflow 2.3

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index dd4b0eed9e..f72c76fb79 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -66,12 +66,12 @@ jobs:
       env:
       - CC=gcc-5
       - CXX=g++-5
-      - TENSORFLOW_VERSION=2.1
+      - TENSORFLOW_VERSION=2.3
     - python: 3.7
       env:
       - CC=gcc-8
       - CXX=g++-8
-      - TENSORFLOW_VERSION=2.1
+      - TENSORFLOW_VERSION=2.3
     - stage: build whls
       services: docker
       env:

From 9f5332699d493effcc817845a9fe1c7c31a859f7 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 17 Oct 2020 19:01:13 -0400
Subject: [PATCH 25/65] bump tf version to 2.3; add tf to requirement if it's
 not detected; don't add cmake when it's installed

---
 .travis.yml | 10 +++-------
 README.md   |  4 ++--
 setup.py    | 29 +++++++++++++++++++----------
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f72c76fb79..a9c42a696e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,22 +77,18 @@ jobs:
       env:
         - TWINE_USERNAME=__token__
         - CIBW_BUILD="cp36-* cp37-*"
-        - CIBW_BEFORE_BUILD="pip install tensorflow && sed -i 's/libresolv.so.2\"/libresolv.so.2\", \"libtensorflow_framework.so.2\"/g' \$(find / -name policy.json)"
+        - CIBW_BEFORE_BUILD="sed -i 's/libresolv.so.2\"/libresolv.so.2\", \"libtensorflow_framework.so.2\"/g' \$(find / -name policy.json)"
         - CIBW_SKIP="*-win32 *-manylinux_i686"
         - CC=gcc-7
         - CXX=g++-7
-        - TENSORFLOW_VERSION=2.1
+        - TENSORFLOW_VERSION=2.3
       install:
-        - python -m pip install twine cibuildwheel==1.1.0 scikit-build setuptools_scm
+        - python -m pip install twine cibuildwheel==1.6.3 scikit-build setuptools_scm
       script:
         - python -m cibuildwheel --output-dir wheelhouse
         - python setup.py sdist
       after_success:
         - if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz;  fi
-before_install:
-  #- pip install --upgrade pip
-  - pip install --upgrade setuptools
-  - pip install tensorflow==$TENSORFLOW_VERSION
 install:
   - pip install --verbose .[test]
 script:
diff --git a/README.md b/README.md
index 5862ed40b5..0d33c8bd12 100644
--- a/README.md
+++ b/README.md
@@ -133,7 +133,7 @@ We follow the virtual environment approach to install the tensorflow's Python in
 virtualenv -p python3 $tensorflow_venv
 source $tensorflow_venv/bin/activate
 pip install --upgrade pip
-pip install --upgrade tensorflow==2.1.0
+pip install --upgrade tensorflow==2.3.0
 ```
 It is notice that everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by 
 ```bash
@@ -149,7 +149,7 @@ virtualenv -p python3.7 $tensorflow_venv
 ```
 If one does not need the GPU support of deepmd-kit and is concerned about package size, the CPU-only version of tensorflow should be installed by	
 ```bash	
-pip install --upgrade tensorflow-cpu==2.1.0	
+pip install --upgrade tensorflow-cpu==2.3.0	
 ```
 To verify the installation, run
 ```bash
diff --git a/setup.py b/setup.py
index 117ccec2a2..ca5a589f3c 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 from setuptools_scm import get_version
 from packaging.version import LegacyVersion
 from os import path, makedirs
-import os, imp, sys, platform, sysconfig
+import os, importlib, sys, platform, sysconfig
 
 
 readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')
@@ -15,19 +15,28 @@
     with open(readme_file) as f:
         readme = f.read()
 
-try:
-    tf_install_dir = imp.find_module('tensorflow')[1] 
-except ImportError:
-    site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
-    tf_install_dir = imp.find_module('tensorflow', [site_packages_path])[1]
+install_requires=['numpy', 'scipy', 'pyyaml']
+setup_requires=['setuptools_scm', 'scikit-build']
 
+def find_tf(path):
+    return importlib.machinery.FileFinder(path).find_spec("tensorflow")
 
-install_requires=['numpy', 'scipy', 'pyyaml']
-setup_requires=['setuptools_scm', 'scikit-build', 'cmake']
+tf_spec = importlib.util.find_spec("tensorflow")
+if tf_spec:
+    tf_install_dir = tf_spec.submodule_search_locations[1]
+else:
+    site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
+    tf_spec = importlib.machinery.FileFinder(path).find_spec("tensorflow")
+    if tf_spec:
+        tf_install_dir = tf_spec.submodule_search_locations[1]
+    else:
+        tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
+        setup_requires.append("tensorflow==" + tf_version)
+        install_requires.append("tensorflow==" + tf_version)
 
-# add cmake as a build requirement if cmake>3.0 is not installed
+# add cmake as a build requirement if cmake>3.7 is not installed
 try:
-    if LegacyVersion(get_cmake_version()) < LegacyVersion("3.0"):
+    if LegacyVersion(get_cmake_version()) < LegacyVersion("3.7"):
         setup_requires.append('cmake')
 except SKBuildError:
     setup_requires.append('cmake')

From d77ffd3c29dc34f463644209550eed87d218b9a3 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 17 Oct 2020 19:05:50 -0400
Subject: [PATCH 26/65] remove useless code

---
 setup.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/setup.py b/setup.py
index ca5a589f3c..d599ad4adc 100644
--- a/setup.py
+++ b/setup.py
@@ -18,9 +18,6 @@
 install_requires=['numpy', 'scipy', 'pyyaml']
 setup_requires=['setuptools_scm', 'scikit-build']
 
-def find_tf(path):
-    return importlib.machinery.FileFinder(path).find_spec("tensorflow")
-
 tf_spec = importlib.util.find_spec("tensorflow")
 if tf_spec:
     tf_install_dir = tf_spec.submodule_search_locations[1]

From d8a4b77dbdae1c824dd2eb44c839871e8a4a73ef Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 17 Oct 2020 22:55:26 -0400
Subject: [PATCH 27/65] fix bug

---
 .travis.yml | 2 ++
 setup.py    | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index a9c42a696e..380f151b39 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -89,6 +89,8 @@ jobs:
         - python setup.py sdist
       after_success:
         - if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz;  fi
+before_install:
+  - pip install tensorflow==$TENSORFLOW_VERSION
 install:
   - pip install --verbose .[test]
 script:
diff --git a/setup.py b/setup.py
index ca5a589f3c..0015070266 100644
--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,7 @@
 def find_tf(path):
     return importlib.machinery.FileFinder(path).find_spec("tensorflow")
 
+extras_require = dict()
 tf_spec = importlib.util.find_spec("tensorflow")
 if tf_spec:
     tf_install_dir = tf_spec.submodule_search_locations[1]
@@ -32,7 +33,7 @@ def find_tf(path):
     else:
         tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
         setup_requires.append("tensorflow==" + tf_version)
-        install_requires.append("tensorflow==" + tf_version)
+        extras_require = {"cpu": "tensorflow_cpu==" + tf_version, "gpu": "tensorflow==" + tf_version}
 
 # add cmake as a build requirement if cmake>3.7 is not installed
 try:
@@ -74,6 +75,7 @@ def find_tf(path):
     cmake_minimum_required_version='3.0',
     extras_require={
         'test': ['dpdata>=0.1.9'],
+        **extras_require,
     },
     entry_points={
           'console_scripts': ['dp = deepmd.main:main']

From f0da8521f667b8e056bca4fe6759e85f97e9f1b5 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 17 Oct 2020 23:57:40 -0400
Subject: [PATCH 28/65] bugfix

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 150130d38d..9b6c7cca2f 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@
     tf_install_dir = tf_spec.submodule_search_locations[1]
 else:
     site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
-    tf_spec = importlib.machinery.FileFinder(path).find_spec("tensorflow")
+    tf_spec = importlib.machinery.FileFinder(site_packages_path).find_spec("tensorflow")
     if tf_spec:
         tf_install_dir = tf_spec.submodule_search_locations[1]
     else:

From e760f6c1293d0a4957792ad8072631de6177d2ba Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 09:29:56 -0400
Subject: [PATCH 29/65] bugfix

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 9b6c7cca2f..10000aee8f 100644
--- a/setup.py
+++ b/setup.py
@@ -21,12 +21,12 @@
 extras_require = dict()
 tf_spec = importlib.util.find_spec("tensorflow")
 if tf_spec:
-    tf_install_dir = tf_spec.submodule_search_locations[1]
+    tf_install_dir = tf_spec.submodule_search_locations[0]
 else:
     site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
     tf_spec = importlib.machinery.FileFinder(site_packages_path).find_spec("tensorflow")
     if tf_spec:
-        tf_install_dir = tf_spec.submodule_search_locations[1]
+        tf_install_dir = tf_spec.submodule_search_locations[0]
     else:
         tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
         setup_requires.append("tensorflow==" + tf_version)

From 99492e38f6a07524a268eb83b23e12528f93baf5 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 11:15:16 -0400
Subject: [PATCH 30/65] add tf_install_dir

---
 setup.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 10000aee8f..6f9adc35e7 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,9 @@
 from setuptools_scm import get_version
 from packaging.version import LegacyVersion
 from os import path, makedirs
-import os, importlib, sys, platform, sysconfig
+import os, importlib
+import pkg_resources
+from distutils.util import get_platform
 
 
 readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')
@@ -31,6 +33,10 @@
         tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
         setup_requires.append("tensorflow==" + tf_version)
         extras_require = {"cpu": "tensorflow_cpu==" + tf_version, "gpu": "tensorflow==" + tf_version}
+        tf_install_dir = path.join(path.dirname(path.abspath(__file__)), '.egg',
+                                   pkg_resources.Distribution(project_name="tensorflow", version=tf_version,
+                                                              platform=get_platform()).egg_name(),
+                                   'tensorflow')
 
 # add cmake as a build requirement if cmake>3.7 is not installed
 try:

From fccd45b542d5768951b1098c120e485c71ace040 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 17:47:37 -0400
Subject: [PATCH 31/65] remove before_install

---
 .travis.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 380f151b39..a9c42a696e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -89,8 +89,6 @@ jobs:
         - python setup.py sdist
       after_success:
         - if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz;  fi
-before_install:
-  - pip install tensorflow==$TENSORFLOW_VERSION
 install:
   - pip install --verbose .[test]
 script:

From 609a3238783e2e93283591946505e53b1db58c43 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 18:09:33 -0400
Subject: [PATCH 32/65] remove verbose

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index a9c42a696e..f6b4c0d137 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -90,6 +90,6 @@ jobs:
       after_success:
         - if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz;  fi
 install:
-  - pip install --verbose .[test]
+  - pip install .[test]
 script:
   - cd source/tests && python -m unittest

From 7dc6cef72410890c8c636bbdb4b2781b63659116 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 20:52:30 -0400
Subject: [PATCH 33/65] install tf

---
 .travis.yml | 2 +-
 setup.py    | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f6b4c0d137..b434af6819 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -90,6 +90,6 @@ jobs:
       after_success:
         - if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz;  fi
 install:
-  - pip install .[test]
+  - pip install .[cpu,test]
 script:
   - cd source/tests && python -m unittest
diff --git a/setup.py b/setup.py
index 6f9adc35e7..4ff61dfd66 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
 install_requires=['numpy', 'scipy', 'pyyaml']
 setup_requires=['setuptools_scm', 'scikit-build']
 
-extras_require = dict()
+extras_require = {"cpu": "", "gpu": ""}
 tf_spec = importlib.util.find_spec("tensorflow")
 if tf_spec:
     tf_install_dir = tf_spec.submodule_search_locations[0]
@@ -32,7 +32,10 @@
     else:
         tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
         setup_requires.append("tensorflow==" + tf_version)
-        extras_require = {"cpu": "tensorflow_cpu==" + tf_version, "gpu": "tensorflow==" + tf_version}
+        if LegacyVersion(tf_version) < LegacyVersion("1.15") or (LegacyVersion(tf_version) >= LegacyVersion("2.0") and LegacyVersion(tf_version) <  LegacyVersion("2.1")):
+            extras_require = {"cpu": "tensorflow==" + tf_version, "gpu": "tensorflow_gpu==" + tf_version}
+        else:
+            extras_require = {"cpu": "tensorflow_cpu==" + tf_version, "gpu": "tensorflow==" + tf_version}
         tf_install_dir = path.join(path.dirname(path.abspath(__file__)), '.egg',
                                    pkg_resources.Distribution(project_name="tensorflow", version=tf_version,
                                                               platform=get_platform()).egg_name(),

From 98710c440f469f807843c6aaf8e73389f4ca3aaa Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 19 Oct 2020 08:53:39 +0800
Subject: [PATCH 34/65] check and generate doc for the options in the training
 input file.

---
 doc/train-input.rst         | 585 ++++++++++++++++++++++++++++++++++++
 setup.py                    |   2 +-
 source/train/CMakeLists.txt |   2 +-
 source/train/argcheck.py    | 364 ++++++++++++++++++++++
 source/train/doc.py         |   5 +
 source/train/main.py        |   9 +
 source/train/train.py       |   7 +
 7 files changed, 972 insertions(+), 2 deletions(-)
 create mode 100644 doc/train-input.rst
 create mode 100644 source/train/argcheck.py
 create mode 100644 source/train/doc.py

diff --git a/doc/train-input.rst b/doc/train-input.rst
new file mode 100644
index 0000000000..7a53b8a979
--- /dev/null
+++ b/doc/train-input.rst
@@ -0,0 +1,585 @@
+model: ``dict``
+    Argument path: model
+
+    type_map: ``list``, optional
+        Argument path: model/type_map
+        A list of strings. Give the name to each type of atoms.
+
+    data_stat_nbatch: ``int``, optional
+        Argument path: model/data_stat_nbatch
+        The model determines the normalization from the statistics of the
+        data. This key specifies the number of `frames` in each `system` used
+        for statistics.
+
+    descriptor: ``dict``
+        Argument path: model/descriptor
+        The descriptor of atomic environment.
+
+        Depending on the value of *type*, different sub args are accepted. 
+
+        type: ``str``
+            Argument path: model/descriptor/type
+            The type of the descritpor. Valid types are `loc_frame`, `se_a`,
+            `se_r` and `se_ar`. 
+            - `loc_frame`: Defines a local frame at each
+            atom, and the compute the descriptor as local coordinates under this
+            frame.
+            - `se_a`: Used by the smooth edition of Deep Potential. The
+            full relative coordinates are used to construct the descriptor.
+            -
+            `se_r`: Used by the smooth edition of Deep Potential. Only the
+            distance between atoms is used to construct the descriptor.
+            - `se_ar`:
+            A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off
+            while the `se_r` has a larger cut-off.
+
+        When *type* is set to ``loc_frame``: 
+
+        sel_a: ``list``
+            Argument path: model/descriptor/loc_frame/sel_a
+            A list of integers. The length of the list should be the same as the
+            number of atom types in the system. `sel_a[i]` gives the selected
+            number of type-i neighbors. The full relative coordinates of the
+            neighbors are used by the descriptor.
+
+        sel_r: ``list``
+            Argument path: model/descriptor/loc_frame/sel_r
+            A list of integers. The length of the list should be the same as the
+            number of atom types in the system. `sel_r[i]` gives the selected
+            number of type-i neighbors. Only relative distance of the neighbors
+            are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be
+            larger than the maximally possible number of type-i neighbors in the
+            cut-off radius.
+
+        rcut: ``float``, optional
+            Argument path: model/descriptor/loc_frame/rcut
+            The cut-off radius. The default value is 6.0
+
+        axis_rule: ``list``
+            Argument path: model/descriptor/loc_frame/axis_rule
+            A list of integers. The length should be 6 times of the number of
+            types. 
+
+            - axis_rule[i*6+0]: class of the atom defining the first axis
+            of type-i atom. 0 for neighbors with full coordinates and 1 for
+            neighbors only with relative distance.
+
+            - axis_rule[i*6+1]: type of
+            the atom defining the first axis of type-i atom.
+
+            - axis_rule[i*6+2]:
+            index of the axis atom defining the first axis. Note that the
+            neighbors with the same class and type are sorted according to their
+            relative distance.
+            - axis_rule[i*6+3]: class of the atom defining the
+            first axis of type-i atom. 0 for neighbors with full coordinates and 1
+            for neighbors only with relative distance.
+            - axis_rule[i*6+4]: type of
+            the atom defining the second axis of type-i atom.
+            - axis_rule[i*6+5]:
+            class of the atom defining the second axis of type-i atom. 0 for
+            neighbors with full coordinates and 1 for neighbors only with relative
+            distance.
+
+        When *type* is set to ``se_a``: 
+
+        sel: ``list``
+            Argument path: model/descriptor/se_a/sel
+            A list of integers. The length of the list should be the same as the
+            number of atom types in the system. `sel[i]` gives the selected number
+            of type-i neighbors. `sel[i]` is recommended to be larger than the
+            maximally possible number of type-i neighbors in the cut-off radius.
+
+        rcut: ``float``, optional
+            Argument path: model/descriptor/se_a/rcut
+            The cut-off radius.
+
+        rcut_smth: ``float``, optional
+            Argument path: model/descriptor/se_a/rcut_smth
+            Where to start smoothing. For example the 1/r term is smoothed from
+            `rcut` to `rcut_smth`
+
+        neuron: ``list``, optional
+            Argument path: model/descriptor/se_a/neuron
+            Number of neurons in each hidden layers of the embedding net. When two
+            layers are of the same size or one layer is twice as large as the
+            previous layer, a skip connection is built.
+
+        axis_neuron: ``int``, optional
+            Argument path: model/descriptor/se_a/axis_neuron
+            Size of the submatrix of G (embedding matrix).
+
+        activation_function: ``str``, optional
+            Argument path: model/descriptor/se_a/activation_function
+            The activation function in the embedding net. Supported activation
+            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: ``bool``, optional
+            Argument path: model/descriptor/se_a/resnet_dt
+            Whether to use a "Timestep" in the skip connection
+
+        type_one_side: ``bool``, optional
+            Argument path: model/descriptor/se_a/type_one_side
+            Try to build N_types embedding nets. Otherwise, building N_types^2
+            embedding nets
+
+        precision: ``str``, optional
+            Argument path: model/descriptor/se_a/precision
+            The precision of the embedding net parameters, supported options are
+            "float64", "float32", "float16".
+
+        trainable: ``bool``, optional
+            Argument path: model/descriptor/se_a/trainable
+            If the parameters in the embedding net is trainable
+
+        seed: ``int``|``NoneType``, optional
+            Argument path: model/descriptor/se_a/seed
+            Random seed for parameter initialization
+
+        exclude_types: ``list``, optional
+            Argument path: model/descriptor/se_a/exclude_types
+            The Excluded types
+
+        set_davg_zero: ``bool``, optional
+            Argument path: model/descriptor/se_a/set_davg_zero
+            Set the normalization average to zero. This option should be set when
+            `atom_ener` in the energy fitting is used
+
+        When *type* is set to ``se_r``: 
+
+        sel: ``list``
+            Argument path: model/descriptor/se_r/sel
+            A list of integers. The length of the list should be the same as the
+            number of atom types in the system. `sel[i]` gives the selected number
+            of type-i neighbors. `sel[i]` is recommended to be larger than the
+            maximally possible number of type-i neighbors in the cut-off radius.
+
+        rcut: ``float``, optional
+            Argument path: model/descriptor/se_r/rcut
+            The cut-off radius.
+
+        rcut_smth: ``float``, optional
+            Argument path: model/descriptor/se_r/rcut_smth
+            Where to start smoothing. For example the 1/r term is smoothed from
+            `rcut` to `rcut_smth`
+
+        neuron: ``list``, optional
+            Argument path: model/descriptor/se_r/neuron
+            Number of neurons in each hidden layers of the embedding net. When two
+            layers are of the same size or one layer is twice as large as the
+            previous layer, a skip connection is built.
+
+        activation_function: ``str``, optional
+            Argument path: model/descriptor/se_r/activation_function
+            The activation function in the embedding net. Supported activation
+            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: ``bool``, optional
+            Argument path: model/descriptor/se_r/resnet_dt
+            Whether to use a "Timestep" in the skip connection
+
+        type_one_side: ``bool``, optional
+            Argument path: model/descriptor/se_r/type_one_side
+            Try to build N_types embedding nets. Otherwise, building N_types^2
+            embedding nets
+
+        precision: ``str``, optional
+            Argument path: model/descriptor/se_r/precision
+            The precision of the embedding net parameters, supported options are
+            "float64", "float32", "float16".
+
+        trainable: ``bool``, optional
+            Argument path: model/descriptor/se_r/trainable
+            If the parameters in the embedding net is trainable
+
+        seed: ``int``|``NoneType``, optional
+            Argument path: model/descriptor/se_r/seed
+            Random seed for parameter initialization
+
+        exclude_types: ``list``, optional
+            Argument path: model/descriptor/se_r/exclude_types
+            The Excluded types
+
+        set_davg_zero: ``bool``, optional
+            Argument path: model/descriptor/se_r/set_davg_zero
+            Set the normalization average to zero. This option should be set when
+            `atom_ener` in the energy fitting is used
+
+        When *type* is set to ``se_ar``: 
+
+        a: ``dict``
+            Argument path: model/descriptor/se_ar/a
+            The parameters of descriptor `se_a`
+
+        r: ``dict``
+            Argument path: model/descriptor/se_ar/r
+            The parameters of descriptor `se_r`
+
+    fitting_net: ``dict``
+        Argument path: model/fitting_net
+        The fitting of physical properties.
+
+        Depending on the value of *type*, different sub args are accepted. 
+
+        type: ``str``, default: ``ener``
+            Argument path: model/fitting_net/type
+            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and
+            `global_polar`. 
+            - `ener`: Fit an energy model (potential energy
+            surface).
+            - `dipole`: Fit an atomic dipole model. Atomic dipole labels
+            for all the selected atoms (see `sel_type`) should be provided by
+            `dipole.npy` in each data system. The file has number of frames lines
+            and 3 times of number of selected atoms columns.
+            - `polar`: Fit an
+            atomic polarizability model. Atomic polarizability labels for all the
+            selected atoms (see `sel_type`) should be provided by
+            `polarizability.npy` in each data system. The file has number of
+            frames lines and 9 times of number of selected atoms columns.
+            -
+            `global_polar`: Fit a polarizability model. Polarizability labels
+            should be provided by `polarizability.npy` in each data system. The
+            file has number of frames lines and 9 columns.
+
+        When *type* is set to ``ener``: 
+
+        numb_fparam: ``int``, optional
+            Argument path: model/fitting_net/ener/numb_fparam
+            The dimension of the frame parameter. If set to >0, file `fparam.npy`
+            should be included to provided the input fparams.
+
+        numb_aparam: ``int``, optional
+            Argument path: model/fitting_net/ener/numb_aparam
+            The dimension of the atomic parameter. If set to >0, file `aparam.npy`
+            should be included to provided the input aparams.
+
+        neuron: ``list``, optional
+            Argument path: model/fitting_net/ener/neuron
+            The number of neurons in each hidden layers of the fitting net. When
+            two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: ``str``, optional
+            Argument path: model/fitting_net/ener/activation_function
+            The activation function in the fitting net. Supported activation
+            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        precision: ``str``, optional
+            Argument path: model/fitting_net/ener/precision
+            The precision of the fitting net parameters, supported options are
+            "float64", "float32", "float16".
+
+        resnet_dt: ``bool``, optional
+            Argument path: model/fitting_net/ener/resnet_dt
+            Whether to use a "Timestep" in the skip connection
+
+        trainable: ``bool``|``list``, optional
+            Argument path: model/fitting_net/ener/trainable
+            Whether the parameters in the fitting net are trainable. This option
+            can be
+            - bool: True if all parameters of the fitting net are
+            trainable, False otherwise.
+            - list of bool: Specifies if each layer is
+            trainable. Since the fitting net is composed by hidden layers followed
+            by a output layer, the length of tihs list should be equal to
+            len(`neuron`)+1.
+
+        rcond: ``float``, optional
+            Argument path: model/fitting_net/ener/rcond
+            The condition number used to determine the inital energy shift for
+            each type of atoms.
+
+        seed: ``int``|``NoneType``, optional
+            Argument path: model/fitting_net/ener/seed
+            Random seed for parameter initialization of the fitting net
+
+        atom_ener: ``list``, optional
+            Argument path: model/fitting_net/ener/atom_ener
+            Specify the atomic energy in vacuum for each type
+
+        When *type* is set to ``dipole``: 
+
+        neuron: ``list``, optional
+            Argument path: model/fitting_net/dipole/neuron
+            The number of neurons in each hidden layers of the fitting net. When
+            two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: ``str``, optional
+            Argument path: model/fitting_net/dipole/activation_function
+            The activation function in the fitting net. Supported activation
+            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: ``bool``, optional
+            Argument path: model/fitting_net/dipole/resnet_dt
+            Whether to use a "Timestep" in the skip connection
+
+        precision: ``str``, optional
+            Argument path: model/fitting_net/dipole/precision
+            The precision of the fitting net parameters, supported options are
+            "float64", "float32", "float16".
+
+        sel_type: ``int``|``NoneType``|``list``, optional
+            Argument path: model/fitting_net/dipole/sel_type
+            The atom types for which the atomic dipole will be provided. If not
+            set, all types will be selected.
+
+        seed: ``int``|``NoneType``, optional
+            Argument path: model/fitting_net/dipole/seed
+            Random seed for parameter initialization of the fitting net
+
+        When *type* is set to ``polar``: 
+
+        neuron: ``list``, optional
+            Argument path: model/fitting_net/polar/neuron
+            The number of neurons in each hidden layers of the fitting net. When
+            two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: ``str``, optional
+            Argument path: model/fitting_net/polar/activation_function
+            The activation function in the fitting net. Supported activation
+            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: ``bool``, optional
+            Argument path: model/fitting_net/polar/resnet_dt
+            Whether to use a "Timestep" in the skip connection
+
+        precision: ``str``, optional
+            Argument path: model/fitting_net/polar/precision
+            The precision of the fitting net parameters, supported options are
+            "float64", "float32", "float16".
+
+        fit_diag: ``bool``, optional
+            Argument path: model/fitting_net/polar/fit_diag
+            The diagonal part of the polarizability matrix  will be shifted by
+            `fit_diag`. The shift operation is carried out after `scale`.
+
+        scale: ``float``|``list``, optional
+            Argument path: model/fitting_net/polar/scale
+            The output of the fitting net (polarizability matrix) will be scaled
+            by `scale`
+
+        diag_shift: ``float``|``list``, optional
+            Argument path: model/fitting_net/polar/diag_shift
+            The diagonal part of the polarizability matrix  will be shifted by
+            `fit_diag`. The shift operation is carried out after `scale`.
+
+        sel_type: ``int``|``NoneType``|``list``, optional
+            Argument path: model/fitting_net/polar/sel_type
+            The atom types for which the atomic polarizability will be provided.
+            If not set, all types will be selected.
+
+        seed: ``int``|``NoneType``, optional
+            Argument path: model/fitting_net/polar/seed
+            Random seed for parameter initialization of the fitting net
+
+        When *type* is set to ``global_polar``: 
+
+        neuron: ``list``, optional
+            Argument path: model/fitting_net/global_polar/neuron
+            The number of neurons in each hidden layers of the fitting net. When
+            two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: ``str``, optional
+            Argument path: model/fitting_net/global_polar/activation_function
+            The activation function in the fitting net. Supported activation
+            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: ``bool``, optional
+            Argument path: model/fitting_net/global_polar/resnet_dt
+            Whether to use a "Timestep" in the skip connection
+
+        precision: ``str``, optional
+            Argument path: model/fitting_net/global_polar/precision
+            The precision of the fitting net parameters, supported options are
+            "float64", "float32", "float16".
+
+        fit_diag: ``bool``, optional
+            Argument path: model/fitting_net/global_polar/fit_diag
+            The diagonal part of the polarizability matrix  will be shifted by
+            `fit_diag`. The shift operation is carried out after `scale`.
+
+        scale: ``float``|``list``, optional
+            Argument path: model/fitting_net/global_polar/scale
+            The output of the fitting net (polarizability matrix) will be scaled
+            by `scale`
+
+        diag_shift: ``float``|``list``, optional
+            Argument path: model/fitting_net/global_polar/diag_shift
+            The diagonal part of the polarizability matrix  will be shifted by
+            `fit_diag`. The shift operation is carried out after `scale`.
+
+        sel_type: ``int``|``NoneType``|``list``, optional
+            Argument path: model/fitting_net/global_polar/sel_type
+            The atom types for which the atomic polarizability will be provided.
+            If not set, all types will be selected.
+
+        seed: ``int``|``NoneType``, optional
+            Argument path: model/fitting_net/global_polar/seed
+            Random seed for parameter initialization of the fitting net
+
+loss: ``dict``
+    Argument path: loss
+    The definition of loss function. The type of the loss depends on the
+    type of the fitting. For fitting type `ener`, the prefactors before
+    energy, force, virial and atomic energy losses may be provided. For
+    fitting type `dipole`, `polar` and `global_polar`, the loss may be an
+    empty `dict` or unset.
+
+    Depending on the value of *type*, different sub args are accepted. 
+
+    type: ``str``, default: ``ener``
+        Argument path: loss/type
+        The type of the loss. For fitting type `ener`, the loss type should be
+        set to `ener` or left unset. For tensorial fitting types `dipole`,
+        `polar` and `global_polar`, the type should be left unset.
+        \.
+
+    When *type* is set to ``ener``: 
+
+    start_pref_e: ``float``|``int``, optional
+        Argument path: loss/ener/start_pref_e
+        The prefactor of energy loss at the start of the training. Should be
+        larger than or equal to 0. If set to none-zero value, the energy label
+        should be provided by file energy.npy in each data system. If both
+        start_pref_energy and limit_pref_energy are set to 0, then the energy
+        will be ignored.
+
+    limit_pref_e: ``float``|``int``, optional
+        Argument path: loss/ener/limit_pref_e
+        The prefactor of energy loss at the limit of the training, Should be
+        larger than or equal to 0. i.e. the training step goes to infinity.
+
+    start_pref_f: ``float``|``int``, optional
+        Argument path: loss/ener/start_pref_f
+        The prefactor of force loss at the start of the training. Should be
+        larger than or equal to 0. If set to none-zero value, the force label
+        should be provided by file force.npy in each data system. If both
+        start_pref_force and limit_pref_force are set to 0, then the force
+        will be ignored.
+
+    limit_pref_f: ``float``|``int``, optional
+        Argument path: loss/ener/limit_pref_f
+        The prefactor of force loss at the limit of the training, Should be
+        larger than or equal to 0. i.e. the training step goes to infinity.
+
+    start_pref_v: ``float``|``int``, optional
+        Argument path: loss/ener/start_pref_v
+        The prefactor of virial loss at the start of the training. Should be
+        larger than or equal to 0. If set to none-zero value, the virial label
+        should be provided by file virial.npy in each data system. If both
+        start_pref_virial and limit_pref_virial are set to 0, then the virial
+        will be ignored.
+
+    limit_pref_v: ``float``|``int``, optional
+        Argument path: loss/ener/limit_pref_v
+        The prefactor of virial loss at the limit of the training, Should be
+        larger than or equal to 0. i.e. the training step goes to infinity.
+
+    start_pref_ae: ``float``|``int``, optional
+        Argument path: loss/ener/start_pref_ae
+        The prefactor of virial loss at the start of the training. Should be
+        larger than or equal to 0. If set to none-zero value, the virial label
+        should be provided by file virial.npy in each data system. If both
+        start_pref_virial and limit_pref_virial are set to 0, then the virial
+        will be ignored.
+
+    limit_pref_ae: ``float``|``int``, optional
+        Argument path: loss/ener/limit_pref_ae
+        The prefactor of virial loss at the limit of the training, Should be
+        larger than or equal to 0. i.e. the training step goes to infinity.
+
+    relative_f: ``float``|``NoneType``, optional
+        Argument path: loss/ener/relative_f
+        If provided, relative force error will be used in the loss. The
+        difference of force will be normalized by the magnitude of the force
+        in the label with a shift given by `relative_f`, i.e. DF_i / ( || F ||
+        + relative_f ) with DF denoting the difference between prediction and
+        label and || F || denoting the L2 norm of the label.
+
+learning_rate: ``dict``
+    Argument path: learning_rate
+    The learning rate options
+
+    start_lr: ``float``, optional
+        Argument path: learning_rate/start_lr
+        The learning rate the start of the training.
+
+    stop_lr: ``float``, optional
+        Argument path: learning_rate/stop_lr
+        The desired learning rate at the end of the training.
+
+    decay_steps: ``int``, optional
+        Argument path: learning_rate/decay_steps
+        The learning rate is decaying every this number of training steps.
+
+training: ``dict``
+    Argument path: training
+    The training options
+
+    systems: ``list``|``str``
+        Argument path: training/systems
+        The data systems. This key can be provided with a listthat specifies
+        the systems, or be provided with a string by which the prefix of all
+        systems are given and the list of the systems is automatically
+        generated.
+
+    set_prefix: ``str``, optional
+        Argument path: training/set_prefix
+        The prefix of the sets in the systems.
+
+    stop_batch: ``int``
+        Argument path: training/stop_batch
+        Number of training batch. Each training uses one batch of data.
+
+    batch_size: ``int``|``list``|``str``, optional
+        Argument path: training/batch_size
+        This key can be 
+        - list: the length of which is the same as the
+        `systems`. The batch size of each system is given by the elements of
+        the list.
+        - int: all `systems` uses the same batch size.
+        - string
+        "auto": automatically determines the batch size os that the batch_size
+        times the number of atoms in the system is no less than 32.
+        - string
+        "auto:N": automatically determines the batch size os that the
+        batch_size times the number of atoms in the system is no less than N.
+
+    seed: ``int``|``NoneType``, optional
+        Argument path: training/seed
+        The random seed for training.
+
+    disp_file: ``str``, optional
+        Argument path: training/disp_file
+        The file for printing learning curve.
+
+    disp_freq: ``int``, optional
+        Argument path: training/disp_freq
+        The frequency of printing learning curve.
+
+    numb_test: ``int``, optional
+        Argument path: training/numb_test
+        Number of frames used for the test during training.
+
+    save_freq: ``int``, optional
+        Argument path: training/save_freq
+        The frequency of saving check point.
+
+    save_ckpt: ``str``, optional
+        Argument path: training/save_ckpt
+        The file name of saving check point.
+
+    disp_training: ``bool``, optional
+        Argument path: training/disp_training
+        Displaying verbose information during training.
+
+    time_training: ``bool``, optional
+        Argument path: training/time_training
+        Timing durining training.
+
+    profiling: ``bool``, optional
+        Argument path: training/profiling
+        Profiling during training.
+
+    profiling_file: ``str``, optional
+        Argument path: training/profiling_file
+        Output file for profiling.
diff --git a/setup.py b/setup.py
index 117ccec2a2..0df59a98a5 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
     tf_install_dir = imp.find_module('tensorflow', [site_packages_path])[1]
 
 
-install_requires=['numpy', 'scipy', 'pyyaml']
+install_requires=['numpy', 'scipy', 'pyyaml', 'dargs']
 setup_requires=['setuptools_scm', 'scikit-build', 'cmake']
 
 # add cmake as a build requirement if cmake>3.0 is not installed
diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt
index 1875d2097c..34363a6826 100644
--- a/source/train/CMakeLists.txt
+++ b/source/train/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY)
 
-file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py EwaldRecp.py DataModifier.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py)
+file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py EwaldRecp.py DataModifier.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py argcheck.py doc.py)
 
 file(GLOB CLS_PY  Local.py Slurm.py)
 
diff --git a/source/train/argcheck.py b/source/train/argcheck.py
new file mode 100644
index 0000000000..15c384d843
--- /dev/null
+++ b/source/train/argcheck.py
@@ -0,0 +1,364 @@
+from dargs import Argument, Variant
+from deepmd.common import activation_fn_dict
+
+def list_to_doc (xx):
+    items = []
+    for ii in xx:
+        if len(items) == 0:
+            items.append(f'"{ii}"')
+        else:
+            items.append(f', "{ii}"')
+    items.append('.')
+    return ''.join(items)
+
+
+def supported_precision() :
+    return list_to_doc(['float64', 'float32', 'float16'])
+
+
+def descrpt_local_frame_args ():
+    doc_sel_a = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.'
+    doc_sel_r = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.'
+    doc_rcut = 'The cut-off radius. The default value is 6.0'
+    doc_axis_rule = 'A list of integers. The length should be 6 times of the number of types. \n\n\
+- axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
+- axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.\n\n\
+- axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.\n\
+- axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\
+- axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.\n\
+- axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.'
+    
+    return [
+        Argument("sel_a", list, optional = False, doc = doc_sel_a),
+        Argument("sel_r", list, optional = False, doc = doc_sel_r),
+        Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
+        Argument("axis_rule", list, optional = False, doc = doc_axis_rule)
+    ]
+
+
+def descrpt_se_a_args():
+    doc_sel = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.'
+    doc_rcut = 'The cut-off radius.'
+    doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
+    doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
+    doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {supported_precision()}'
+    doc_trainable = 'If the parameters in the embedding net is trainable'
+    doc_seed = 'Random seed for parameter initialization'
+    doc_exclude_types = 'The Excluded types'
+    doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
+    
+    return [
+        Argument("sel", list, optional = False, doc = doc_sel),
+        Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
+        Argument("rcut_smth", float, optional = True, default = 0.5, doc = doc_rcut_smth),
+        Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron),
+        Argument("axis_neuron", int, optional = True, default = 4, doc = doc_axis_neuron),
+        Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
+        Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
+        Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side),
+        Argument("precision", str, optional = True, default = "float64", doc = doc_precision),
+        Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
+        Argument("seed", [int,None], optional = True, doc = doc_seed),
+        Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types),
+        Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero)
+    ]
+
+
+def descrpt_se_r_args():
+    doc_sel = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.'
+    doc_rcut = 'The cut-off radius.'
+    doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
+    doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {supported_precision()}'
+    doc_trainable = 'If the parameters in the embedding net is trainable'
+    doc_seed = 'Random seed for parameter initialization'
+    doc_exclude_types = 'The Excluded types'
+    doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
+    
+    return [
+        Argument("sel", list, optional = False, doc = doc_sel),
+        Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
+        Argument("rcut_smth", float, optional = True, default = 0.5, doc = doc_rcut_smth),
+        Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron),
+        Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
+        Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
+        Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side),
+        Argument("precision", str, optional = True, default = "float64", doc = doc_precision),
+        Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
+        Argument("seed", [int,None], optional = True, doc = doc_seed),
+        Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types),
+        Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero)
+    ]
+
+
+def descrpt_se_ar_args():
+    doc_a = 'The parameters of descriptor `se_a`'
+    doc_r = 'The parameters of descriptor `se_r`'
+    
+    return [
+        Argument("a", dict, optional = False, doc = doc_a),
+        Argument("r", dict, optional = False, doc = doc_r),
+    ]
+
+
+def descrpt_variant_type_args():
+    doc_descrpt_type = 'The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. \n\
+- `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\
+- `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\
+- `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\
+- `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.'
+    
+    return Variant("type", [
+        Argument("loc_frame", dict, descrpt_local_frame_args()),
+        Argument("se_a", dict, descrpt_se_a_args()),
+        Argument("se_r", dict, descrpt_se_r_args()),
+        Argument("se_ar", dict, descrpt_se_ar_args())
+    ], doc = doc_descrpt_type)
+
+
+def fitting_ener():
+    doc_numb_fparam = 'The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.'
+    doc_numb_aparam = 'The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.'
+    doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
+    doc_precision = f'The precision of the fitting net parameters, supported options are {supported_precision()}'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\
+- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\
+- list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.'
+    doc_rcond = 'The condition number used to determine the inital energy shift for each type of atoms.'
+    doc_seed = 'Random seed for parameter initialization of the fitting net'
+    doc_atom_ener = 'Specify the atomic energy in vacuum for each type'
+
+    return [
+        Argument("numb_fparam", int, optional = True, default = 0, doc = doc_numb_fparam),
+        Argument("numb_aparam", int, optional = True, default = 0, doc = doc_numb_aparam),
+        Argument("neuron", list, optional = True, default = [120,120,120], doc = doc_neuron),
+        Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
+        Argument("precision", str, optional = True, default = 'float64', doc = doc_precision),
+        Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
+        Argument("trainable", [list,bool], optional = True, default = True, doc = doc_trainable),
+        Argument("rcond", float, optional = True, default = 1e-3, doc = doc_rcond),
+        Argument("seed", [int,None], optional = True, doc = doc_seed),
+        Argument("atom_ener", list, optional = True, default = [], doc = doc_atom_ener)
+    ]
+
+
+def fitting_polar():
+    doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_precision = f'The precision of the fitting net parameters, supported options are {supported_precision()}'
+    doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by `scale`'
+    doc_diag_shift = 'The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.'
+    doc_fit_diag = 'The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.'
+    doc_sel_type = 'The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.'
+    doc_seed = 'Random seed for parameter initialization of the fitting net'
+    
+    return [
+        Argument("neuron", list, optional = True, default = [120,120,120], doc = doc_neuron),
+        Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
+        Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
+        Argument("precision", str, optional = True, default = 'float64', doc = doc_precision),
+        Argument("fit_diag", bool, optional = True, default = True, doc = doc_fit_diag),
+        Argument("scale", [list,float], optional = True, default = 1.0, doc = doc_scale),
+        Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift),
+        Argument("sel_type", [list,int,None], optional = True, doc = doc_sel_type),
+        Argument("seed", [int,None], optional = True, doc = doc_seed)
+    ]
+
+
+def fitting_global_polar():
+    return fitting_polar()
+
+
+def fitting_dipole():
+    doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_precision = f'The precision of the fitting net parameters, supported options are {supported_precision()}'
+    doc_sel_type = 'The atom types for which the atomic dipole will be provided. If not set, all types will be selected.'
+    doc_seed = 'Random seed for parameter initialization of the fitting net'
+    return [
+        Argument("neuron", list, optional = True, default = [120,120,120], doc = doc_neuron),
+        Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
+        Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
+        Argument("precision", str, optional = True, default = 'float64', doc = doc_precision),
+        Argument("sel_type", [list,int,None], optional = True, doc = doc_sel_type),
+        Argument("seed", [int,None], optional = True, doc = doc_seed)
+    ]    
+
+
+def fitting_variant_type_args():
+    doc_descrpt_type = 'The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. \n\
+- `ener`: Fit an energy model (potential energy surface).\n\
+- `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.\n\
+- `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.\n\
+- `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.'
+    
+    return Variant("type", [Argument("ener", dict, fitting_ener()),
+                            Argument("dipole", dict, fitting_dipole()),
+                            Argument("polar", dict, fitting_polar()),
+                            Argument("global_polar", dict, fitting_global_polar())], 
+                   optional = True,
+                   default_tag = 'ener',
+                   doc = doc_descrpt_type)
+
+
+def model_args ():
+    doc_type_map = 'A list of strings. Give the name to each type of atoms.'
+    doc_data_stat_nbatch = 'The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.'
+    doc_descrpt = 'The descriptor of atomic environment.'
+    doc_fitting = 'The fitting of physical properties.'
+    ca = Argument("model", dict, 
+                  [Argument("type_map", list, optional = True, doc = doc_type_map),
+                   Argument("data_stat_nbatch", int, optional = True, default = 10, doc = doc_data_stat_nbatch),
+                   Argument("descriptor", dict, [], [descrpt_variant_type_args()], doc = doc_descrpt),
+                   Argument("fitting_net", dict, [], [fitting_variant_type_args()], doc = doc_fitting)
+                  ])
+    # print(ca.gen_doc())
+    return ca
+
+
+def learning_rate_args():
+    doc_start_lr = 'The learning rate the start of the training.'
+    doc_stop_lr = 'The desired learning rate at the end of the training.'
+    doc_decay_steps = 'The learning rate is decaying every this number of training steps.'
+    
+    args =  [
+        Argument("start_lr", float, optional = True, default = 1e-3, doc = doc_start_lr),
+        Argument("stop_lr", float, optional = True, default = 1e-8, doc = doc_stop_lr),
+        Argument("decay_steps", int, optional = True, default = 5000, doc = doc_decay_steps)
+    ]
+
+    doc_lr = "The learning rate options" 
+    return Argument("learning_rate", dict, args, [], doc = doc_lr)
+
+
+def start_pref(item):
+    return f'The prefactor of {item} loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the {item} label should be provided by file {item}.npy in each data system. If both start_pref_{item} and limit_pref_{item} are set to 0, then the {item} will be ignored.'
+
+def limit_pref(item):
+    return f'The prefactor of {item} loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.'
+
+def loss_ener():
+    doc_start_pref_e = start_pref('energy')
+    doc_limit_pref_e = limit_pref('energy')
+    doc_start_pref_f = start_pref('force')
+    doc_limit_pref_f = limit_pref('force')
+    doc_start_pref_v = start_pref('virial')
+    doc_limit_pref_v = limit_pref('virial')
+    doc_start_pref_ae = start_pref('atom_ener')
+    doc_start_pref_ae = limit_pref('atom_ener')
+    doc_relative_f = 'If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.'
+    return [
+        Argument("start_pref_e", [float,int], optional = True, default = 0.02, doc = doc_start_pref_e),
+        Argument("limit_pref_e", [float,int], optional = True, default = 1.00, doc = doc_limit_pref_e),
+        Argument("start_pref_f", [float,int], optional = True, default = 1000, doc = doc_start_pref_f),
+        Argument("limit_pref_f", [float,int], optional = True, default = 1.00, doc = doc_limit_pref_f),
+        Argument("start_pref_v", [float,int], optional = True, default = 0.00, doc = doc_start_pref_v),
+        Argument("limit_pref_v", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_v),
+        Argument("start_pref_ae", [float,int], optional = True, default = 0.00, doc = doc_start_pref_v),
+        Argument("limit_pref_ae", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_v),
+        Argument("relative_f", [float,None], optional = True, doc = doc_relative_f)
+    ]
+
+
+def loss_variant_type_args():
+    doc_loss = 'The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.\n\.'
+    
+    return Variant("type", 
+                   [Argument("ener", dict, loss_ener())],
+                   optional = True,
+                   default_tag = 'ener',
+                   doc = doc_loss)
+
+def loss_args():
+    doc_loss = 'The definition of loss function. The type of the loss depends on the type of the fitting. For fitting type `ener`, the prefactors before energy, force, virial and atomic energy losses may be provided. For fitting type `dipole`, `polar` and `global_polar`, the loss may be an empty `dict` or unset.' 
+    ca = Argument('loss', dict, [], 
+                  [loss_variant_type_args()],
+                  doc = doc_loss)
+    return ca
+
+def training_args():
+    doc_systems = 'The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.'
+    doc_set_prefix = 'The prefix of the sets in the systems.'
+    doc_stop_batch = 'Number of training batch. Each training uses one batch of data.'
+    doc_batch_size = 'This key can be \n\
+- list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.\n\
+- int: all `systems` uses the same batch size.\n\
+- string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.\n\
+- string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.'
+    doc_seed = 'The random seed for training.'
+    doc_disp_file = 'The file for printing learning curve.'
+    doc_disp_freq = 'The frequency of printing learning curve.'
+    doc_numb_test = 'Number of frames used for the test during training.'
+    doc_save_freq = 'The frequency of saving check point.'
+    doc_save_ckpt = 'The file name of saving check point.'
+    doc_disp_training = 'Displaying verbose information during training.'
+    doc_time_training = 'Timing durining training.'
+    doc_profiling = 'Profiling during training.'
+    doc_profiling_file = 'Output file for profiling.'
+
+    args = [
+        Argument("systems", [list,str], optional = False, doc = doc_systems),
+        Argument("set_prefix", str, optional = True, default = 'set', doc = doc_set_prefix),
+        Argument("stop_batch", int, optional = False, doc = doc_stop_batch),
+        Argument("batch_size", [list,int,str], optional = True, default = 'auto', doc = doc_batch_size),
+        Argument("seed", [int,None], optional = True, doc = doc_seed),
+        Argument("disp_file", str, optional = True, default = 'lcueve.out', doc = doc_disp_file),
+        Argument("disp_freq", int, optional = True, default = 1000, doc = doc_disp_freq),
+        Argument("numb_test", int, optional = True, default = 1, doc = doc_numb_test),
+        Argument("save_freq", int, optional = True, default = 1000, doc = doc_save_freq),
+        Argument("save_ckpt", str, optional = True, default = 'model.ckpt', doc = doc_save_ckpt),
+        Argument("disp_training", bool, optional = True, default = True, doc = doc_disp_training),
+        Argument("time_training", bool, optional = True, default = True, doc = doc_time_training),
+        Argument("profiling", bool, optional = True, default = False, doc = doc_profiling),
+        Argument("profiling_file", str, optional = True, default = 'timeline.json', doc = doc_profiling_file)
+    ]
+
+    doc_training = 'The training options'
+    return Argument("training", dict, args, [], doc = doc_training)
+
+
+def gen_doc():
+    ma = model_args()
+    lra = learning_rate_args()
+    la = loss_args()
+    ta = training_args()
+    ptr = []
+    ptr.append(ma.gen_doc())
+    ptr.append(la.gen_doc())
+    ptr.append(lra.gen_doc())
+    ptr.append(ta.gen_doc())
+    return "\n\n".join(ptr)
+
+def normalize(data):
+    ma = model_args()
+    lra = learning_rate_args()
+    la = loss_args()
+    ta = training_args()
+
+    data_m  = ma .normalize({'model': data.get('model', {})}, trim_pattern = "_*")
+    data_lr = lra.normalize({'learning_rate': data.get('learning_rate', {})}, trim_pattern = "_*")
+    data_l  = la .normalize({'loss': data.get('loss', {})}, trim_pattern = "_*")
+    data_t  = ta .normalize({'training': data.get('training', {})}, trim_pattern = "_*")
+
+    ma .check(data_m)
+    lra.check(data_lr)
+    la .check(data_l)
+    ta .check(data_t)
+
+    return {**data_m, **data_lr, **data_l, **data_t}
+
+
+if __name__ == '__main__':
+    gen_doc()
+        
diff --git a/source/train/doc.py b/source/train/doc.py
new file mode 100644
index 0000000000..939efe411a
--- /dev/null
+++ b/source/train/doc.py
@@ -0,0 +1,5 @@
+from deepmd.argcheck import gen_doc
+
+def doc_train_input(args):
+    doc_str = gen_doc()
+    print(doc_str)
diff --git a/source/train/main.py b/source/train/main.py
index 56100ec54e..8cb039bc0e 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -5,6 +5,7 @@
 from .config import config
 from .test import test
 from .transform import transform
+from .doc import doc_train_input
 
 def main () :    
     parser = argparse.ArgumentParser(
@@ -32,6 +33,9 @@ def main () :
     parser_train.add_argument('--restart', type = str, 
                               help=
                               'Restart the training from the provided checkpoint.')
+    parser_train.add_argument('-o','--output', type = str, default = 'out.json',
+                              help=
+                              'The output file of the parameters used in training.')
     
     parser_frz = subparsers.add_parser('freeze', help='freeze the model')
     parser_frz.add_argument("-d", "--folder", type=str, default = ".", 
@@ -57,6 +61,9 @@ def main () :
     parser_tst.add_argument("-d", "--detail-file", type=str, 
                             help="The file containing details of energy force and virial accuracy")
 
+    parser_train = subparsers.add_parser('doc-train-input', 
+                                         help='print the documentation (in rst format) of input training parameters.')
+
     args = parser.parse_args()
 
     if args.command is None :
@@ -72,5 +79,7 @@ def main () :
         test(args)
     elif args.command == 'transform' :
         transform(args)
+    elif args.command == 'doc-train-input' :
+        doc_train_input(args)
     else :
         raise RuntimeError('unknown command ' + args.command)
diff --git a/source/train/train.py b/source/train/train.py
index 3e7ba2955b..e7978361b2 100755
--- a/source/train/train.py
+++ b/source/train/train.py
@@ -3,6 +3,7 @@
 import os
 import sys
 import time
+import json
 import numpy as np
 from deepmd.env import tf
 from deepmd.compat import convert_input_v0_v1
@@ -11,6 +12,7 @@
 from deepmd.Trainer import NNPTrainer
 from deepmd.common import data_requirement, expand_sys_str, j_loader
 from deepmd.DataModifier import DipoleChargeModifier
+from deepmd.argcheck import normalize
 
 def create_done_queue(cluster_spec, task_index):
    with tf.device("/job:ps/task:%d" % (task_index)):
@@ -54,6 +56,11 @@ def train (args) :
        jdata = convert_input_v0_v1(jdata, 
                                    warning = True, 
                                    dump = 'input_v1_compat.json')
+    
+    jdata = normalize(jdata)
+    with open(args.output, 'w') as fp:
+        json.dump(jdata, fp, indent=4)
+
     # run options
     with_distrib = False 
     if 'with_distrib' in jdata:

From 7928151af0ae4917ef3b6dd28015d3186495db47 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 20:59:35 -0400
Subject: [PATCH 35/65] try to fix it

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 4ff61dfd66..b21a42223e 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
 install_requires=['numpy', 'scipy', 'pyyaml']
 setup_requires=['setuptools_scm', 'scikit-build']
 
-extras_require = {"cpu": "", "gpu": ""}
+extras_require = {"cpu": [], "gpu": []}
 tf_spec = importlib.util.find_spec("tensorflow")
 if tf_spec:
     tf_install_dir = tf_spec.submodule_search_locations[0]
@@ -33,9 +33,9 @@
         tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
         setup_requires.append("tensorflow==" + tf_version)
         if LegacyVersion(tf_version) < LegacyVersion("1.15") or (LegacyVersion(tf_version) >= LegacyVersion("2.0") and LegacyVersion(tf_version) <  LegacyVersion("2.1")):
-            extras_require = {"cpu": "tensorflow==" + tf_version, "gpu": "tensorflow_gpu==" + tf_version}
+            extras_require = {"cpu": ["tensorflow==" + tf_version], "gpu": ["tensorflow-gpu==" + tf_version]}
         else:
-            extras_require = {"cpu": "tensorflow_cpu==" + tf_version, "gpu": "tensorflow==" + tf_version}
+            extras_require = {"cpu": ["tensorflow-cpu==" + tf_version], "gpu": ["tensorflow==" + tf_version]}
         tf_install_dir = path.join(path.dirname(path.abspath(__file__)), '.egg',
                                    pkg_resources.Distribution(project_name="tensorflow", version=tf_version,
                                                               platform=get_platform()).egg_name(),

From 7d98beb2d4dc68ca92f2c70d9505b68035af0e2e Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 18 Oct 2020 21:07:14 -0400
Subject: [PATCH 36/65] set extras_require

---
 setup.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index b21a42223e..db86971589 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,11 @@
 install_requires=['numpy', 'scipy', 'pyyaml']
 setup_requires=['setuptools_scm', 'scikit-build']
 
-extras_require = {"cpu": [], "gpu": []}
+tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
+if LegacyVersion(tf_version) < LegacyVersion("1.15") or (LegacyVersion(tf_version) >= LegacyVersion("2.0") and LegacyVersion(tf_version) <  LegacyVersion("2.1")):
+    extras_require = {"cpu": ["tensorflow==" + tf_version], "gpu": ["tensorflow-gpu==" + tf_version]}
+else:
+    extras_require = {"cpu": ["tensorflow-cpu==" + tf_version], "gpu": ["tensorflow==" + tf_version]}
 tf_spec = importlib.util.find_spec("tensorflow")
 if tf_spec:
     tf_install_dir = tf_spec.submodule_search_locations[0]
@@ -30,12 +34,7 @@
     if tf_spec:
         tf_install_dir = tf_spec.submodule_search_locations[0]
     else:
-        tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
         setup_requires.append("tensorflow==" + tf_version)
-        if LegacyVersion(tf_version) < LegacyVersion("1.15") or (LegacyVersion(tf_version) >= LegacyVersion("2.0") and LegacyVersion(tf_version) <  LegacyVersion("2.1")):
-            extras_require = {"cpu": ["tensorflow==" + tf_version], "gpu": ["tensorflow-gpu==" + tf_version]}
-        else:
-            extras_require = {"cpu": ["tensorflow-cpu==" + tf_version], "gpu": ["tensorflow==" + tf_version]}
         tf_install_dir = path.join(path.dirname(path.abspath(__file__)), '.egg',
                                    pkg_resources.Distribution(project_name="tensorflow", version=tf_version,
                                                               platform=get_platform()).egg_name(),

From 289e532865e8f03e6f5a2bdab4478d97cfd74e9d Mon Sep 17 00:00:00 2001
From: Han Wang <amcadmus@gmail.com>
Date: Mon, 19 Oct 2020 10:11:39 +0800
Subject: [PATCH 37/65] Update README.md

Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c16fe7dd8c..b23f76f75d 100644
--- a/README.md
+++ b/README.md
@@ -401,7 +401,7 @@ An example of `training` is
 	"systems":	["../data1/", "../data2/"],
 	"set_prefix":	"set",    
 	"stop_batch":	1000000,
-	"_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]"
+	"_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]",
 	"batch_size":	1,
 
 	"seed":		1,

From b7c523cacb50a5265bed1154c48e49b0349ead2c Mon Sep 17 00:00:00 2001
From: Han Wang <amcadmus@gmail.com>
Date: Mon, 19 Oct 2020 10:11:52 +0800
Subject: [PATCH 38/65] Update README.md

Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b23f76f75d..23cf621471 100644
--- a/README.md
+++ b/README.md
@@ -410,7 +410,7 @@ An example of `training` is
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
-	"_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]"
+	"_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]",
 	"numb_test":	10,
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",

From 3330844aa3ae81be927a45ac4ea553180e9e57ca Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 19 Oct 2020 11:30:57 +0800
Subject: [PATCH 39/65] restructure the readme. Fix bugs in argcheck

---
 README.md                | 547 +--------------------------------------
 doc/install.md           | 200 ++++++++++++++
 doc/train-input.rst      |  55 ++--
 doc/use-deepmd-kit.md    | 352 +++++++++++++++++++++++++
 source/train/argcheck.py |  34 +--
 5 files changed, 616 insertions(+), 572 deletions(-)
 create mode 100644 doc/install.md
 create mode 100644 doc/use-deepmd-kit.md

diff --git a/README.md b/README.md
index f7ddec083b..cdec110a92 100644
--- a/README.md
+++ b/README.md
@@ -9,30 +9,7 @@
  	- [License and credits](#license-and-credits)
  	- [Deep Potential in a nutshell](#deep-potential-in-a-nutshell)
 - [Download and install](#download-and-install)
-    - [Easy installation methods](#easy-installation-methods)
-      - [Offline packages](#offline-packages)
-      - [With Docker](#with-docker)
-      - [With conda](#with-conda)
-    - [Install the python interaction](#install-the-python-interface)
-      - [Install the Tensorflow's python interface](#install-the-tensorflows-python-interface)
-      - [Install the DeePMD-kit's python interface](#install-the-deepmd-kits-python-interface)
-    - [Install the C++ interface](#install-the-c-interface)
-	    - [Install the Tensorflow's C++ interface](#install-the-tensorflows-c-interface)    
-	    - [Install the DeePMD-kit's C++ interface](#install-the-deepmd-kits-c-interface)
-	    - [Install LAMMPS's DeePMD-kit module](#install-lammpss-deepmd-kit-module)
 - [Use DeePMD-kit](#use-deepmd-kit)
-	- [Prepare data](#prepare-data)
-	- [Train a model](#train-a-model)
-	    - [The DeePMD model](#the-deepmd-model)
-	    - [The DeepPot-SE model](#the-deeppot-se-model)
-	- [Freeze a model](#freeze-a-model)
-	- [Test a model](#test-a-model)
-	- [Model inference](#model-inference)
-	- [Run MD with Lammps](#run-md-with-lammps)
-	    - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
-	    - [Long-range interaction](#long-range-interaction)
-	- [Run path-integral MD with i-PI](#run-path-integral-md-with-i-pi)
-	- [Use deep potential with ASE](#use-deep-potential-with-ase)
 - [Troubleshooting](#troubleshooting)
 
 # About DeePMD-kit
@@ -85,529 +62,27 @@ In addition to building up potential energy models, DeePMD-kit can also be used
 
 # Download and install
 
-Please follow our [github](https://github.com/deepmodeling/deepmd-kit) webpage to see the latest released version and development version.
+Please follow our [github](https://github.com/deepmodeling/deepmd-kit) webpage to download the [latest released version](https://github.com/deepmodeling/deepmd-kit/tree/master) and [development version](https://github.com/deepmodeling/deepmd-kit/tree/devel).
 
-## Easy installation methods
-There various easy methods to install DeePMD-kit. Choose one that you prefer. If you want to build by yourself, jump to the next two sections.
+DeePMD-kit offers multiple installation methods. It is recommend using easily methods like [offline packages](doc/install.md#offline-packages), [conda](doc/install.md#with-conda) and [docker](doc/install.md#with-docker). 
 
-After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be available to execute. You can try `dp -h` and `lmp -h` to see the help. `mpirun` is also available considering you may want to run LAMMPS in parallel.
+One may manually install DeePMD-kit by following the instuctions on [installing the python interface](doc/install.md#install-the-python-interface) and [installing the C++ interface](doc/install.md#install-the-c-interface). The C++ interface is necessary when using DeePMD-kit with LAMMPS and i-PI.
 
-### Offline packages
-Both CPU and GPU version offline packages are avaiable in [the Releases page](https://github.com/deepmodeling/deepmd-kit/releases).
-
-### With conda
-DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [Anaconda](https://www.anaconda.com/distribution/#download-section) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) first.
-
-To install the CPU version:
-```bash
-conda install deepmd-kit=*=*cpu lammps-dp=*=*cpu -c deepmodeling
-```
-
-To install the GPU version containing [CUDA 10.1](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver):
-```bash
-conda install deepmd-kit=*=*gpu lammps-dp=*=*gpu -c deepmodeling
-```
-
-### With Docker
-A docker for installing the DeePMD-kit is available [here](https://github.com/orgs/deepmodeling/packages/container/deepmd-kit).
-
-To pull the CPU version:
-```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cpu
-```
-
-To pull the GPU version:
-```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cuda10.1_gpu
-```
-
-## Install the python interface 
-### Install the Tensorflow's python interface
-First, check the python version on your machine 
-```bash
-python --version
-```
-
-We follow the virtual environment approach to install the tensorflow's Python interface. The full instruction can be found on [the tensorflow's official website](https://www.tensorflow.org/install/pip). Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
-```bash
-virtualenv -p python3 $tensorflow_venv
-source $tensorflow_venv/bin/activate
-pip install --upgrade pip
-pip install --upgrade tensorflow==2.1.0
-```
-It is notice that everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by 
-```bash
-source $tensorflow_venv/bin/activate
-```
-if one wants to skip out of the virtual environment, he/she can do
-```bash
-deactivate
-```
-If one has multiple python interpreters named like python3.x, it can be specified by, for example
-```bash
-virtualenv -p python3.7 $tensorflow_venv
-```
-If one does not need the GPU support of deepmd-kit and is concerned about package size, the CPU-only version of tensorflow should be installed by	
-```bash	
-pip install --upgrade tensorflow-cpu==2.1.0	
-```
-To verify the installation, run
-```bash
-python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"
-```
-One should remember to activate the virtual environment every time he/she uses deepmd-kit.
-
-### Install the DeePMD-kit's python interface
-
-Execute
-```bash
-pip install deepmd-kit
-```
-To test the installation, one may execute
-```bash
-dp -h
-```
-It will print the help information like
-```text
-usage: dp [-h] {train,freeze,test} ...
-
-DeePMD-kit: A deep learning package for many-body potential energy
-representation and molecular dynamics
-
-optional arguments:
-  -h, --help           show this help message and exit
-
-Valid subcommands:
-  {train,freeze,test}
-    train              train a model
-    freeze             freeze the model
-    test               test the model
-```
-
-## Install the C++ interface 
-
-If one does not need to use DeePMD-kit with Lammps or I-Pi, then the python interface installed in the previous section does everything and he/she can safely skip this section. 
-
-### Install the Tensorflow's C++ interface
-
-Check the compiler version on your machine
-
-```
-gcc --version
-```
-
-The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
-
-First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](doc/install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](doc/install-tf.1.14-gpu.md) to install the C++ interface.
-
-### Install the DeePMD-kit's C++ interface
-
-Clone the DeePMD-kit source code
-```bash
-cd /some/workspace
-git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
-```
-
-For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
-```bash
-cd deepmd-kit
-deepmd_source_dir=`pwd`
-```
-
-Now goto the source code directory of DeePMD-kit and make a build place.
-```bash
-cd $deepmd_source_dir/source
-mkdir build 
-cd build
-```
-I assume you want to install DeePMD-kit into path `$deepmd_root`, then execute cmake
-```bash
-cmake -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
-```
-where the variable `tensorflow_root` stores the location where the tensorflow's C++ interface is installed. The DeePMD-kit will automatically detect if a CUDA tool-kit is available on your machine and build the GPU support accordingly. If you want to force the cmake to find CUDA tool-kit, you can speicify the key `USE_CUDA_TOOLKIT`, 
-```bash
-cmake -DUSE_CUDA_TOOLKIT=true -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
-```
-and you may further asked to provide `CUDA_TOOLKIT_ROOT_DIR`. If the cmake has executed successfully, then 
-```bash
-make
-make install
-```
-If everything works fine, you will have the following executable and libraries installed in `$deepmd_root/bin` and `$deepmd_root/lib`
-```bash
-$ ls $deepmd_root/bin
-dp_ipi
-$ ls $deepmd_root/lib
-libdeepmd_ipi.so  libdeepmd_op.so  libdeepmd.so
-```
-
-### Install LAMMPS's DeePMD-kit module
-DeePMD-kit provide module for running MD simulation with LAMMPS. Now make the DeePMD-kit module for LAMMPS.
-```bash
-cd $deepmd_source_dir/source/build
-make lammps
-```
-DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. Now download your favorite LAMMPS code, and uncompress it (I assume that you have downloaded the tar `lammps-stable.tar.gz`)
-```bash
-cd /some/workspace
-tar xf lammps-stable.tar.gz
-```
-The source code of LAMMPS is stored in directory, for example `lammps-31Mar17`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
-```bash
-cd lammps-31Mar17/src/
-cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
-```
-Now build LAMMPS
-```bash
-make yes-user-deepmd
-make mpi -j4
-```
-The option `-j4` means using 4 processes in parallel. You may want to use a different number according to your hardware. 
-
-If everything works fine, you will end up with an executable `lmp_mpi`.
-
-The DeePMD-kit module can be removed from LAMMPS source code by 
-```bash
-make no-user-deepmd
-```
 
 # Use DeePMD-kit
-In this text, we will call the deep neural network that is used to represent the interatomic interactions (Deep Potential) the **model**. The typical procedure of using DeePMD-kit is 
-
-1. Prepare data
-2. Train a model
-3. Freeze the model
-4. MD runs with the model (Native MD code or LAMMPS)
-
-## Prepare data
-One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
-
-Property| Unit
----	| :---:
-Time	| ps
-Length	| Å
-Energy	| eV
-Force	| eV/Å
-Pressure| Bar
-
-The frames of the system are stored in two formats. A raw file is a plain text file with each information item written in one file and one frame written on one line. The default files that provide box, coordinate, force, energy and virial are `box.raw`, `coord.raw`, `force.raw`, `energy.raw` and `virial.raw`, respectively. *We recommend you use these file names*. Here is an example of force.raw:
-```bash
-$ cat force.raw
--0.724  2.039 -0.951  0.841 -0.464  0.363
- 6.737  1.554 -5.587 -2.803  0.062  2.222
--1.968 -0.163  1.020 -0.225 -0.789  0.343
-```
-This `force.raw` contains 3 frames with each frame having the forces of 2 atoms, thus it has 3 lines and 6 columns. Each line provides all the 3 force components of 2 atoms in 1 frame. The first three numbers are the 3 force components of the first atom, while the second three numbers are the 3 force components of the second atom. The coordinate file `coord.raw` is organized similarly. In `box.raw`, the 9 components of the box vectors should be provided on each line. In `virial.raw`, the 9 components of the virial tensor should be provided on each line. The number of lines of all raw files should be identical.
-
-We assume that the atom types do not change in all frames. It is provided by `type.raw`, which has one line with the types of atoms written one by one. The atom types should be integers. For example the `type.raw` of a system that has 2 atoms with 0 and 1:
-```bash
-$ cat type.raw
-0 1
-```
-
-The second format is the data sets of `numpy` binary data that are directly used by the training program. User can use the script `$deepmd_source_dir/data/raw/raw_to_set.sh` to convert the prepared raw files to data sets. For example, if we have a raw file that contains 6000 frames, 
-```bash
-$ ls 
-box.raw  coord.raw  energy.raw  force.raw  type.raw  virial.raw
-$ $deepmd_source_dir/data/raw/raw_to_set.sh 2000
-nframe is 6000
-nline per set is 2000
-will make 3 sets
-making set 0 ...
-making set 1 ...
-making set 2 ...
-$ ls 
-box.raw  coord.raw  energy.raw  force.raw  set.000  set.001  set.002  type.raw  virial.raw
-```
-It generates three sets `set.000`, `set.001` and `set.002`, with each set contains 2000 frames. The last set (`set.002`) is used as testing set, while the rest sets (`set.000` and `set.001`) are used as training sets. One do not need to take care of the binary data files in each of the `set.*` directories. The path containing `set.*` and `type.raw` is called a *system*. 
-
-## Train a model
-
-### Write the input script
-
-The method of training is explained in our [DeePMD][2] and [DeepPot-SE][3] papers. With the source code we provide a small training dataset taken from 400 frames generated by NVT ab-initio water MD trajectory with 300 frames for training and 100 for testing. [An example training parameter file](./examples/water/train/water_se_a.json) is provided. One can try with the training by
-```bash
-$ cd $deepmd_source_dir/examples/water/train/
-$ dp train water_se_a.json
-```
-where `water_se_a.json` is the `json` format parameter file that controls the training. It is also possible to use `yaml` format file with the same keys as json (see `water_se_a.yaml` example). You can use script `json2yaml.py` in `data/json/` dir to convert your json files to yaml. The components of the `water.json` contains four parts, `model`, `learning_rate`, `loss` and `training`.
-
-The `model` section specify how the deep potential model is built. An example of the smooth-edition is provided as follows
-```json
-    "model": {
-	"type_map":	["O", "H"],
-	"descriptor" :{
-	    "type":		"se_a",
-	    "rcut_smth":	5.80,
-	    "rcut":		6.00,
-	    "sel":		[46, 92],
-	    "neuron":		[25, 50, 100],
-	    "axis_neuron":	16,
-	    "resnet_dt":	false,
-	    "seed":		1,
-	    "_comment":		" that's all"
-	},
-	"fitting_net" : {
-	    "neuron":		[240, 240, 240],
-	    "resnet_dt":	true,	    
-	    "seed":		1,
-	    "_comment":		" that's all"
-	},
-	"_comment":	" that's all"
-    }
-```
-The **`type_map`** is optional, which provide the element names (but not restricted to) for corresponding atom types.
 
-The construction of the descriptor is given by option **`descriptor`**. The **`type`** of the descriptor is set to `"se_a"`, which means smooth-edition, angular infomation. The  **`rcut`** is the cut-off radius for neighbor searching, and the **`rcut_smth`** gives where the smoothing starts. **`sel`** gives the maximum possible number of neighbors in the cut-off radius. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denote the maximum possible number of neighbors with type `i`. The **`neuron`** specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layers from input end to the output end, respectively. The **`axis_neuron`** specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper][3]. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
+The typical procedure of using DeePMD-kit includes 5 steps 
 
-The construction of the fitting net is give by **`fitting_net`**. The key **`neuron`** specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
+1. [Prepare data](doc/use-deepmd-kit.md#prepare-data)
+2. [Train a model](doc/use-deepmd-kit.md#train-a-model)
+3. [Freeze the model](doc/use-deepmd-kit.md#freeze-a-model)
+4. [Test the model](doc/use-deepmd-kit.md#test-a-model)
+5. [Inference the model in python](doc/use-deepmd-kit.md#model-inference) or using the model in other molecular simulation packages like [LAMMPS](doc/use-deepmd-kit.md#run-md-with-lammps), [i-PI](doc/use-deepmd-kit.md#run-path-integral-md-with-i-pi) or [ASE](doc/use-deepmd-kit.md#use-deep-potential-with-ase).
 
-An example of the `learning_rate` is given as follows
-```json
-    "learning_rate" :{
-	"type":		"exp",
-	"start_lr":	0.005,
-	"decay_steps":	5000,
-	"decay_rate":	0.95,
-	"_comment":	"that's all"
-    }
-```
-The option **`start_lr`**, **`decay_rate`** and **`decay_steps`** specify how the learning rate changes. For example, the `t`th batch will be trained with learning rate:
-```math
-lr(t) = start_lr * decay_rate ^ ( t / decay_steps )
-```
-
-An example of the `loss` is 
-```json
-    "loss" : {
-	"start_pref_e":	0.02,
-	"limit_pref_e":	1,
-	"start_pref_f":	1000,
-	"limit_pref_f":	1,
-	"start_pref_v":	0,
-	"limit_pref_v":	0,
-	"_comment":	" that's all"
-    }
-```
-The options **`start_pref_e`**, **`limit_pref_e`**, **`start_pref_f`**, **`limit_pref_f`**, **`start_pref_v`** and **`limit_pref_v`** determine how the prefactors of energy error, force error and virial error changes in the loss function (see the appendix of the [DeePMD paper][2] for details). Taking the prefactor of force error for example, the prefactor at batch `t` is
-```math
-w_f(t) = start_pref_f * ( lr(t) / start_lr ) + limit_pref_f * ( 1 - lr(t) / start_lr )
-```
-Since we do not have virial data, the virial prefactors `start_pref_v` and `limit_pref_v` are set to 0.
-
-An example of `training` is
-```json
-    "training" : {
-	"systems":	["../data1/", "../data2/"],
-	"set_prefix":	"set",    
-	"stop_batch":	1000000,
-	"_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]",
-	"batch_size":	1,
-
-	"seed":		1,
-
-	"_comment": " display and restart",
-	"_comment": " frequencies counted in batch",
-	"disp_file":	"lcurve.out",
-	"disp_freq":	100,
-	"_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]",
-	"numb_test":	10,
-	"save_freq":	1000,
-	"save_ckpt":	"model.ckpt",
-	"load_ckpt":	"model.ckpt",
-	"disp_training":true,
-	"time_training":true,
-	"profiling":	false,
-	"profiling_file":"timeline.json",
-	"_comment":	"that's all"
-    }
-```
-The option **`systems`** provide location of the systems (path to `set.*` and `type.raw`). It is a vector, thus DeePMD-kit allows you to provide multiple systems. DeePMD-kit will train the model with the systems in the vector one by one in a cyclic manner. **It is warned that the example water data (in folder `examples/data/water`) is of very limited amount, is provided only for testing purpose, and should not be used to train a productive model.**
-
-The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size or it can be input as a list setting batch size individually for each system.
-The option **`stop_batch`** specifies the total number of batches will be used in the training.
-
-The option **`numb_test`** specifies the number of tests that will be used for each system. If it is an integer each system will be tested with the same number of tests. It can be set to percentage `"XX%"` to use XX% of frames of each system for its testing or it can be input as a list setting numer of tests individually for each system (the order should correspond to ordering of the systems key in json).
-
-### Training
-
-The training can be invoked by
-```bash
-$ dp train water_se_a.json
-```
-
-During the training, the error of the model is tested every **`disp_freq`** batches with **`numb_test`** frames from the last set in the **`systems`** directory on the fly, and the results are output to **`disp_file`**. A typical `disp_file` looks like
-```bash
-# batch      l2_tst    l2_trn    l2_e_tst  l2_e_trn    l2_f_tst  l2_f_trn         lr
-      0    2.67e+01  2.57e+01    2.21e-01  2.22e-01    8.44e-01  8.12e-01    1.0e-03
-    100    6.14e+00  5.40e+00    3.01e-01  2.99e-01    1.93e-01  1.70e-01    1.0e-03
-    200    5.02e+00  4.49e+00    1.53e-01  1.53e-01    1.58e-01  1.42e-01    1.0e-03
-    300    4.36e+00  3.71e+00    7.32e-02  7.27e-02    1.38e-01  1.17e-01    1.0e-03
-    400    4.04e+00  3.29e+00    3.16e-02  3.22e-02    1.28e-01  1.04e-01    1.0e-03
-```
-The first column displays the number of batches. The second and third columns display the loss function evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The fourth and fifth columns display the RMS energy error (normalized by number of atoms) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The sixth and seventh columns display the RMS force error (component-wise) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The last column displays the current learning rate.
+A quick-start on using DeePMD-kit can be found [here](doc/use-deepmd-kit.md).
 
-Checkpoints will be written to files with prefix **`save_ckpt`** every **`save_freq`** batches. If **`restart`** is set to `true`, then the training will start from the checkpoint named **`load_ckpt`**, rather than from scratch.
-
-Several command line options can be passed to `dp train`, which can be checked with
-```bash
-$ dp train --help
-```
-An explanation will be provided
-```
-positional arguments:
-  INPUT                 the input json database
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --init-model INIT_MODEL
-                        Initialize a model by the provided checkpoint
-  --restart RESTART     Restart the training from the provided checkpoint
-```
-The keys `intra_op_parallelism_threads` and `inter_op_parallelism_threads` are Tensorflow configurations for multithreading, which are explained [here](https://www.tensorflow.org/performance/performance_guide#optimizing_for_cpu). Skipping `-t` and `OMP_NUM_THREADS` leads to the default setting of these keys in the Tensorflow.
+A full [document](doc/train-input.rst) on options in the training input script is available.
 
-**`--init-model model.ckpt`**, for example, initializes the model training with an existing model that is stored in the checkpoint `model.ckpt`, the network architectures should match.
-
-**`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`.
-
-On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are  Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads).
-
-For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows:
-```bash
-export OMP_NUM_THREADS=6
-export TF_INTRA_OP_PARALLELISM_THREADS=3
-export TF_INTER_OP_PARALLELISM_THREADS=2
-dp train input.json
-```
-
-## Freeze a model
-
-The trained neural network is extracted from a checkpoint and dumped into a database. This process is called "freezing" a model. The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc). To freeze a model, typically one does
-```bash
-$ dp freeze -o graph.pb
-```
-in the folder where the model is trained. The output database is called `graph.pb`.
-
-
-## Test a model
-
-The frozen model can be used in many ways. The most straightforward test can be performed using `dp test`. A typical usage of `dp test` is 
-```bash
-dp test -m graph.pb -s /path/to/system -n 30
-```
-where `-m` gives the tested model, `-s` the path to the tested system and `-n` the number of tested frames. Several other command line options can be passed to `dp test`, which can be checked with
-```bash
-$ dp test --help
-```
-An explanation will be provided
-```
-usage: dp test [-h] [-m MODEL] [-s SYSTEM] [-S SET_PREFIX] [-n NUMB_TEST]
-               [-r RAND_SEED] [--shuffle-test] [-d DETAIL_FILE]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        Frozen model file to import
-  -s SYSTEM, --system SYSTEM
-                        The system dir
-  -S SET_PREFIX, --set-prefix SET_PREFIX
-                        The set prefix
-  -n NUMB_TEST, --numb-test NUMB_TEST
-                        The number of data for test
-  -r RAND_SEED, --rand-seed RAND_SEED
-                        The random seed
-  --shuffle-test        Shuffle test data
-  -d DETAIL_FILE, --detail-file DETAIL_FILE
-                        The file containing details of energy force and virial
-                        accuracy
-```
-
-## Model inference 
-One may use the python interface of DeePMD-kit for model inference, an example is given as follows
-```python
-import deepmd.DeepPot as DP
-import numpy as np
-dp = DP('graph.pb')
-coord = np.array([[1,0,0], [0,0,1.5], [1,0,3]]).reshape([1, -1])
-cell = np.diag(10 * np.ones(3)).reshape([1, -1])
-atype = [1,0,1]
-e, f, v = dp.eval(coord, cell, atype)
-```
-where `e`, `f` and `v` are predicted energy, force and virial of the system, respectively.
-
-
-## Run MD with LAMMPS
-### Include deepmd in the pair style
-Running an MD simulation with LAMMPS is simpler. In the LAMMPS input file, one needs to specify the pair style as follows
-```bash
-pair_style     deepmd graph.pb
-pair_coeff     
-```
-where `graph.pb` is the file name of the frozen model. The `pair_coeff` should be left blank. It should be noted that LAMMPS counts atom types starting from 1, therefore, all LAMMPS atom type will be firstly subtracted by 1, and then passed into the DeePMD-kit engine to compute the interactions. [A detailed documentation of this pair style is available.](doc/lammps-pair-style-deepmd.md).
-
-### Long-range interaction
-The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes 
-```bash
-pair_style	deepmd graph.pb
-pair_coeff
-kspace_style	pppm 1.0e-5
-kspace_modify	gewald 0.45
-```
-Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
-
-## Run path-integral MD with i-PI
-The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. The client can be started by
-```bash
-$ dp_ipi water.json
-```
-It is noted that multiple instances of the client is allow for computing, in parallel, the interactions of multiple replica of the path-integral MD.
-
-`water.json` is the parameter file for the client `dp_ipi`, and [an example](./examples/ipi/water.json) is provided:
-```json
-{
-    "verbose":		false,
-    "use_unix":		true,
-    "port":		31415,
-    "host":		"localhost",
-    "graph_file":	"graph.pb",
-    "coord_file":	"conf.xyz",
-    "atom_type" : {
-	"OW":		0, 
-	"HW1":		1,
-	"HW2":		1
-    }
-}
-```
-The option **`use_unix`** is set to `true` to activate the Unix domain socket, otherwise, the Internet socket is used.
-
-The option **`graph_file`** provides the file name of the frozen model.
-
-The `dp_ipi` gets the atom names from an [XYZ file](https://en.wikipedia.org/wiki/XYZ_file_format) provided by **`coord_file`** (meanwhile ignores all coordinates in it), and translates the names to atom types by rules provided by **`atom_type`**.
-
-## Use deep potential with ASE
-
-Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
-```python
-from ase import Atoms
-from deepmd.calculator import DP
-
-water = Atoms('H2O',
-              positions=[(0.7601, 1.9270, 1),
-                         (1.9575, 1, 1),
-                         (1., 1., 1.)],
-              cell=[100, 100, 100],
-              calculator=DP(model="frozen_model.pb"))
-print(water.get_potential_energy())
-print(water.get_forces())
-```
-
-Optimization is also available:
-```python
-from ase.optimize import BFGS
-dyn = BFGS(water)
-dyn.run(fmax=1e-6)
-print(water.get_positions())
-```
 
 # Troubleshooting
 In consequence of various differences of computers or systems, problems may occur. Some common circumstances are listed as follows. 
diff --git a/doc/install.md b/doc/install.md
new file mode 100644
index 0000000000..91a1f8353d
--- /dev/null
+++ b/doc/install.md
@@ -0,0 +1,200 @@
+- [Download and install](#download-and-install)
+    - [Easy installation methods](#easy-installation-methods)
+      - [Offline packages](#offline-packages)
+      - [With Docker](#with-docker)
+      - [With conda](#with-conda)
+    - [Install the python interaction](#install-the-python-interface)
+      - [Install the Tensorflow's python interface](#install-the-tensorflows-python-interface)
+      - [Install the DeePMD-kit's python interface](#install-the-deepmd-kits-python-interface)
+    - [Install the C++ interface](#install-the-c-interface)
+	    - [Install the Tensorflow's C++ interface](#install-the-tensorflows-c-interface)    
+	    - [Install the DeePMD-kit's C++ interface](#install-the-deepmd-kits-c-interface)
+	    - [Install LAMMPS's DeePMD-kit module](#install-lammpss-deepmd-kit-module)
+
+# Download and install
+
+Please follow our [github](https://github.com/deepmodeling/deepmd-kit) webpage to see the latest released version and development version.
+
+## Easy installation methods
+There various easy methods to install DeePMD-kit. Choose one that you prefer. If you want to build by yourself, jump to the next two sections.
+
+After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be available to execute. You can try `dp -h` and `lmp -h` to see the help. `mpirun` is also available considering you may want to run LAMMPS in parallel.
+
+### Offline packages
+Both CPU and GPU version offline packages are avaiable in [the Releases page](https://github.com/deepmodeling/deepmd-kit/releases).
+
+### With conda
+DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [Anaconda](https://www.anaconda.com/distribution/#download-section) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) first.
+
+To install the CPU version:
+```bash
+conda install deepmd-kit=*=*cpu lammps-dp=*=*cpu -c deepmodeling
+```
+
+To install the GPU version containing [CUDA 10.1](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver):
+```bash
+conda install deepmd-kit=*=*gpu lammps-dp=*=*gpu -c deepmodeling
+```
+
+### With Docker
+A docker for installing the DeePMD-kit is available [here](https://github.com/orgs/deepmodeling/packages/container/deepmd-kit).
+
+To pull the CPU version:
+```bash
+docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cpu
+```
+
+To pull the GPU version:
+```bash
+docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cuda10.1_gpu
+```
+
+## Install the python interface 
+### Install the Tensorflow's python interface
+First, check the python version on your machine 
+```bash
+python --version
+```
+
+We follow the virtual environment approach to install the tensorflow's Python interface. The full instruction can be found on [the tensorflow's official website](https://www.tensorflow.org/install/pip). Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
+```bash
+virtualenv -p python3 $tensorflow_venv
+source $tensorflow_venv/bin/activate
+pip install --upgrade pip
+pip install --upgrade tensorflow==2.1.0
+```
+It is notice that everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by 
+```bash
+source $tensorflow_venv/bin/activate
+```
+if one wants to skip out of the virtual environment, he/she can do
+```bash
+deactivate
+```
+If one has multiple python interpreters named like python3.x, it can be specified by, for example
+```bash
+virtualenv -p python3.7 $tensorflow_venv
+```
+If one does not need the GPU support of deepmd-kit and is concerned about package size, the CPU-only version of tensorflow should be installed by	
+```bash	
+pip install --upgrade tensorflow-cpu==2.1.0	
+```
+To verify the installation, run
+```bash
+python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"
+```
+One should remember to activate the virtual environment every time he/she uses deepmd-kit.
+
+### Install the DeePMD-kit's python interface
+
+Execute
+```bash
+pip install deepmd-kit
+```
+To test the installation, one may execute
+```bash
+dp -h
+```
+It will print the help information like
+```text
+usage: dp [-h] {train,freeze,test} ...
+
+DeePMD-kit: A deep learning package for many-body potential energy
+representation and molecular dynamics
+
+optional arguments:
+  -h, --help           show this help message and exit
+
+Valid subcommands:
+  {train,freeze,test}
+    train              train a model
+    freeze             freeze the model
+    test               test the model
+```
+
+## Install the C++ interface 
+
+If one does not need to use DeePMD-kit with Lammps or I-Pi, then the python interface installed in the previous section does everything and he/she can safely skip this section. 
+
+### Install the Tensorflow's C++ interface
+
+Check the compiler version on your machine
+
+```
+gcc --version
+```
+
+The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
+
+First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](install-tf.1.14-gpu.md) to install the C++ interface.
+
+### Install the DeePMD-kit's C++ interface
+
+Clone the DeePMD-kit source code
+```bash
+cd /some/workspace
+git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
+```
+
+For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
+```bash
+cd deepmd-kit
+deepmd_source_dir=`pwd`
+```
+
+Now goto the source code directory of DeePMD-kit and make a build place.
+```bash
+cd $deepmd_source_dir/source
+mkdir build 
+cd build
+```
+I assume you want to install DeePMD-kit into path `$deepmd_root`, then execute cmake
+```bash
+cmake -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
+```
+where the variable `tensorflow_root` stores the location where the tensorflow's C++ interface is installed. The DeePMD-kit will automatically detect if a CUDA tool-kit is available on your machine and build the GPU support accordingly. If you want to force the cmake to find CUDA tool-kit, you can speicify the key `USE_CUDA_TOOLKIT`, 
+```bash
+cmake -DUSE_CUDA_TOOLKIT=true -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
+```
+and you may further asked to provide `CUDA_TOOLKIT_ROOT_DIR`. If the cmake has executed successfully, then 
+```bash
+make
+make install
+```
+If everything works fine, you will have the following executable and libraries installed in `$deepmd_root/bin` and `$deepmd_root/lib`
+```bash
+$ ls $deepmd_root/bin
+dp_ipi
+$ ls $deepmd_root/lib
+libdeepmd_ipi.so  libdeepmd_op.so  libdeepmd.so
+```
+
+### Install LAMMPS's DeePMD-kit module
+DeePMD-kit provide module for running MD simulation with LAMMPS. Now make the DeePMD-kit module for LAMMPS.
+```bash
+cd $deepmd_source_dir/source/build
+make lammps
+```
+DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. Now download your favorite LAMMPS code, and uncompress it (I assume that you have downloaded the tar `lammps-stable.tar.gz`)
+```bash
+cd /some/workspace
+tar xf lammps-stable.tar.gz
+```
+The source code of LAMMPS is stored in directory, for example `lammps-31Mar17`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
+```bash
+cd lammps-31Mar17/src/
+cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
+```
+Now build LAMMPS
+```bash
+make yes-user-deepmd
+make mpi -j4
+```
+The option `-j4` means using 4 processes in parallel. You may want to use a different number according to your hardware. 
+
+If everything works fine, you will end up with an executable `lmp_mpi`.
+
+The DeePMD-kit module can be removed from LAMMPS source code by 
+```bash
+make no-user-deepmd
+```
diff --git a/doc/train-input.rst b/doc/train-input.rst
index 7a53b8a979..6914497c55 100644
--- a/doc/train-input.rst
+++ b/doc/train-input.rst
@@ -21,17 +21,21 @@ model: ``dict``
             Argument path: model/descriptor/type
             The type of the descritpor. Valid types are `loc_frame`, `se_a`,
             `se_r` and `se_ar`. 
+
             - `loc_frame`: Defines a local frame at each
             atom, and the compute the descriptor as local coordinates under this
             frame.
+
             - `se_a`: Used by the smooth edition of Deep Potential. The
             full relative coordinates are used to construct the descriptor.
+
             -
             `se_r`: Used by the smooth edition of Deep Potential. Only the
             distance between atoms is used to construct the descriptor.
-            - `se_ar`:
-            A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off
-            while the `se_r` has a larger cut-off.
+
+            -
+            `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller
+            cut-off while the `se_r` has a larger cut-off.
 
         When *type* is set to ``loc_frame``: 
 
@@ -71,15 +75,18 @@ model: ``dict``
             index of the axis atom defining the first axis. Note that the
             neighbors with the same class and type are sorted according to their
             relative distance.
+
             - axis_rule[i*6+3]: class of the atom defining the
             first axis of type-i atom. 0 for neighbors with full coordinates and 1
             for neighbors only with relative distance.
-            - axis_rule[i*6+4]: type of
-            the atom defining the second axis of type-i atom.
-            - axis_rule[i*6+5]:
-            class of the atom defining the second axis of type-i atom. 0 for
-            neighbors with full coordinates and 1 for neighbors only with relative
-            distance.
+
+            - axis_rule[i*6+4]: type
+            of the atom defining the second axis of type-i atom.
+
+            -
+            axis_rule[i*6+5]: class of the atom defining the second axis of type-i
+            atom. 0 for neighbors with full coordinates and 1 for neighbors only
+            with relative distance.
 
         When *type* is set to ``se_a``: 
 
@@ -225,17 +232,21 @@ model: ``dict``
             Argument path: model/fitting_net/type
             The type of the fitting. Valid types are `ener`, `dipole`, `polar` and
             `global_polar`. 
+
             - `ener`: Fit an energy model (potential energy
             surface).
-            - `dipole`: Fit an atomic dipole model. Atomic dipole labels
-            for all the selected atoms (see `sel_type`) should be provided by
-            `dipole.npy` in each data system. The file has number of frames lines
-            and 3 times of number of selected atoms columns.
-            - `polar`: Fit an
-            atomic polarizability model. Atomic polarizability labels for all the
-            selected atoms (see `sel_type`) should be provided by
+
+            - `dipole`: Fit an atomic dipole model. Atomic dipole
+            labels for all the selected atoms (see `sel_type`) should be provided
+            by `dipole.npy` in each data system. The file has number of frames
+            lines and 3 times of number of selected atoms columns.
+
+            - `polar`: Fit
+            an atomic polarizability model. Atomic polarizability labels for all
+            the selected atoms (see `sel_type`) should be provided by
             `polarizability.npy` in each data system. The file has number of
             frames lines and 9 times of number of selected atoms columns.
+
             -
             `global_polar`: Fit a polarizability model. Polarizability labels
             should be provided by `polarizability.npy` in each data system. The
@@ -276,11 +287,13 @@ model: ``dict``
             Argument path: model/fitting_net/ener/trainable
             Whether the parameters in the fitting net are trainable. This option
             can be
+
             - bool: True if all parameters of the fitting net are
             trainable, False otherwise.
-            - list of bool: Specifies if each layer is
-            trainable. Since the fitting net is composed by hidden layers followed
-            by a output layer, the length of tihs list should be equal to
+
+            - list of bool: Specifies if each layer
+            is trainable. Since the fitting net is composed by hidden layers
+            followed by a output layer, the length of tihs list should be equal to
             len(`neuron`)+1.
 
         rcond: ``float``, optional
@@ -533,13 +546,17 @@ training: ``dict``
     batch_size: ``int``|``list``|``str``, optional
         Argument path: training/batch_size
         This key can be 
+
         - list: the length of which is the same as the
         `systems`. The batch size of each system is given by the elements of
         the list.
+
         - int: all `systems` uses the same batch size.
+
         - string
         "auto": automatically determines the batch size os that the batch_size
         times the number of atoms in the system is no less than 32.
+
         - string
         "auto:N": automatically determines the batch size os that the
         batch_size times the number of atoms in the system is no less than N.
diff --git a/doc/use-deepmd-kit.md b/doc/use-deepmd-kit.md
new file mode 100644
index 0000000000..cbf7a91cbc
--- /dev/null
+++ b/doc/use-deepmd-kit.md
@@ -0,0 +1,352 @@
+- [Use DeePMD-kit](#use-deepmd-kit)
+	- [Prepare data](#prepare-data)
+	- [Train a model](#train-a-model)
+	    - [The DeePMD model](#the-deepmd-model)
+	    - [The DeepPot-SE model](#the-deeppot-se-model)
+	- [Freeze a model](#freeze-a-model)
+	- [Test a model](#test-a-model)
+	- [Model inference](#model-inference)
+	- [Run MD with Lammps](#run-md-with-lammps)
+	    - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
+	    - [Long-range interaction](#long-range-interaction)
+	- [Run path-integral MD with i-PI](#run-path-integral-md-with-i-pi)
+	- [Use deep potential with ASE](#use-deep-potential-with-ase)
+
+# Use DeePMD-kit
+In this text, we will call the deep neural network that is used to represent the interatomic interactions (Deep Potential) the **model**. The typical procedure of using DeePMD-kit is 
+
+1. Prepare data
+2. Train a model
+3. Freeze the model
+4. Test the model
+5. Inference with the model
+
+## Prepare data
+One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
+
+Property| Unit
+---	| :---:
+Time	| ps
+Length	| Å
+Energy	| eV
+Force	| eV/Å
+Pressure| Bar
+
+The frames of the system are stored in two formats. A raw file is a plain text file with each information item written in one file and one frame written on one line. The default files that provide box, coordinate, force, energy and virial are `box.raw`, `coord.raw`, `force.raw`, `energy.raw` and `virial.raw`, respectively. *We recommend you use these file names*. Here is an example of force.raw:
+```bash
+$ cat force.raw
+-0.724  2.039 -0.951  0.841 -0.464  0.363
+ 6.737  1.554 -5.587 -2.803  0.062  2.222
+-1.968 -0.163  1.020 -0.225 -0.789  0.343
+```
+This `force.raw` contains 3 frames with each frame having the forces of 2 atoms, thus it has 3 lines and 6 columns. Each line provides all the 3 force components of 2 atoms in 1 frame. The first three numbers are the 3 force components of the first atom, while the second three numbers are the 3 force components of the second atom. The coordinate file `coord.raw` is organized similarly. In `box.raw`, the 9 components of the box vectors should be provided on each line. In `virial.raw`, the 9 components of the virial tensor should be provided on each line. The number of lines of all raw files should be identical.
+
+We assume that the atom types do not change in all frames. It is provided by `type.raw`, which has one line with the types of atoms written one by one. The atom types should be integers. For example the `type.raw` of a system that has 2 atoms with 0 and 1:
+```bash
+$ cat type.raw
+0 1
+```
+
+The second format is the data sets of `numpy` binary data that are directly used by the training program. User can use the script `$deepmd_source_dir/data/raw/raw_to_set.sh` to convert the prepared raw files to data sets. For example, if we have a raw file that contains 6000 frames, 
+```bash
+$ ls 
+box.raw  coord.raw  energy.raw  force.raw  type.raw  virial.raw
+$ $deepmd_source_dir/data/raw/raw_to_set.sh 2000
+nframe is 6000
+nline per set is 2000
+will make 3 sets
+making set 0 ...
+making set 1 ...
+making set 2 ...
+$ ls 
+box.raw  coord.raw  energy.raw  force.raw  set.000  set.001  set.002  type.raw  virial.raw
+```
+It generates three sets `set.000`, `set.001` and `set.002`, with each set contains 2000 frames. The last set (`set.002`) is used as testing set, while the rest sets (`set.000` and `set.001`) are used as training sets. One do not need to take care of the binary data files in each of the `set.*` directories. The path containing `set.*` and `type.raw` is called a *system*. 
+
+## Train a model
+
+### Write the input script
+
+The method of training is explained in our [DeePMD][2] and [DeepPot-SE][3] papers. With the source code we provide a small training dataset taken from 400 frames generated by NVT ab-initio water MD trajectory with 300 frames for training and 100 for testing. [An example training parameter file](./examples/water/train/water_se_a.json) is provided. One can try with the training by
+```bash
+$ cd $deepmd_source_dir/examples/water/train/
+$ dp train water_se_a.json
+```
+where `water_se_a.json` is the `json` format parameter file that controls the training. It is also possible to use `yaml` format file with the same keys as json (see `water_se_a.yaml` example). You can use script `json2yaml.py` in `data/json/` dir to convert your json files to yaml. The components of the `water.json` contains four parts, `model`, `learning_rate`, `loss` and `training`.
+
+The `model` section specify how the deep potential model is built. An example of the smooth-edition is provided as follows
+```json
+    "model": {
+	"type_map":	["O", "H"],
+	"descriptor" :{
+	    "type":		"se_a",
+	    "rcut_smth":	5.80,
+	    "rcut":		6.00,
+	    "sel":		[46, 92],
+	    "neuron":		[25, 50, 100],
+	    "axis_neuron":	16,
+	    "resnet_dt":	false,
+	    "seed":		1,
+	    "_comment":		" that's all"
+	},
+	"fitting_net" : {
+	    "neuron":		[240, 240, 240],
+	    "resnet_dt":	true,	    
+	    "seed":		1,
+	    "_comment":		" that's all"
+	},
+	"_comment":	" that's all"
+    }
+```
+The **`type_map`** is optional, which provide the element names (but not restricted to) for corresponding atom types.
+
+The construction of the descriptor is given by option **`descriptor`**. The **`type`** of the descriptor is set to `"se_a"`, which means smooth-edition, angular infomation. The  **`rcut`** is the cut-off radius for neighbor searching, and the **`rcut_smth`** gives where the smoothing starts. **`sel`** gives the maximum possible number of neighbors in the cut-off radius. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denote the maximum possible number of neighbors with type `i`. The **`neuron`** specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layers from input end to the output end, respectively. The **`axis_neuron`** specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper][3]. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
+
+The construction of the fitting net is give by **`fitting_net`**. The key **`neuron`** specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
+
+An example of the `learning_rate` is given as follows
+```json
+    "learning_rate" :{
+	"type":		"exp",
+	"start_lr":	0.005,
+	"decay_steps":	5000,
+	"decay_rate":	0.95,
+	"_comment":	"that's all"
+    }
+```
+The option **`start_lr`**, **`decay_rate`** and **`decay_steps`** specify how the learning rate changes. For example, the `t`th batch will be trained with learning rate:
+```math
+lr(t) = start_lr * decay_rate ^ ( t / decay_steps )
+```
+
+An example of the `loss` is 
+```json
+    "loss" : {
+	"start_pref_e":	0.02,
+	"limit_pref_e":	1,
+	"start_pref_f":	1000,
+	"limit_pref_f":	1,
+	"start_pref_v":	0,
+	"limit_pref_v":	0,
+	"_comment":	" that's all"
+    }
+```
+The options **`start_pref_e`**, **`limit_pref_e`**, **`start_pref_f`**, **`limit_pref_f`**, **`start_pref_v`** and **`limit_pref_v`** determine how the prefactors of energy error, force error and virial error changes in the loss function (see the appendix of the [DeePMD paper][2] for details). Taking the prefactor of force error for example, the prefactor at batch `t` is
+```math
+w_f(t) = start_pref_f * ( lr(t) / start_lr ) + limit_pref_f * ( 1 - lr(t) / start_lr )
+```
+Since we do not have virial data, the virial prefactors `start_pref_v` and `limit_pref_v` are set to 0.
+
+An example of `training` is
+```json
+    "training" : {
+	"systems":	["../data1/", "../data2/"],
+	"set_prefix":	"set",    
+	"stop_batch":	1000000,
+	"_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]",
+	"batch_size":	1,
+
+	"seed":		1,
+
+	"_comment": " display and restart",
+	"_comment": " frequencies counted in batch",
+	"disp_file":	"lcurve.out",
+	"disp_freq":	100,
+	"_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]",
+	"numb_test":	10,
+	"save_freq":	1000,
+	"save_ckpt":	"model.ckpt",
+	"load_ckpt":	"model.ckpt",
+	"disp_training":true,
+	"time_training":true,
+	"profiling":	false,
+	"profiling_file":"timeline.json",
+	"_comment":	"that's all"
+    }
+```
+The option **`systems`** provide location of the systems (path to `set.*` and `type.raw`). It is a vector, thus DeePMD-kit allows you to provide multiple systems. DeePMD-kit will train the model with the systems in the vector one by one in a cyclic manner. **It is warned that the example water data (in folder `examples/data/water`) is of very limited amount, is provided only for testing purpose, and should not be used to train a productive model.**
+
+The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size or it can be input as a list setting batch size individually for each system.
+The option **`stop_batch`** specifies the total number of batches will be used in the training.
+
+The option **`numb_test`** specifies the number of tests that will be used for each system. If it is an integer each system will be tested with the same number of tests. It can be set to percentage `"XX%"` to use XX% of frames of each system for its testing or it can be input as a list setting numer of tests individually for each system (the order should correspond to ordering of the systems key in json).
+
+### Training
+
+The training can be invoked by
+```bash
+$ dp train water_se_a.json
+```
+
+During the training, the error of the model is tested every **`disp_freq`** batches with **`numb_test`** frames from the last set in the **`systems`** directory on the fly, and the results are output to **`disp_file`**. A typical `disp_file` looks like
+```bash
+# batch      l2_tst    l2_trn    l2_e_tst  l2_e_trn    l2_f_tst  l2_f_trn         lr
+      0    2.67e+01  2.57e+01    2.21e-01  2.22e-01    8.44e-01  8.12e-01    1.0e-03
+    100    6.14e+00  5.40e+00    3.01e-01  2.99e-01    1.93e-01  1.70e-01    1.0e-03
+    200    5.02e+00  4.49e+00    1.53e-01  1.53e-01    1.58e-01  1.42e-01    1.0e-03
+    300    4.36e+00  3.71e+00    7.32e-02  7.27e-02    1.38e-01  1.17e-01    1.0e-03
+    400    4.04e+00  3.29e+00    3.16e-02  3.22e-02    1.28e-01  1.04e-01    1.0e-03
+```
+The first column displays the number of batches. The second and third columns display the loss function evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The fourth and fifth columns display the RMS energy error (normalized by number of atoms) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The sixth and seventh columns display the RMS force error (component-wise) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The last column displays the current learning rate.
+
+Checkpoints will be written to files with prefix **`save_ckpt`** every **`save_freq`** batches. If **`restart`** is set to `true`, then the training will start from the checkpoint named **`load_ckpt`**, rather than from scratch.
+
+Several command line options can be passed to `dp train`, which can be checked with
+```bash
+$ dp train --help
+```
+An explanation will be provided
+```
+positional arguments:
+  INPUT                 the input json database
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --init-model INIT_MODEL
+                        Initialize a model by the provided checkpoint
+  --restart RESTART     Restart the training from the provided checkpoint
+```
+The keys `intra_op_parallelism_threads` and `inter_op_parallelism_threads` are Tensorflow configurations for multithreading, which are explained [here](https://www.tensorflow.org/performance/performance_guide#optimizing_for_cpu). Skipping `-t` and `OMP_NUM_THREADS` leads to the default setting of these keys in the Tensorflow.
+
+**`--init-model model.ckpt`**, for example, initializes the model training with an existing model that is stored in the checkpoint `model.ckpt`, the network architectures should match.
+
+**`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`.
+
+On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are  Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads).
+
+For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows:
+```bash
+export OMP_NUM_THREADS=6
+export TF_INTRA_OP_PARALLELISM_THREADS=3
+export TF_INTER_OP_PARALLELISM_THREADS=2
+dp train input.json
+```
+
+## Freeze a model
+
+The trained neural network is extracted from a checkpoint and dumped into a database. This process is called "freezing" a model. The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc). To freeze a model, typically one does
+```bash
+$ dp freeze -o graph.pb
+```
+in the folder where the model is trained. The output database is called `graph.pb`.
+
+
+## Test a model
+
+The frozen model can be used in many ways. The most straightforward test can be performed using `dp test`. A typical usage of `dp test` is 
+```bash
+dp test -m graph.pb -s /path/to/system -n 30
+```
+where `-m` gives the tested model, `-s` the path to the tested system and `-n` the number of tested frames. Several other command line options can be passed to `dp test`, which can be checked with
+```bash
+$ dp test --help
+```
+An explanation will be provided
+```
+usage: dp test [-h] [-m MODEL] [-s SYSTEM] [-S SET_PREFIX] [-n NUMB_TEST]
+               [-r RAND_SEED] [--shuffle-test] [-d DETAIL_FILE]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Frozen model file to import
+  -s SYSTEM, --system SYSTEM
+                        The system dir
+  -S SET_PREFIX, --set-prefix SET_PREFIX
+                        The set prefix
+  -n NUMB_TEST, --numb-test NUMB_TEST
+                        The number of data for test
+  -r RAND_SEED, --rand-seed RAND_SEED
+                        The random seed
+  --shuffle-test        Shuffle test data
+  -d DETAIL_FILE, --detail-file DETAIL_FILE
+                        The file containing details of energy force and virial
+                        accuracy
+```
+
+## Model inference 
+One may use the python interface of DeePMD-kit for model inference, an example is given as follows
+```python
+import deepmd.DeepPot as DP
+import numpy as np
+dp = DP('graph.pb')
+coord = np.array([[1,0,0], [0,0,1.5], [1,0,3]]).reshape([1, -1])
+cell = np.diag(10 * np.ones(3)).reshape([1, -1])
+atype = [1,0,1]
+e, f, v = dp.eval(coord, cell, atype)
+```
+where `e`, `f` and `v` are predicted energy, force and virial of the system, respectively.
+
+
+## Run MD with LAMMPS
+### Include deepmd in the pair style
+Running an MD simulation with LAMMPS is simpler. In the LAMMPS input file, one needs to specify the pair style as follows
+```bash
+pair_style     deepmd graph.pb
+pair_coeff     
+```
+where `graph.pb` is the file name of the frozen model. The `pair_coeff` should be left blank. It should be noted that LAMMPS counts atom types starting from 1, therefore, all LAMMPS atom type will be firstly subtracted by 1, and then passed into the DeePMD-kit engine to compute the interactions. [A detailed documentation of this pair style is available.](doc/lammps-pair-style-deepmd.md).
+
+### Long-range interaction
+The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes 
+```bash
+pair_style	deepmd graph.pb
+pair_coeff
+kspace_style	pppm 1.0e-5
+kspace_modify	gewald 0.45
+```
+Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
+
+## Run path-integral MD with i-PI
+The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. The client can be started by
+```bash
+$ dp_ipi water.json
+```
+It is noted that multiple instances of the client is allow for computing, in parallel, the interactions of multiple replica of the path-integral MD.
+
+`water.json` is the parameter file for the client `dp_ipi`, and [an example](./examples/ipi/water.json) is provided:
+```json
+{
+    "verbose":		false,
+    "use_unix":		true,
+    "port":		31415,
+    "host":		"localhost",
+    "graph_file":	"graph.pb",
+    "coord_file":	"conf.xyz",
+    "atom_type" : {
+	"OW":		0, 
+	"HW1":		1,
+	"HW2":		1
+    }
+}
+```
+The option **`use_unix`** is set to `true` to activate the Unix domain socket, otherwise, the Internet socket is used.
+
+The option **`graph_file`** provides the file name of the frozen model.
+
+The `dp_ipi` gets the atom names from an [XYZ file](https://en.wikipedia.org/wiki/XYZ_file_format) provided by **`coord_file`** (meanwhile ignores all coordinates in it), and translates the names to atom types by rules provided by **`atom_type`**.
+
+## Use deep potential with ASE
+
+Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
+```python
+from ase import Atoms
+from deepmd.calculator import DP
+
+water = Atoms('H2O',
+              positions=[(0.7601, 1.9270, 1),
+                         (1.9575, 1, 1),
+                         (1., 1., 1.)],
+              cell=[100, 100, 100],
+              calculator=DP(model="frozen_model.pb"))
+print(water.get_potential_energy())
+print(water.get_forces())
+```
+
+Optimization is also available:
+```python
+from ase.optimize import BFGS
+dyn = BFGS(water)
+dyn.run(fmax=1e-6)
+print(water.get_positions())
+```
diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index 15c384d843..d852b1c210 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -23,9 +23,9 @@ def descrpt_local_frame_args ():
     doc_axis_rule = 'A list of integers. The length should be 6 times of the number of types. \n\n\
 - axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
 - axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.\n\n\
-- axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.\n\
-- axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\
-- axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.\n\
+- axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.\n\n\
+- axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
+- axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.\n\n\
 - axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.'
     
     return [
@@ -109,10 +109,10 @@ def descrpt_se_ar_args():
 
 
 def descrpt_variant_type_args():
-    doc_descrpt_type = 'The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. \n\
-- `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\
-- `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\
-- `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\
+    doc_descrpt_type = 'The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. \n\n\
+- `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\n\
+- `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\n\
+- `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\n\
 - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.'
     
     return Variant("type", [
@@ -130,8 +130,8 @@ def fitting_ener():
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
     doc_precision = f'The precision of the fitting net parameters, supported options are {supported_precision()}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\
-- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\
+    doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\n\
+- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
 - list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.'
     doc_rcond = 'The condition number used to determine the inital energy shift for each type of atoms.'
     doc_seed = 'Random seed for parameter initialization of the fitting net'
@@ -197,10 +197,10 @@ def fitting_dipole():
 
 
 def fitting_variant_type_args():
-    doc_descrpt_type = 'The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. \n\
-- `ener`: Fit an energy model (potential energy surface).\n\
-- `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.\n\
-- `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.\n\
+    doc_descrpt_type = 'The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. \n\n\
+- `ener`: Fit an energy model (potential energy surface).\n\n\
+- `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.\n\n\
+- `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.\n\n\
 - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.'
     
     return Variant("type", [Argument("ener", dict, fitting_ener()),
@@ -291,10 +291,10 @@ def training_args():
     doc_systems = 'The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.'
     doc_set_prefix = 'The prefix of the sets in the systems.'
     doc_stop_batch = 'Number of training batch. Each training uses one batch of data.'
-    doc_batch_size = 'This key can be \n\
-- list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.\n\
-- int: all `systems` uses the same batch size.\n\
-- string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.\n\
+    doc_batch_size = 'This key can be \n\n\
+- list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.\n\n\
+- int: all `systems` uses the same batch size.\n\n\
+- string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.\n\n\
 - string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.'
     doc_seed = 'The random seed for training.'
     doc_disp_file = 'The file for printing learning curve.'

From cacf428ea86c75875375378b1ed97c8af0f4b47a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 19 Oct 2020 23:48:53 +0800
Subject: [PATCH 40/65] doc generated by updated dargs

---
 doc/train-input.rst | 962 ++++++++++++++++++++++++--------------------
 1 file changed, 525 insertions(+), 437 deletions(-)

diff --git a/doc/train-input.rst b/doc/train-input.rst
index 6914497c55..88b7a412d8 100644
--- a/doc/train-input.rst
+++ b/doc/train-input.rst
@@ -1,602 +1,690 @@
-model: ``dict``
-    Argument path: model
+model: 
+    | type: ``dict``
+    | argument path: ``model``
+
+    type_map: 
+        | type: ``list``, optional
+        | argument path: ``model/type_map``
 
-    type_map: ``list``, optional
-        Argument path: model/type_map
         A list of strings. Give the name to each type of atoms.
 
-    data_stat_nbatch: ``int``, optional
-        Argument path: model/data_stat_nbatch
-        The model determines the normalization from the statistics of the
-        data. This key specifies the number of `frames` in each `system` used
-        for statistics.
+    data_stat_nbatch: 
+        | type: ``int``, optional, default: ``10``
+        | argument path: ``model/data_stat_nbatch``
+
+        The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.
+
+    descriptor: 
+        | type: ``dict``
+        | argument path: ``model/descriptor``
 
-    descriptor: ``dict``
-        Argument path: model/descriptor
         The descriptor of atomic environment.
 
+
         Depending on the value of *type*, different sub args are accepted. 
 
-        type: ``str``
-            Argument path: model/descriptor/type
-            The type of the descritpor. Valid types are `loc_frame`, `se_a`,
-            `se_r` and `se_ar`. 
+        type:
+            | type: ``str`` (flag key)
+            | argument path: ``model/descriptor/type`` 
+
+            The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. 
 
-            - `loc_frame`: Defines a local frame at each
-            atom, and the compute the descriptor as local coordinates under this
-            frame.
+            - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.
 
-            - `se_a`: Used by the smooth edition of Deep Potential. The
-            full relative coordinates are used to construct the descriptor.
+            - `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.
 
-            -
-            `se_r`: Used by the smooth edition of Deep Potential. Only the
-            distance between atoms is used to construct the descriptor.
+            - `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.
+
+            - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
 
-            -
-            `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller
-            cut-off while the `se_r` has a larger cut-off.
 
         When *type* is set to ``loc_frame``: 
 
-        sel_a: ``list``
-            Argument path: model/descriptor/loc_frame/sel_a
-            A list of integers. The length of the list should be the same as the
-            number of atom types in the system. `sel_a[i]` gives the selected
-            number of type-i neighbors. The full relative coordinates of the
-            neighbors are used by the descriptor.
-
-        sel_r: ``list``
-            Argument path: model/descriptor/loc_frame/sel_r
-            A list of integers. The length of the list should be the same as the
-            number of atom types in the system. `sel_r[i]` gives the selected
-            number of type-i neighbors. Only relative distance of the neighbors
-            are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be
-            larger than the maximally possible number of type-i neighbors in the
-            cut-off radius.
-
-        rcut: ``float``, optional
-            Argument path: model/descriptor/loc_frame/rcut
+        sel_a: 
+            | type: ``list``
+            | argument path: ``model/descriptor[loc_frame]/sel_a``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.
+
+        sel_r: 
+            | type: ``list``
+            | argument path: ``model/descriptor[loc_frame]/sel_r``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+        rcut: 
+            | type: ``float``, optional, default: ``6.0``
+            | argument path: ``model/descriptor[loc_frame]/rcut``
+
             The cut-off radius. The default value is 6.0
 
-        axis_rule: ``list``
-            Argument path: model/descriptor/loc_frame/axis_rule
-            A list of integers. The length should be 6 times of the number of
-            types. 
+        axis_rule: 
+            | type: ``list``
+            | argument path: ``model/descriptor[loc_frame]/axis_rule``
+
+            A list of integers. The length should be 6 times of the number of types. 
 
-            - axis_rule[i*6+0]: class of the atom defining the first axis
-            of type-i atom. 0 for neighbors with full coordinates and 1 for
-            neighbors only with relative distance.
+            - axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
 
-            - axis_rule[i*6+1]: type of
-            the atom defining the first axis of type-i atom.
+            - axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.
 
-            - axis_rule[i*6+2]:
-            index of the axis atom defining the first axis. Note that the
-            neighbors with the same class and type are sorted according to their
-            relative distance.
+            - axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.
 
-            - axis_rule[i*6+3]: class of the atom defining the
-            first axis of type-i atom. 0 for neighbors with full coordinates and 1
-            for neighbors only with relative distance.
+            - axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
 
-            - axis_rule[i*6+4]: type
-            of the atom defining the second axis of type-i atom.
+            - axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.
+
+            - axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
 
-            -
-            axis_rule[i*6+5]: class of the atom defining the second axis of type-i
-            atom. 0 for neighbors with full coordinates and 1 for neighbors only
-            with relative distance.
 
         When *type* is set to ``se_a``: 
 
-        sel: ``list``
-            Argument path: model/descriptor/se_a/sel
-            A list of integers. The length of the list should be the same as the
-            number of atom types in the system. `sel[i]` gives the selected number
-            of type-i neighbors. `sel[i]` is recommended to be larger than the
-            maximally possible number of type-i neighbors in the cut-off radius.
+        sel: 
+            | type: ``list``
+            | argument path: ``model/descriptor[se_a]/sel``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+        rcut: 
+            | type: ``float``, optional, default: ``6.0``
+            | argument path: ``model/descriptor[se_a]/rcut``
 
-        rcut: ``float``, optional
-            Argument path: model/descriptor/se_a/rcut
             The cut-off radius.
 
-        rcut_smth: ``float``, optional
-            Argument path: model/descriptor/se_a/rcut_smth
-            Where to start smoothing. For example the 1/r term is smoothed from
-            `rcut` to `rcut_smth`
+        rcut_smth: 
+            | type: ``float``, optional, default: ``0.5``
+            | argument path: ``model/descriptor[se_a]/rcut_smth``
+
+            Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
+
+        neuron: 
+            | type: ``list``, optional, default: ``[10, 20, 40]``
+            | argument path: ``model/descriptor[se_a]/neuron``
+
+            Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
 
-        neuron: ``list``, optional
-            Argument path: model/descriptor/se_a/neuron
-            Number of neurons in each hidden layers of the embedding net. When two
-            layers are of the same size or one layer is twice as large as the
-            previous layer, a skip connection is built.
+        axis_neuron: 
+            | type: ``int``, optional, default: ``4``
+            | argument path: ``model/descriptor[se_a]/axis_neuron``
 
-        axis_neuron: ``int``, optional
-            Argument path: model/descriptor/se_a/axis_neuron
             Size of the submatrix of G (embedding matrix).
 
-        activation_function: ``str``, optional
-            Argument path: model/descriptor/se_a/activation_function
-            The activation function in the embedding net. Supported activation
-            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/descriptor[se_a]/activation_function``
+
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_a]/resnet_dt``
 
-        resnet_dt: ``bool``, optional
-            Argument path: model/descriptor/se_a/resnet_dt
             Whether to use a "Timestep" in the skip connection
 
-        type_one_side: ``bool``, optional
-            Argument path: model/descriptor/se_a/type_one_side
-            Try to build N_types embedding nets. Otherwise, building N_types^2
-            embedding nets
+        type_one_side: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_a]/type_one_side``
+
+            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/descriptor[se_a]/precision``
+
+            The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
 
-        precision: ``str``, optional
-            Argument path: model/descriptor/se_a/precision
-            The precision of the embedding net parameters, supported options are
-            "float64", "float32", "float16".
+        trainable: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/descriptor[se_a]/trainable``
 
-        trainable: ``bool``, optional
-            Argument path: model/descriptor/se_a/trainable
             If the parameters in the embedding net is trainable
 
-        seed: ``int``|``NoneType``, optional
-            Argument path: model/descriptor/se_a/seed
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/descriptor[se_a]/seed``
+
             Random seed for parameter initialization
 
-        exclude_types: ``list``, optional
-            Argument path: model/descriptor/se_a/exclude_types
+        exclude_types: 
+            | type: ``list``, optional, default: ``[]``
+            | argument path: ``model/descriptor[se_a]/exclude_types``
+
             The Excluded types
 
-        set_davg_zero: ``bool``, optional
-            Argument path: model/descriptor/se_a/set_davg_zero
-            Set the normalization average to zero. This option should be set when
-            `atom_ener` in the energy fitting is used
+        set_davg_zero: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_a]/set_davg_zero``
+
+            Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
+
 
         When *type* is set to ``se_r``: 
 
-        sel: ``list``
-            Argument path: model/descriptor/se_r/sel
-            A list of integers. The length of the list should be the same as the
-            number of atom types in the system. `sel[i]` gives the selected number
-            of type-i neighbors. `sel[i]` is recommended to be larger than the
-            maximally possible number of type-i neighbors in the cut-off radius.
+        sel: 
+            | type: ``list``
+            | argument path: ``model/descriptor[se_r]/sel``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+        rcut: 
+            | type: ``float``, optional, default: ``6.0``
+            | argument path: ``model/descriptor[se_r]/rcut``
 
-        rcut: ``float``, optional
-            Argument path: model/descriptor/se_r/rcut
             The cut-off radius.
 
-        rcut_smth: ``float``, optional
-            Argument path: model/descriptor/se_r/rcut_smth
-            Where to start smoothing. For example the 1/r term is smoothed from
-            `rcut` to `rcut_smth`
+        rcut_smth: 
+            | type: ``float``, optional, default: ``0.5``
+            | argument path: ``model/descriptor[se_r]/rcut_smth``
+
+            Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
+
+        neuron: 
+            | type: ``list``, optional, default: ``[10, 20, 40]``
+            | argument path: ``model/descriptor[se_r]/neuron``
+
+            Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
 
-        neuron: ``list``, optional
-            Argument path: model/descriptor/se_r/neuron
-            Number of neurons in each hidden layers of the embedding net. When two
-            layers are of the same size or one layer is twice as large as the
-            previous layer, a skip connection is built.
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/descriptor[se_r]/activation_function``
 
-        activation_function: ``str``, optional
-            Argument path: model/descriptor/se_r/activation_function
-            The activation function in the embedding net. Supported activation
-            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_r]/resnet_dt``
 
-        resnet_dt: ``bool``, optional
-            Argument path: model/descriptor/se_r/resnet_dt
             Whether to use a "Timestep" in the skip connection
 
-        type_one_side: ``bool``, optional
-            Argument path: model/descriptor/se_r/type_one_side
-            Try to build N_types embedding nets. Otherwise, building N_types^2
-            embedding nets
+        type_one_side: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_r]/type_one_side``
+
+            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/descriptor[se_r]/precision``
+
+            The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
 
-        precision: ``str``, optional
-            Argument path: model/descriptor/se_r/precision
-            The precision of the embedding net parameters, supported options are
-            "float64", "float32", "float16".
+        trainable: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/descriptor[se_r]/trainable``
 
-        trainable: ``bool``, optional
-            Argument path: model/descriptor/se_r/trainable
             If the parameters in the embedding net is trainable
 
-        seed: ``int``|``NoneType``, optional
-            Argument path: model/descriptor/se_r/seed
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/descriptor[se_r]/seed``
+
             Random seed for parameter initialization
 
-        exclude_types: ``list``, optional
-            Argument path: model/descriptor/se_r/exclude_types
+        exclude_types: 
+            | type: ``list``, optional, default: ``[]``
+            | argument path: ``model/descriptor[se_r]/exclude_types``
+
             The Excluded types
 
-        set_davg_zero: ``bool``, optional
-            Argument path: model/descriptor/se_r/set_davg_zero
-            Set the normalization average to zero. This option should be set when
-            `atom_ener` in the energy fitting is used
+        set_davg_zero: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_r]/set_davg_zero``
+
+            Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
+
 
         When *type* is set to ``se_ar``: 
 
-        a: ``dict``
-            Argument path: model/descriptor/se_ar/a
+        a: 
+            | type: ``dict``
+            | argument path: ``model/descriptor[se_ar]/a``
+
             The parameters of descriptor `se_a`
 
-        r: ``dict``
-            Argument path: model/descriptor/se_ar/r
+        r: 
+            | type: ``dict``
+            | argument path: ``model/descriptor[se_ar]/r``
+
             The parameters of descriptor `se_r`
 
-    fitting_net: ``dict``
-        Argument path: model/fitting_net
+    fitting_net: 
+        | type: ``dict``
+        | argument path: ``model/fitting_net``
+
         The fitting of physical properties.
 
+
         Depending on the value of *type*, different sub args are accepted. 
 
-        type: ``str``, default: ``ener``
-            Argument path: model/fitting_net/type
-            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and
-            `global_polar`. 
+        type:
+            | type: ``str`` (flag key), default: ``ener``
+            | argument path: ``model/fitting_net/type`` 
+
+            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. 
 
-            - `ener`: Fit an energy model (potential energy
-            surface).
+            - `ener`: Fit an energy model (potential energy surface).
 
-            - `dipole`: Fit an atomic dipole model. Atomic dipole
-            labels for all the selected atoms (see `sel_type`) should be provided
-            by `dipole.npy` in each data system. The file has number of frames
-            lines and 3 times of number of selected atoms columns.
+            - `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.
 
-            - `polar`: Fit
-            an atomic polarizability model. Atomic polarizability labels for all
-            the selected atoms (see `sel_type`) should be provided by
-            `polarizability.npy` in each data system. The file has number of
-            frames lines and 9 times of number of selected atoms columns.
+            - `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.
+
+            - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
 
-            -
-            `global_polar`: Fit a polarizability model. Polarizability labels
-            should be provided by `polarizability.npy` in each data system. The
-            file has number of frames lines and 9 columns.
 
         When *type* is set to ``ener``: 
 
-        numb_fparam: ``int``, optional
-            Argument path: model/fitting_net/ener/numb_fparam
-            The dimension of the frame parameter. If set to >0, file `fparam.npy`
-            should be included to provided the input fparams.
-
-        numb_aparam: ``int``, optional
-            Argument path: model/fitting_net/ener/numb_aparam
-            The dimension of the atomic parameter. If set to >0, file `aparam.npy`
-            should be included to provided the input aparams.
-
-        neuron: ``list``, optional
-            Argument path: model/fitting_net/ener/neuron
-            The number of neurons in each hidden layers of the fitting net. When
-            two hidden layers are of the same size, a skip connection is built.
-
-        activation_function: ``str``, optional
-            Argument path: model/fitting_net/ener/activation_function
-            The activation function in the fitting net. Supported activation
-            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        precision: ``str``, optional
-            Argument path: model/fitting_net/ener/precision
-            The precision of the fitting net parameters, supported options are
-            "float64", "float32", "float16".
-
-        resnet_dt: ``bool``, optional
-            Argument path: model/fitting_net/ener/resnet_dt
+        numb_fparam: 
+            | type: ``int``, optional, default: ``0``
+            | argument path: ``model/fitting_net[ener]/numb_fparam``
+
+            The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.
+
+        numb_aparam: 
+            | type: ``int``, optional, default: ``0``
+            | argument path: ``model/fitting_net[ener]/numb_aparam``
+
+            The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.
+
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[ener]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[ener]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[ener]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[ener]/resnet_dt``
+
             Whether to use a "Timestep" in the skip connection
 
-        trainable: ``bool``|``list``, optional
-            Argument path: model/fitting_net/ener/trainable
-            Whether the parameters in the fitting net are trainable. This option
-            can be
+        trainable: 
+            | type: ``bool`` | ``list``, optional, default: ``True``
+            | argument path: ``model/fitting_net[ener]/trainable``
+
+            Whether the parameters in the fitting net are trainable. This option can be
+
+            - bool: True if all parameters of the fitting net are trainable, False otherwise.
 
-            - bool: True if all parameters of the fitting net are
-            trainable, False otherwise.
+            - list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.
 
-            - list of bool: Specifies if each layer
-            is trainable. Since the fitting net is composed by hidden layers
-            followed by a output layer, the length of tihs list should be equal to
-            len(`neuron`)+1.
+        rcond: 
+            | type: ``float``, optional, default: ``0.001``
+            | argument path: ``model/fitting_net[ener]/rcond``
 
-        rcond: ``float``, optional
-            Argument path: model/fitting_net/ener/rcond
-            The condition number used to determine the inital energy shift for
-            each type of atoms.
+            The condition number used to determine the inital energy shift for each type of atoms.
+
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[ener]/seed``
 
-        seed: ``int``|``NoneType``, optional
-            Argument path: model/fitting_net/ener/seed
             Random seed for parameter initialization of the fitting net
 
-        atom_ener: ``list``, optional
-            Argument path: model/fitting_net/ener/atom_ener
+        atom_ener: 
+            | type: ``list``, optional, default: ``[]``
+            | argument path: ``model/fitting_net[ener]/atom_ener``
+
             Specify the atomic energy in vacuum for each type
 
+
         When *type* is set to ``dipole``: 
 
-        neuron: ``list``, optional
-            Argument path: model/fitting_net/dipole/neuron
-            The number of neurons in each hidden layers of the fitting net. When
-            two hidden layers are of the same size, a skip connection is built.
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[dipole]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
 
-        activation_function: ``str``, optional
-            Argument path: model/fitting_net/dipole/activation_function
-            The activation function in the fitting net. Supported activation
-            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[dipole]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[dipole]/resnet_dt``
 
-        resnet_dt: ``bool``, optional
-            Argument path: model/fitting_net/dipole/resnet_dt
             Whether to use a "Timestep" in the skip connection
 
-        precision: ``str``, optional
-            Argument path: model/fitting_net/dipole/precision
-            The precision of the fitting net parameters, supported options are
-            "float64", "float32", "float16".
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[dipole]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        sel_type: 
+            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | argument path: ``model/fitting_net[dipole]/sel_type``
+
+            The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
 
-        sel_type: ``int``|``NoneType``|``list``, optional
-            Argument path: model/fitting_net/dipole/sel_type
-            The atom types for which the atomic dipole will be provided. If not
-            set, all types will be selected.
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[dipole]/seed``
 
-        seed: ``int``|``NoneType``, optional
-            Argument path: model/fitting_net/dipole/seed
             Random seed for parameter initialization of the fitting net
 
+
         When *type* is set to ``polar``: 
 
-        neuron: ``list``, optional
-            Argument path: model/fitting_net/polar/neuron
-            The number of neurons in each hidden layers of the fitting net. When
-            two hidden layers are of the same size, a skip connection is built.
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[polar]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[polar]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
-        activation_function: ``str``, optional
-            Argument path: model/fitting_net/polar/activation_function
-            The activation function in the fitting net. Supported activation
-            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[polar]/resnet_dt``
 
-        resnet_dt: ``bool``, optional
-            Argument path: model/fitting_net/polar/resnet_dt
             Whether to use a "Timestep" in the skip connection
 
-        precision: ``str``, optional
-            Argument path: model/fitting_net/polar/precision
-            The precision of the fitting net parameters, supported options are
-            "float64", "float32", "float16".
-
-        fit_diag: ``bool``, optional
-            Argument path: model/fitting_net/polar/fit_diag
-            The diagonal part of the polarizability matrix  will be shifted by
-            `fit_diag`. The shift operation is carried out after `scale`.
-
-        scale: ``float``|``list``, optional
-            Argument path: model/fitting_net/polar/scale
-            The output of the fitting net (polarizability matrix) will be scaled
-            by `scale`
-
-        diag_shift: ``float``|``list``, optional
-            Argument path: model/fitting_net/polar/diag_shift
-            The diagonal part of the polarizability matrix  will be shifted by
-            `fit_diag`. The shift operation is carried out after `scale`.
-
-        sel_type: ``int``|``NoneType``|``list``, optional
-            Argument path: model/fitting_net/polar/sel_type
-            The atom types for which the atomic polarizability will be provided.
-            If not set, all types will be selected.
-
-        seed: ``int``|``NoneType``, optional
-            Argument path: model/fitting_net/polar/seed
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[polar]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        fit_diag: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[polar]/fit_diag``
+
+            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+
+        scale: 
+            | type: ``float`` | ``list``, optional, default: ``1.0``
+            | argument path: ``model/fitting_net[polar]/scale``
+
+            The output of the fitting net (polarizability matrix) will be scaled by `scale`
+
+        diag_shift: 
+            | type: ``float`` | ``list``, optional, default: ``0.0``
+            | argument path: ``model/fitting_net[polar]/diag_shift``
+
+            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+
+        sel_type: 
+            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | argument path: ``model/fitting_net[polar]/sel_type``
+
+            The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
+
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[polar]/seed``
+
             Random seed for parameter initialization of the fitting net
 
+
         When *type* is set to ``global_polar``: 
 
-        neuron: ``list``, optional
-            Argument path: model/fitting_net/global_polar/neuron
-            The number of neurons in each hidden layers of the fitting net. When
-            two hidden layers are of the same size, a skip connection is built.
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[global_polar]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[global_polar]/activation_function``
 
-        activation_function: ``str``, optional
-            Argument path: model/fitting_net/global_polar/activation_function
-            The activation function in the fitting net. Supported activation
-            functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[global_polar]/resnet_dt``
 
-        resnet_dt: ``bool``, optional
-            Argument path: model/fitting_net/global_polar/resnet_dt
             Whether to use a "Timestep" in the skip connection
 
-        precision: ``str``, optional
-            Argument path: model/fitting_net/global_polar/precision
-            The precision of the fitting net parameters, supported options are
-            "float64", "float32", "float16".
-
-        fit_diag: ``bool``, optional
-            Argument path: model/fitting_net/global_polar/fit_diag
-            The diagonal part of the polarizability matrix  will be shifted by
-            `fit_diag`. The shift operation is carried out after `scale`.
-
-        scale: ``float``|``list``, optional
-            Argument path: model/fitting_net/global_polar/scale
-            The output of the fitting net (polarizability matrix) will be scaled
-            by `scale`
-
-        diag_shift: ``float``|``list``, optional
-            Argument path: model/fitting_net/global_polar/diag_shift
-            The diagonal part of the polarizability matrix  will be shifted by
-            `fit_diag`. The shift operation is carried out after `scale`.
-
-        sel_type: ``int``|``NoneType``|``list``, optional
-            Argument path: model/fitting_net/global_polar/sel_type
-            The atom types for which the atomic polarizability will be provided.
-            If not set, all types will be selected.
-
-        seed: ``int``|``NoneType``, optional
-            Argument path: model/fitting_net/global_polar/seed
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[global_polar]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        fit_diag: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[global_polar]/fit_diag``
+
+            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+
+        scale: 
+            | type: ``float`` | ``list``, optional, default: ``1.0``
+            | argument path: ``model/fitting_net[global_polar]/scale``
+
+            The output of the fitting net (polarizability matrix) will be scaled by `scale`
+
+        diag_shift: 
+            | type: ``float`` | ``list``, optional, default: ``0.0``
+            | argument path: ``model/fitting_net[global_polar]/diag_shift``
+
+            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+
+        sel_type: 
+            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | argument path: ``model/fitting_net[global_polar]/sel_type``
+
+            The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
+
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[global_polar]/seed``
+
             Random seed for parameter initialization of the fitting net
 
-loss: ``dict``
-    Argument path: loss
-    The definition of loss function. The type of the loss depends on the
-    type of the fitting. For fitting type `ener`, the prefactors before
-    energy, force, virial and atomic energy losses may be provided. For
-    fitting type `dipole`, `polar` and `global_polar`, the loss may be an
-    empty `dict` or unset.
+
+loss: 
+    | type: ``dict``
+    | argument path: ``loss``
+
+    The definition of loss function. The type of the loss depends on the type of the fitting. For fitting type `ener`, the prefactors before energy, force, virial and atomic energy losses may be provided. For fitting type `dipole`, `polar` and `global_polar`, the loss may be an empty `dict` or unset.
+
 
     Depending on the value of *type*, different sub args are accepted. 
 
-    type: ``str``, default: ``ener``
-        Argument path: loss/type
-        The type of the loss. For fitting type `ener`, the loss type should be
-        set to `ener` or left unset. For tensorial fitting types `dipole`,
-        `polar` and `global_polar`, the type should be left unset.
+    type:
+        | type: ``str`` (flag key), default: ``ener``
+        | argument path: ``loss/type`` 
+
+        The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.
         \.
 
+
     When *type* is set to ``ener``: 
 
-    start_pref_e: ``float``|``int``, optional
-        Argument path: loss/ener/start_pref_e
-        The prefactor of energy loss at the start of the training. Should be
-        larger than or equal to 0. If set to none-zero value, the energy label
-        should be provided by file energy.npy in each data system. If both
-        start_pref_energy and limit_pref_energy are set to 0, then the energy
-        will be ignored.
-
-    limit_pref_e: ``float``|``int``, optional
-        Argument path: loss/ener/limit_pref_e
-        The prefactor of energy loss at the limit of the training, Should be
-        larger than or equal to 0. i.e. the training step goes to infinity.
-
-    start_pref_f: ``float``|``int``, optional
-        Argument path: loss/ener/start_pref_f
-        The prefactor of force loss at the start of the training. Should be
-        larger than or equal to 0. If set to none-zero value, the force label
-        should be provided by file force.npy in each data system. If both
-        start_pref_force and limit_pref_force are set to 0, then the force
-        will be ignored.
-
-    limit_pref_f: ``float``|``int``, optional
-        Argument path: loss/ener/limit_pref_f
-        The prefactor of force loss at the limit of the training, Should be
-        larger than or equal to 0. i.e. the training step goes to infinity.
-
-    start_pref_v: ``float``|``int``, optional
-        Argument path: loss/ener/start_pref_v
-        The prefactor of virial loss at the start of the training. Should be
-        larger than or equal to 0. If set to none-zero value, the virial label
-        should be provided by file virial.npy in each data system. If both
-        start_pref_virial and limit_pref_virial are set to 0, then the virial
-        will be ignored.
-
-    limit_pref_v: ``float``|``int``, optional
-        Argument path: loss/ener/limit_pref_v
-        The prefactor of virial loss at the limit of the training, Should be
-        larger than or equal to 0. i.e. the training step goes to infinity.
-
-    start_pref_ae: ``float``|``int``, optional
-        Argument path: loss/ener/start_pref_ae
-        The prefactor of virial loss at the start of the training. Should be
-        larger than or equal to 0. If set to none-zero value, the virial label
-        should be provided by file virial.npy in each data system. If both
-        start_pref_virial and limit_pref_virial are set to 0, then the virial
-        will be ignored.
-
-    limit_pref_ae: ``float``|``int``, optional
-        Argument path: loss/ener/limit_pref_ae
-        The prefactor of virial loss at the limit of the training, Should be
-        larger than or equal to 0. i.e. the training step goes to infinity.
-
-    relative_f: ``float``|``NoneType``, optional
-        Argument path: loss/ener/relative_f
-        If provided, relative force error will be used in the loss. The
-        difference of force will be normalized by the magnitude of the force
-        in the label with a shift given by `relative_f`, i.e. DF_i / ( || F ||
-        + relative_f ) with DF denoting the difference between prediction and
-        label and || F || denoting the L2 norm of the label.
-
-learning_rate: ``dict``
-    Argument path: learning_rate
+    start_pref_e: 
+        | type: ``float`` | ``int``, optional, default: ``0.02``
+        | argument path: ``loss[ener]/start_pref_e``
+
+        The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
+
+    limit_pref_e: 
+        | type: ``float`` | ``int``, optional, default: ``1.0``
+        | argument path: ``loss[ener]/limit_pref_e``
+
+        The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    start_pref_f: 
+        | type: ``float`` | ``int``, optional, default: ``1000``
+        | argument path: ``loss[ener]/start_pref_f``
+
+        The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
+
+    limit_pref_f: 
+        | type: ``float`` | ``int``, optional, default: ``1.0``
+        | argument path: ``loss[ener]/limit_pref_f``
+
+        The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    start_pref_v: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/start_pref_v``
+
+        The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
+
+    limit_pref_v: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/limit_pref_v``
+
+        The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    start_pref_ae: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/start_pref_ae``
+
+        The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
+
+    limit_pref_ae: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/limit_pref_ae``
+
+        The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    relative_f: 
+        | type: ``float`` | ``NoneType``, optional
+        | argument path: ``loss[ener]/relative_f``
+
+        If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.
+
+
+learning_rate: 
+    | type: ``dict``
+    | argument path: ``learning_rate``
+
     The learning rate options
 
-    start_lr: ``float``, optional
-        Argument path: learning_rate/start_lr
+    start_lr: 
+        | type: ``float``, optional, default: ``0.001``
+        | argument path: ``learning_rate/start_lr``
+
         The learning rate the start of the training.
 
-    stop_lr: ``float``, optional
-        Argument path: learning_rate/stop_lr
+    stop_lr: 
+        | type: ``float``, optional, default: ``1e-08``
+        | argument path: ``learning_rate/stop_lr``
+
         The desired learning rate at the end of the training.
 
-    decay_steps: ``int``, optional
-        Argument path: learning_rate/decay_steps
+    decay_steps: 
+        | type: ``int``, optional, default: ``5000``
+        | argument path: ``learning_rate/decay_steps``
+
         The learning rate is decaying every this number of training steps.
 
-training: ``dict``
-    Argument path: training
+
+training: 
+    | type: ``dict``
+    | argument path: ``training``
+
     The training options
 
-    systems: ``list``|``str``
-        Argument path: training/systems
-        The data systems. This key can be provided with a listthat specifies
-        the systems, or be provided with a string by which the prefix of all
-        systems are given and the list of the systems is automatically
-        generated.
+    systems: 
+        | type: ``list`` | ``str``
+        | argument path: ``training/systems``
+
+        The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.
+
+    set_prefix: 
+        | type: ``str``, optional, default: ``set``
+        | argument path: ``training/set_prefix``
 
-    set_prefix: ``str``, optional
-        Argument path: training/set_prefix
         The prefix of the sets in the systems.
 
-    stop_batch: ``int``
-        Argument path: training/stop_batch
+    stop_batch: 
+        | type: ``int``
+        | argument path: ``training/stop_batch``
+
         Number of training batch. Each training uses one batch of data.
 
-    batch_size: ``int``|``list``|``str``, optional
-        Argument path: training/batch_size
+    batch_size: 
+        | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
+        | argument path: ``training/batch_size``
+
         This key can be 
 
-        - list: the length of which is the same as the
-        `systems`. The batch size of each system is given by the elements of
-        the list.
+        - list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.
 
         - int: all `systems` uses the same batch size.
 
-        - string
-        "auto": automatically determines the batch size os that the batch_size
-        times the number of atoms in the system is no less than 32.
+        - string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.
+
+        - string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.
 
-        - string
-        "auto:N": automatically determines the batch size os that the
-        batch_size times the number of atoms in the system is no less than N.
+    seed: 
+        | type: ``int`` | ``NoneType``, optional
+        | argument path: ``training/seed``
 
-    seed: ``int``|``NoneType``, optional
-        Argument path: training/seed
         The random seed for training.
 
-    disp_file: ``str``, optional
-        Argument path: training/disp_file
+    disp_file: 
+        | type: ``str``, optional, default: ``lcueve.out``
+        | argument path: ``training/disp_file``
+
         The file for printing learning curve.
 
-    disp_freq: ``int``, optional
-        Argument path: training/disp_freq
+    disp_freq: 
+        | type: ``int``, optional, default: ``1000``
+        | argument path: ``training/disp_freq``
+
         The frequency of printing learning curve.
 
-    numb_test: ``int``, optional
-        Argument path: training/numb_test
+    numb_test: 
+        | type: ``int``, optional, default: ``1``
+        | argument path: ``training/numb_test``
+
         Number of frames used for the test during training.
 
-    save_freq: ``int``, optional
-        Argument path: training/save_freq
+    save_freq: 
+        | type: ``int``, optional, default: ``1000``
+        | argument path: ``training/save_freq``
+
         The frequency of saving check point.
 
-    save_ckpt: ``str``, optional
-        Argument path: training/save_ckpt
+    save_ckpt: 
+        | type: ``str``, optional, default: ``model.ckpt``
+        | argument path: ``training/save_ckpt``
+
         The file name of saving check point.
 
-    disp_training: ``bool``, optional
-        Argument path: training/disp_training
+    disp_training: 
+        | type: ``bool``, optional, default: ``True``
+        | argument path: ``training/disp_training``
+
         Displaying verbose information during training.
 
-    time_training: ``bool``, optional
-        Argument path: training/time_training
+    time_training: 
+        | type: ``bool``, optional, default: ``True``
+        | argument path: ``training/time_training``
+
         Timing durining training.
 
-    profiling: ``bool``, optional
-        Argument path: training/profiling
+    profiling: 
+        | type: ``bool``, optional, default: ``False``
+        | argument path: ``training/profiling``
+
         Profiling during training.
 
-    profiling_file: ``str``, optional
-        Argument path: training/profiling_file
+    profiling_file: 
+        | type: ``str``, optional, default: ``timeline.json``
+        | argument path: ``training/profiling_file``
+
         Output file for profiling.
+

From 17f04798953cae11c9834289884bdc958686ad3a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 19 Oct 2020 23:49:30 +0800
Subject: [PATCH 41/65] better way of checking and normalizing data

---
 source/train/argcheck.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index d852b1c210..bb5303b263 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -346,17 +346,11 @@ def normalize(data):
     la = loss_args()
     ta = training_args()
 
-    data_m  = ma .normalize({'model': data.get('model', {})}, trim_pattern = "_*")
-    data_lr = lra.normalize({'learning_rate': data.get('learning_rate', {})}, trim_pattern = "_*")
-    data_l  = la .normalize({'loss': data.get('loss', {})}, trim_pattern = "_*")
-    data_t  = ta .normalize({'training': data.get('training', {})}, trim_pattern = "_*")
+    base = Argument("base", dict, [ma, lra, la, ta])
+    data = base.normalize_value(data, trim_pattern = "_*")
+    base.check_value(data)
 
-    ma .check(data_m)
-    lra.check(data_lr)
-    la .check(data_l)
-    ta .check(data_t)
-
-    return {**data_m, **data_lr, **data_l, **data_t}
+    return data
 
 
 if __name__ == '__main__':

From e764da1ccf4c5dffb0c483e117d8c30a3a817fe4 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 30 Oct 2020 16:21:18 +0800
Subject: [PATCH 42/65] fix bug in polar doc

---
 doc/train-input.rst      | 69 ++++++++++++++--------------------------
 source/train/argcheck.py | 34 +++++++++++++++-----
 source/train/doc.py      |  2 +-
 3 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/doc/train-input.rst b/doc/train-input.rst
index 88b7a412d8..db1b6b05cc 100644
--- a/doc/train-input.rst
+++ b/doc/train-input.rst
@@ -27,16 +27,6 @@ model:
             | type: ``str`` (flag key)
             | argument path: ``model/descriptor/type`` 
 
-            The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. 
-
-            - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.
-
-            - `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.
-
-            - `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.
-
-            - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
-
 
         When *type* is set to ``loc_frame``: 
 
@@ -260,16 +250,6 @@ model:
             | type: ``str`` (flag key), default: ``ener``
             | argument path: ``model/fitting_net/type`` 
 
-            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. 
-
-            - `ener`: Fit an energy model (potential energy surface).
-
-            - `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.
-
-            - `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.
-
-            - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
-
 
         When *type* is set to ``ener``: 
 
@@ -310,7 +290,7 @@ model:
             Whether to use a "Timestep" in the skip connection
 
         trainable: 
-            | type: ``bool`` | ``list``, optional, default: ``True``
+            | type: ``list`` | ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[ener]/trainable``
 
             Whether the parameters in the fitting net are trainable. This option can be
@@ -365,7 +345,7 @@ model:
             The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
 
         sel_type: 
-            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | type: ``list`` | ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[dipole]/sel_type``
 
             The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
@@ -407,22 +387,22 @@ model:
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[polar]/fit_diag``
 
-            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
 
         scale: 
-            | type: ``float`` | ``list``, optional, default: ``1.0``
+            | type: ``list`` | ``float``, optional, default: ``1.0``
             | argument path: ``model/fitting_net[polar]/scale``
 
-            The output of the fitting net (polarizability matrix) will be scaled by `scale`
+            The output of the fitting net (polarizability matrix) will be scaled by ``scale``
 
         diag_shift: 
-            | type: ``float`` | ``list``, optional, default: ``0.0``
+            | type: ``list`` | ``float``, optional, default: ``0.0``
             | argument path: ``model/fitting_net[polar]/diag_shift``
 
-            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+            The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
 
         sel_type: 
-            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | type: ``list`` | ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[polar]/sel_type``
 
             The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
@@ -464,22 +444,22 @@ model:
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[global_polar]/fit_diag``
 
-            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
 
         scale: 
-            | type: ``float`` | ``list``, optional, default: ``1.0``
+            | type: ``list`` | ``float``, optional, default: ``1.0``
             | argument path: ``model/fitting_net[global_polar]/scale``
 
-            The output of the fitting net (polarizability matrix) will be scaled by `scale`
+            The output of the fitting net (polarizability matrix) will be scaled by ``scale``
 
         diag_shift: 
-            | type: ``float`` | ``list``, optional, default: ``0.0``
+            | type: ``list`` | ``float``, optional, default: ``0.0``
             | argument path: ``model/fitting_net[global_polar]/diag_shift``
 
-            The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.
+            The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
 
         sel_type: 
-            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | type: ``list`` | ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[global_polar]/sel_type``
 
             The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
@@ -504,56 +484,53 @@ loss:
         | type: ``str`` (flag key), default: ``ener``
         | argument path: ``loss/type`` 
 
-        The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.
-        \.
-
 
     When *type* is set to ``ener``: 
 
     start_pref_e: 
-        | type: ``float`` | ``int``, optional, default: ``0.02``
+        | type: ``int`` | ``float``, optional, default: ``0.02``
         | argument path: ``loss[ener]/start_pref_e``
 
         The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
 
     limit_pref_e: 
-        | type: ``float`` | ``int``, optional, default: ``1.0``
+        | type: ``int`` | ``float``, optional, default: ``1.0``
         | argument path: ``loss[ener]/limit_pref_e``
 
         The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
     start_pref_f: 
-        | type: ``float`` | ``int``, optional, default: ``1000``
+        | type: ``int`` | ``float``, optional, default: ``1000``
         | argument path: ``loss[ener]/start_pref_f``
 
         The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
 
     limit_pref_f: 
-        | type: ``float`` | ``int``, optional, default: ``1.0``
+        | type: ``int`` | ``float``, optional, default: ``1.0``
         | argument path: ``loss[ener]/limit_pref_f``
 
         The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
     start_pref_v: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | type: ``int`` | ``float``, optional, default: ``0.0``
         | argument path: ``loss[ener]/start_pref_v``
 
         The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
 
     limit_pref_v: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | type: ``int`` | ``float``, optional, default: ``0.0``
         | argument path: ``loss[ener]/limit_pref_v``
 
         The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
     start_pref_ae: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | type: ``int`` | ``float``, optional, default: ``0.0``
         | argument path: ``loss[ener]/start_pref_ae``
 
         The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
 
     limit_pref_ae: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | type: ``int`` | ``float``, optional, default: ``0.0``
         | argument path: ``loss[ener]/limit_pref_ae``
 
         The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
@@ -615,7 +592,7 @@ training:
         Number of training batch. Each training uses one batch of data.
 
     batch_size: 
-        | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
+        | type: ``list`` | ``str`` | ``int``, optional, default: ``auto``
         | argument path: ``training/batch_size``
 
         This key can be 
diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index bb5303b263..a544346c81 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -16,6 +16,10 @@ def supported_precision() :
     return list_to_doc(['float64', 'float32', 'float16'])
 
 
+def make_link(content, ref_key) :
+    return f'`{content} <#{ref_key}>`__'
+
+
 def descrpt_local_frame_args ():
     doc_sel_a = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.'
     doc_sel_r = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.'
@@ -156,9 +160,9 @@ def fitting_polar():
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(activation_fn_dict.keys())}'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {supported_precision()}'
-    doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by `scale`'
-    doc_diag_shift = 'The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.'
-    doc_fit_diag = 'The diagonal part of the polarizability matrix  will be shifted by `fit_diag`. The shift operation is carried out after `scale`.'
+    doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by ``scale``'
+    doc_diag_shift = 'The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.'
+    doc_fit_diag = 'Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.'
     doc_sel_type = 'The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.'
     doc_seed = 'Random seed for parameter initialization of the fitting net'
     
@@ -328,16 +332,30 @@ def training_args():
     return Argument("training", dict, args, [], doc = doc_training)
 
 
-def gen_doc():
+def make_index(keys):
+    ret = []
+    for ii in keys:
+        ret.append(make_link(ii, ii))
+    return ', '.join(ret)
+
+
+def gen_doc(**kwargs):
     ma = model_args()
     lra = learning_rate_args()
     la = loss_args()
     ta = training_args()
     ptr = []
-    ptr.append(ma.gen_doc())
-    ptr.append(la.gen_doc())
-    ptr.append(lra.gen_doc())
-    ptr.append(ta.gen_doc())
+    ptr.append(ma.gen_doc(**kwargs))
+    ptr.append(la.gen_doc(**kwargs))
+    ptr.append(lra.gen_doc(**kwargs))
+    ptr.append(ta.gen_doc(**kwargs))
+
+    key_words = []
+    for ii in "\n\n".join(ptr).split('\n'):
+        if 'argument path' in ii:
+            key_words.append(ii.split(':')[1].replace('`','').strip())
+    #ptr.insert(0, make_index(key_words))
+
     return "\n\n".join(ptr)
 
 def normalize(data):
diff --git a/source/train/doc.py b/source/train/doc.py
index 939efe411a..acd481541e 100644
--- a/source/train/doc.py
+++ b/source/train/doc.py
@@ -1,5 +1,5 @@
 from deepmd.argcheck import gen_doc
 
 def doc_train_input(args):
-    doc_str = gen_doc()
+    doc_str = gen_doc(make_anchor=False)
     print(doc_str)

From f0fe0854fa7b92c253b91a3d557076ccaffe520b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sun, 1 Nov 2020 22:54:35 -0500
Subject: [PATCH 43/65] bump docker version in README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3aed033a13..0d61de70b6 100644
--- a/README.md
+++ b/README.md
@@ -109,16 +109,16 @@ conda install deepmd-kit=*=*gpu lammps-dp=*=*gpu -c deepmodeling
 ```
 
 ### With Docker
-A docker for installing the DeePMD-kit is available [here](https://github.com/orgs/deepmodeling/packages/container/deepmd-kit).
+A docker for installing the DeePMD-kit is available [here](https://github.com/orgs/deepmodeling/packages/container/package/deepmd-kit).
 
 To pull the CPU version:
 ```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cpu
+docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.2_cpu
 ```
 
 To pull the GPU version:
 ```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cuda10.1_gpu
+docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.2_cuda10.1_gpu
 ```
 
 ## Install the python interface 

From 2ef41c31fe3cd216e4e3f85aea762ee443068fd8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 9 Nov 2020 13:33:53 +0800
Subject: [PATCH 44/65] update doc

---
 doc/train-input.rst | 57 +++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/doc/train-input.rst b/doc/train-input.rst
index db1b6b05cc..0162c80a90 100644
--- a/doc/train-input.rst
+++ b/doc/train-input.rst
@@ -27,6 +27,16 @@ model:
             | type: ``str`` (flag key)
             | argument path: ``model/descriptor/type`` 
 
+            The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. 
+
+            - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.
+
+            - `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.
+
+            - `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.
+
+            - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
+
 
         When *type* is set to ``loc_frame``: 
 
@@ -250,6 +260,16 @@ model:
             | type: ``str`` (flag key), default: ``ener``
             | argument path: ``model/fitting_net/type`` 
 
+            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. 
+
+            - `ener`: Fit an energy model (potential energy surface).
+
+            - `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.
+
+            - `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.
+
+            - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
+
 
         When *type* is set to ``ener``: 
 
@@ -290,7 +310,7 @@ model:
             Whether to use a "Timestep" in the skip connection
 
         trainable: 
-            | type: ``list`` | ``bool``, optional, default: ``True``
+            | type: ``bool`` | ``list``, optional, default: ``True``
             | argument path: ``model/fitting_net[ener]/trainable``
 
             Whether the parameters in the fitting net are trainable. This option can be
@@ -345,7 +365,7 @@ model:
             The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
 
         sel_type: 
-            | type: ``list`` | ``int`` | ``NoneType``, optional
+            | type: ``int`` | ``NoneType`` | ``list``, optional
             | argument path: ``model/fitting_net[dipole]/sel_type``
 
             The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
@@ -390,19 +410,19 @@ model:
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
 
         scale: 
-            | type: ``list`` | ``float``, optional, default: ``1.0``
+            | type: ``float`` | ``list``, optional, default: ``1.0``
             | argument path: ``model/fitting_net[polar]/scale``
 
             The output of the fitting net (polarizability matrix) will be scaled by ``scale``
 
         diag_shift: 
-            | type: ``list`` | ``float``, optional, default: ``0.0``
+            | type: ``float`` | ``list``, optional, default: ``0.0``
             | argument path: ``model/fitting_net[polar]/diag_shift``
 
             The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
 
         sel_type: 
-            | type: ``list`` | ``int`` | ``NoneType``, optional
+            | type: ``int`` | ``NoneType`` | ``list``, optional
             | argument path: ``model/fitting_net[polar]/sel_type``
 
             The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
@@ -447,19 +467,19 @@ model:
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
 
         scale: 
-            | type: ``list`` | ``float``, optional, default: ``1.0``
+            | type: ``float`` | ``list``, optional, default: ``1.0``
             | argument path: ``model/fitting_net[global_polar]/scale``
 
             The output of the fitting net (polarizability matrix) will be scaled by ``scale``
 
         diag_shift: 
-            | type: ``list`` | ``float``, optional, default: ``0.0``
+            | type: ``float`` | ``list``, optional, default: ``0.0``
             | argument path: ``model/fitting_net[global_polar]/diag_shift``
 
             The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
 
         sel_type: 
-            | type: ``list`` | ``int`` | ``NoneType``, optional
+            | type: ``int`` | ``NoneType`` | ``list``, optional
             | argument path: ``model/fitting_net[global_polar]/sel_type``
 
             The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
@@ -484,53 +504,56 @@ loss:
         | type: ``str`` (flag key), default: ``ener``
         | argument path: ``loss/type`` 
 
+        The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.
+        \.
+
 
     When *type* is set to ``ener``: 
 
     start_pref_e: 
-        | type: ``int`` | ``float``, optional, default: ``0.02``
+        | type: ``float`` | ``int``, optional, default: ``0.02``
         | argument path: ``loss[ener]/start_pref_e``
 
         The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
 
     limit_pref_e: 
-        | type: ``int`` | ``float``, optional, default: ``1.0``
+        | type: ``float`` | ``int``, optional, default: ``1.0``
         | argument path: ``loss[ener]/limit_pref_e``
 
         The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
     start_pref_f: 
-        | type: ``int`` | ``float``, optional, default: ``1000``
+        | type: ``float`` | ``int``, optional, default: ``1000``
         | argument path: ``loss[ener]/start_pref_f``
 
         The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
 
     limit_pref_f: 
-        | type: ``int`` | ``float``, optional, default: ``1.0``
+        | type: ``float`` | ``int``, optional, default: ``1.0``
         | argument path: ``loss[ener]/limit_pref_f``
 
         The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
     start_pref_v: 
-        | type: ``int`` | ``float``, optional, default: ``0.0``
+        | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/start_pref_v``
 
         The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
 
     limit_pref_v: 
-        | type: ``int`` | ``float``, optional, default: ``0.0``
+        | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/limit_pref_v``
 
         The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
     start_pref_ae: 
-        | type: ``int`` | ``float``, optional, default: ``0.0``
+        | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/start_pref_ae``
 
         The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
 
     limit_pref_ae: 
-        | type: ``int`` | ``float``, optional, default: ``0.0``
+        | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/limit_pref_ae``
 
         The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
@@ -592,7 +615,7 @@ training:
         Number of training batch. Each training uses one batch of data.
 
     batch_size: 
-        | type: ``list`` | ``str`` | ``int``, optional, default: ``auto``
+        | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
         | argument path: ``training/batch_size``
 
         This key can be 

From 55baecf1b72a65a8c3bef147ec7c340d0a95c76b Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 9 Nov 2020 13:53:27 +0800
Subject: [PATCH 45/65] add hyperlinks

---
 doc/train-input.rst | 337 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 335 insertions(+), 2 deletions(-)

diff --git a/doc/train-input.rst b/doc/train-input.rst
index 0162c80a90..e29dd5799b 100644
--- a/doc/train-input.rst
+++ b/doc/train-input.rst
@@ -1,19 +1,31 @@
+.. raw:: html
+
+   <a id="model"></a>
 model: 
     | type: ``dict``
     | argument path: ``model``
 
+    .. raw:: html
+
+       <a id="model/type_map"></a>
     type_map: 
         | type: ``list``, optional
         | argument path: ``model/type_map``
 
         A list of strings. Give the name to each type of atoms.
 
+    .. raw:: html
+
+       <a id="model/data_stat_nbatch"></a>
     data_stat_nbatch: 
         | type: ``int``, optional, default: ``10``
         | argument path: ``model/data_stat_nbatch``
 
         The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.
 
+    .. raw:: html
+
+       <a id="model/descriptor"></a>
     descriptor: 
         | type: ``dict``
         | argument path: ``model/descriptor``
@@ -23,6 +35,9 @@ model:
 
         Depending on the value of *type*, different sub args are accepted. 
 
+        .. raw:: html
+
+           <a id="model/descriptor/type"></a>
         type:
             | type: ``str`` (flag key)
             | argument path: ``model/descriptor/type`` 
@@ -38,26 +53,41 @@ model:
             - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
 
 
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]"></a>
         When *type* is set to ``loc_frame``: 
 
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/sel_a"></a>
         sel_a: 
             | type: ``list``
             | argument path: ``model/descriptor[loc_frame]/sel_a``
 
             A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.
 
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/sel_r"></a>
         sel_r: 
             | type: ``list``
             | argument path: ``model/descriptor[loc_frame]/sel_r``
 
             A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
 
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/rcut"></a>
         rcut: 
             | type: ``float``, optional, default: ``6.0``
             | argument path: ``model/descriptor[loc_frame]/rcut``
 
             The cut-off radius. The default value is 6.0
 
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/axis_rule"></a>
         axis_rule: 
             | type: ``list``
             | argument path: ``model/descriptor[loc_frame]/axis_rule``
@@ -77,80 +107,122 @@ model:
             - axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
 
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]"></a>
         When *type* is set to ``se_a``: 
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/sel"></a>
         sel: 
             | type: ``list``
             | argument path: ``model/descriptor[se_a]/sel``
 
             A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/rcut"></a>
         rcut: 
             | type: ``float``, optional, default: ``6.0``
             | argument path: ``model/descriptor[se_a]/rcut``
 
             The cut-off radius.
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/rcut_smth"></a>
         rcut_smth: 
             | type: ``float``, optional, default: ``0.5``
             | argument path: ``model/descriptor[se_a]/rcut_smth``
 
             Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/neuron"></a>
         neuron: 
             | type: ``list``, optional, default: ``[10, 20, 40]``
             | argument path: ``model/descriptor[se_a]/neuron``
 
             Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/axis_neuron"></a>
         axis_neuron: 
             | type: ``int``, optional, default: ``4``
             | argument path: ``model/descriptor[se_a]/axis_neuron``
 
             Size of the submatrix of G (embedding matrix).
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/activation_function"></a>
         activation_function: 
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_a]/activation_function``
 
             The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/resnet_dt"></a>
         resnet_dt: 
             | type: ``bool``, optional, default: ``False``
             | argument path: ``model/descriptor[se_a]/resnet_dt``
 
             Whether to use a "Timestep" in the skip connection
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/type_one_side"></a>
         type_one_side: 
             | type: ``bool``, optional, default: ``False``
             | argument path: ``model/descriptor[se_a]/type_one_side``
 
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/precision"></a>
         precision: 
             | type: ``str``, optional, default: ``float64``
             | argument path: ``model/descriptor[se_a]/precision``
 
             The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/trainable"></a>
         trainable: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/descriptor[se_a]/trainable``
 
             If the parameters in the embedding net is trainable
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/seed"></a>
         seed: 
             | type: ``int`` | ``NoneType``, optional
             | argument path: ``model/descriptor[se_a]/seed``
 
             Random seed for parameter initialization
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/exclude_types"></a>
         exclude_types: 
             | type: ``list``, optional, default: ``[]``
             | argument path: ``model/descriptor[se_a]/exclude_types``
 
             The Excluded types
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/set_davg_zero"></a>
         set_davg_zero: 
             | type: ``bool``, optional, default: ``False``
             | argument path: ``model/descriptor[se_a]/set_davg_zero``
@@ -158,74 +230,113 @@ model:
             Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
 
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]"></a>
         When *type* is set to ``se_r``: 
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/sel"></a>
         sel: 
             | type: ``list``
             | argument path: ``model/descriptor[se_r]/sel``
 
             A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/rcut"></a>
         rcut: 
             | type: ``float``, optional, default: ``6.0``
             | argument path: ``model/descriptor[se_r]/rcut``
 
             The cut-off radius.
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/rcut_smth"></a>
         rcut_smth: 
             | type: ``float``, optional, default: ``0.5``
             | argument path: ``model/descriptor[se_r]/rcut_smth``
 
             Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/neuron"></a>
         neuron: 
             | type: ``list``, optional, default: ``[10, 20, 40]``
             | argument path: ``model/descriptor[se_r]/neuron``
 
             Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/activation_function"></a>
         activation_function: 
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/descriptor[se_r]/activation_function``
 
             The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/resnet_dt"></a>
         resnet_dt: 
             | type: ``bool``, optional, default: ``False``
             | argument path: ``model/descriptor[se_r]/resnet_dt``
 
             Whether to use a "Timestep" in the skip connection
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/type_one_side"></a>
         type_one_side: 
             | type: ``bool``, optional, default: ``False``
             | argument path: ``model/descriptor[se_r]/type_one_side``
 
             Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/precision"></a>
         precision: 
             | type: ``str``, optional, default: ``float64``
             | argument path: ``model/descriptor[se_r]/precision``
 
             The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/trainable"></a>
         trainable: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/descriptor[se_r]/trainable``
 
             If the parameters in the embedding net is trainable
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/seed"></a>
         seed: 
             | type: ``int`` | ``NoneType``, optional
             | argument path: ``model/descriptor[se_r]/seed``
 
             Random seed for parameter initialization
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/exclude_types"></a>
         exclude_types: 
             | type: ``list``, optional, default: ``[]``
             | argument path: ``model/descriptor[se_r]/exclude_types``
 
             The Excluded types
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/set_davg_zero"></a>
         set_davg_zero: 
             | type: ``bool``, optional, default: ``False``
             | argument path: ``model/descriptor[se_r]/set_davg_zero``
@@ -233,20 +344,32 @@ model:
             Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
 
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_ar]"></a>
         When *type* is set to ``se_ar``: 
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_ar]/a"></a>
         a: 
             | type: ``dict``
             | argument path: ``model/descriptor[se_ar]/a``
 
-            The parameters of descriptor `se_a`
+            The parameters of descriptor `se_a <#model/descriptor[se_a]>`__
 
+        .. raw:: html
+
+           <a id="model/descriptor[se_ar]/r"></a>
         r: 
             | type: ``dict``
             | argument path: ``model/descriptor[se_ar]/r``
 
-            The parameters of descriptor `se_r`
+            The parameters of descriptor `se_r <#model/descriptor[se_r]>`__
+
+    .. raw:: html
 
+       <a id="model/fitting_net"></a>
     fitting_net: 
         | type: ``dict``
         | argument path: ``model/fitting_net``
@@ -256,6 +379,9 @@ model:
 
         Depending on the value of *type*, different sub args are accepted. 
 
+        .. raw:: html
+
+           <a id="model/fitting_net/type"></a>
         type:
             | type: ``str`` (flag key), default: ``ener``
             | argument path: ``model/fitting_net/type`` 
@@ -271,44 +397,68 @@ model:
             - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]"></a>
         When *type* is set to ``ener``: 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/numb_fparam"></a>
         numb_fparam: 
             | type: ``int``, optional, default: ``0``
             | argument path: ``model/fitting_net[ener]/numb_fparam``
 
             The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/numb_aparam"></a>
         numb_aparam: 
             | type: ``int``, optional, default: ``0``
             | argument path: ``model/fitting_net[ener]/numb_aparam``
 
             The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/neuron"></a>
         neuron: 
             | type: ``list``, optional, default: ``[120, 120, 120]``
             | argument path: ``model/fitting_net[ener]/neuron``
 
             The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/activation_function"></a>
         activation_function: 
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/fitting_net[ener]/activation_function``
 
             The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/precision"></a>
         precision: 
             | type: ``str``, optional, default: ``float64``
             | argument path: ``model/fitting_net[ener]/precision``
 
             The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/resnet_dt"></a>
         resnet_dt: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[ener]/resnet_dt``
 
             Whether to use a "Timestep" in the skip connection
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/trainable"></a>
         trainable: 
             | type: ``bool`` | ``list``, optional, default: ``True``
             | argument path: ``model/fitting_net[ener]/trainable``
@@ -319,18 +469,27 @@ model:
 
             - list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/rcond"></a>
         rcond: 
             | type: ``float``, optional, default: ``0.001``
             | argument path: ``model/fitting_net[ener]/rcond``
 
             The condition number used to determine the inital energy shift for each type of atoms.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/seed"></a>
         seed: 
             | type: ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[ener]/seed``
 
             Random seed for parameter initialization of the fitting net
 
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/atom_ener"></a>
         atom_ener: 
             | type: ``list``, optional, default: ``[]``
             | argument path: ``model/fitting_net[ener]/atom_ener``
@@ -338,38 +497,59 @@ model:
             Specify the atomic energy in vacuum for each type
 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]"></a>
         When *type* is set to ``dipole``: 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/neuron"></a>
         neuron: 
             | type: ``list``, optional, default: ``[120, 120, 120]``
             | argument path: ``model/fitting_net[dipole]/neuron``
 
             The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/activation_function"></a>
         activation_function: 
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/fitting_net[dipole]/activation_function``
 
             The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/resnet_dt"></a>
         resnet_dt: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[dipole]/resnet_dt``
 
             Whether to use a "Timestep" in the skip connection
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/precision"></a>
         precision: 
             | type: ``str``, optional, default: ``float64``
             | argument path: ``model/fitting_net[dipole]/precision``
 
             The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/sel_type"></a>
         sel_type: 
             | type: ``int`` | ``NoneType`` | ``list``, optional
             | argument path: ``model/fitting_net[dipole]/sel_type``
 
             The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/seed"></a>
         seed: 
             | type: ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[dipole]/seed``
@@ -377,56 +557,86 @@ model:
             Random seed for parameter initialization of the fitting net
 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]"></a>
         When *type* is set to ``polar``: 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/neuron"></a>
         neuron: 
             | type: ``list``, optional, default: ``[120, 120, 120]``
             | argument path: ``model/fitting_net[polar]/neuron``
 
             The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/activation_function"></a>
         activation_function: 
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/fitting_net[polar]/activation_function``
 
             The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/resnet_dt"></a>
         resnet_dt: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[polar]/resnet_dt``
 
             Whether to use a "Timestep" in the skip connection
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/precision"></a>
         precision: 
             | type: ``str``, optional, default: ``float64``
             | argument path: ``model/fitting_net[polar]/precision``
 
             The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/fit_diag"></a>
         fit_diag: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[polar]/fit_diag``
 
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/scale"></a>
         scale: 
             | type: ``float`` | ``list``, optional, default: ``1.0``
             | argument path: ``model/fitting_net[polar]/scale``
 
             The output of the fitting net (polarizability matrix) will be scaled by ``scale``
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/diag_shift"></a>
         diag_shift: 
             | type: ``float`` | ``list``, optional, default: ``0.0``
             | argument path: ``model/fitting_net[polar]/diag_shift``
 
             The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/sel_type"></a>
         sel_type: 
             | type: ``int`` | ``NoneType`` | ``list``, optional
             | argument path: ``model/fitting_net[polar]/sel_type``
 
             The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/seed"></a>
         seed: 
             | type: ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[polar]/seed``
@@ -434,56 +644,86 @@ model:
             Random seed for parameter initialization of the fitting net
 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]"></a>
         When *type* is set to ``global_polar``: 
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/neuron"></a>
         neuron: 
             | type: ``list``, optional, default: ``[120, 120, 120]``
             | argument path: ``model/fitting_net[global_polar]/neuron``
 
             The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/activation_function"></a>
         activation_function: 
             | type: ``str``, optional, default: ``tanh``
             | argument path: ``model/fitting_net[global_polar]/activation_function``
 
             The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/resnet_dt"></a>
         resnet_dt: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[global_polar]/resnet_dt``
 
             Whether to use a "Timestep" in the skip connection
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/precision"></a>
         precision: 
             | type: ``str``, optional, default: ``float64``
             | argument path: ``model/fitting_net[global_polar]/precision``
 
             The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/fit_diag"></a>
         fit_diag: 
             | type: ``bool``, optional, default: ``True``
             | argument path: ``model/fitting_net[global_polar]/fit_diag``
 
             Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/scale"></a>
         scale: 
             | type: ``float`` | ``list``, optional, default: ``1.0``
             | argument path: ``model/fitting_net[global_polar]/scale``
 
             The output of the fitting net (polarizability matrix) will be scaled by ``scale``
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/diag_shift"></a>
         diag_shift: 
             | type: ``float`` | ``list``, optional, default: ``0.0``
             | argument path: ``model/fitting_net[global_polar]/diag_shift``
 
             The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/sel_type"></a>
         sel_type: 
             | type: ``int`` | ``NoneType`` | ``list``, optional
             | argument path: ``model/fitting_net[global_polar]/sel_type``
 
             The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
 
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/seed"></a>
         seed: 
             | type: ``int`` | ``NoneType``, optional
             | argument path: ``model/fitting_net[global_polar]/seed``
@@ -491,6 +731,9 @@ model:
             Random seed for parameter initialization of the fitting net
 
 
+.. raw:: html
+
+   <a id="loss"></a>
 loss: 
     | type: ``dict``
     | argument path: ``loss``
@@ -500,6 +743,9 @@ loss:
 
     Depending on the value of *type*, different sub args are accepted. 
 
+    .. raw:: html
+
+       <a id="loss/type"></a>
     type:
         | type: ``str`` (flag key), default: ``ener``
         | argument path: ``loss/type`` 
@@ -508,56 +754,86 @@ loss:
         \.
 
 
+    .. raw:: html
+
+       <a id="loss[ener]"></a>
     When *type* is set to ``ener``: 
 
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_e"></a>
     start_pref_e: 
         | type: ``float`` | ``int``, optional, default: ``0.02``
         | argument path: ``loss[ener]/start_pref_e``
 
         The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
 
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_e"></a>
     limit_pref_e: 
         | type: ``float`` | ``int``, optional, default: ``1.0``
         | argument path: ``loss[ener]/limit_pref_e``
 
         The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_f"></a>
     start_pref_f: 
         | type: ``float`` | ``int``, optional, default: ``1000``
         | argument path: ``loss[ener]/start_pref_f``
 
         The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
 
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_f"></a>
     limit_pref_f: 
         | type: ``float`` | ``int``, optional, default: ``1.0``
         | argument path: ``loss[ener]/limit_pref_f``
 
         The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_v"></a>
     start_pref_v: 
         | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/start_pref_v``
 
         The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
 
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_v"></a>
     limit_pref_v: 
         | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/limit_pref_v``
 
         The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_ae"></a>
     start_pref_ae: 
         | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/start_pref_ae``
 
         The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
 
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_ae"></a>
     limit_pref_ae: 
         | type: ``float`` | ``int``, optional, default: ``0.0``
         | argument path: ``loss[ener]/limit_pref_ae``
 
         The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
 
+    .. raw:: html
+
+       <a id="loss[ener]/relative_f"></a>
     relative_f: 
         | type: ``float`` | ``NoneType``, optional
         | argument path: ``loss[ener]/relative_f``
@@ -565,24 +841,36 @@ loss:
         If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.
 
 
+.. raw:: html
+
+   <a id="learning_rate"></a>
 learning_rate: 
     | type: ``dict``
     | argument path: ``learning_rate``
 
     The learning rate options
 
+    .. raw:: html
+
+       <a id="learning_rate/start_lr"></a>
     start_lr: 
         | type: ``float``, optional, default: ``0.001``
         | argument path: ``learning_rate/start_lr``
 
         The learning rate the start of the training.
 
+    .. raw:: html
+
+       <a id="learning_rate/stop_lr"></a>
     stop_lr: 
         | type: ``float``, optional, default: ``1e-08``
         | argument path: ``learning_rate/stop_lr``
 
         The desired learning rate at the end of the training.
 
+    .. raw:: html
+
+       <a id="learning_rate/decay_steps"></a>
     decay_steps: 
         | type: ``int``, optional, default: ``5000``
         | argument path: ``learning_rate/decay_steps``
@@ -590,30 +878,45 @@ learning_rate:
         The learning rate is decaying every this number of training steps.
 
 
+.. raw:: html
+
+   <a id="training"></a>
 training: 
     | type: ``dict``
     | argument path: ``training``
 
     The training options
 
+    .. raw:: html
+
+       <a id="training/systems"></a>
     systems: 
         | type: ``list`` | ``str``
         | argument path: ``training/systems``
 
         The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.
 
+    .. raw:: html
+
+       <a id="training/set_prefix"></a>
     set_prefix: 
         | type: ``str``, optional, default: ``set``
         | argument path: ``training/set_prefix``
 
         The prefix of the sets in the systems.
 
+    .. raw:: html
+
+       <a id="training/stop_batch"></a>
     stop_batch: 
         | type: ``int``
         | argument path: ``training/stop_batch``
 
         Number of training batch. Each training uses one batch of data.
 
+    .. raw:: html
+
+       <a id="training/batch_size"></a>
     batch_size: 
         | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
         | argument path: ``training/batch_size``
@@ -628,60 +931,90 @@ training:
 
         - string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.
 
+    .. raw:: html
+
+       <a id="training/seed"></a>
     seed: 
         | type: ``int`` | ``NoneType``, optional
         | argument path: ``training/seed``
 
         The random seed for training.
 
+    .. raw:: html
+
+       <a id="training/disp_file"></a>
     disp_file: 
         | type: ``str``, optional, default: ``lcueve.out``
         | argument path: ``training/disp_file``
 
         The file for printing learning curve.
 
+    .. raw:: html
+
+       <a id="training/disp_freq"></a>
     disp_freq: 
         | type: ``int``, optional, default: ``1000``
         | argument path: ``training/disp_freq``
 
         The frequency of printing learning curve.
 
+    .. raw:: html
+
+       <a id="training/numb_test"></a>
     numb_test: 
         | type: ``int``, optional, default: ``1``
         | argument path: ``training/numb_test``
 
         Number of frames used for the test during training.
 
+    .. raw:: html
+
+       <a id="training/save_freq"></a>
     save_freq: 
         | type: ``int``, optional, default: ``1000``
         | argument path: ``training/save_freq``
 
         The frequency of saving check point.
 
+    .. raw:: html
+
+       <a id="training/save_ckpt"></a>
     save_ckpt: 
         | type: ``str``, optional, default: ``model.ckpt``
         | argument path: ``training/save_ckpt``
 
         The file name of saving check point.
 
+    .. raw:: html
+
+       <a id="training/disp_training"></a>
     disp_training: 
         | type: ``bool``, optional, default: ``True``
         | argument path: ``training/disp_training``
 
         Displaying verbose information during training.
 
+    .. raw:: html
+
+       <a id="training/time_training"></a>
     time_training: 
         | type: ``bool``, optional, default: ``True``
         | argument path: ``training/time_training``
 
         Timing durining training.
 
+    .. raw:: html
+
+       <a id="training/profiling"></a>
     profiling: 
         | type: ``bool``, optional, default: ``False``
         | argument path: ``training/profiling``
 
         Profiling during training.
 
+    .. raw:: html
+
+       <a id="training/profiling_file"></a>
     profiling_file: 
         | type: ``str``, optional, default: ``timeline.json``
         | argument path: ``training/profiling_file``

From 1e483c9656af8aa7de8729307a57b9a402e79388 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 9 Nov 2020 13:54:14 +0800
Subject: [PATCH 46/65] revise the source for hyperlink

---
 source/train/argcheck.py | 6 ++++--
 source/train/doc.py      | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index a544346c81..89034180ba 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -103,8 +103,10 @@ def descrpt_se_r_args():
 
 
 def descrpt_se_ar_args():
-    doc_a = 'The parameters of descriptor `se_a`'
-    doc_r = 'The parameters of descriptor `se_r`'
+    link = make_link('se_a', 'model/descriptor[se_a]')
+    doc_a = f'The parameters of descriptor {link}'
+    link = make_link('se_r', 'model/descriptor[se_r]')
+    doc_r = f'The parameters of descriptor {link}'
     
     return [
         Argument("a", dict, optional = False, doc = doc_a),
diff --git a/source/train/doc.py b/source/train/doc.py
index acd481541e..599f64e247 100644
--- a/source/train/doc.py
+++ b/source/train/doc.py
@@ -1,5 +1,5 @@
 from deepmd.argcheck import gen_doc
 
 def doc_train_input(args):
-    doc_str = gen_doc(make_anchor=False)
+    doc_str = gen_doc(make_anchor=True)
     print(doc_str)

From 4300687deb1c498ce8c230c81d16f69f22209e25 Mon Sep 17 00:00:00 2001
From: bwang-ecnu <66714922+bwang-ecnu@users.noreply.github.com>
Date: Mon, 9 Nov 2020 14:21:04 +0800
Subject: [PATCH 47/65] Update Data.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When condition “if coeff_atom_ener==1:” is established and one would like to ignore the energy_file, it seems to need to replace "coeff_atom_ener=1" with "coeff_ener=0".
---
 source/train/Data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/train/Data.py b/source/train/Data.py
index 149c76bbe8..80b508e97d 100644
--- a/source/train/Data.py
+++ b/source/train/Data.py
@@ -476,7 +476,7 @@ def load_energy(self,
         # ignore energy_file
         if coeff_atom_ener == 1:
             ener = np.sum(atom_ener, axis = 1)
-            coeff_atom_ener = 1
+            coeff_ener = 0
         # load energy_file
         else:
             coeff_ener, ener = self.load_data(set_name, energy_file, [nframes], False)

From bec05d71f032bcea83cd8c4293679fc9685c9665 Mon Sep 17 00:00:00 2001
From: bwang-ecnu <66714922+bwang-ecnu@users.noreply.github.com>
Date: Fri, 13 Nov 2020 17:18:09 +0800
Subject: [PATCH 48/65] change "coeff_ener=1" to "coeff"

---
 source/train/Data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/train/Data.py b/source/train/Data.py
index 80b508e97d..b38839a546 100644
--- a/source/train/Data.py
+++ b/source/train/Data.py
@@ -476,7 +476,7 @@ def load_energy(self,
         # ignore energy_file
         if coeff_atom_ener == 1:
             ener = np.sum(atom_ener, axis = 1)
-            coeff_ener = 0
+            coeff_ener = 1
         # load energy_file
         else:
             coeff_ener, ener = self.load_data(set_name, energy_file, [nframes], False)

From b0b62b0d16c2bffbca80d7936d3f84952f4e5072 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 20 Nov 2020 14:54:28 -0500
Subject: [PATCH 49/65] allow ntypes_model > ntypes_data (fix #261)

Usually, the type number of the model should be equal to that of the data
However, nt_model > nt_data should be allowed, since users may only want to
train using a dataset that only have some of elements
---
 source/train/Trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index f4830a2eba..b7112370ae 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -209,7 +209,10 @@ def build (self,
                data, 
                stop_batch = 0) :
         self.ntypes = self.model.get_ntypes()
-        assert (self.ntypes == data.get_ntypes()), "ntypes should match that found in data"
+        # Usually, the type number of the model should be equal to that of the data
+        # However, nt_model > nt_data should be allowed, since users may only want to 
+        # train using a dataset that only have some of elements 
+        assert (self.ntypes >= data.get_ntypes()), "ntypes should match that found in data"
         self.stop_batch = stop_batch
 
         self.batch_size = data.get_batch_size()
@@ -492,4 +495,4 @@ def test_on_the_fly (self,
                                                      feed_dict_batch)
             print_str += "   %8.1e\n" % current_lr
             fp.write(print_str)
-            fp.flush ()
\ No newline at end of file
+            fp.flush ()

From 35b6eae0015c6ca436b8925da27a1e7a49564f57 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 18:43:00 -0500
Subject: [PATCH 50/65] use sphinx to build docs

---
 .gitignore    |  3 +++
 doc/Makefile  | 20 ++++++++++++++++++++
 doc/conf.py   | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 doc/index.rst | 20 ++++++++++++++++++++
 doc/make.bat  | 35 ++++++++++++++++++++++++++++++++++
 5 files changed, 130 insertions(+)
 create mode 100644 doc/Makefile
 create mode 100644 doc/conf.py
 create mode 100644 doc/index.rst
 create mode 100644 doc/make.bat

diff --git a/.gitignore b/.gitignore
index 435a560708..1fb9d5e0ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,6 @@ dist
 _version.py
 venv*
 .vscode/**
+_build
+_static
+_templates
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000000..d4bb2cbb9e
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000..6ba8586e9b
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,52 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'deepmd-kit'
+copyright = '2020, dp'
+author = 'dp'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000..45720203f9
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,20 @@
+.. deepmd-kit documentation master file, created by
+   sphinx-quickstart on Sat Nov 21 18:36:24 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to deepmd-kit's documentation!
+======================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000000..2119f51099
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd

From 106ab31b4ac028a7bec1bbee92b88e510fd93b74 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 19:44:42 -0500
Subject: [PATCH 51/65] update docs

---
 doc/api.rst              |   91 ++++
 doc/conf.py              |   15 +-
 doc/index.rst            |    8 +-
 doc/install.md           |    2 +-
 doc/train-input-auto.rst | 1023 +++++++++++++++++++++++++++++++++++++
 doc/train-input.rst      | 1026 +-------------------------------------
 6 files changed, 1135 insertions(+), 1030 deletions(-)
 create mode 100644 doc/api.rst
 create mode 100644 doc/train-input-auto.rst

diff --git a/doc/api.rst b/doc/api.rst
new file mode 100644
index 0000000000..17604ae010
--- /dev/null
+++ b/doc/api.rst
@@ -0,0 +1,91 @@
+DeePMD-kit API
+===============
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+.. automodule:: deepmd.Data
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DataModifier
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DataSystem
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DeepDipole
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DeepEval
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DeepPolar
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DeepPot
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DeepWFC
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DescrptLocFrame
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DescrptSeA
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DescrptSeAR
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.DescrptSeR
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.EwaldRecp
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.Fitting
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.LearningRate
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.Local
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.Loss
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.Model
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.Network
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.TabInter
+   :members:
+   :undoc-members:
+
+.. automodule:: deepmd.Trainer
+   :members:
+   :undoc-members:
+
diff --git a/doc/conf.py b/doc/conf.py
index 6ba8586e9b..790711aa8d 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -17,9 +17,9 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'deepmd-kit'
-copyright = '2020, dp'
-author = 'dp'
+project = 'DeePMD-kit'
+copyright = '2020, Deep Potential'
+author = 'Deep Potential'
 
 
 # -- General configuration ---------------------------------------------------
@@ -28,6 +28,9 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    'recommonmark',
+    "sphinx_rtd_theme",
+    'sphinx.ext.autosummary'
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -44,9 +47,11 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
+html_static_path = ['_static']
+autodoc_default_flags = ['members']
+autosummary_generate = True
diff --git a/doc/index.rst b/doc/index.rst
index 45720203f9..13c0c45b7d 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -3,13 +3,19 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to deepmd-kit's documentation!
+DeePMD-kit's documentation
 ======================================
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
+   
+   install
+   use-deepmd-kit
+   train-input
+   lammps-pair-style-deepmd
+   api
 
 
 Indices and tables
diff --git a/doc/install.md b/doc/install.md
index cf5ddbddbd..5e00d3275a 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -126,7 +126,7 @@ gcc --version
 
 The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
 
-First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](doc/install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](doc/install-tf.1.14-gpu.md) to install the C++ interface.
+First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](install-tf.1.14-gpu.md) to install the C++ interface.
 
 ### Install the DeePMD-kit's C++ interface
 
diff --git a/doc/train-input-auto.rst b/doc/train-input-auto.rst
new file mode 100644
index 0000000000..e29dd5799b
--- /dev/null
+++ b/doc/train-input-auto.rst
@@ -0,0 +1,1023 @@
+.. raw:: html
+
+   <a id="model"></a>
+model: 
+    | type: ``dict``
+    | argument path: ``model``
+
+    .. raw:: html
+
+       <a id="model/type_map"></a>
+    type_map: 
+        | type: ``list``, optional
+        | argument path: ``model/type_map``
+
+        A list of strings. Give the name to each type of atoms.
+
+    .. raw:: html
+
+       <a id="model/data_stat_nbatch"></a>
+    data_stat_nbatch: 
+        | type: ``int``, optional, default: ``10``
+        | argument path: ``model/data_stat_nbatch``
+
+        The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.
+
+    .. raw:: html
+
+       <a id="model/descriptor"></a>
+    descriptor: 
+        | type: ``dict``
+        | argument path: ``model/descriptor``
+
+        The descriptor of atomic environment.
+
+
+        Depending on the value of *type*, different sub args are accepted. 
+
+        .. raw:: html
+
+           <a id="model/descriptor/type"></a>
+        type:
+            | type: ``str`` (flag key)
+            | argument path: ``model/descriptor/type`` 
+
+            The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. 
+
+            - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.
+
+            - `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.
+
+            - `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.
+
+            - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
+
+
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]"></a>
+        When *type* is set to ``loc_frame``: 
+
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/sel_a"></a>
+        sel_a: 
+            | type: ``list``
+            | argument path: ``model/descriptor[loc_frame]/sel_a``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.
+
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/sel_r"></a>
+        sel_r: 
+            | type: ``list``
+            | argument path: ``model/descriptor[loc_frame]/sel_r``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/rcut"></a>
+        rcut: 
+            | type: ``float``, optional, default: ``6.0``
+            | argument path: ``model/descriptor[loc_frame]/rcut``
+
+            The cut-off radius. The default value is 6.0
+
+        .. raw:: html
+
+           <a id="model/descriptor[loc_frame]/axis_rule"></a>
+        axis_rule: 
+            | type: ``list``
+            | argument path: ``model/descriptor[loc_frame]/axis_rule``
+
+            A list of integers. The length should be 6 times of the number of types. 
+
+            - axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
+
+            - axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.
+
+            - axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.
+
+            - axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
+
+            - axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.
+
+            - axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
+
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]"></a>
+        When *type* is set to ``se_a``: 
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/sel"></a>
+        sel: 
+            | type: ``list``
+            | argument path: ``model/descriptor[se_a]/sel``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/rcut"></a>
+        rcut: 
+            | type: ``float``, optional, default: ``6.0``
+            | argument path: ``model/descriptor[se_a]/rcut``
+
+            The cut-off radius.
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/rcut_smth"></a>
+        rcut_smth: 
+            | type: ``float``, optional, default: ``0.5``
+            | argument path: ``model/descriptor[se_a]/rcut_smth``
+
+            Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/neuron"></a>
+        neuron: 
+            | type: ``list``, optional, default: ``[10, 20, 40]``
+            | argument path: ``model/descriptor[se_a]/neuron``
+
+            Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/axis_neuron"></a>
+        axis_neuron: 
+            | type: ``int``, optional, default: ``4``
+            | argument path: ``model/descriptor[se_a]/axis_neuron``
+
+            Size of the submatrix of G (embedding matrix).
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/activation_function"></a>
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/descriptor[se_a]/activation_function``
+
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/resnet_dt"></a>
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_a]/resnet_dt``
+
+            Whether to use a "Timestep" in the skip connection
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/type_one_side"></a>
+        type_one_side: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_a]/type_one_side``
+
+            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/precision"></a>
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/descriptor[se_a]/precision``
+
+            The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/trainable"></a>
+        trainable: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/descriptor[se_a]/trainable``
+
+            If the parameters in the embedding net is trainable
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/seed"></a>
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/descriptor[se_a]/seed``
+
+            Random seed for parameter initialization
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/exclude_types"></a>
+        exclude_types: 
+            | type: ``list``, optional, default: ``[]``
+            | argument path: ``model/descriptor[se_a]/exclude_types``
+
+            The Excluded types
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_a]/set_davg_zero"></a>
+        set_davg_zero: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_a]/set_davg_zero``
+
+            Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
+
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]"></a>
+        When *type* is set to ``se_r``: 
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/sel"></a>
+        sel: 
+            | type: ``list``
+            | argument path: ``model/descriptor[se_r]/sel``
+
+            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/rcut"></a>
+        rcut: 
+            | type: ``float``, optional, default: ``6.0``
+            | argument path: ``model/descriptor[se_r]/rcut``
+
+            The cut-off radius.
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/rcut_smth"></a>
+        rcut_smth: 
+            | type: ``float``, optional, default: ``0.5``
+            | argument path: ``model/descriptor[se_r]/rcut_smth``
+
+            Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/neuron"></a>
+        neuron: 
+            | type: ``list``, optional, default: ``[10, 20, 40]``
+            | argument path: ``model/descriptor[se_r]/neuron``
+
+            Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/activation_function"></a>
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/descriptor[se_r]/activation_function``
+
+            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/resnet_dt"></a>
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_r]/resnet_dt``
+
+            Whether to use a "Timestep" in the skip connection
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/type_one_side"></a>
+        type_one_side: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_r]/type_one_side``
+
+            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/precision"></a>
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/descriptor[se_r]/precision``
+
+            The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/trainable"></a>
+        trainable: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/descriptor[se_r]/trainable``
+
+            If the parameters in the embedding net is trainable
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/seed"></a>
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/descriptor[se_r]/seed``
+
+            Random seed for parameter initialization
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/exclude_types"></a>
+        exclude_types: 
+            | type: ``list``, optional, default: ``[]``
+            | argument path: ``model/descriptor[se_r]/exclude_types``
+
+            The Excluded types
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_r]/set_davg_zero"></a>
+        set_davg_zero: 
+            | type: ``bool``, optional, default: ``False``
+            | argument path: ``model/descriptor[se_r]/set_davg_zero``
+
+            Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
+
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_ar]"></a>
+        When *type* is set to ``se_ar``: 
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_ar]/a"></a>
+        a: 
+            | type: ``dict``
+            | argument path: ``model/descriptor[se_ar]/a``
+
+            The parameters of descriptor `se_a <#model/descriptor[se_a]>`__
+
+        .. raw:: html
+
+           <a id="model/descriptor[se_ar]/r"></a>
+        r: 
+            | type: ``dict``
+            | argument path: ``model/descriptor[se_ar]/r``
+
+            The parameters of descriptor `se_r <#model/descriptor[se_r]>`__
+
+    .. raw:: html
+
+       <a id="model/fitting_net"></a>
+    fitting_net: 
+        | type: ``dict``
+        | argument path: ``model/fitting_net``
+
+        The fitting of physical properties.
+
+
+        Depending on the value of *type*, different sub args are accepted. 
+
+        .. raw:: html
+
+           <a id="model/fitting_net/type"></a>
+        type:
+            | type: ``str`` (flag key), default: ``ener``
+            | argument path: ``model/fitting_net/type`` 
+
+            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. 
+
+            - `ener`: Fit an energy model (potential energy surface).
+
+            - `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.
+
+            - `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.
+
+            - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
+
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]"></a>
+        When *type* is set to ``ener``: 
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/numb_fparam"></a>
+        numb_fparam: 
+            | type: ``int``, optional, default: ``0``
+            | argument path: ``model/fitting_net[ener]/numb_fparam``
+
+            The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/numb_aparam"></a>
+        numb_aparam: 
+            | type: ``int``, optional, default: ``0``
+            | argument path: ``model/fitting_net[ener]/numb_aparam``
+
+            The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/neuron"></a>
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[ener]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/activation_function"></a>
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[ener]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/precision"></a>
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[ener]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/resnet_dt"></a>
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[ener]/resnet_dt``
+
+            Whether to use a "Timestep" in the skip connection
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/trainable"></a>
+        trainable: 
+            | type: ``bool`` | ``list``, optional, default: ``True``
+            | argument path: ``model/fitting_net[ener]/trainable``
+
+            Whether the parameters in the fitting net are trainable. This option can be
+
+            - bool: True if all parameters of the fitting net are trainable, False otherwise.
+
+            - list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/rcond"></a>
+        rcond: 
+            | type: ``float``, optional, default: ``0.001``
+            | argument path: ``model/fitting_net[ener]/rcond``
+
+            The condition number used to determine the inital energy shift for each type of atoms.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/seed"></a>
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[ener]/seed``
+
+            Random seed for parameter initialization of the fitting net
+
+        .. raw:: html
+
+           <a id="model/fitting_net[ener]/atom_ener"></a>
+        atom_ener: 
+            | type: ``list``, optional, default: ``[]``
+            | argument path: ``model/fitting_net[ener]/atom_ener``
+
+            Specify the atomic energy in vacuum for each type
+
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]"></a>
+        When *type* is set to ``dipole``: 
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/neuron"></a>
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[dipole]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/activation_function"></a>
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[dipole]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/resnet_dt"></a>
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[dipole]/resnet_dt``
+
+            Whether to use a "Timestep" in the skip connection
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/precision"></a>
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[dipole]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/sel_type"></a>
+        sel_type: 
+            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | argument path: ``model/fitting_net[dipole]/sel_type``
+
+            The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[dipole]/seed"></a>
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[dipole]/seed``
+
+            Random seed for parameter initialization of the fitting net
+
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]"></a>
+        When *type* is set to ``polar``: 
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/neuron"></a>
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[polar]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/activation_function"></a>
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[polar]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/resnet_dt"></a>
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[polar]/resnet_dt``
+
+            Whether to use a "Timestep" in the skip connection
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/precision"></a>
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[polar]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/fit_diag"></a>
+        fit_diag: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[polar]/fit_diag``
+
+            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/scale"></a>
+        scale: 
+            | type: ``float`` | ``list``, optional, default: ``1.0``
+            | argument path: ``model/fitting_net[polar]/scale``
+
+            The output of the fitting net (polarizability matrix) will be scaled by ``scale``
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/diag_shift"></a>
+        diag_shift: 
+            | type: ``float`` | ``list``, optional, default: ``0.0``
+            | argument path: ``model/fitting_net[polar]/diag_shift``
+
+            The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/sel_type"></a>
+        sel_type: 
+            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | argument path: ``model/fitting_net[polar]/sel_type``
+
+            The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[polar]/seed"></a>
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[polar]/seed``
+
+            Random seed for parameter initialization of the fitting net
+
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]"></a>
+        When *type* is set to ``global_polar``: 
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/neuron"></a>
+        neuron: 
+            | type: ``list``, optional, default: ``[120, 120, 120]``
+            | argument path: ``model/fitting_net[global_polar]/neuron``
+
+            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/activation_function"></a>
+        activation_function: 
+            | type: ``str``, optional, default: ``tanh``
+            | argument path: ``model/fitting_net[global_polar]/activation_function``
+
+            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/resnet_dt"></a>
+        resnet_dt: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[global_polar]/resnet_dt``
+
+            Whether to use a "Timestep" in the skip connection
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/precision"></a>
+        precision: 
+            | type: ``str``, optional, default: ``float64``
+            | argument path: ``model/fitting_net[global_polar]/precision``
+
+            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/fit_diag"></a>
+        fit_diag: 
+            | type: ``bool``, optional, default: ``True``
+            | argument path: ``model/fitting_net[global_polar]/fit_diag``
+
+            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/scale"></a>
+        scale: 
+            | type: ``float`` | ``list``, optional, default: ``1.0``
+            | argument path: ``model/fitting_net[global_polar]/scale``
+
+            The output of the fitting net (polarizability matrix) will be scaled by ``scale``
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/diag_shift"></a>
+        diag_shift: 
+            | type: ``float`` | ``list``, optional, default: ``0.0``
+            | argument path: ``model/fitting_net[global_polar]/diag_shift``
+
+            The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/sel_type"></a>
+        sel_type: 
+            | type: ``int`` | ``NoneType`` | ``list``, optional
+            | argument path: ``model/fitting_net[global_polar]/sel_type``
+
+            The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
+
+        .. raw:: html
+
+           <a id="model/fitting_net[global_polar]/seed"></a>
+        seed: 
+            | type: ``int`` | ``NoneType``, optional
+            | argument path: ``model/fitting_net[global_polar]/seed``
+
+            Random seed for parameter initialization of the fitting net
+
+
+.. raw:: html
+
+   <a id="loss"></a>
+loss: 
+    | type: ``dict``
+    | argument path: ``loss``
+
+    The definition of loss function. The type of the loss depends on the type of the fitting. For fitting type `ener`, the prefactors before energy, force, virial and atomic energy losses may be provided. For fitting type `dipole`, `polar` and `global_polar`, the loss may be an empty `dict` or unset.
+
+
+    Depending on the value of *type*, different sub args are accepted. 
+
+    .. raw:: html
+
+       <a id="loss/type"></a>
+    type:
+        | type: ``str`` (flag key), default: ``ener``
+        | argument path: ``loss/type`` 
+
+        The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.
+        \.
+
+
+    .. raw:: html
+
+       <a id="loss[ener]"></a>
+    When *type* is set to ``ener``: 
+
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_e"></a>
+    start_pref_e: 
+        | type: ``float`` | ``int``, optional, default: ``0.02``
+        | argument path: ``loss[ener]/start_pref_e``
+
+        The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
+
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_e"></a>
+    limit_pref_e: 
+        | type: ``float`` | ``int``, optional, default: ``1.0``
+        | argument path: ``loss[ener]/limit_pref_e``
+
+        The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_f"></a>
+    start_pref_f: 
+        | type: ``float`` | ``int``, optional, default: ``1000``
+        | argument path: ``loss[ener]/start_pref_f``
+
+        The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
+
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_f"></a>
+    limit_pref_f: 
+        | type: ``float`` | ``int``, optional, default: ``1.0``
+        | argument path: ``loss[ener]/limit_pref_f``
+
+        The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_v"></a>
+    start_pref_v: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/start_pref_v``
+
+        The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
+
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_v"></a>
+    limit_pref_v: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/limit_pref_v``
+
+        The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    .. raw:: html
+
+       <a id="loss[ener]/start_pref_ae"></a>
+    start_pref_ae: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/start_pref_ae``
+
+        The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
+
+    .. raw:: html
+
+       <a id="loss[ener]/limit_pref_ae"></a>
+    limit_pref_ae: 
+        | type: ``float`` | ``int``, optional, default: ``0.0``
+        | argument path: ``loss[ener]/limit_pref_ae``
+
+        The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+    .. raw:: html
+
+       <a id="loss[ener]/relative_f"></a>
+    relative_f: 
+        | type: ``float`` | ``NoneType``, optional
+        | argument path: ``loss[ener]/relative_f``
+
+        If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.
+
+
+.. raw:: html
+
+   <a id="learning_rate"></a>
+learning_rate: 
+    | type: ``dict``
+    | argument path: ``learning_rate``
+
+    The learning rate options
+
+    .. raw:: html
+
+       <a id="learning_rate/start_lr"></a>
+    start_lr: 
+        | type: ``float``, optional, default: ``0.001``
+        | argument path: ``learning_rate/start_lr``
+
+        The learning rate the start of the training.
+
+    .. raw:: html
+
+       <a id="learning_rate/stop_lr"></a>
+    stop_lr: 
+        | type: ``float``, optional, default: ``1e-08``
+        | argument path: ``learning_rate/stop_lr``
+
+        The desired learning rate at the end of the training.
+
+    .. raw:: html
+
+       <a id="learning_rate/decay_steps"></a>
+    decay_steps: 
+        | type: ``int``, optional, default: ``5000``
+        | argument path: ``learning_rate/decay_steps``
+
+        The learning rate is decaying every this number of training steps.
+
+
+.. raw:: html
+
+   <a id="training"></a>
+training: 
+    | type: ``dict``
+    | argument path: ``training``
+
+    The training options
+
+    .. raw:: html
+
+       <a id="training/systems"></a>
+    systems: 
+        | type: ``list`` | ``str``
+        | argument path: ``training/systems``
+
+        The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.
+
+    .. raw:: html
+
+       <a id="training/set_prefix"></a>
+    set_prefix: 
+        | type: ``str``, optional, default: ``set``
+        | argument path: ``training/set_prefix``
+
+        The prefix of the sets in the systems.
+
+    .. raw:: html
+
+       <a id="training/stop_batch"></a>
+    stop_batch: 
+        | type: ``int``
+        | argument path: ``training/stop_batch``
+
+        Number of training batch. Each training uses one batch of data.
+
+    .. raw:: html
+
+       <a id="training/batch_size"></a>
+    batch_size: 
+        | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
+        | argument path: ``training/batch_size``
+
+        This key can be 
+
+        - list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.
+
+        - int: all `systems` uses the same batch size.
+
+        - string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.
+
+        - string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.
+
+    .. raw:: html
+
+       <a id="training/seed"></a>
+    seed: 
+        | type: ``int`` | ``NoneType``, optional
+        | argument path: ``training/seed``
+
+        The random seed for training.
+
+    .. raw:: html
+
+       <a id="training/disp_file"></a>
+    disp_file: 
+        | type: ``str``, optional, default: ``lcueve.out``
+        | argument path: ``training/disp_file``
+
+        The file for printing learning curve.
+
+    .. raw:: html
+
+       <a id="training/disp_freq"></a>
+    disp_freq: 
+        | type: ``int``, optional, default: ``1000``
+        | argument path: ``training/disp_freq``
+
+        The frequency of printing learning curve.
+
+    .. raw:: html
+
+       <a id="training/numb_test"></a>
+    numb_test: 
+        | type: ``int``, optional, default: ``1``
+        | argument path: ``training/numb_test``
+
+        Number of frames used for the test during training.
+
+    .. raw:: html
+
+       <a id="training/save_freq"></a>
+    save_freq: 
+        | type: ``int``, optional, default: ``1000``
+        | argument path: ``training/save_freq``
+
+        The frequency of saving check point.
+
+    .. raw:: html
+
+       <a id="training/save_ckpt"></a>
+    save_ckpt: 
+        | type: ``str``, optional, default: ``model.ckpt``
+        | argument path: ``training/save_ckpt``
+
+        The file name of saving check point.
+
+    .. raw:: html
+
+       <a id="training/disp_training"></a>
+    disp_training: 
+        | type: ``bool``, optional, default: ``True``
+        | argument path: ``training/disp_training``
+
+        Displaying verbose information during training.
+
+    .. raw:: html
+
+       <a id="training/time_training"></a>
+    time_training: 
+        | type: ``bool``, optional, default: ``True``
+        | argument path: ``training/time_training``
+
+        Timing durining training.
+
+    .. raw:: html
+
+       <a id="training/profiling"></a>
+    profiling: 
+        | type: ``bool``, optional, default: ``False``
+        | argument path: ``training/profiling``
+
+        Profiling during training.
+
+    .. raw:: html
+
+       <a id="training/profiling_file"></a>
+    profiling_file: 
+        | type: ``str``, optional, default: ``timeline.json``
+        | argument path: ``training/profiling_file``
+
+        Output file for profiling.
+
diff --git a/doc/train-input.rst b/doc/train-input.rst
index e29dd5799b..aa6c7bd01e 100644
--- a/doc/train-input.rst
+++ b/doc/train-input.rst
@@ -1,1023 +1,3 @@
-.. raw:: html
-
-   <a id="model"></a>
-model: 
-    | type: ``dict``
-    | argument path: ``model``
-
-    .. raw:: html
-
-       <a id="model/type_map"></a>
-    type_map: 
-        | type: ``list``, optional
-        | argument path: ``model/type_map``
-
-        A list of strings. Give the name to each type of atoms.
-
-    .. raw:: html
-
-       <a id="model/data_stat_nbatch"></a>
-    data_stat_nbatch: 
-        | type: ``int``, optional, default: ``10``
-        | argument path: ``model/data_stat_nbatch``
-
-        The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.
-
-    .. raw:: html
-
-       <a id="model/descriptor"></a>
-    descriptor: 
-        | type: ``dict``
-        | argument path: ``model/descriptor``
-
-        The descriptor of atomic environment.
-
-
-        Depending on the value of *type*, different sub args are accepted. 
-
-        .. raw:: html
-
-           <a id="model/descriptor/type"></a>
-        type:
-            | type: ``str`` (flag key)
-            | argument path: ``model/descriptor/type`` 
-
-            The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`. 
-
-            - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.
-
-            - `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.
-
-            - `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.
-
-            - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
-
-
-        .. raw:: html
-
-           <a id="model/descriptor[loc_frame]"></a>
-        When *type* is set to ``loc_frame``: 
-
-        .. raw:: html
-
-           <a id="model/descriptor[loc_frame]/sel_a"></a>
-        sel_a: 
-            | type: ``list``
-            | argument path: ``model/descriptor[loc_frame]/sel_a``
-
-            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.
-
-        .. raw:: html
-
-           <a id="model/descriptor[loc_frame]/sel_r"></a>
-        sel_r: 
-            | type: ``list``
-            | argument path: ``model/descriptor[loc_frame]/sel_r``
-
-            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
-
-        .. raw:: html
-
-           <a id="model/descriptor[loc_frame]/rcut"></a>
-        rcut: 
-            | type: ``float``, optional, default: ``6.0``
-            | argument path: ``model/descriptor[loc_frame]/rcut``
-
-            The cut-off radius. The default value is 6.0
-
-        .. raw:: html
-
-           <a id="model/descriptor[loc_frame]/axis_rule"></a>
-        axis_rule: 
-            | type: ``list``
-            | argument path: ``model/descriptor[loc_frame]/axis_rule``
-
-            A list of integers. The length should be 6 times of the number of types. 
-
-            - axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
-
-            - axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.
-
-            - axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.
-
-            - axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
-
-            - axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.
-
-            - axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
-
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]"></a>
-        When *type* is set to ``se_a``: 
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/sel"></a>
-        sel: 
-            | type: ``list``
-            | argument path: ``model/descriptor[se_a]/sel``
-
-            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/rcut"></a>
-        rcut: 
-            | type: ``float``, optional, default: ``6.0``
-            | argument path: ``model/descriptor[se_a]/rcut``
-
-            The cut-off radius.
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/rcut_smth"></a>
-        rcut_smth: 
-            | type: ``float``, optional, default: ``0.5``
-            | argument path: ``model/descriptor[se_a]/rcut_smth``
-
-            Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/neuron"></a>
-        neuron: 
-            | type: ``list``, optional, default: ``[10, 20, 40]``
-            | argument path: ``model/descriptor[se_a]/neuron``
-
-            Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/axis_neuron"></a>
-        axis_neuron: 
-            | type: ``int``, optional, default: ``4``
-            | argument path: ``model/descriptor[se_a]/axis_neuron``
-
-            Size of the submatrix of G (embedding matrix).
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/activation_function"></a>
-        activation_function: 
-            | type: ``str``, optional, default: ``tanh``
-            | argument path: ``model/descriptor[se_a]/activation_function``
-
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/resnet_dt"></a>
-        resnet_dt: 
-            | type: ``bool``, optional, default: ``False``
-            | argument path: ``model/descriptor[se_a]/resnet_dt``
-
-            Whether to use a "Timestep" in the skip connection
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/type_one_side"></a>
-        type_one_side: 
-            | type: ``bool``, optional, default: ``False``
-            | argument path: ``model/descriptor[se_a]/type_one_side``
-
-            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/precision"></a>
-        precision: 
-            | type: ``str``, optional, default: ``float64``
-            | argument path: ``model/descriptor[se_a]/precision``
-
-            The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/trainable"></a>
-        trainable: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/descriptor[se_a]/trainable``
-
-            If the parameters in the embedding net is trainable
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/seed"></a>
-        seed: 
-            | type: ``int`` | ``NoneType``, optional
-            | argument path: ``model/descriptor[se_a]/seed``
-
-            Random seed for parameter initialization
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/exclude_types"></a>
-        exclude_types: 
-            | type: ``list``, optional, default: ``[]``
-            | argument path: ``model/descriptor[se_a]/exclude_types``
-
-            The Excluded types
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_a]/set_davg_zero"></a>
-        set_davg_zero: 
-            | type: ``bool``, optional, default: ``False``
-            | argument path: ``model/descriptor[se_a]/set_davg_zero``
-
-            Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
-
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]"></a>
-        When *type* is set to ``se_r``: 
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/sel"></a>
-        sel: 
-            | type: ``list``
-            | argument path: ``model/descriptor[se_r]/sel``
-
-            A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/rcut"></a>
-        rcut: 
-            | type: ``float``, optional, default: ``6.0``
-            | argument path: ``model/descriptor[se_r]/rcut``
-
-            The cut-off radius.
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/rcut_smth"></a>
-        rcut_smth: 
-            | type: ``float``, optional, default: ``0.5``
-            | argument path: ``model/descriptor[se_r]/rcut_smth``
-
-            Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/neuron"></a>
-        neuron: 
-            | type: ``list``, optional, default: ``[10, 20, 40]``
-            | argument path: ``model/descriptor[se_r]/neuron``
-
-            Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/activation_function"></a>
-        activation_function: 
-            | type: ``str``, optional, default: ``tanh``
-            | argument path: ``model/descriptor[se_r]/activation_function``
-
-            The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/resnet_dt"></a>
-        resnet_dt: 
-            | type: ``bool``, optional, default: ``False``
-            | argument path: ``model/descriptor[se_r]/resnet_dt``
-
-            Whether to use a "Timestep" in the skip connection
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/type_one_side"></a>
-        type_one_side: 
-            | type: ``bool``, optional, default: ``False``
-            | argument path: ``model/descriptor[se_r]/type_one_side``
-
-            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/precision"></a>
-        precision: 
-            | type: ``str``, optional, default: ``float64``
-            | argument path: ``model/descriptor[se_r]/precision``
-
-            The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/trainable"></a>
-        trainable: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/descriptor[se_r]/trainable``
-
-            If the parameters in the embedding net is trainable
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/seed"></a>
-        seed: 
-            | type: ``int`` | ``NoneType``, optional
-            | argument path: ``model/descriptor[se_r]/seed``
-
-            Random seed for parameter initialization
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/exclude_types"></a>
-        exclude_types: 
-            | type: ``list``, optional, default: ``[]``
-            | argument path: ``model/descriptor[se_r]/exclude_types``
-
-            The Excluded types
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_r]/set_davg_zero"></a>
-        set_davg_zero: 
-            | type: ``bool``, optional, default: ``False``
-            | argument path: ``model/descriptor[se_r]/set_davg_zero``
-
-            Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
-
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_ar]"></a>
-        When *type* is set to ``se_ar``: 
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_ar]/a"></a>
-        a: 
-            | type: ``dict``
-            | argument path: ``model/descriptor[se_ar]/a``
-
-            The parameters of descriptor `se_a <#model/descriptor[se_a]>`__
-
-        .. raw:: html
-
-           <a id="model/descriptor[se_ar]/r"></a>
-        r: 
-            | type: ``dict``
-            | argument path: ``model/descriptor[se_ar]/r``
-
-            The parameters of descriptor `se_r <#model/descriptor[se_r]>`__
-
-    .. raw:: html
-
-       <a id="model/fitting_net"></a>
-    fitting_net: 
-        | type: ``dict``
-        | argument path: ``model/fitting_net``
-
-        The fitting of physical properties.
-
-
-        Depending on the value of *type*, different sub args are accepted. 
-
-        .. raw:: html
-
-           <a id="model/fitting_net/type"></a>
-        type:
-            | type: ``str`` (flag key), default: ``ener``
-            | argument path: ``model/fitting_net/type`` 
-
-            The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`. 
-
-            - `ener`: Fit an energy model (potential energy surface).
-
-            - `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.
-
-            - `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.
-
-            - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
-
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]"></a>
-        When *type* is set to ``ener``: 
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/numb_fparam"></a>
-        numb_fparam: 
-            | type: ``int``, optional, default: ``0``
-            | argument path: ``model/fitting_net[ener]/numb_fparam``
-
-            The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/numb_aparam"></a>
-        numb_aparam: 
-            | type: ``int``, optional, default: ``0``
-            | argument path: ``model/fitting_net[ener]/numb_aparam``
-
-            The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/neuron"></a>
-        neuron: 
-            | type: ``list``, optional, default: ``[120, 120, 120]``
-            | argument path: ``model/fitting_net[ener]/neuron``
-
-            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/activation_function"></a>
-        activation_function: 
-            | type: ``str``, optional, default: ``tanh``
-            | argument path: ``model/fitting_net[ener]/activation_function``
-
-            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/precision"></a>
-        precision: 
-            | type: ``str``, optional, default: ``float64``
-            | argument path: ``model/fitting_net[ener]/precision``
-
-            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/resnet_dt"></a>
-        resnet_dt: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/fitting_net[ener]/resnet_dt``
-
-            Whether to use a "Timestep" in the skip connection
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/trainable"></a>
-        trainable: 
-            | type: ``bool`` | ``list``, optional, default: ``True``
-            | argument path: ``model/fitting_net[ener]/trainable``
-
-            Whether the parameters in the fitting net are trainable. This option can be
-
-            - bool: True if all parameters of the fitting net are trainable, False otherwise.
-
-            - list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/rcond"></a>
-        rcond: 
-            | type: ``float``, optional, default: ``0.001``
-            | argument path: ``model/fitting_net[ener]/rcond``
-
-            The condition number used to determine the inital energy shift for each type of atoms.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/seed"></a>
-        seed: 
-            | type: ``int`` | ``NoneType``, optional
-            | argument path: ``model/fitting_net[ener]/seed``
-
-            Random seed for parameter initialization of the fitting net
-
-        .. raw:: html
-
-           <a id="model/fitting_net[ener]/atom_ener"></a>
-        atom_ener: 
-            | type: ``list``, optional, default: ``[]``
-            | argument path: ``model/fitting_net[ener]/atom_ener``
-
-            Specify the atomic energy in vacuum for each type
-
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]"></a>
-        When *type* is set to ``dipole``: 
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]/neuron"></a>
-        neuron: 
-            | type: ``list``, optional, default: ``[120, 120, 120]``
-            | argument path: ``model/fitting_net[dipole]/neuron``
-
-            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]/activation_function"></a>
-        activation_function: 
-            | type: ``str``, optional, default: ``tanh``
-            | argument path: ``model/fitting_net[dipole]/activation_function``
-
-            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]/resnet_dt"></a>
-        resnet_dt: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/fitting_net[dipole]/resnet_dt``
-
-            Whether to use a "Timestep" in the skip connection
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]/precision"></a>
-        precision: 
-            | type: ``str``, optional, default: ``float64``
-            | argument path: ``model/fitting_net[dipole]/precision``
-
-            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]/sel_type"></a>
-        sel_type: 
-            | type: ``int`` | ``NoneType`` | ``list``, optional
-            | argument path: ``model/fitting_net[dipole]/sel_type``
-
-            The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[dipole]/seed"></a>
-        seed: 
-            | type: ``int`` | ``NoneType``, optional
-            | argument path: ``model/fitting_net[dipole]/seed``
-
-            Random seed for parameter initialization of the fitting net
-
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]"></a>
-        When *type* is set to ``polar``: 
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/neuron"></a>
-        neuron: 
-            | type: ``list``, optional, default: ``[120, 120, 120]``
-            | argument path: ``model/fitting_net[polar]/neuron``
-
-            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/activation_function"></a>
-        activation_function: 
-            | type: ``str``, optional, default: ``tanh``
-            | argument path: ``model/fitting_net[polar]/activation_function``
-
-            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/resnet_dt"></a>
-        resnet_dt: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/fitting_net[polar]/resnet_dt``
-
-            Whether to use a "Timestep" in the skip connection
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/precision"></a>
-        precision: 
-            | type: ``str``, optional, default: ``float64``
-            | argument path: ``model/fitting_net[polar]/precision``
-
-            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/fit_diag"></a>
-        fit_diag: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/fitting_net[polar]/fit_diag``
-
-            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/scale"></a>
-        scale: 
-            | type: ``float`` | ``list``, optional, default: ``1.0``
-            | argument path: ``model/fitting_net[polar]/scale``
-
-            The output of the fitting net (polarizability matrix) will be scaled by ``scale``
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/diag_shift"></a>
-        diag_shift: 
-            | type: ``float`` | ``list``, optional, default: ``0.0``
-            | argument path: ``model/fitting_net[polar]/diag_shift``
-
-            The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/sel_type"></a>
-        sel_type: 
-            | type: ``int`` | ``NoneType`` | ``list``, optional
-            | argument path: ``model/fitting_net[polar]/sel_type``
-
-            The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[polar]/seed"></a>
-        seed: 
-            | type: ``int`` | ``NoneType``, optional
-            | argument path: ``model/fitting_net[polar]/seed``
-
-            Random seed for parameter initialization of the fitting net
-
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]"></a>
-        When *type* is set to ``global_polar``: 
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/neuron"></a>
-        neuron: 
-            | type: ``list``, optional, default: ``[120, 120, 120]``
-            | argument path: ``model/fitting_net[global_polar]/neuron``
-
-            The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/activation_function"></a>
-        activation_function: 
-            | type: ``str``, optional, default: ``tanh``
-            | argument path: ``model/fitting_net[global_polar]/activation_function``
-
-            The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/resnet_dt"></a>
-        resnet_dt: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/fitting_net[global_polar]/resnet_dt``
-
-            Whether to use a "Timestep" in the skip connection
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/precision"></a>
-        precision: 
-            | type: ``str``, optional, default: ``float64``
-            | argument path: ``model/fitting_net[global_polar]/precision``
-
-            The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/fit_diag"></a>
-        fit_diag: 
-            | type: ``bool``, optional, default: ``True``
-            | argument path: ``model/fitting_net[global_polar]/fit_diag``
-
-            Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/scale"></a>
-        scale: 
-            | type: ``float`` | ``list``, optional, default: ``1.0``
-            | argument path: ``model/fitting_net[global_polar]/scale``
-
-            The output of the fitting net (polarizability matrix) will be scaled by ``scale``
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/diag_shift"></a>
-        diag_shift: 
-            | type: ``float`` | ``list``, optional, default: ``0.0``
-            | argument path: ``model/fitting_net[global_polar]/diag_shift``
-
-            The diagonal part of the polarizability matrix  will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/sel_type"></a>
-        sel_type: 
-            | type: ``int`` | ``NoneType`` | ``list``, optional
-            | argument path: ``model/fitting_net[global_polar]/sel_type``
-
-            The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
-
-        .. raw:: html
-
-           <a id="model/fitting_net[global_polar]/seed"></a>
-        seed: 
-            | type: ``int`` | ``NoneType``, optional
-            | argument path: ``model/fitting_net[global_polar]/seed``
-
-            Random seed for parameter initialization of the fitting net
-
-
-.. raw:: html
-
-   <a id="loss"></a>
-loss: 
-    | type: ``dict``
-    | argument path: ``loss``
-
-    The definition of loss function. The type of the loss depends on the type of the fitting. For fitting type `ener`, the prefactors before energy, force, virial and atomic energy losses may be provided. For fitting type `dipole`, `polar` and `global_polar`, the loss may be an empty `dict` or unset.
-
-
-    Depending on the value of *type*, different sub args are accepted. 
-
-    .. raw:: html
-
-       <a id="loss/type"></a>
-    type:
-        | type: ``str`` (flag key), default: ``ener``
-        | argument path: ``loss/type`` 
-
-        The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.
-        \.
-
-
-    .. raw:: html
-
-       <a id="loss[ener]"></a>
-    When *type* is set to ``ener``: 
-
-    .. raw:: html
-
-       <a id="loss[ener]/start_pref_e"></a>
-    start_pref_e: 
-        | type: ``float`` | ``int``, optional, default: ``0.02``
-        | argument path: ``loss[ener]/start_pref_e``
-
-        The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
-
-    .. raw:: html
-
-       <a id="loss[ener]/limit_pref_e"></a>
-    limit_pref_e: 
-        | type: ``float`` | ``int``, optional, default: ``1.0``
-        | argument path: ``loss[ener]/limit_pref_e``
-
-        The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
-
-    .. raw:: html
-
-       <a id="loss[ener]/start_pref_f"></a>
-    start_pref_f: 
-        | type: ``float`` | ``int``, optional, default: ``1000``
-        | argument path: ``loss[ener]/start_pref_f``
-
-        The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
-
-    .. raw:: html
-
-       <a id="loss[ener]/limit_pref_f"></a>
-    limit_pref_f: 
-        | type: ``float`` | ``int``, optional, default: ``1.0``
-        | argument path: ``loss[ener]/limit_pref_f``
-
-        The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
-
-    .. raw:: html
-
-       <a id="loss[ener]/start_pref_v"></a>
-    start_pref_v: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
-        | argument path: ``loss[ener]/start_pref_v``
-
-        The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
-
-    .. raw:: html
-
-       <a id="loss[ener]/limit_pref_v"></a>
-    limit_pref_v: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
-        | argument path: ``loss[ener]/limit_pref_v``
-
-        The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
-
-    .. raw:: html
-
-       <a id="loss[ener]/start_pref_ae"></a>
-    start_pref_ae: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
-        | argument path: ``loss[ener]/start_pref_ae``
-
-        The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
-
-    .. raw:: html
-
-       <a id="loss[ener]/limit_pref_ae"></a>
-    limit_pref_ae: 
-        | type: ``float`` | ``int``, optional, default: ``0.0``
-        | argument path: ``loss[ener]/limit_pref_ae``
-
-        The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
-
-    .. raw:: html
-
-       <a id="loss[ener]/relative_f"></a>
-    relative_f: 
-        | type: ``float`` | ``NoneType``, optional
-        | argument path: ``loss[ener]/relative_f``
-
-        If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.
-
-
-.. raw:: html
-
-   <a id="learning_rate"></a>
-learning_rate: 
-    | type: ``dict``
-    | argument path: ``learning_rate``
-
-    The learning rate options
-
-    .. raw:: html
-
-       <a id="learning_rate/start_lr"></a>
-    start_lr: 
-        | type: ``float``, optional, default: ``0.001``
-        | argument path: ``learning_rate/start_lr``
-
-        The learning rate the start of the training.
-
-    .. raw:: html
-
-       <a id="learning_rate/stop_lr"></a>
-    stop_lr: 
-        | type: ``float``, optional, default: ``1e-08``
-        | argument path: ``learning_rate/stop_lr``
-
-        The desired learning rate at the end of the training.
-
-    .. raw:: html
-
-       <a id="learning_rate/decay_steps"></a>
-    decay_steps: 
-        | type: ``int``, optional, default: ``5000``
-        | argument path: ``learning_rate/decay_steps``
-
-        The learning rate is decaying every this number of training steps.
-
-
-.. raw:: html
-
-   <a id="training"></a>
-training: 
-    | type: ``dict``
-    | argument path: ``training``
-
-    The training options
-
-    .. raw:: html
-
-       <a id="training/systems"></a>
-    systems: 
-        | type: ``list`` | ``str``
-        | argument path: ``training/systems``
-
-        The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.
-
-    .. raw:: html
-
-       <a id="training/set_prefix"></a>
-    set_prefix: 
-        | type: ``str``, optional, default: ``set``
-        | argument path: ``training/set_prefix``
-
-        The prefix of the sets in the systems.
-
-    .. raw:: html
-
-       <a id="training/stop_batch"></a>
-    stop_batch: 
-        | type: ``int``
-        | argument path: ``training/stop_batch``
-
-        Number of training batch. Each training uses one batch of data.
-
-    .. raw:: html
-
-       <a id="training/batch_size"></a>
-    batch_size: 
-        | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
-        | argument path: ``training/batch_size``
-
-        This key can be 
-
-        - list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.
-
-        - int: all `systems` uses the same batch size.
-
-        - string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.
-
-        - string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.
-
-    .. raw:: html
-
-       <a id="training/seed"></a>
-    seed: 
-        | type: ``int`` | ``NoneType``, optional
-        | argument path: ``training/seed``
-
-        The random seed for training.
-
-    .. raw:: html
-
-       <a id="training/disp_file"></a>
-    disp_file: 
-        | type: ``str``, optional, default: ``lcueve.out``
-        | argument path: ``training/disp_file``
-
-        The file for printing learning curve.
-
-    .. raw:: html
-
-       <a id="training/disp_freq"></a>
-    disp_freq: 
-        | type: ``int``, optional, default: ``1000``
-        | argument path: ``training/disp_freq``
-
-        The frequency of printing learning curve.
-
-    .. raw:: html
-
-       <a id="training/numb_test"></a>
-    numb_test: 
-        | type: ``int``, optional, default: ``1``
-        | argument path: ``training/numb_test``
-
-        Number of frames used for the test during training.
-
-    .. raw:: html
-
-       <a id="training/save_freq"></a>
-    save_freq: 
-        | type: ``int``, optional, default: ``1000``
-        | argument path: ``training/save_freq``
-
-        The frequency of saving check point.
-
-    .. raw:: html
-
-       <a id="training/save_ckpt"></a>
-    save_ckpt: 
-        | type: ``str``, optional, default: ``model.ckpt``
-        | argument path: ``training/save_ckpt``
-
-        The file name of saving check point.
-
-    .. raw:: html
-
-       <a id="training/disp_training"></a>
-    disp_training: 
-        | type: ``bool``, optional, default: ``True``
-        | argument path: ``training/disp_training``
-
-        Displaying verbose information during training.
-
-    .. raw:: html
-
-       <a id="training/time_training"></a>
-    time_training: 
-        | type: ``bool``, optional, default: ``True``
-        | argument path: ``training/time_training``
-
-        Timing durining training.
-
-    .. raw:: html
-
-       <a id="training/profiling"></a>
-    profiling: 
-        | type: ``bool``, optional, default: ``False``
-        | argument path: ``training/profiling``
-
-        Profiling during training.
-
-    .. raw:: html
-
-       <a id="training/profiling_file"></a>
-    profiling_file: 
-        | type: ``str``, optional, default: ``timeline.json``
-        | argument path: ``training/profiling_file``
-
-        Output file for profiling.
-
+Training parameters
+======================================
+.. include:: train-input-auto.rst

From 0f6bc0f2dea85d848db936b2a3abd8ec0c7ce3bf Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 20:45:50 -0500
Subject: [PATCH 52/65] add requirements for docs

---
 doc/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 doc/requirements.txt

diff --git a/doc/requirements.txt b/doc/requirements.txt
new file mode 100644
index 0000000000..120e5d601b
--- /dev/null
+++ b/doc/requirements.txt
@@ -0,0 +1,4 @@
+..
+sphinx
+recommonmark
+sphinx_rtd_theme

From f450c46973312590de341ab865dd06f74f9ae54d Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 20:57:03 -0500
Subject: [PATCH 53/65] requirements.txt resolves relative path to cwd...

---
 doc/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index 120e5d601b..24ed3e2f96 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -1,4 +1,4 @@
-..
+.
 sphinx
 recommonmark
 sphinx_rtd_theme

From 92682239d65a01acfb2cd717eeb60c1cd85b92bc Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 21:06:54 -0500
Subject: [PATCH 54/65] install tf; move docs require to setup.py

---
 doc/requirements.txt | 5 +----
 setup.py             | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/doc/requirements.txt b/doc/requirements.txt
index 24ed3e2f96..1d39662bb4 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -1,4 +1 @@
-.
-sphinx
-recommonmark
-sphinx_rtd_theme
+.[docs,cpu]
diff --git a/setup.py b/setup.py
index e03d2dcf7f..885022d196 100644
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,7 @@
     cmake_minimum_required_version='3.0',
     extras_require={
         'test': ['dpdata>=0.1.9'],
+        'docs': ['sphinx', 'recommonmark', 'sphinx_rtd_theme'],
         **extras_require,
     },
     entry_points={

From 6c68f8e81af84df789eef42086b212efb845f3e8 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 21:13:02 -0500
Subject: [PATCH 55/65] overrides Read the Doc's default conf

https://stackoverflow.com/questions/56336234/build-fail-sphinx-error-contents-rst-not-found
---
 doc/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/conf.py b/doc/conf.py
index 790711aa8d..d2ade97acc 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -55,3 +55,4 @@
 html_static_path = ['_static']
 autodoc_default_flags = ['members']
 autosummary_generate = True
+master_doc = 'index'

From d537243123de400c52b316578554ce6c70019b19 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 21 Nov 2020 22:03:49 -0500
Subject: [PATCH 56/65] add overflow for pre

---
 .gitignore                 | 1 -
 doc/_static/css/custom.css | 3 +++
 doc/conf.py                | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 doc/_static/css/custom.css

diff --git a/.gitignore b/.gitignore
index 1fb9d5e0ba..b392cdbca5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,5 +25,4 @@ _version.py
 venv*
 .vscode/**
 _build
-_static
 _templates
diff --git a/doc/_static/css/custom.css b/doc/_static/css/custom.css
new file mode 100644
index 0000000000..120e099e0e
--- /dev/null
+++ b/doc/_static/css/custom.css
@@ -0,0 +1,3 @@
+pre{
+	overflow: auto;
+}
diff --git a/doc/conf.py b/doc/conf.py
index d2ade97acc..b757da9771 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -53,6 +53,8 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
+html_css_files = ['css/custom.css']
+
 autodoc_default_flags = ['members']
 autosummary_generate = True
 master_doc = 'index'

From e90d33db1e82b150839d300678efe5e47c2f0632 Mon Sep 17 00:00:00 2001
From: marian-code <marian.rynik@outlook.sk>
Date: Tue, 24 Nov 2020 13:34:45 +0100
Subject: [PATCH 57/65] fix error in definition of numb_test possible type

numb_test can be input as str, int, or list not only list
---
 source/train/argcheck.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index 89034180ba..d87d41857c 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -321,7 +321,7 @@ def training_args():
         Argument("seed", [int,None], optional = True, doc = doc_seed),
         Argument("disp_file", str, optional = True, default = 'lcueve.out', doc = doc_disp_file),
         Argument("disp_freq", int, optional = True, default = 1000, doc = doc_disp_freq),
-        Argument("numb_test", int, optional = True, default = 1, doc = doc_numb_test),
+        Argument("numb_test", [list,int,str], optional = True, default = 1, doc = doc_numb_test),
         Argument("save_freq", int, optional = True, default = 1000, doc = doc_save_freq),
         Argument("save_ckpt", str, optional = True, default = 'model.ckpt', doc = doc_save_ckpt),
         Argument("disp_training", bool, optional = True, default = True, doc = doc_disp_training),

From 0b4101f951159f71427365d1b63857eb008c218e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 29 Nov 2020 22:25:02 +0800
Subject: [PATCH 58/65] compatible to lammps stable_29Oct2020

---
 source/lmp/pair_nnp.cpp  |  4 +--
 source/lmp/pppm_dplr.cpp | 75 ++++++++++++----------------------------
 2 files changed, 25 insertions(+), 54 deletions(-)

diff --git a/source/lmp/pair_nnp.cpp b/source/lmp/pair_nnp.cpp
index ba3dcd3286..e8cff008f5 100644
--- a/source/lmp/pair_nnp.cpp
+++ b/source/lmp/pair_nnp.cpp
@@ -816,8 +816,8 @@ void PairNNP::coeff(int narg, char **arg)
   ihi = n;
   jhi = n;
   if (narg == 2) {
-    force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
-    force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+    utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
+    utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
     if (ilo != 1 || jlo != 1 || ihi != n || jhi != n) {
       error->all(FLERR,"deepmd requires that the scale should be set to all atom types, i.e. pair_coeff * *.");
     }
diff --git a/source/lmp/pppm_dplr.cpp b/source/lmp/pppm_dplr.cpp
index e5643e114f..da95f58c9d 100644
--- a/source/lmp/pppm_dplr.cpp
+++ b/source/lmp/pppm_dplr.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
+   https://lammps.sandia.gov/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -38,6 +38,7 @@ enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
 #define ONEF  1.0
 #endif
 
+
 /* ---------------------------------------------------------------------- */
 
 #ifdef OLD_LMP_PPPM
@@ -68,7 +69,6 @@ void PPPMDPLR::init()
   fill(fele.begin(), fele.end(), 0.0);
 }
 
-
 /* ----------------------------------------------------------------------
    compute the PPPM long-range force, energy, virial
 ------------------------------------------------------------------------- */
@@ -80,15 +80,9 @@ void PPPMDPLR::compute(int eflag, int vflag)
   // set energy/virial flags
   // invoke allocate_peratom() if needed for first time
 
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = evflag_atom = eflag_global = vflag_global =
-         eflag_atom = vflag_atom = 0;
+  ev_init(eflag,vflag);
 
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    cg_peratom->ghost_notify();
-    cg_peratom->setup();
-  }
+  if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
 
   // if atom count has changed, update qsum and qsqsum
 
@@ -127,7 +121,8 @@ void PPPMDPLR::compute(int eflag, int vflag)
   //   to fully sum contribution in their 3d bricks
   // remap from 3d decomposition to FFT decomposition
 
-  cg->reverse_comm(this,REVERSE_RHO);
+  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // compute potential gradient on my FFT grid and
@@ -140,16 +135,22 @@ void PPPMDPLR::compute(int eflag, int vflag)
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
-  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  if (differentiation_flag == 1)
+    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  else
+    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   // calculate the force on my particles
@@ -183,14 +184,6 @@ void PPPMDPLR::compute(int eflag, int vflag)
     MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
     for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
   }
-  // std::cout<< "energy in pppm -------------------" << std::endl;
-  // std::cout << energy << " " 
-  // 	    << std::endl;
-  // std::cout<< "virial in pppm -------------------" << std::endl;
-  // for (int ii = 0; ii < 6; ++ii){
-  //   std::cout << virial[ii] << " " ;
-  // }
-  // std::cout << std::endl;
 
   // per-atom energy/virial
   // energy includes self-energy correction
@@ -227,10 +220,10 @@ void PPPMDPLR::compute(int eflag, int vflag)
   if (triclinic) domain->lamda2x(atom->nlocal);
 }
 
-
 /* ----------------------------------------------------------------------
    interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
+
 void PPPMDPLR::fieldforce_ik()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz;
@@ -288,21 +281,6 @@ void PPPMDPLR::fieldforce_ik()
     fele[i*3+1] += qfactor*eky;
     if (slabflag != 2) fele[i*3+2] += qfactor*ekz;
   }
-
-  // vector<FLOAT_PREC> dcoord(nall*3), dbox(9);
-  // vector<int> dtype(nall);
-  // {
-  //   double ** xx = atom->x;
-  //   for(int ii = 0; ii < nall; ++ii){
-  //     for (int dd = 0; dd < 3; +=dd){
-  // 	dcoord[ii*3+dd] = xx[ii][dd];
-  //     }
-  //   }
-  //   int *type = atom->type;
-  //   for (int ii = 0; ii < nall; ++ii){
-  //     dtype[ii] = type[ii] - 1;
-  //   }
-  // }
 }
 
 /* ----------------------------------------------------------------------
@@ -335,11 +313,14 @@ void PPPMDPLR::fieldforce_ad()
 
   double *q = atom->q;
   double **x = atom->x;
-  double **f = atom->f;
+  // double **f = atom->f;
 
   int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
 
-  vector<double > fele(nlocal, 0.0);
+  fele.resize(nlocal*3);
+  fill(fele.begin(), fele.end(), 0.0);
 
   for (i = 0; i < nlocal; i++) {
     nx = part2grid[i][0];
@@ -369,7 +350,7 @@ void PPPMDPLR::fieldforce_ad()
     eky *= hy_inv;
     ekz *= hz_inv;
 
-    // convert E-field to force and substract self forces
+    // convert E-field to force and subtract self forces
 
     const double qfactor = qqrd2e * scale;
 
@@ -392,15 +373,5 @@ void PPPMDPLR::fieldforce_ad()
     sf *= 2*q[i]*q[i];
     if (slabflag != 2) fele[i*3+2] += qfactor*(ekz*q[i] - sf);
   }
-
-  // for (int ii = 0; ii < nlocal; ++ii){
-  //   cout << ii << "\t ";
-  //   for (int dd = 0; dd < 3; ++dd){
-  //     cout << fele[ii*3+dd] << " " ;
-  //   }
-  //   cout << endl;
-  // }
 }
 
-
-

From 5165c2afa69df17c8d9e17d28f28bbac273a81aa Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Thu, 3 Dec 2020 01:21:08 -0500
Subject: [PATCH 59/65] add documentation link to readme

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cdec110a92..778277ffe9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <span style="font-size:larger;">DeePMD-kit Manual</span>
 ========
-
+[![GitHub release](https://img.shields.io/github/release/deepmodeling/deepmd-kit.svg?maxAge=86400)](https://github.com/deepmodeling/deepmd-kit/releases)
+[![Documentation Status](https://readthedocs.org/projects/deepmd/badge/?version=latest)](https://deepmd.readthedocs.io/en/latest/?badge=latest)
 
 # Table of contents
 - [About DeePMD-kit](#about-deepmd-kit)
@@ -15,6 +16,8 @@
 # About DeePMD-kit
 DeePMD-kit is a package written in Python/C++, designed to minimize the effort required to build deep learning based model of interatomic potential energy and force field and to perform molecular dynamics (MD). This brings new hopes to addressing the accuracy-versus-efficiency dilemma in molecular simulations. Applications of DeePMD-kit span from finite molecules to extended systems and from metallic systems to chemically bonded systems. 
 
+For more information, check the [documentation](https://deepmd.readthedocs.io/).
+
 ## Highlighted features
 * **interfaced with TensorFlow**, one of the most popular deep learning frameworks, making the training process highly automatic and efficient.
 * **interfaced with high-performance classical MD and quantum (path-integral) MD packages**, i.e., LAMMPS and i-PI, respectively. 

From 3c3b7fb40f8a2184887ed0f462ab6124f4613534 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 22 Dec 2020 12:56:39 +0800
Subject: [PATCH 60/65] bug fixing: not displacing input doc on github

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 778277ffe9..2ce48317fd 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ The typical procedure of using DeePMD-kit includes 5 steps
 
 A quick-start on using DeePMD-kit can be found [here](doc/use-deepmd-kit.md).
 
-A full [document](doc/train-input.rst) on options in the training input script is available.
+A full [document](doc/train-input-auto.rst) on options in the training input script is available.
 
 
 # Troubleshooting

From 45d70baf85378a7c18777aab7b54d761b3317d22 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 22 Dec 2020 12:56:39 +0800
Subject: [PATCH 61/65] bug fixing: not displaying input doc on github

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 778277ffe9..2ce48317fd 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ The typical procedure of using DeePMD-kit includes 5 steps
 
 A quick-start on using DeePMD-kit can be found [here](doc/use-deepmd-kit.md).
 
-A full [document](doc/train-input.rst) on options in the training input script is available.
+A full [document](doc/train-input-auto.rst) on options in the training input script is available.
 
 
 # Troubleshooting

From b83d4c8fb37c6a5353db10493fb88bf98ed419e6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 1 Jan 2021 17:10:53 +0800
Subject: [PATCH 62/65] add doc for short-range tabulated interaction

---
 doc/train-input-auto.rst | 38 +++++++++++++++++++++++++++++++++++++-
 source/train/argcheck.py |  9 +++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/doc/train-input-auto.rst b/doc/train-input-auto.rst
index e29dd5799b..c7dbe1a9e0 100644
--- a/doc/train-input-auto.rst
+++ b/doc/train-input-auto.rst
@@ -23,6 +23,42 @@ model:
 
         The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.
 
+    .. raw:: html
+
+       <a id="model/use_srtab"></a>
+    use_srtab: 
+        | type: ``str``, optional
+        | argument path: ``model/use_srtab``
+
+        The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.
+
+    .. raw:: html
+
+       <a id="model/smin_alpha"></a>
+    smin_alpha: 
+        | type: ``float``, optional
+        | argument path: ``model/smin_alpha``
+
+        The short-range tabulated interaction will be swithed according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided.
+
+    .. raw:: html
+
+       <a id="model/sw_rmin"></a>
+    sw_rmin: 
+        | type: ``float``, optional
+        | argument path: ``model/sw_rmin``
+
+        The lower boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.
+
+    .. raw:: html
+
+       <a id="model/sw_rmax"></a>
+    sw_rmax: 
+        | type: ``float``, optional
+        | argument path: ``model/sw_rmax``
+
+        The upper boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.
+
     .. raw:: html
 
        <a id="model/descriptor"></a>
@@ -962,7 +998,7 @@ training:
 
        <a id="training/numb_test"></a>
     numb_test: 
-        | type: ``int``, optional, default: ``1``
+        | type: ``int`` | ``list`` | ``str``, optional, default: ``1``
         | argument path: ``training/numb_test``
 
         Number of frames used for the test during training.
diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index d87d41857c..032d71ad17 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -223,9 +223,18 @@ def model_args ():
     doc_data_stat_nbatch = 'The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.'
     doc_descrpt = 'The descriptor of atomic environment.'
     doc_fitting = 'The fitting of physical properties.'
+    doc_use_srtab = 'The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.'
+    doc_smin_alpha = 'The short-range tabulated interaction will be swithed according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided.'
+    doc_sw_rmin = 'The lower boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.'
+    doc_sw_rmax = 'The upper boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.'
+
     ca = Argument("model", dict, 
                   [Argument("type_map", list, optional = True, doc = doc_type_map),
                    Argument("data_stat_nbatch", int, optional = True, default = 10, doc = doc_data_stat_nbatch),
+                   Argument("use_srtab", str, optional = True, doc = doc_use_srtab),
+                   Argument("smin_alpha", float, optional = True, doc = doc_smin_alpha),
+                   Argument("sw_rmin", float, optional = True, doc = doc_sw_rmin),
+                   Argument("sw_rmax", float, optional = True, doc = doc_sw_rmax),
                    Argument("descriptor", dict, [], [descrpt_variant_type_args()], doc = doc_descrpt),
                    Argument("fitting_net", dict, [], [fitting_variant_type_args()], doc = doc_fitting)
                   ])

From 468b5fe25635dcc1f0e5f5fbe76eb344efbf8267 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 12 Jan 2021 10:07:38 +0800
Subject: [PATCH 63/65] fix bug of compulsory key `loss`

---
 source/train/argcheck.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/train/argcheck.py b/source/train/argcheck.py
index 032d71ad17..c358c114ff 100644
--- a/source/train/argcheck.py
+++ b/source/train/argcheck.py
@@ -299,6 +299,7 @@ def loss_args():
     doc_loss = 'The definition of loss function. The type of the loss depends on the type of the fitting. For fitting type `ener`, the prefactors before energy, force, virial and atomic energy losses may be provided. For fitting type `dipole`, `polar` and `global_polar`, the loss may be an empty `dict` or unset.' 
     ca = Argument('loss', dict, [], 
                   [loss_variant_type_args()],
+                  optional = True,
                   doc = doc_loss)
     return ca
 

From e6a3ce6be198338f96af6e0142921e9ced9d7e22 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 12 Jan 2021 10:39:02 +0800
Subject: [PATCH 64/65] fix bug of compulsory lable requirement

---
 source/train/Loss.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/source/train/Loss.py b/source/train/Loss.py
index 1f336325a3..68fd8dd660 100644
--- a/source/train/Loss.py
+++ b/source/train/Loss.py
@@ -38,11 +38,11 @@ def __init__ (self, jdata, **kwarg) :
         self.has_ae = (self.start_pref_ae != 0 or self.limit_pref_ae != 0)
         self.has_pf = (self.start_pref_pf != 0 or self.limit_pref_pf != 0)
         # data required
-        add_data_requirement('energy', 1, atomic=False, must=self.has_e, high_prec=True)
-        add_data_requirement('force',  3, atomic=True,  must=self.has_f, high_prec=False)
-        add_data_requirement('virial', 9, atomic=False, must=self.has_v, high_prec=False)
-        add_data_requirement('atom_ener', 1, atomic=True, must=self.has_ae, high_prec=False)
-        add_data_requirement('atom_pref', 1, atomic=True, must=self.has_pf, high_prec=False, repeat=3)
+        add_data_requirement('energy', 1, atomic=False, must=False, high_prec=True)
+        add_data_requirement('force',  3, atomic=True,  must=False, high_prec=False)
+        add_data_requirement('virial', 9, atomic=False, must=False, high_prec=False)
+        add_data_requirement('atom_ener', 1, atomic=True, must=False, high_prec=False)
+        add_data_requirement('atom_pref', 1, atomic=True, must=False, high_prec=False, repeat=3)
 
     def build (self, 
                learning_rate,

From 2b21c22367a6ebf9417d2fadb0b7425eac820c26 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 12 Jan 2021 11:41:03 +0800
Subject: [PATCH 65/65] add notice for the consistency of the tf version

---
 doc/install.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/install.md b/doc/install.md
index 5e00d3275a..05a0d3bdb0 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -63,7 +63,8 @@ source $tensorflow_venv/bin/activate
 pip install --upgrade pip
 pip install --upgrade tensorflow==2.3.0
 ```
-It is notice that everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by 
+It is highly recommanded to keep the consistency of the TensorFlow version for the python and C++ interfaces. 
+Everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by 
 ```bash
 source $tensorflow_venv/bin/activate
 ```