diff --git a/.gitignore b/.gitignore
index c41f53e077..b392cdbca5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,6 @@ dist
.eggs
_version.py
venv*
+.vscode/**
+_build
+_templates
diff --git a/.travis.yml b/.travis.yml
index dd4b0eed9e..b434af6819 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -66,34 +66,30 @@ jobs:
env:
- CC=gcc-5
- CXX=g++-5
- - TENSORFLOW_VERSION=2.1
+ - TENSORFLOW_VERSION=2.3
- python: 3.7
env:
- CC=gcc-8
- CXX=g++-8
- - TENSORFLOW_VERSION=2.1
+ - TENSORFLOW_VERSION=2.3
- stage: build whls
services: docker
env:
- TWINE_USERNAME=__token__
- CIBW_BUILD="cp36-* cp37-*"
- - CIBW_BEFORE_BUILD="pip install tensorflow && sed -i 's/libresolv.so.2\"/libresolv.so.2\", \"libtensorflow_framework.so.2\"/g' \$(find / -name policy.json)"
+ - CIBW_BEFORE_BUILD="sed -i 's/libresolv.so.2\"/libresolv.so.2\", \"libtensorflow_framework.so.2\"/g' \$(find / -name policy.json)"
- CIBW_SKIP="*-win32 *-manylinux_i686"
- CC=gcc-7
- CXX=g++-7
- - TENSORFLOW_VERSION=2.1
+ - TENSORFLOW_VERSION=2.3
install:
- - python -m pip install twine cibuildwheel==1.1.0 scikit-build setuptools_scm
+ - python -m pip install twine cibuildwheel==1.6.3 scikit-build setuptools_scm
script:
- python -m cibuildwheel --output-dir wheelhouse
- python setup.py sdist
after_success:
- if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz; fi
-before_install:
- #- pip install --upgrade pip
- - pip install --upgrade setuptools
- - pip install tensorflow==$TENSORFLOW_VERSION
install:
- - pip install --verbose .[test]
+ - pip install .[cpu,test]
script:
- cd source/tests && python -m unittest
diff --git a/README.md b/README.md
index 5520daa57f..2ce48317fd 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
DeePMD-kit Manual
========
-
+[![GitHub release](https://img.shields.io/github/release/deepmodeling/deepmd-kit.svg?maxAge=86400)](https://github.com/deepmodeling/deepmd-kit/releases)
+[![Documentation Status](https://readthedocs.org/projects/deepmd/badge/?version=latest)](https://deepmd.readthedocs.io/en/latest/?badge=latest)
# Table of contents
- [About DeePMD-kit](#about-deepmd-kit)
@@ -9,35 +10,14 @@
- [License and credits](#license-and-credits)
- [Deep Potential in a nutshell](#deep-potential-in-a-nutshell)
- [Download and install](#download-and-install)
- - [Easy installation methods](#easy-installation-methods)
- - [Offline packages](#offline-packages)
- - [With Docker](#with-docker)
- - [With conda](#with-conda)
- - [Install the python interaction](#install-the-python-interface)
- - [Install the Tensorflow's python interface](#install-the-tensorflows-python-interface)
- - [Install the DeePMD-kit's python interface](#install-the-deepmd-kits-python-interface)
- - [Install the C++ interface](#install-the-c-interface)
- - [Install the Tensorflow's C++ interface](#install-the-tensorflows-c-interface)
- - [Install the DeePMD-kit's C++ interface](#install-the-deepmd-kits-c-interface)
- - [Install LAMMPS's DeePMD-kit module](#install-lammpss-deepmd-kit-module)
- [Use DeePMD-kit](#use-deepmd-kit)
- - [Prepare data](#prepare-data)
- - [Train a model](#train-a-model)
- - [The DeePMD model](#the-deepmd-model)
- - [The DeepPot-SE model](#the-deeppot-se-model)
- - [Freeze a model](#freeze-a-model)
- - [Test a model](#test-a-model)
- - [Model inference](#model-inference)
- - [Run MD with Lammps](#run-md-with-lammps)
- - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
- - [Long-range interaction](#long-range-interaction)
- - [Run path-integral MD with i-PI](#run-path-integral-md-with-i-pi)
- - [Use deep potential with ASE](#use-deep-potential-with-ase)
- [Troubleshooting](#troubleshooting)
# About DeePMD-kit
DeePMD-kit is a package written in Python/C++, designed to minimize the effort required to build deep learning based model of interatomic potential energy and force field and to perform molecular dynamics (MD). This brings new hopes to addressing the accuracy-versus-efficiency dilemma in molecular simulations. Applications of DeePMD-kit span from finite molecules to extended systems and from metallic systems to chemically bonded systems.
+For more information, check the [documentation](https://deepmd.readthedocs.io/).
+
## Highlighted features
* **interfaced with TensorFlow**, one of the most popular deep learning frameworks, making the training process highly automatic and efficient.
* **interfaced with high-performance classical MD and quantum (path-integral) MD packages**, i.e., LAMMPS and i-PI, respectively.
@@ -85,526 +65,27 @@ In addition to building up potential energy models, DeePMD-kit can also be used
# Download and install
-Please follow our [github](https://github.com/deepmodeling/deepmd-kit) webpage to see the latest released version and development version.
-
-## Easy installation methods
-There various easy methods to install DeePMD-kit. Choose one that you prefer. If you want to build by yourself, jump to the next two sections.
-
-After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be available to execute. You can try `dp -h` and `lmp -h` to see the help. `mpirun` is also available considering you may want to run LAMMPS in parallel.
-
-### Offline packages
-Both CPU and GPU version offline packages are avaiable in [the Releases page](https://github.com/deepmodeling/deepmd-kit/releases).
-
-### With conda
-DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [Anaconda](https://www.anaconda.com/distribution/#download-section) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) first.
-
-To install the CPU version:
-```bash
-conda install deepmd-kit=*=*cpu lammps-dp=*=*cpu -c deepmodeling
-```
-
-To install the GPU version containing [CUDA 10.1](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver):
-```bash
-conda install deepmd-kit=*=*gpu lammps-dp=*=*gpu -c deepmodeling
-```
-
-### With Docker
-A docker for installing the DeePMD-kit is available [here](https://github.com/orgs/deepmodeling/packages/container/deepmd-kit).
-
-To pull the CPU version:
-```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cpu
-```
-
-To pull the GPU version:
-```bash
-docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.0_cuda10.1_gpu
-```
-
-## Install the python interface
-### Install the Tensorflow's python interface
-First, check the python version on your machine
-```bash
-python --version
-```
-
-We follow the virtual environment approach to install the tensorflow's Python interface. The full instruction can be found on [the tensorflow's official website](https://www.tensorflow.org/install/pip). Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
-```bash
-virtualenv -p python3 $tensorflow_venv
-source $tensorflow_venv/bin/activate
-pip install --upgrade pip
-pip install --upgrade tensorflow==2.1.0
-```
-It is notice that everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by
-```bash
-source $tensorflow_venv/bin/activate
-```
-if one wants to skip out of the virtual environment, he/she can do
-```bash
-deactivate
-```
-If one has multiple python interpreters named like python3.x, it can be specified by, for example
-```bash
-virtualenv -p python3.7 $tensorflow_venv
-```
-If one does not need the GPU support of deepmd-kit and is concerned about package size, the CPU-only version of tensorflow should be installed by
-```bash
-pip install --upgrade tensorflow-cpu==2.1.0
-```
-To verify the installation, run
-```bash
-python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"
-```
-One should remember to activate the virtual environment every time he/she uses deepmd-kit.
-
-### Install the DeePMD-kit's python interface
-
-Execute
-```bash
-pip install deepmd-kit
-```
-To test the installation, one may execute
-```bash
-dp -h
-```
-It will print the help information like
-```text
-usage: dp [-h] {train,freeze,test} ...
-
-DeePMD-kit: A deep learning package for many-body potential energy
-representation and molecular dynamics
-
-optional arguments:
- -h, --help show this help message and exit
-
-Valid subcommands:
- {train,freeze,test}
- train train a model
- freeze freeze the model
- test test the model
-```
-
-## Install the C++ interface
-
-If one does not need to use DeePMD-kit with Lammps or I-Pi, then the python interface installed in the previous section does everything and he/she can safely skip this section.
-
-### Install the Tensorflow's C++ interface
-
-Check the compiler version on your machine
-
-```
-gcc --version
-```
-
-The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
-
-First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](doc/install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](doc/install-tf.1.14-gpu.md) to install the C++ interface.
-
-### Install the DeePMD-kit's C++ interface
-
-Clone the DeePMD-kit source code
-```bash
-cd /some/workspace
-git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
-```
-
-For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
-```bash
-cd deepmd-kit
-deepmd_source_dir=`pwd`
-```
-
-Now goto the source code directory of DeePMD-kit and make a build place.
-```bash
-cd $deepmd_source_dir/source
-mkdir build
-cd build
-```
-I assume you want to install DeePMD-kit into path `$deepmd_root`, then execute cmake
-```bash
-cmake -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
-```
-where the variable `tensorflow_root` stores the location where the tensorflow's C++ interface is installed. The DeePMD-kit will automatically detect if a CUDA tool-kit is available on your machine and build the GPU support accordingly. If you want to force the cmake to find CUDA tool-kit, you can speicify the key `USE_CUDA_TOOLKIT`,
-```bash
-cmake -DUSE_CUDA_TOOLKIT=true -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
-```
-and you may further asked to provide `CUDA_TOOLKIT_ROOT_DIR`. If the cmake has executed successfully, then
-```bash
-make
-make install
-```
-If everything works fine, you will have the following executable and libraries installed in `$deepmd_root/bin` and `$deepmd_root/lib`
-```bash
-$ ls $deepmd_root/bin
-dp_ipi
-$ ls $deepmd_root/lib
-libdeepmd_ipi.so libdeepmd_op.so libdeepmd.so
-```
+Please follow our [github](https://github.com/deepmodeling/deepmd-kit) webpage to download the [latest released version](https://github.com/deepmodeling/deepmd-kit/tree/master) and [development version](https://github.com/deepmodeling/deepmd-kit/tree/devel).
-### Install LAMMPS's DeePMD-kit module
-DeePMD-kit provide module for running MD simulation with LAMMPS. Now make the DeePMD-kit module for LAMMPS.
-```bash
-cd $deepmd_source_dir/source/build
-make lammps
-```
-DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. Now download your favorite LAMMPS code, and uncompress it (I assume that you have downloaded the tar `lammps-stable.tar.gz`)
-```bash
-cd /some/workspace
-tar xf lammps-stable.tar.gz
-```
-The source code of LAMMPS is stored in directory, for example `lammps-31Mar17`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
-```bash
-cd lammps-31Mar17/src/
-cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
-```
-Now build LAMMPS
-```bash
-make yes-user-deepmd
-make mpi -j4
-```
-The option `-j4` means using 4 processes in parallel. You may want to use a different number according to your hardware.
+DeePMD-kit offers multiple installation methods. It is recommend using easily methods like [offline packages](doc/install.md#offline-packages), [conda](doc/install.md#with-conda) and [docker](doc/install.md#with-docker).
-If everything works fine, you will end up with an executable `lmp_mpi`.
+One may manually install DeePMD-kit by following the instuctions on [installing the python interface](doc/install.md#install-the-python-interface) and [installing the C++ interface](doc/install.md#install-the-c-interface). The C++ interface is necessary when using DeePMD-kit with LAMMPS and i-PI.
-The DeePMD-kit module can be removed from LAMMPS source code by
-```bash
-make no-user-deepmd
-```
# Use DeePMD-kit
-In this text, we will call the deep neural network that is used to represent the interatomic interactions (Deep Potential) the **model**. The typical procedure of using DeePMD-kit is
-
-1. Prepare data
-2. Train a model
-3. Freeze the model
-4. MD runs with the model (Native MD code or LAMMPS)
-
-## Prepare data
-One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
-
-Property| Unit
---- | :---:
-Time | ps
-Length | Å
-Energy | eV
-Force | eV/Å
-Pressure| Bar
-
-The frames of the system are stored in two formats. A raw file is a plain text file with each information item written in one file and one frame written on one line. The default files that provide box, coordinate, force, energy and virial are `box.raw`, `coord.raw`, `force.raw`, `energy.raw` and `virial.raw`, respectively. *We recommend you use these file names*. Here is an example of force.raw:
-```bash
-$ cat force.raw
--0.724 2.039 -0.951 0.841 -0.464 0.363
- 6.737 1.554 -5.587 -2.803 0.062 2.222
--1.968 -0.163 1.020 -0.225 -0.789 0.343
-```
-This `force.raw` contains 3 frames with each frame having the forces of 2 atoms, thus it has 3 lines and 6 columns. Each line provides all the 3 force components of 2 atoms in 1 frame. The first three numbers are the 3 force components of the first atom, while the second three numbers are the 3 force components of the second atom. The coordinate file `coord.raw` is organized similarly. In `box.raw`, the 9 components of the box vectors should be provided on each line. In `virial.raw`, the 9 components of the virial tensor should be provided on each line. The number of lines of all raw files should be identical.
-
-We assume that the atom types do not change in all frames. It is provided by `type.raw`, which has one line with the types of atoms written one by one. The atom types should be integers. For example the `type.raw` of a system that has 2 atoms with 0 and 1:
-```bash
-$ cat type.raw
-0 1
-```
-
-The second format is the data sets of `numpy` binary data that are directly used by the training program. User can use the script `$deepmd_source_dir/data/raw/raw_to_set.sh` to convert the prepared raw files to data sets. For example, if we have a raw file that contains 6000 frames,
-```bash
-$ ls
-box.raw coord.raw energy.raw force.raw type.raw virial.raw
-$ $deepmd_source_dir/data/raw/raw_to_set.sh 2000
-nframe is 6000
-nline per set is 2000
-will make 3 sets
-making set 0 ...
-making set 1 ...
-making set 2 ...
-$ ls
-box.raw coord.raw energy.raw force.raw set.000 set.001 set.002 type.raw virial.raw
-```
-It generates three sets `set.000`, `set.001` and `set.002`, with each set contains 2000 frames. The last set (`set.002`) is used as testing set, while the rest sets (`set.000` and `set.001`) are used as training sets. One do not need to take care of the binary data files in each of the `set.*` directories. The path containing `set.*` and `type.raw` is called a *system*.
-
-## Train a model
-
-### Write the input script
-
-The method of training is explained in our [DeePMD][2] and [DeepPot-SE][3] papers. With the source code we provide a small training dataset taken from 400 frames generated by NVT ab-initio water MD trajectory with 300 frames for training and 100 for testing. [An example training parameter file](./examples/water/train/water_se_a.json) is provided. One can try with the training by
-```bash
-$ cd $deepmd_source_dir/examples/water/train/
-$ dp train water_se_a.json
-```
-where `water_se_a.json` is the `json` format parameter file that controls the training. The components of the `water.json` contains three parts, `model`, `learning_rate`, `loss` and `training`.
-
-The `model` section specify how the deep potential model is built. An example of the smooth-edition is provided as follows
-```json
- "model": {
- "type_map": ["O", "H"],
- "descriptor" :{
- "type": "se_a",
- "rcut_smth": 5.80,
- "rcut": 6.00,
- "sel": [46, 92],
- "neuron": [25, 50, 100],
- "axis_neuron": 16,
- "resnet_dt": false,
- "seed": 1,
- "_comment": " that's all"
- },
- "fitting_net" : {
- "neuron": [240, 240, 240],
- "resnet_dt": true,
- "seed": 1,
- "_comment": " that's all"
- },
- "_comment": " that's all"
- }
-```
-The **`type_map`** is optional, which provide the element names (but not restricted to) for corresponding atom types.
-
-The construction of the descriptor is given by option **`descriptor`**. The **`type`** of the descriptor is set to `"se_a"`, which means smooth-edition, angular infomation. The **`rcut`** is the cut-off radius for neighbor searching, and the **`rcut_smth`** gives where the smoothing starts. **`sel`** gives the maximum possible number of neighbors in the cut-off radius. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denote the maximum possible number of neighbors with type `i`. The **`neuron`** specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layers from input end to the output end, respectively. The **`axis_neuron`** specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper][3]. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
-
-The construction of the fitting net is give by **`fitting_net`**. The key **`neuron`** specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
-
-An example of the `learning_rate` is given as follows
-```json
- "learning_rate" :{
- "type": "exp",
- "start_lr": 0.005,
- "decay_steps": 5000,
- "decay_rate": 0.95,
- "_comment": "that's all"
- }
-```
-The option **`start_lr`**, **`decay_rate`** and **`decay_steps`** specify how the learning rate changes. For example, the `t`th batch will be trained with learning rate:
-```math
-lr(t) = start_lr * decay_rate ^ ( t / decay_steps )
-```
-
-An example of the `loss` is
-```json
- "loss" : {
- "start_pref_e": 0.02,
- "limit_pref_e": 1,
- "start_pref_f": 1000,
- "limit_pref_f": 1,
- "start_pref_v": 0,
- "limit_pref_v": 0,
- "_comment": " that's all"
- }
-```
-The options **`start_pref_e`**, **`limit_pref_e`**, **`start_pref_f`**, **`limit_pref_f`**, **`start_pref_v`** and **`limit_pref_v`** determine how the prefactors of energy error, force error and virial error changes in the loss function (see the appendix of the [DeePMD paper][2] for details). Taking the prefactor of force error for example, the prefactor at batch `t` is
-```math
-w_f(t) = start_pref_f * ( lr(t) / start_lr ) + limit_pref_f * ( 1 - lr(t) / start_lr )
-```
-Since we do not have virial data, the virial prefactors `start_pref_v` and `limit_pref_v` are set to 0.
-
-An example of `training` is
-```json
- "training" : {
- "systems": ["../data/"],
- "set_prefix": "set",
- "stop_batch": 1000000,
- "batch_size": 1,
-
- "seed": 1,
-
- "_comment": " display and restart",
- "_comment": " frequencies counted in batch",
- "disp_file": "lcurve.out",
- "disp_freq": 100,
- "numb_test": 10,
- "save_freq": 1000,
- "save_ckpt": "model.ckpt",
- "load_ckpt": "model.ckpt",
- "disp_training":true,
- "time_training":true,
- "profiling": false,
- "profiling_file":"timeline.json",
- "_comment": "that's all"
- }
-```
-The option **`systems`** provide location of the systems (path to `set.*` and `type.raw`). It is a vector, thus DeePMD-kit allows you to provide multiple systems. DeePMD-kit will train the model with the systems in the vector one by one in a cyclic manner. **It is warned that the example water data (in folder `examples/data/water`) is of very limited amount, is provided only for testing purpose, and should not be used to train a productive model.**
-
-The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size.
-The option **`stop_batch`** specifies the total number of batches will be used in the training.
-
-
-### Training
-
-The training can be invoked by
-```bash
-$ dp train water_se_a.json
-```
-
-During the training, the error of the model is tested every **`disp_freq`** batches with **`numb_test`** frames from the last set in the **`systems`** directory on the fly, and the results are output to **`disp_file`**. A typical `disp_file` looks like
-```bash
-# batch l2_tst l2_trn l2_e_tst l2_e_trn l2_f_tst l2_f_trn lr
- 0 2.67e+01 2.57e+01 2.21e-01 2.22e-01 8.44e-01 8.12e-01 1.0e-03
- 100 6.14e+00 5.40e+00 3.01e-01 2.99e-01 1.93e-01 1.70e-01 1.0e-03
- 200 5.02e+00 4.49e+00 1.53e-01 1.53e-01 1.58e-01 1.42e-01 1.0e-03
- 300 4.36e+00 3.71e+00 7.32e-02 7.27e-02 1.38e-01 1.17e-01 1.0e-03
- 400 4.04e+00 3.29e+00 3.16e-02 3.22e-02 1.28e-01 1.04e-01 1.0e-03
-```
-The first column displays the number of batches. The second and third columns display the loss function evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The fourth and fifth columns display the RMS energy error (normalized by number of atoms) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The sixth and seventh columns display the RMS force error (component-wise) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The last column displays the current learning rate.
-
-Checkpoints will be written to files with prefix **`save_ckpt`** every **`save_freq`** batches. If **`restart`** is set to `true`, then the training will start from the checkpoint named **`load_ckpt`**, rather than from scratch.
-Several command line options can be passed to `dp train`, which can be checked with
-```bash
-$ dp train --help
-```
-An explanation will be provided
-```
-positional arguments:
- INPUT the input json database
-
-optional arguments:
- -h, --help show this help message and exit
- --init-model INIT_MODEL
- Initialize a model by the provided checkpoint
- --restart RESTART Restart the training from the provided checkpoint
-```
-The keys `intra_op_parallelism_threads` and `inter_op_parallelism_threads` are Tensorflow configurations for multithreading, which are explained [here](https://www.tensorflow.org/performance/performance_guide#optimizing_for_cpu). Skipping `-t` and `OMP_NUM_THREADS` leads to the default setting of these keys in the Tensorflow.
-
-**`--init-model model.ckpt`**, for example, initializes the model training with an existing model that is stored in the checkpoint `model.ckpt`, the network architectures should match.
-
-**`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`.
-
-On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads).
-
-For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows:
-```bash
-export OMP_NUM_THREADS=6
-export TF_INTRA_OP_PARALLELISM_THREADS=3
-export TF_INTER_OP_PARALLELISM_THREADS=2
-dp train input.json
-```
-
-## Freeze a model
-
-The trained neural network is extracted from a checkpoint and dumped into a database. This process is called "freezing" a model. The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc). To freeze a model, typically one does
-```bash
-$ dp freeze -o graph.pb
-```
-in the folder where the model is trained. The output database is called `graph.pb`.
-
-
-## Test a model
-
-The frozen model can be used in many ways. The most straightforward test can be performed using `dp test`. A typical usage of `dp test` is
-```bash
-dp test -m graph.pb -s /path/to/system -n 30
-```
-where `-m` gives the tested model, `-s` the path to the tested system and `-n` the number of tested frames. Several other command line options can be passed to `dp test`, which can be checked with
-```bash
-$ dp test --help
-```
-An explanation will be provided
-```
-usage: dp test [-h] [-m MODEL] [-s SYSTEM] [-S SET_PREFIX] [-n NUMB_TEST]
- [-r RAND_SEED] [--shuffle-test] [-d DETAIL_FILE]
-
-optional arguments:
- -h, --help show this help message and exit
- -m MODEL, --model MODEL
- Frozen model file to import
- -s SYSTEM, --system SYSTEM
- The system dir
- -S SET_PREFIX, --set-prefix SET_PREFIX
- The set prefix
- -n NUMB_TEST, --numb-test NUMB_TEST
- The number of data for test
- -r RAND_SEED, --rand-seed RAND_SEED
- The random seed
- --shuffle-test Shuffle test data
- -d DETAIL_FILE, --detail-file DETAIL_FILE
- The file containing details of energy force and virial
- accuracy
-```
-
-## Model inference
-One may use the python interface of DeePMD-kit for model inference, an example is given as follows
-```python
-import deepmd.DeepPot as DP
-import numpy as np
-dp = DP('graph.pb')
-coord = np.array([[1,0,0], [0,0,1.5], [1,0,3]]).reshape([1, -1])
-cell = np.diag(10 * np.ones(3)).reshape([1, -1])
-atype = [1,0,1]
-e, f, v = dp.eval(coord, cell, atype)
-```
-where `e`, `f` and `v` are predicted energy, force and virial of the system, respectively.
-
-
-## Run MD with LAMMPS
-### Include deepmd in the pair style
-Running an MD simulation with LAMMPS is simpler. In the LAMMPS input file, one needs to specify the pair style as follows
-```bash
-pair_style deepmd graph.pb
-pair_coeff
-```
-where `graph.pb` is the file name of the frozen model. The `pair_coeff` should be left blank. It should be noted that LAMMPS counts atom types starting from 1, therefore, all LAMMPS atom type will be firstly subtracted by 1, and then passed into the DeePMD-kit engine to compute the interactions. [A detailed documentation of this pair style is available.](doc/lammps-pair-style-deepmd.md).
-
-### Long-range interaction
-The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes
-```bash
-pair_style deepmd graph.pb
-pair_coeff
-kspace_style pppm 1.0e-5
-kspace_modify gewald 0.45
-```
-Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
-
-## Run path-integral MD with i-PI
-The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. The client can be started by
-```bash
-$ dp_ipi water.json
-```
-It is noted that multiple instances of the client is allow for computing, in parallel, the interactions of multiple replica of the path-integral MD.
-
-`water.json` is the parameter file for the client `dp_ipi`, and [an example](./examples/ipi/water.json) is provided:
-```json
-{
- "verbose": false,
- "use_unix": true,
- "port": 31415,
- "host": "localhost",
- "graph_file": "graph.pb",
- "coord_file": "conf.xyz",
- "atom_type" : {
- "OW": 0,
- "HW1": 1,
- "HW2": 1
- }
-}
-```
-The option **`use_unix`** is set to `true` to activate the Unix domain socket, otherwise, the Internet socket is used.
-
-The option **`graph_file`** provides the file name of the frozen model.
+The typical procedure of using DeePMD-kit includes 5 steps
-The `dp_ipi` gets the atom names from an [XYZ file](https://en.wikipedia.org/wiki/XYZ_file_format) provided by **`coord_file`** (meanwhile ignores all coordinates in it), and translates the names to atom types by rules provided by **`atom_type`**.
+1. [Prepare data](doc/use-deepmd-kit.md#prepare-data)
+2. [Train a model](doc/use-deepmd-kit.md#train-a-model)
+3. [Freeze the model](doc/use-deepmd-kit.md#freeze-a-model)
+4. [Test the model](doc/use-deepmd-kit.md#test-a-model)
+5. [Inference the model in python](doc/use-deepmd-kit.md#model-inference) or using the model in other molecular simulation packages like [LAMMPS](doc/use-deepmd-kit.md#run-md-with-lammps), [i-PI](doc/use-deepmd-kit.md#run-path-integral-md-with-i-pi) or [ASE](doc/use-deepmd-kit.md#use-deep-potential-with-ase).
-## Use deep potential with ASE
+A quick-start on using DeePMD-kit can be found [here](doc/use-deepmd-kit.md).
-Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
-```python
-from ase import Atoms
-from deepmd.calculator import DP
+A full [document](doc/train-input-auto.rst) on options in the training input script is available.
-water = Atoms('H2O',
- positions=[(0.7601, 1.9270, 1),
- (1.9575, 1, 1),
- (1., 1., 1.)],
- cell=[100, 100, 100],
- calculator=DP(model="frozen_model.pb"))
-print(water.get_potential_energy())
-print(water.get_forces())
-```
-
-Optimization is also available:
-```python
-from ase.optimize import BFGS
-dyn = BFGS(water)
-dyn.run(fmax=1e-6)
-print(water.get_positions())
-```
# Troubleshooting
In consequence of various differences of computers or systems, problems may occur. Some common circumstances are listed as follows.
diff --git a/data/json/json2yaml.py b/data/json/json2yaml.py
new file mode 100644
index 0000000000..f601928427
--- /dev/null
+++ b/data/json/json2yaml.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+from pathlib import Path
+from warnings import warn
+
+import yaml
+
+
+def _main():
+ parser = argparse.ArgumentParser(
+ description="convert json config file to yaml",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ # get all json files in dir
+ jsons = [p for p in Path.cwd().glob("*.json")]
+ # use the newest as autosuggestion
+ jsons.sort(key=lambda x: x.stat().st_mtime, reverse=True)
+ jfile = jsons[0]
+ yfile = jfile.with_suffix(".yaml")
+
+ parser.add_argument("INPUT", default=jfile, type=Path, nargs="?",
+ help="input json file")
+ parser.add_argument("OUTPUT", default=yfile, type=Path, nargs="?",
+ help="output yaml file")
+ args = parser.parse_args()
+
+ with args.INPUT.open("r") as infile, args.OUTPUT.open("w") as outfile:
+ yaml.dump(json.load(infile), outfile, default_flow_style=False,
+ sort_keys=False)
+
+ warn("The order of the keys won't be preserved!", SyntaxWarning)
+ warn("_comment keys will also be lostt in the conversion")
+
+if __name__ == "__main__":
+ _main()
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000000..d4bb2cbb9e
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/_static/css/custom.css b/doc/_static/css/custom.css
new file mode 100644
index 0000000000..120e099e0e
--- /dev/null
+++ b/doc/_static/css/custom.css
@@ -0,0 +1,3 @@
+pre{
+ overflow: auto;
+}
diff --git a/doc/api.rst b/doc/api.rst
new file mode 100644
index 0000000000..17604ae010
--- /dev/null
+++ b/doc/api.rst
@@ -0,0 +1,91 @@
+DeePMD-kit API
+===============
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+.. automodule:: deepmd.Data
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DataModifier
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DataSystem
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DeepDipole
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DeepEval
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DeepPolar
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DeepPot
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DeepWFC
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DescrptLocFrame
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DescrptSeA
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DescrptSeAR
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.DescrptSeR
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.EwaldRecp
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.Fitting
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.LearningRate
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.Local
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.Loss
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.Model
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.Network
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.TabInter
+ :members:
+ :undoc-members:
+
+.. automodule:: deepmd.Trainer
+ :members:
+ :undoc-members:
+
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000..b757da9771
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,60 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'DeePMD-kit'
+copyright = '2020, Deep Potential'
+author = 'Deep Potential'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'recommonmark',
+ "sphinx_rtd_theme",
+ 'sphinx.ext.autosummary'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/custom.css']
+
+autodoc_default_flags = ['members']
+autosummary_generate = True
+master_doc = 'index'
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000..13c0c45b7d
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,26 @@
+.. deepmd-kit documentation master file, created by
+ sphinx-quickstart on Sat Nov 21 18:36:24 2020.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+DeePMD-kit's documentation
+======================================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+
+ install
+ use-deepmd-kit
+ train-input
+ lammps-pair-style-deepmd
+ api
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/install.md b/doc/install.md
new file mode 100644
index 0000000000..05a0d3bdb0
--- /dev/null
+++ b/doc/install.md
@@ -0,0 +1,201 @@
+- [Download and install](#download-and-install)
+ - [Easy installation methods](#easy-installation-methods)
+ - [Offline packages](#offline-packages)
+ - [With Docker](#with-docker)
+ - [With conda](#with-conda)
+ - [Install the python interaction](#install-the-python-interface)
+ - [Install the Tensorflow's python interface](#install-the-tensorflows-python-interface)
+ - [Install the DeePMD-kit's python interface](#install-the-deepmd-kits-python-interface)
+ - [Install the C++ interface](#install-the-c-interface)
+ - [Install the Tensorflow's C++ interface](#install-the-tensorflows-c-interface)
+ - [Install the DeePMD-kit's C++ interface](#install-the-deepmd-kits-c-interface)
+ - [Install LAMMPS's DeePMD-kit module](#install-lammpss-deepmd-kit-module)
+
+# Download and install
+
+Please follow our [github](https://github.com/deepmodeling/deepmd-kit) webpage to download the [latest released version](https://github.com/deepmodeling/deepmd-kit/tree/master) and [development version](https://github.com/deepmodeling/deepmd-kit/tree/devel).
+
+## Easy installation methods
+There various easy methods to install DeePMD-kit. Choose one that you prefer. If you want to build by yourself, jump to the next two sections.
+
+After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be available to execute. You can try `dp -h` and `lmp -h` to see the help. `mpirun` is also available considering you may want to run LAMMPS in parallel.
+
+### Offline packages
+Both CPU and GPU version offline packages are avaiable in [the Releases page](https://github.com/deepmodeling/deepmd-kit/releases).
+
+### With conda
+DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [Anaconda](https://www.anaconda.com/distribution/#download-section) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) first.
+
+To install the CPU version:
+```bash
+conda install deepmd-kit=*=*cpu lammps-dp=*=*cpu -c deepmodeling
+```
+
+To install the GPU version containing [CUDA 10.1](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver):
+```bash
+conda install deepmd-kit=*=*gpu lammps-dp=*=*gpu -c deepmodeling
+```
+
+### With Docker
+A docker for installing the DeePMD-kit is available [here](https://github.com/orgs/deepmodeling/packages/container/package/deepmd-kit).
+
+To pull the CPU version:
+```bash
+docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.2_cpu
+```
+
+To pull the GPU version:
+```bash
+docker pull ghcr.io/deepmodeling/deepmd-kit:1.2.2_cuda10.1_gpu
+```
+
+## Install the python interface
+### Install the Tensorflow's python interface
+First, check the python version on your machine
+```bash
+python --version
+```
+
+We follow the virtual environment approach to install the tensorflow's Python interface. The full instruction can be found on [the tensorflow's official website](https://www.tensorflow.org/install/pip). Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
+```bash
+virtualenv -p python3 $tensorflow_venv
+source $tensorflow_venv/bin/activate
+pip install --upgrade pip
+pip install --upgrade tensorflow==2.3.0
+```
+It is highly recommanded to keep the consistency of the TensorFlow version for the python and C++ interfaces.
+Everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by
+```bash
+source $tensorflow_venv/bin/activate
+```
+if one wants to skip out of the virtual environment, he/she can do
+```bash
+deactivate
+```
+If one has multiple python interpreters named like python3.x, it can be specified by, for example
+```bash
+virtualenv -p python3.7 $tensorflow_venv
+```
+If one does not need the GPU support of deepmd-kit and is concerned about package size, the CPU-only version of tensorflow should be installed by
+```bash
+pip install --upgrade tensorflow-cpu==2.3.0
+```
+To verify the installation, run
+```bash
+python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"
+```
+One should remember to activate the virtual environment every time he/she uses deepmd-kit.
+
+### Install the DeePMD-kit's python interface
+
+Execute
+```bash
+pip install deepmd-kit
+```
+To test the installation, one may execute
+```bash
+dp -h
+```
+It will print the help information like
+```text
+usage: dp [-h] {train,freeze,test} ...
+
+DeePMD-kit: A deep learning package for many-body potential energy
+representation and molecular dynamics
+
+optional arguments:
+ -h, --help show this help message and exit
+
+Valid subcommands:
+ {train,freeze,test}
+ train train a model
+ freeze freeze the model
+ test test the model
+```
+
+## Install the C++ interface
+
+If one does not need to use DeePMD-kit with Lammps or I-Pi, then the python interface installed in the previous section does everything and he/she can safely skip this section.
+
+### Install the Tensorflow's C++ interface
+
+Check the compiler version on your machine
+
+```
+gcc --version
+```
+
+The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
+
+First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](install-tf.1.14-gpu.md) to install the C++ interface.
+
+### Install the DeePMD-kit's C++ interface
+
+Clone the DeePMD-kit source code
+```bash
+cd /some/workspace
+git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
+```
+
+For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
+```bash
+cd deepmd-kit
+deepmd_source_dir=`pwd`
+```
+
+Now goto the source code directory of DeePMD-kit and make a build place.
+```bash
+cd $deepmd_source_dir/source
+mkdir build
+cd build
+```
+I assume you want to install DeePMD-kit into path `$deepmd_root`, then execute cmake
+```bash
+cmake -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
+```
+where the variable `tensorflow_root` stores the location where the tensorflow's C++ interface is installed. The DeePMD-kit will automatically detect if a CUDA tool-kit is available on your machine and build the GPU support accordingly. If you want to force the cmake to find CUDA tool-kit, you can speicify the key `USE_CUDA_TOOLKIT`,
+```bash
+cmake -DUSE_CUDA_TOOLKIT=true -DTENSORFLOW_ROOT=$tensorflow_root -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
+```
+and you may further asked to provide `CUDA_TOOLKIT_ROOT_DIR`. If the cmake has executed successfully, then
+```bash
+make
+make install
+```
+If everything works fine, you will have the following executable and libraries installed in `$deepmd_root/bin` and `$deepmd_root/lib`
+```bash
+$ ls $deepmd_root/bin
+dp_ipi
+$ ls $deepmd_root/lib
+libdeepmd_ipi.so libdeepmd_op.so libdeepmd.so
+```
+
+### Install LAMMPS's DeePMD-kit module
+DeePMD-kit provide module for running MD simulation with LAMMPS. Now make the DeePMD-kit module for LAMMPS.
+```bash
+cd $deepmd_source_dir/source/build
+make lammps
+```
+DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. Now download your favorite LAMMPS code, and uncompress it (I assume that you have downloaded the tar `lammps-stable.tar.gz`)
+```bash
+cd /some/workspace
+tar xf lammps-stable.tar.gz
+```
+The source code of LAMMPS is stored in directory, for example `lammps-31Mar17`. Now go into the LAMMPS code and copy the DeePMD-kit module like this
+```bash
+cd lammps-31Mar17/src/
+cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
+```
+Now build LAMMPS
+```bash
+make yes-user-deepmd
+make mpi -j4
+```
+The option `-j4` means using 4 processes in parallel. You may want to use a different number according to your hardware.
+
+If everything works fine, you will end up with an executable `lmp_mpi`.
+
+The DeePMD-kit module can be removed from LAMMPS source code by
+```bash
+make no-user-deepmd
+```
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000000..2119f51099
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/doc/requirements.txt b/doc/requirements.txt
new file mode 100644
index 0000000000..1d39662bb4
--- /dev/null
+++ b/doc/requirements.txt
@@ -0,0 +1 @@
+.[docs,cpu]
diff --git a/doc/train-input-auto.rst b/doc/train-input-auto.rst
new file mode 100644
index 0000000000..c7dbe1a9e0
--- /dev/null
+++ b/doc/train-input-auto.rst
@@ -0,0 +1,1059 @@
+.. raw:: html
+
+
+model:
+ | type: ``dict``
+ | argument path: ``model``
+
+ .. raw:: html
+
+
+ type_map:
+ | type: ``list``, optional
+ | argument path: ``model/type_map``
+
+ A list of strings. Give the name to each type of atoms.
+
+ .. raw:: html
+
+
+ data_stat_nbatch:
+ | type: ``int``, optional, default: ``10``
+ | argument path: ``model/data_stat_nbatch``
+
+ The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.
+
+ .. raw:: html
+
+
+ use_srtab:
+ | type: ``str``, optional
+ | argument path: ``model/use_srtab``
+
+ The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.
+
+ .. raw:: html
+
+
+ smin_alpha:
+ | type: ``float``, optional
+ | argument path: ``model/smin_alpha``
+
+ The short-range tabulated interaction will be swithed according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided.
+
+ .. raw:: html
+
+
+ sw_rmin:
+ | type: ``float``, optional
+ | argument path: ``model/sw_rmin``
+
+ The lower boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.
+
+ .. raw:: html
+
+
+ sw_rmax:
+ | type: ``float``, optional
+ | argument path: ``model/sw_rmax``
+
+ The upper boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.
+
+ .. raw:: html
+
+
+ descriptor:
+ | type: ``dict``
+ | argument path: ``model/descriptor``
+
+ The descriptor of atomic environment.
+
+
+ Depending on the value of *type*, different sub args are accepted.
+
+ .. raw:: html
+
+
+ type:
+ | type: ``str`` (flag key)
+ | argument path: ``model/descriptor/type``
+
+ The type of the descritpor. Valid types are `loc_frame`, `se_a`, `se_r` and `se_ar`.
+
+ - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.
+
+ - `se_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.
+
+ - `se_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.
+
+ - `se_ar`: A hybrid of `se_a` and `se_r`. Typically `se_a` has a smaller cut-off while the `se_r` has a larger cut-off.
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``loc_frame``:
+
+ .. raw:: html
+
+
+ sel_a:
+ | type: ``list``
+ | argument path: ``model/descriptor[loc_frame]/sel_a``
+
+ A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.
+
+ .. raw:: html
+
+
+ sel_r:
+ | type: ``list``
+ | argument path: ``model/descriptor[loc_frame]/sel_r``
+
+ A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+ .. raw:: html
+
+
+ rcut:
+ | type: ``float``, optional, default: ``6.0``
+ | argument path: ``model/descriptor[loc_frame]/rcut``
+
+ The cut-off radius. The default value is 6.0
+
+ .. raw:: html
+
+
+ axis_rule:
+ | type: ``list``
+ | argument path: ``model/descriptor[loc_frame]/axis_rule``
+
+ A list of integers. The length should be 6 times of the number of types.
+
+ - axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
+
+ - axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.
+
+ - axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.
+
+ - axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
+
+ - axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.
+
+ - axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``se_a``:
+
+ .. raw:: html
+
+
+ sel:
+ | type: ``list``
+ | argument path: ``model/descriptor[se_a]/sel``
+
+ A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+ .. raw:: html
+
+
+ rcut:
+ | type: ``float``, optional, default: ``6.0``
+ | argument path: ``model/descriptor[se_a]/rcut``
+
+ The cut-off radius.
+
+ .. raw:: html
+
+
+ rcut_smth:
+ | type: ``float``, optional, default: ``0.5``
+ | argument path: ``model/descriptor[se_a]/rcut_smth``
+
+ Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
+
+ .. raw:: html
+
+
+ neuron:
+ | type: ``list``, optional, default: ``[10, 20, 40]``
+ | argument path: ``model/descriptor[se_a]/neuron``
+
+ Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
+
+ .. raw:: html
+
+
+ axis_neuron:
+ | type: ``int``, optional, default: ``4``
+ | argument path: ``model/descriptor[se_a]/axis_neuron``
+
+ Size of the submatrix of G (embedding matrix).
+
+ .. raw:: html
+
+
+ activation_function:
+ | type: ``str``, optional, default: ``tanh``
+ | argument path: ``model/descriptor[se_a]/activation_function``
+
+ The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+ .. raw:: html
+
+
+ resnet_dt:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``model/descriptor[se_a]/resnet_dt``
+
+ Whether to use a "Timestep" in the skip connection
+
+ .. raw:: html
+
+
+ type_one_side:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``model/descriptor[se_a]/type_one_side``
+
+ Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+
+ .. raw:: html
+
+
+ precision:
+ | type: ``str``, optional, default: ``float64``
+ | argument path: ``model/descriptor[se_a]/precision``
+
+ The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
+
+ .. raw:: html
+
+
+ trainable:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/descriptor[se_a]/trainable``
+
+ If the parameters in the embedding net is trainable
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``model/descriptor[se_a]/seed``
+
+ Random seed for parameter initialization
+
+ .. raw:: html
+
+
+ exclude_types:
+ | type: ``list``, optional, default: ``[]``
+ | argument path: ``model/descriptor[se_a]/exclude_types``
+
+ The Excluded types
+
+ .. raw:: html
+
+
+ set_davg_zero:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``model/descriptor[se_a]/set_davg_zero``
+
+ Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``se_r``:
+
+ .. raw:: html
+
+
+ sel:
+ | type: ``list``
+ | argument path: ``model/descriptor[se_r]/sel``
+
+ A list of integers. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.
+
+ .. raw:: html
+
+
+ rcut:
+ | type: ``float``, optional, default: ``6.0``
+ | argument path: ``model/descriptor[se_r]/rcut``
+
+ The cut-off radius.
+
+ .. raw:: html
+
+
+ rcut_smth:
+ | type: ``float``, optional, default: ``0.5``
+ | argument path: ``model/descriptor[se_r]/rcut_smth``
+
+ Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`
+
+ .. raw:: html
+
+
+ neuron:
+ | type: ``list``, optional, default: ``[10, 20, 40]``
+ | argument path: ``model/descriptor[se_r]/neuron``
+
+ Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.
+
+ .. raw:: html
+
+
+ activation_function:
+ | type: ``str``, optional, default: ``tanh``
+ | argument path: ``model/descriptor[se_r]/activation_function``
+
+ The activation function in the embedding net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+ .. raw:: html
+
+
+ resnet_dt:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``model/descriptor[se_r]/resnet_dt``
+
+ Whether to use a "Timestep" in the skip connection
+
+ .. raw:: html
+
+
+ type_one_side:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``model/descriptor[se_r]/type_one_side``
+
+ Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+
+ .. raw:: html
+
+
+ precision:
+ | type: ``str``, optional, default: ``float64``
+ | argument path: ``model/descriptor[se_r]/precision``
+
+ The precision of the embedding net parameters, supported options are "float64", "float32", "float16".
+
+ .. raw:: html
+
+
+ trainable:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/descriptor[se_r]/trainable``
+
+ If the parameters in the embedding net is trainable
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``model/descriptor[se_r]/seed``
+
+ Random seed for parameter initialization
+
+ .. raw:: html
+
+
+ exclude_types:
+ | type: ``list``, optional, default: ``[]``
+ | argument path: ``model/descriptor[se_r]/exclude_types``
+
+ The Excluded types
+
+ .. raw:: html
+
+
+ set_davg_zero:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``model/descriptor[se_r]/set_davg_zero``
+
+ Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``se_ar``:
+
+ .. raw:: html
+
+
+ a:
+ | type: ``dict``
+ | argument path: ``model/descriptor[se_ar]/a``
+
+ The parameters of descriptor `se_a <#model/descriptor[se_a]>`__
+
+ .. raw:: html
+
+
+ r:
+ | type: ``dict``
+ | argument path: ``model/descriptor[se_ar]/r``
+
+ The parameters of descriptor `se_r <#model/descriptor[se_r]>`__
+
+ .. raw:: html
+
+
+ fitting_net:
+ | type: ``dict``
+ | argument path: ``model/fitting_net``
+
+ The fitting of physical properties.
+
+
+ Depending on the value of *type*, different sub args are accepted.
+
+ .. raw:: html
+
+
+ type:
+ | type: ``str`` (flag key), default: ``ener``
+ | argument path: ``model/fitting_net/type``
+
+ The type of the fitting. Valid types are `ener`, `dipole`, `polar` and `global_polar`.
+
+ - `ener`: Fit an energy model (potential energy surface).
+
+ - `dipole`: Fit an atomic dipole model. Atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file has number of frames lines and 3 times of number of selected atoms columns.
+
+ - `polar`: Fit an atomic polarizability model. Atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 times of number of selected atoms columns.
+
+ - `global_polar`: Fit a polarizability model. Polarizability labels should be provided by `polarizability.npy` in each data system. The file has number of frames lines and 9 columns.
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``ener``:
+
+ .. raw:: html
+
+
+ numb_fparam:
+ | type: ``int``, optional, default: ``0``
+ | argument path: ``model/fitting_net[ener]/numb_fparam``
+
+ The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.
+
+ .. raw:: html
+
+
+ numb_aparam:
+ | type: ``int``, optional, default: ``0``
+ | argument path: ``model/fitting_net[ener]/numb_aparam``
+
+ The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.
+
+ .. raw:: html
+
+
+ neuron:
+ | type: ``list``, optional, default: ``[120, 120, 120]``
+ | argument path: ``model/fitting_net[ener]/neuron``
+
+ The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+ .. raw:: html
+
+
+ activation_function:
+ | type: ``str``, optional, default: ``tanh``
+ | argument path: ``model/fitting_net[ener]/activation_function``
+
+ The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+ .. raw:: html
+
+
+ precision:
+ | type: ``str``, optional, default: ``float64``
+ | argument path: ``model/fitting_net[ener]/precision``
+
+ The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+ .. raw:: html
+
+
+ resnet_dt:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/fitting_net[ener]/resnet_dt``
+
+ Whether to use a "Timestep" in the skip connection
+
+ .. raw:: html
+
+
+ trainable:
+ | type: ``bool`` | ``list``, optional, default: ``True``
+ | argument path: ``model/fitting_net[ener]/trainable``
+
+ Whether the parameters in the fitting net are trainable. This option can be
+
+ - bool: True if all parameters of the fitting net are trainable, False otherwise.
+
+ - list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.
+
+ .. raw:: html
+
+
+ rcond:
+ | type: ``float``, optional, default: ``0.001``
+ | argument path: ``model/fitting_net[ener]/rcond``
+
+ The condition number used to determine the inital energy shift for each type of atoms.
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``model/fitting_net[ener]/seed``
+
+ Random seed for parameter initialization of the fitting net
+
+ .. raw:: html
+
+
+ atom_ener:
+ | type: ``list``, optional, default: ``[]``
+ | argument path: ``model/fitting_net[ener]/atom_ener``
+
+ Specify the atomic energy in vacuum for each type
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``dipole``:
+
+ .. raw:: html
+
+
+ neuron:
+ | type: ``list``, optional, default: ``[120, 120, 120]``
+ | argument path: ``model/fitting_net[dipole]/neuron``
+
+ The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+ .. raw:: html
+
+
+ activation_function:
+ | type: ``str``, optional, default: ``tanh``
+ | argument path: ``model/fitting_net[dipole]/activation_function``
+
+ The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+ .. raw:: html
+
+
+ resnet_dt:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/fitting_net[dipole]/resnet_dt``
+
+ Whether to use a "Timestep" in the skip connection
+
+ .. raw:: html
+
+
+ precision:
+ | type: ``str``, optional, default: ``float64``
+ | argument path: ``model/fitting_net[dipole]/precision``
+
+ The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+ .. raw:: html
+
+
+ sel_type:
+ | type: ``int`` | ``NoneType`` | ``list``, optional
+ | argument path: ``model/fitting_net[dipole]/sel_type``
+
+ The atom types for which the atomic dipole will be provided. If not set, all types will be selected.
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``model/fitting_net[dipole]/seed``
+
+ Random seed for parameter initialization of the fitting net
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``polar``:
+
+ .. raw:: html
+
+
+ neuron:
+ | type: ``list``, optional, default: ``[120, 120, 120]``
+ | argument path: ``model/fitting_net[polar]/neuron``
+
+ The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+ .. raw:: html
+
+
+ activation_function:
+ | type: ``str``, optional, default: ``tanh``
+ | argument path: ``model/fitting_net[polar]/activation_function``
+
+ The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+ .. raw:: html
+
+
+ resnet_dt:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/fitting_net[polar]/resnet_dt``
+
+ Whether to use a "Timestep" in the skip connection
+
+ .. raw:: html
+
+
+ precision:
+ | type: ``str``, optional, default: ``float64``
+ | argument path: ``model/fitting_net[polar]/precision``
+
+ The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+ .. raw:: html
+
+
+ fit_diag:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/fitting_net[polar]/fit_diag``
+
+ Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
+
+ .. raw:: html
+
+
+ scale:
+ | type: ``float`` | ``list``, optional, default: ``1.0``
+ | argument path: ``model/fitting_net[polar]/scale``
+
+ The output of the fitting net (polarizability matrix) will be scaled by ``scale``
+
+ .. raw:: html
+
+
+ diag_shift:
+ | type: ``float`` | ``list``, optional, default: ``0.0``
+ | argument path: ``model/fitting_net[polar]/diag_shift``
+
+ The diagonal part of the polarizability matrix will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
+
+ .. raw:: html
+
+
+ sel_type:
+ | type: ``int`` | ``NoneType`` | ``list``, optional
+ | argument path: ``model/fitting_net[polar]/sel_type``
+
+ The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``model/fitting_net[polar]/seed``
+
+ Random seed for parameter initialization of the fitting net
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``global_polar``:
+
+ .. raw:: html
+
+
+ neuron:
+ | type: ``list``, optional, default: ``[120, 120, 120]``
+ | argument path: ``model/fitting_net[global_polar]/neuron``
+
+ The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.
+
+ .. raw:: html
+
+
+ activation_function:
+ | type: ``str``, optional, default: ``tanh``
+ | argument path: ``model/fitting_net[global_polar]/activation_function``
+
+ The activation function in the fitting net. Supported activation functions are "relu", "relu6", "softplus", "sigmoid", "tanh", "gelu".
+
+ .. raw:: html
+
+
+ resnet_dt:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/fitting_net[global_polar]/resnet_dt``
+
+ Whether to use a "Timestep" in the skip connection
+
+ .. raw:: html
+
+
+ precision:
+ | type: ``str``, optional, default: ``float64``
+ | argument path: ``model/fitting_net[global_polar]/precision``
+
+ The precision of the fitting net parameters, supported options are "float64", "float32", "float16".
+
+ .. raw:: html
+
+
+ fit_diag:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``model/fitting_net[global_polar]/fit_diag``
+
+ Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.
+
+ .. raw:: html
+
+
+ scale:
+ | type: ``float`` | ``list``, optional, default: ``1.0``
+ | argument path: ``model/fitting_net[global_polar]/scale``
+
+ The output of the fitting net (polarizability matrix) will be scaled by ``scale``
+
+ .. raw:: html
+
+
+ diag_shift:
+ | type: ``float`` | ``list``, optional, default: ``0.0``
+ | argument path: ``model/fitting_net[global_polar]/diag_shift``
+
+ The diagonal part of the polarizability matrix will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.
+
+ .. raw:: html
+
+
+ sel_type:
+ | type: ``int`` | ``NoneType`` | ``list``, optional
+ | argument path: ``model/fitting_net[global_polar]/sel_type``
+
+ The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``model/fitting_net[global_polar]/seed``
+
+ Random seed for parameter initialization of the fitting net
+
+
+.. raw:: html
+
+
+loss:
+ | type: ``dict``
+ | argument path: ``loss``
+
+ The definition of loss function. The type of the loss depends on the type of the fitting. For fitting type `ener`, the prefactors before energy, force, virial and atomic energy losses may be provided. For fitting type `dipole`, `polar` and `global_polar`, the loss may be an empty `dict` or unset.
+
+
+ Depending on the value of *type*, different sub args are accepted.
+
+ .. raw:: html
+
+
+ type:
+ | type: ``str`` (flag key), default: ``ener``
+ | argument path: ``loss/type``
+
+ The type of the loss. For fitting type `ener`, the loss type should be set to `ener` or left unset. For tensorial fitting types `dipole`, `polar` and `global_polar`, the type should be left unset.
+ \.
+
+
+ .. raw:: html
+
+
+ When *type* is set to ``ener``:
+
+ .. raw:: html
+
+
+ start_pref_e:
+ | type: ``float`` | ``int``, optional, default: ``0.02``
+ | argument path: ``loss[ener]/start_pref_e``
+
+ The prefactor of energy loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the energy label should be provided by file energy.npy in each data system. If both start_pref_energy and limit_pref_energy are set to 0, then the energy will be ignored.
+
+ .. raw:: html
+
+
+ limit_pref_e:
+ | type: ``float`` | ``int``, optional, default: ``1.0``
+ | argument path: ``loss[ener]/limit_pref_e``
+
+ The prefactor of energy loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+ .. raw:: html
+
+
+ start_pref_f:
+ | type: ``float`` | ``int``, optional, default: ``1000``
+ | argument path: ``loss[ener]/start_pref_f``
+
+ The prefactor of force loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the force label should be provided by file force.npy in each data system. If both start_pref_force and limit_pref_force are set to 0, then the force will be ignored.
+
+ .. raw:: html
+
+
+ limit_pref_f:
+ | type: ``float`` | ``int``, optional, default: ``1.0``
+ | argument path: ``loss[ener]/limit_pref_f``
+
+ The prefactor of force loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+ .. raw:: html
+
+
+ start_pref_v:
+ | type: ``float`` | ``int``, optional, default: ``0.0``
+ | argument path: ``loss[ener]/start_pref_v``
+
+ The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
+
+ .. raw:: html
+
+
+ limit_pref_v:
+ | type: ``float`` | ``int``, optional, default: ``0.0``
+ | argument path: ``loss[ener]/limit_pref_v``
+
+ The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+ .. raw:: html
+
+
+ start_pref_ae:
+ | type: ``float`` | ``int``, optional, default: ``0.0``
+ | argument path: ``loss[ener]/start_pref_ae``
+
+ The prefactor of virial loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the virial label should be provided by file virial.npy in each data system. If both start_pref_virial and limit_pref_virial are set to 0, then the virial will be ignored.
+
+ .. raw:: html
+
+
+ limit_pref_ae:
+ | type: ``float`` | ``int``, optional, default: ``0.0``
+ | argument path: ``loss[ener]/limit_pref_ae``
+
+ The prefactor of virial loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.
+
+ .. raw:: html
+
+
+ relative_f:
+ | type: ``float`` | ``NoneType``, optional
+ | argument path: ``loss[ener]/relative_f``
+
+ If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.
+
+
+.. raw:: html
+
+
+learning_rate:
+ | type: ``dict``
+ | argument path: ``learning_rate``
+
+ The learning rate options
+
+ .. raw:: html
+
+
+ start_lr:
+ | type: ``float``, optional, default: ``0.001``
+ | argument path: ``learning_rate/start_lr``
+
+ The learning rate the start of the training.
+
+ .. raw:: html
+
+
+ stop_lr:
+ | type: ``float``, optional, default: ``1e-08``
+ | argument path: ``learning_rate/stop_lr``
+
+ The desired learning rate at the end of the training.
+
+ .. raw:: html
+
+
+ decay_steps:
+ | type: ``int``, optional, default: ``5000``
+ | argument path: ``learning_rate/decay_steps``
+
+ The learning rate is decaying every this number of training steps.
+
+
+.. raw:: html
+
+
+training:
+ | type: ``dict``
+ | argument path: ``training``
+
+ The training options
+
+ .. raw:: html
+
+
+ systems:
+ | type: ``list`` | ``str``
+ | argument path: ``training/systems``
+
+ The data systems. This key can be provided with a listthat specifies the systems, or be provided with a string by which the prefix of all systems are given and the list of the systems is automatically generated.
+
+ .. raw:: html
+
+
+ set_prefix:
+ | type: ``str``, optional, default: ``set``
+ | argument path: ``training/set_prefix``
+
+ The prefix of the sets in the systems.
+
+ .. raw:: html
+
+
+ stop_batch:
+ | type: ``int``
+ | argument path: ``training/stop_batch``
+
+ Number of training batch. Each training uses one batch of data.
+
+ .. raw:: html
+
+
+ batch_size:
+ | type: ``int`` | ``list`` | ``str``, optional, default: ``auto``
+ | argument path: ``training/batch_size``
+
+ This key can be
+
+ - list: the length of which is the same as the `systems`. The batch size of each system is given by the elements of the list.
+
+ - int: all `systems` uses the same batch size.
+
+ - string "auto": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than 32.
+
+ - string "auto:N": automatically determines the batch size os that the batch_size times the number of atoms in the system is no less than N.
+
+ .. raw:: html
+
+
+ seed:
+ | type: ``int`` | ``NoneType``, optional
+ | argument path: ``training/seed``
+
+ The random seed for training.
+
+ .. raw:: html
+
+
+ disp_file:
+ | type: ``str``, optional, default: ``lcueve.out``
+ | argument path: ``training/disp_file``
+
+ The file for printing learning curve.
+
+ .. raw:: html
+
+
+ disp_freq:
+ | type: ``int``, optional, default: ``1000``
+ | argument path: ``training/disp_freq``
+
+ The frequency of printing learning curve.
+
+ .. raw:: html
+
+
+ numb_test:
+ | type: ``int`` | ``list`` | ``str``, optional, default: ``1``
+ | argument path: ``training/numb_test``
+
+ Number of frames used for the test during training.
+
+ .. raw:: html
+
+
+ save_freq:
+ | type: ``int``, optional, default: ``1000``
+ | argument path: ``training/save_freq``
+
+ The frequency of saving check point.
+
+ .. raw:: html
+
+
+ save_ckpt:
+ | type: ``str``, optional, default: ``model.ckpt``
+ | argument path: ``training/save_ckpt``
+
+ The file name of saving check point.
+
+ .. raw:: html
+
+
+ disp_training:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``training/disp_training``
+
+ Displaying verbose information during training.
+
+ .. raw:: html
+
+
+ time_training:
+ | type: ``bool``, optional, default: ``True``
+ | argument path: ``training/time_training``
+
+ Timing durining training.
+
+ .. raw:: html
+
+
+ profiling:
+ | type: ``bool``, optional, default: ``False``
+ | argument path: ``training/profiling``
+
+ Profiling during training.
+
+ .. raw:: html
+
+
+ profiling_file:
+ | type: ``str``, optional, default: ``timeline.json``
+ | argument path: ``training/profiling_file``
+
+ Output file for profiling.
+
diff --git a/doc/train-input.rst b/doc/train-input.rst
new file mode 100644
index 0000000000..aa6c7bd01e
--- /dev/null
+++ b/doc/train-input.rst
@@ -0,0 +1,3 @@
+Training parameters
+======================================
+.. include:: train-input-auto.rst
diff --git a/doc/use-deepmd-kit.md b/doc/use-deepmd-kit.md
new file mode 100644
index 0000000000..cbf7a91cbc
--- /dev/null
+++ b/doc/use-deepmd-kit.md
@@ -0,0 +1,352 @@
+- [Use DeePMD-kit](#use-deepmd-kit)
+ - [Prepare data](#prepare-data)
+ - [Train a model](#train-a-model)
+ - [The DeePMD model](#the-deepmd-model)
+ - [The DeepPot-SE model](#the-deeppot-se-model)
+ - [Freeze a model](#freeze-a-model)
+ - [Test a model](#test-a-model)
+ - [Model inference](#model-inference)
+ - [Run MD with Lammps](#run-md-with-lammps)
+ - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
+ - [Long-range interaction](#long-range-interaction)
+ - [Run path-integral MD with i-PI](#run-path-integral-md-with-i-pi)
+ - [Use deep potential with ASE](#use-deep-potential-with-ase)
+
+# Use DeePMD-kit
+In this text, we will call the deep neural network that is used to represent the interatomic interactions (Deep Potential) the **model**. The typical procedure of using DeePMD-kit is
+
+1. Prepare data
+2. Train a model
+3. Freeze the model
+4. Test the model
+5. Inference with the model
+
+## Prepare data
+One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
+
+Property| Unit
+--- | :---:
+Time | ps
+Length | Å
+Energy | eV
+Force | eV/Å
+Pressure| Bar
+
+The frames of the system are stored in two formats. A raw file is a plain text file with each information item written in one file and one frame written on one line. The default files that provide box, coordinate, force, energy and virial are `box.raw`, `coord.raw`, `force.raw`, `energy.raw` and `virial.raw`, respectively. *We recommend you use these file names*. Here is an example of force.raw:
+```bash
+$ cat force.raw
+-0.724 2.039 -0.951 0.841 -0.464 0.363
+ 6.737 1.554 -5.587 -2.803 0.062 2.222
+-1.968 -0.163 1.020 -0.225 -0.789 0.343
+```
+This `force.raw` contains 3 frames with each frame having the forces of 2 atoms, thus it has 3 lines and 6 columns. Each line provides all the 3 force components of 2 atoms in 1 frame. The first three numbers are the 3 force components of the first atom, while the second three numbers are the 3 force components of the second atom. The coordinate file `coord.raw` is organized similarly. In `box.raw`, the 9 components of the box vectors should be provided on each line. In `virial.raw`, the 9 components of the virial tensor should be provided on each line. The number of lines of all raw files should be identical.
+
+We assume that the atom types do not change in all frames. It is provided by `type.raw`, which has one line with the types of atoms written one by one. The atom types should be integers. For example the `type.raw` of a system that has 2 atoms with 0 and 1:
+```bash
+$ cat type.raw
+0 1
+```
+
+The second format is the data sets of `numpy` binary data that are directly used by the training program. User can use the script `$deepmd_source_dir/data/raw/raw_to_set.sh` to convert the prepared raw files to data sets. For example, if we have a raw file that contains 6000 frames,
+```bash
+$ ls
+box.raw coord.raw energy.raw force.raw type.raw virial.raw
+$ $deepmd_source_dir/data/raw/raw_to_set.sh 2000
+nframe is 6000
+nline per set is 2000
+will make 3 sets
+making set 0 ...
+making set 1 ...
+making set 2 ...
+$ ls
+box.raw coord.raw energy.raw force.raw set.000 set.001 set.002 type.raw virial.raw
+```
+It generates three sets `set.000`, `set.001` and `set.002`, with each set contains 2000 frames. The last set (`set.002`) is used as testing set, while the rest sets (`set.000` and `set.001`) are used as training sets. One do not need to take care of the binary data files in each of the `set.*` directories. The path containing `set.*` and `type.raw` is called a *system*.
+
+## Train a model
+
+### Write the input script
+
+The method of training is explained in our [DeePMD][2] and [DeepPot-SE][3] papers. With the source code we provide a small training dataset taken from 400 frames generated by NVT ab-initio water MD trajectory with 300 frames for training and 100 for testing. [An example training parameter file](./examples/water/train/water_se_a.json) is provided. One can try with the training by
+```bash
+$ cd $deepmd_source_dir/examples/water/train/
+$ dp train water_se_a.json
+```
+where `water_se_a.json` is the `json` format parameter file that controls the training. It is also possible to use `yaml` format file with the same keys as json (see `water_se_a.yaml` example). You can use script `json2yaml.py` in `data/json/` dir to convert your json files to yaml. The components of the `water.json` contains four parts, `model`, `learning_rate`, `loss` and `training`.
+
+The `model` section specify how the deep potential model is built. An example of the smooth-edition is provided as follows
+```json
+ "model": {
+ "type_map": ["O", "H"],
+ "descriptor" :{
+ "type": "se_a",
+ "rcut_smth": 5.80,
+ "rcut": 6.00,
+ "sel": [46, 92],
+ "neuron": [25, 50, 100],
+ "axis_neuron": 16,
+ "resnet_dt": false,
+ "seed": 1,
+ "_comment": " that's all"
+ },
+ "fitting_net" : {
+ "neuron": [240, 240, 240],
+ "resnet_dt": true,
+ "seed": 1,
+ "_comment": " that's all"
+ },
+ "_comment": " that's all"
+ }
+```
+The **`type_map`** is optional, which provide the element names (but not restricted to) for corresponding atom types.
+
+The construction of the descriptor is given by option **`descriptor`**. The **`type`** of the descriptor is set to `"se_a"`, which means smooth-edition, angular infomation. The **`rcut`** is the cut-off radius for neighbor searching, and the **`rcut_smth`** gives where the smoothing starts. **`sel`** gives the maximum possible number of neighbors in the cut-off radius. It is a list, the length of which is the same as the number of atom types in the system, and `sel[i]` denote the maximum possible number of neighbors with type `i`. The **`neuron`** specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layers from input end to the output end, respectively. The **`axis_neuron`** specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper][3]. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
+
+The construction of the fitting net is give by **`fitting_net`**. The key **`neuron`** specifies the size of the fitting net. If two neighboring layers are of the same size, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is build between them. If the option **`resnet_dt`** is set `true`, then a timestep is used in the ResNet. **`seed`** gives the random seed that is used to generate random numbers when initializing the model parameters.
+
+An example of the `learning_rate` is given as follows
+```json
+ "learning_rate" :{
+ "type": "exp",
+ "start_lr": 0.005,
+ "decay_steps": 5000,
+ "decay_rate": 0.95,
+ "_comment": "that's all"
+ }
+```
+The option **`start_lr`**, **`decay_rate`** and **`decay_steps`** specify how the learning rate changes. For example, the `t`th batch will be trained with learning rate:
+```math
+lr(t) = start_lr * decay_rate ^ ( t / decay_steps )
+```
+
+An example of the `loss` is
+```json
+ "loss" : {
+ "start_pref_e": 0.02,
+ "limit_pref_e": 1,
+ "start_pref_f": 1000,
+ "limit_pref_f": 1,
+ "start_pref_v": 0,
+ "limit_pref_v": 0,
+ "_comment": " that's all"
+ }
+```
+The options **`start_pref_e`**, **`limit_pref_e`**, **`start_pref_f`**, **`limit_pref_f`**, **`start_pref_v`** and **`limit_pref_v`** determine how the prefactors of energy error, force error and virial error changes in the loss function (see the appendix of the [DeePMD paper][2] for details). Taking the prefactor of force error for example, the prefactor at batch `t` is
+```math
+w_f(t) = start_pref_f * ( lr(t) / start_lr ) + limit_pref_f * ( 1 - lr(t) / start_lr )
+```
+Since we do not have virial data, the virial prefactors `start_pref_v` and `limit_pref_v` are set to 0.
+
+An example of `training` is
+```json
+ "training" : {
+ "systems": ["../data1/", "../data2/"],
+ "set_prefix": "set",
+ "stop_batch": 1000000,
+ "_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]",
+ "batch_size": 1,
+
+ "seed": 1,
+
+ "_comment": " display and restart",
+ "_comment": " frequencies counted in batch",
+ "disp_file": "lcurve.out",
+ "disp_freq": 100,
+ "_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]",
+ "numb_test": 10,
+ "save_freq": 1000,
+ "save_ckpt": "model.ckpt",
+ "load_ckpt": "model.ckpt",
+ "disp_training":true,
+ "time_training":true,
+ "profiling": false,
+ "profiling_file":"timeline.json",
+ "_comment": "that's all"
+ }
+```
+The option **`systems`** provide location of the systems (path to `set.*` and `type.raw`). It is a vector, thus DeePMD-kit allows you to provide multiple systems. DeePMD-kit will train the model with the systems in the vector one by one in a cyclic manner. **It is warned that the example water data (in folder `examples/data/water`) is of very limited amount, is provided only for testing purpose, and should not be used to train a productive model.**
+
+The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size or it can be input as a list setting batch size individually for each system.
+The option **`stop_batch`** specifies the total number of batches will be used in the training.
+
+The option **`numb_test`** specifies the number of tests that will be used for each system. If it is an integer each system will be tested with the same number of tests. It can be set to percentage `"XX%"` to use XX% of frames of each system for its testing or it can be input as a list setting numer of tests individually for each system (the order should correspond to ordering of the systems key in json).
+
+### Training
+
+The training can be invoked by
+```bash
+$ dp train water_se_a.json
+```
+
+During the training, the error of the model is tested every **`disp_freq`** batches with **`numb_test`** frames from the last set in the **`systems`** directory on the fly, and the results are output to **`disp_file`**. A typical `disp_file` looks like
+```bash
+# batch l2_tst l2_trn l2_e_tst l2_e_trn l2_f_tst l2_f_trn lr
+ 0 2.67e+01 2.57e+01 2.21e-01 2.22e-01 8.44e-01 8.12e-01 1.0e-03
+ 100 6.14e+00 5.40e+00 3.01e-01 2.99e-01 1.93e-01 1.70e-01 1.0e-03
+ 200 5.02e+00 4.49e+00 1.53e-01 1.53e-01 1.58e-01 1.42e-01 1.0e-03
+ 300 4.36e+00 3.71e+00 7.32e-02 7.27e-02 1.38e-01 1.17e-01 1.0e-03
+ 400 4.04e+00 3.29e+00 3.16e-02 3.22e-02 1.28e-01 1.04e-01 1.0e-03
+```
+The first column displays the number of batches. The second and third columns display the loss function evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The fourth and fifth columns display the RMS energy error (normalized by number of atoms) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The sixth and seventh columns display the RMS force error (component-wise) evaluated by `numb_test` frames randomly chosen from the test set and that evaluated by the current training batch, respectively. The last column displays the current learning rate.
+
+Checkpoints will be written to files with prefix **`save_ckpt`** every **`save_freq`** batches. If **`restart`** is set to `true`, then the training will start from the checkpoint named **`load_ckpt`**, rather than from scratch.
+
+Several command line options can be passed to `dp train`, which can be checked with
+```bash
+$ dp train --help
+```
+An explanation will be provided
+```
+positional arguments:
+ INPUT the input json database
+
+optional arguments:
+ -h, --help show this help message and exit
+ --init-model INIT_MODEL
+ Initialize a model by the provided checkpoint
+ --restart RESTART Restart the training from the provided checkpoint
+```
+The keys `intra_op_parallelism_threads` and `inter_op_parallelism_threads` are Tensorflow configurations for multithreading, which are explained [here](https://www.tensorflow.org/performance/performance_guide#optimizing_for_cpu). Skipping `-t` and `OMP_NUM_THREADS` leads to the default setting of these keys in the Tensorflow.
+
+**`--init-model model.ckpt`**, for example, initializes the model training with an existing model that is stored in the checkpoint `model.ckpt`, the network architectures should match.
+
+**`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`.
+
+On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads).
+
+For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows:
+```bash
+export OMP_NUM_THREADS=6
+export TF_INTRA_OP_PARALLELISM_THREADS=3
+export TF_INTER_OP_PARALLELISM_THREADS=2
+dp train input.json
+```
+
+## Freeze a model
+
+The trained neural network is extracted from a checkpoint and dumped into a database. This process is called "freezing" a model. The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc). To freeze a model, typically one does
+```bash
+$ dp freeze -o graph.pb
+```
+in the folder where the model is trained. The output database is called `graph.pb`.
+
+
+## Test a model
+
+The frozen model can be used in many ways. The most straightforward test can be performed using `dp test`. A typical usage of `dp test` is
+```bash
+dp test -m graph.pb -s /path/to/system -n 30
+```
+where `-m` gives the tested model, `-s` the path to the tested system and `-n` the number of tested frames. Several other command line options can be passed to `dp test`, which can be checked with
+```bash
+$ dp test --help
+```
+An explanation will be provided
+```
+usage: dp test [-h] [-m MODEL] [-s SYSTEM] [-S SET_PREFIX] [-n NUMB_TEST]
+ [-r RAND_SEED] [--shuffle-test] [-d DETAIL_FILE]
+
+optional arguments:
+ -h, --help show this help message and exit
+ -m MODEL, --model MODEL
+ Frozen model file to import
+ -s SYSTEM, --system SYSTEM
+ The system dir
+ -S SET_PREFIX, --set-prefix SET_PREFIX
+ The set prefix
+ -n NUMB_TEST, --numb-test NUMB_TEST
+ The number of data for test
+ -r RAND_SEED, --rand-seed RAND_SEED
+ The random seed
+ --shuffle-test Shuffle test data
+ -d DETAIL_FILE, --detail-file DETAIL_FILE
+ The file containing details of energy force and virial
+ accuracy
+```
+
+## Model inference
+One may use the python interface of DeePMD-kit for model inference, an example is given as follows
+```python
+import deepmd.DeepPot as DP
+import numpy as np
+dp = DP('graph.pb')
+coord = np.array([[1,0,0], [0,0,1.5], [1,0,3]]).reshape([1, -1])
+cell = np.diag(10 * np.ones(3)).reshape([1, -1])
+atype = [1,0,1]
+e, f, v = dp.eval(coord, cell, atype)
+```
+where `e`, `f` and `v` are predicted energy, force and virial of the system, respectively.
+
+
+## Run MD with LAMMPS
+### Include deepmd in the pair style
+Running an MD simulation with LAMMPS is simpler. In the LAMMPS input file, one needs to specify the pair style as follows
+```bash
+pair_style deepmd graph.pb
+pair_coeff
+```
+where `graph.pb` is the file name of the frozen model. The `pair_coeff` should be left blank. It should be noted that LAMMPS counts atom types starting from 1, therefore, all LAMMPS atom type will be firstly subtracted by 1, and then passed into the DeePMD-kit engine to compute the interactions. [A detailed documentation of this pair style is available.](doc/lammps-pair-style-deepmd.md).
+
+### Long-range interaction
+The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes
+```bash
+pair_style deepmd graph.pb
+pair_coeff
+kspace_style pppm 1.0e-5
+kspace_modify gewald 0.45
+```
+Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
+
+## Run path-integral MD with i-PI
+The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. The client can be started by
+```bash
+$ dp_ipi water.json
+```
+It is noted that multiple instances of the client is allow for computing, in parallel, the interactions of multiple replica of the path-integral MD.
+
+`water.json` is the parameter file for the client `dp_ipi`, and [an example](./examples/ipi/water.json) is provided:
+```json
+{
+ "verbose": false,
+ "use_unix": true,
+ "port": 31415,
+ "host": "localhost",
+ "graph_file": "graph.pb",
+ "coord_file": "conf.xyz",
+ "atom_type" : {
+ "OW": 0,
+ "HW1": 1,
+ "HW2": 1
+ }
+}
+```
+The option **`use_unix`** is set to `true` to activate the Unix domain socket, otherwise, the Internet socket is used.
+
+The option **`graph_file`** provides the file name of the frozen model.
+
+The `dp_ipi` gets the atom names from an [XYZ file](https://en.wikipedia.org/wiki/XYZ_file_format) provided by **`coord_file`** (meanwhile ignores all coordinates in it), and translates the names to atom types by rules provided by **`atom_type`**.
+
+## Use deep potential with ASE
+
+Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
+```python
+from ase import Atoms
+from deepmd.calculator import DP
+
+water = Atoms('H2O',
+ positions=[(0.7601, 1.9270, 1),
+ (1.9575, 1, 1),
+ (1., 1., 1.)],
+ cell=[100, 100, 100],
+ calculator=DP(model="frozen_model.pb"))
+print(water.get_potential_energy())
+print(water.get_forces())
+```
+
+Optimization is also available:
+```python
+from ase.optimize import BFGS
+dyn = BFGS(water)
+dyn.run(fmax=1e-6)
+print(water.get_positions())
+```
diff --git a/examples/water/train/water_se_a.yaml b/examples/water/train/water_se_a.yaml
new file mode 100644
index 0000000000..b92bf2ab4e
--- /dev/null
+++ b/examples/water/train/water_se_a.yaml
@@ -0,0 +1,68 @@
+# model parameters
+model:
+ type_map:
+ - O
+ - H
+ descriptor:
+ type: se_a
+ sel:
+ - 46
+ - 92
+ rcut_smth: 5.8
+ rcut: 6.0
+ neuron:
+ - 25
+ - 50
+ - 100
+ resnet_dt: false
+ axis_neuron: 16
+ seed: 1
+ # that's all for descriptor
+ fitting_net:
+ neuron:
+ - 240
+ - 240
+ - 240
+ resnet_dt: true
+ seed: 1
+ # that's all for fitting net
+ # that's all for model
+
+learning_rate:
+ type: exp
+ decay_steps: 5000
+ start_lr: 0.001
+ stop_lr: 3.51e-08
+ # that's all for learnnig rate
+
+loss:
+ start_pref_e: 0.02
+ limit_pref_e: 1
+ start_pref_f: 1000
+ limit_pref_f: 1
+ start_pref_v: 0
+ limit_pref_v: 0
+ # that's all for loss
+
+# training contols
+training:
+ systems:
+ - ../data/
+ set_prefix: set
+ stop_batch: 1000000
+ batch_size: 1
+ seed: 1
+ # display and restart
+ # frequencies counted in batch
+ disp_file: lcurve.out
+ disp_freq: 100
+ numb_test: 10
+ save_freq: 1000
+ save_ckpt: model.ckpt
+ load_ckpt: model.ckpt
+ disp_training: true
+ time_training: true
+ profiling: false
+ profiling_file: timeline.json
+ # that's all for training
+# that's all
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 858f84aa5a..885022d196 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,9 @@
from setuptools_scm import get_version
from packaging.version import LegacyVersion
from os import path, makedirs
-import os, imp, sys, platform, sysconfig
+import os, importlib
+import pkg_resources
+from distutils.util import get_platform
readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')
@@ -15,18 +17,32 @@
with open(readme_file) as f:
readme = f.read()
-try:
- tf_install_dir = imp.find_module('tensorflow')[1]
-except ImportError:
- site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
- tf_install_dir = imp.find_module('tensorflow', [site_packages_path])[1]
+install_requires=['numpy', 'scipy', 'pyyaml', 'dargs']
+setup_requires=['setuptools_scm', 'scikit-build']
-install_requires=['numpy', 'scipy']
-setup_requires=['setuptools_scm', 'scikit-build', 'cmake']
+tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')
+if LegacyVersion(tf_version) < LegacyVersion("1.15") or (LegacyVersion(tf_version) >= LegacyVersion("2.0") and LegacyVersion(tf_version) < LegacyVersion("2.1")):
+ extras_require = {"cpu": ["tensorflow==" + tf_version], "gpu": ["tensorflow-gpu==" + tf_version]}
+else:
+ extras_require = {"cpu": ["tensorflow-cpu==" + tf_version], "gpu": ["tensorflow==" + tf_version]}
+tf_spec = importlib.util.find_spec("tensorflow")
+if tf_spec:
+ tf_install_dir = tf_spec.submodule_search_locations[0]
+else:
+ site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
+ tf_spec = importlib.machinery.FileFinder(site_packages_path).find_spec("tensorflow")
+ if tf_spec:
+ tf_install_dir = tf_spec.submodule_search_locations[0]
+ else:
+ setup_requires.append("tensorflow==" + tf_version)
+ tf_install_dir = path.join(path.dirname(path.abspath(__file__)), '.egg',
+ pkg_resources.Distribution(project_name="tensorflow", version=tf_version,
+ platform=get_platform()).egg_name(),
+ 'tensorflow')
-# add cmake as a build requirement if cmake>3.0 is not installed
+# add cmake as a build requirement if cmake>3.7 is not installed
try:
- if LegacyVersion(get_cmake_version()) < LegacyVersion("3.0"):
+ if LegacyVersion(get_cmake_version()) < LegacyVersion("3.7"):
setup_requires.append('cmake')
except SKBuildError:
setup_requires.append('cmake')
@@ -64,6 +80,8 @@
cmake_minimum_required_version='3.0',
extras_require={
'test': ['dpdata>=0.1.9'],
+ 'docs': ['sphinx', 'recommonmark', 'sphinx_rtd_theme'],
+ **extras_require,
},
entry_points={
'console_scripts': ['dp = deepmd.main:main']
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 84c3d326da..dc35ee5dc0 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -125,7 +125,7 @@ else()
endif()
endif()
if (USE_CUDA_TOOLKIT)
- add_definitions("-DUSE_CUDA_TOOLKIT")
+ add_definitions("-D GOOGLE_CUDA")
endif()
# define USE_TTM
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
new file mode 100644
index 0000000000..c446db8130
--- /dev/null
+++ b/source/lib/include/CustomeOperation.h
@@ -0,0 +1,572 @@
+#pragma once
+#include
+#include
+#include
+#include
+#include "MathUtilities.h"
+#if GOOGLE_CUDA
+#include "DeviceFunctor.h"
+#endif // GOOGLE_CUDA
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+struct NeighborInfo {
+ int type;
+ double dist;
+ int index;
+ NeighborInfo () : type (0), dist(0), index(0) {}
+ NeighborInfo (int tt, double dd, int ii) : type (tt), dist(dd), index(ii) {}
+
+ bool operator < (const NeighborInfo & b) const {
+ return (type < b.type || (type == b.type && (dist < b.dist || (dist == b.dist && index < b.index))));
+ }
+};
+
+template
+inline void spline5_switch (
+ FPTYPE & vv,
+ FPTYPE & dd,
+ const FPTYPE & xx,
+ const float & rmin,
+ const float & rmax)
+{
+ if (xx < rmin) {
+ dd = 0;
+ vv = 1;
+ }
+ else if (xx < rmax) {
+ FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+ FPTYPE du = 1. / (rmax - rmin) ;
+ vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
+ dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
+ }
+ else {
+ dd = 0;
+ vv = 0;
+ }
+}
+
+template
+int format_nlist_fill_se_a_cpu (
+ vector & fmt_nei_idx_a,
+ const vector & posi,
+ const int & ntypes,
+ const vector & type,
+ const int & i_idx,
+ const vector & nei_idx_a,
+ const float & rcut,
+ const vector & sec_a)
+{
+ fmt_nei_idx_a.resize (sec_a.back());
+ fill (fmt_nei_idx_a.begin(), fmt_nei_idx_a.end(), -1);
+
+ // gether all neighbors
+ std::vector nei_idx (nei_idx_a);
+ // allocate the information for all neighbors
+ vector sel_nei;
+ sel_nei.reserve (nei_idx_a.size());
+ for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
+ FPTYPE diff[3];
+ const int & j_idx = nei_idx[kk];
+ for (int dd = 0; dd < 3; ++dd) {
+ diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+ }
+ FPTYPE rr = sqrt(MathUtilities::dot (diff, diff));
+ if (rr <= rcut) {
+ sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx));
+ }
+ }
+ sort(sel_nei.begin(), sel_nei.end());
+
+ std::vector nei_iter = sec_a;
+ int overflowed = -1;
+ for (unsigned kk = 0; kk < sel_nei.size(); ++kk) {
+ const int & nei_type = sel_nei[kk].type;
+ if (nei_iter[nei_type] < sec_a[nei_type+1]) {
+ fmt_nei_idx_a[nei_iter[nei_type] ++] = sel_nei[kk].index;
+ }
+ }
+ return overflowed;
+}
+
+template
+void compute_descriptor_se_a_cpu (
+ vector & descrpt_a,
+ vector & descrpt_a_deriv,
+ vector & rij_a,
+ const vector & posi,
+ const int & ntypes,
+ const vector & type,
+ const int & i_idx,
+ const vector & fmt_nlist_a,
+ const vector & sec_a,
+ const float & rmin,
+ const float & rmax)
+{
+ // compute the diff of the neighbors
+ rij_a.resize (sec_a.back() * 3);
+ fill (rij_a.begin(), rij_a.end(), 0.0);
+ for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+ for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+ if (fmt_nlist_a[jj] < 0) break;
+ const int & j_idx = fmt_nlist_a[jj];
+
+ for (int dd = 0; dd < 3; ++dd) {
+ rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+ }
+ }
+ }
+ // 1./rr, cos(theta), cos(phi), sin(phi)
+ descrpt_a.resize (sec_a.back() * 4);
+ fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+ // deriv wrt center: 3
+ descrpt_a_deriv.resize (sec_a.back() * 4 * 3);
+ fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+ for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+ for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {
+ if (fmt_nlist_a[nei_iter] < 0) break;
+ const FPTYPE * rr = &rij_a[nei_iter * 3];
+ FPTYPE nr2 = MathUtilities::dot(rr, rr);
+ FPTYPE inr = 1./sqrt(nr2);
+ FPTYPE nr = nr2 * inr;
+ FPTYPE inr2 = inr * inr;
+ FPTYPE inr4 = inr2 * inr2;
+ FPTYPE inr3 = inr4 * nr;
+ FPTYPE sw, dsw;
+ spline5_switch(sw, dsw, nr, rmin, rmax);
+ int idx_deriv = nei_iter * 4 * 3; // 4 components time 3 directions
+ int idx_value = nei_iter * 4; // 4 components
+ // 4 value components
+ descrpt_a[idx_value + 0] = 1./nr;
+ descrpt_a[idx_value + 1] = rr[0] / nr2;
+ descrpt_a[idx_value + 2] = rr[1] / nr2;
+ descrpt_a[idx_value + 3] = rr[2] / nr2;
+ // deriv of component 1/r
+ descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
+ descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
+ descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
+ // deriv of component x/r2
+ descrpt_a_deriv[idx_deriv + 3] = (2. * rr[0] * rr[0] * inr4 - inr2) * sw - descrpt_a[idx_value + 1] * dsw * rr[0] * inr;
+ descrpt_a_deriv[idx_deriv + 4] = (2. * rr[0] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[1] * inr;
+ descrpt_a_deriv[idx_deriv + 5] = (2. * rr[0] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 1] * dsw * rr[2] * inr;
+ // deriv of component y/r2
+ descrpt_a_deriv[idx_deriv + 6] = (2. * rr[1] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[0] * inr;
+ descrpt_a_deriv[idx_deriv + 7] = (2. * rr[1] * rr[1] * inr4 - inr2) * sw - descrpt_a[idx_value + 2] * dsw * rr[1] * inr;
+ descrpt_a_deriv[idx_deriv + 8] = (2. * rr[1] * rr[2] * inr4 ) * sw - descrpt_a[idx_value + 2] * dsw * rr[2] * inr;
+ // deriv of component z/r2
+ descrpt_a_deriv[idx_deriv + 9] = (2. * rr[2] * rr[0] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[0] * inr;
+ descrpt_a_deriv[idx_deriv +10] = (2. * rr[2] * rr[1] * inr4 ) * sw - descrpt_a[idx_value + 3] * dsw * rr[1] * inr;
+ descrpt_a_deriv[idx_deriv +11] = (2. * rr[2] * rr[2] * inr4 - inr2) * sw - descrpt_a[idx_value + 3] * dsw * rr[2] * inr;
+ // 4 value components
+ descrpt_a[idx_value + 0] *= sw;
+ descrpt_a[idx_value + 1] *= sw;
+ descrpt_a[idx_value + 2] *= sw;
+ descrpt_a[idx_value + 3] *= sw;
+ }
+ }
+}
+
+template
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) {
+ // set & normalize coord
+ std::vector d_coord3(nall * 3);
+ for (int ii = 0; ii < nall; ++ii) {
+ for (int dd = 0; dd < 3; ++dd) {
+ d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+ }
+ }
+
+ // set type
+ std::vector d_type (nall);
+ for (int ii = 0; ii < nall; ++ii) {
+ d_type[ii] = type[ii];
+ }
+
+ // build nlist
+ std::vector > d_nlist_a(nloc);
+
+ for (unsigned ii = 0; ii < nloc; ++ii) {
+ d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+ }
+ for (unsigned ii = 0; ii < nloc; ++ii) {
+ int i_idx = ilist[ii];
+ for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
+ int j_idx = jlist[jj];
+ d_nlist_a[i_idx].push_back (j_idx);
+ }
+ }
+
+ #pragma omp parallel for
+ for (int ii = 0; ii < nloc; ++ii) {
+ vector fmt_nlist_a;
+ int ret = -1;
+ if (fill_nei_a) {
+ format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
+ }
+ std::vector d_descrpt_a;
+ std::vector d_descrpt_a_deriv;
+ std::vector d_descrpt_r;
+ std::vector d_descrpt_r_deriv;
+ std::vector d_rij_a;
+ compute_descriptor_se_a_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
+
+ // check sizes
+ assert (d_descrpt_a.size() == ndescrpt);
+ assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+ assert (d_rij_a.size() == nnei * 3);
+ assert (fmt_nlist_a.size() == nnei);
+ // record outputs
+ for (int jj = 0; jj < ndescrpt; ++jj) {
+ descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj];
+ }
+ for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+ descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3];
+ }
+ for (int jj = 0; jj < nnei * 3; ++jj) {
+ rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+ }
+ for (int jj = 0; jj < nnei; ++jj) {
+ nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+ }
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) {
+ DescrptSeAGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op DescrptSeA
+// ******************************************************************************
+
+inline void make_descript_range (int & idx_start, int & idx_end, const int & nei_idx, const int& n_a_sel, const int n_a_shift) {
+ if (nei_idx < n_a_sel) {
+ idx_start = nei_idx * 4;
+ idx_end = nei_idx * 4 + 4;
+ }
+ else {
+ idx_start = n_a_shift + (nei_idx - n_a_sel);
+ idx_end = n_a_shift + (nei_idx - n_a_sel) + 1;
+ }
+}
+
+template
+void ProdForceSeACPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+ memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
+ // compute force of a frame
+ for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+ // deriv wrt center atom
+ for (int aa = 0; aa < ndescrpt; ++aa) {
+ force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+ force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+ force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+ }
+ // deriv wrt neighbors
+ for (int jj = 0; jj < nnei; ++jj) {
+ int j_idx = nlist[i_idx * nnei + jj];
+ if (j_idx < 0) continue;
+ int aa_start, aa_end;
+ make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
+ for (int aa = aa_start; aa < aa_end; ++aa) {
+ force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+ force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+ force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+ }
+ }
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void ProdForceSeAGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+ ProdForceSeAGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+#endif // GOOGLE_CUDA
+
+// ******************************************************************************
+// end of custome op ProdForceSeA
+// ******************************************************************************
+
+template
+void ProdVirialSeACPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+ memset(virial, 0.0, sizeof(FPTYPE) * 9);
+ memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9);
+
+ // compute virial of a frame
+ for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+ // deriv wrt neighbors
+ for (int jj = 0; jj < nnei; ++jj) {
+ int j_idx = nlist[i_idx * nnei + jj];
+ if (j_idx < 0) continue;
+ int aa_start, aa_end;
+ make_descript_range (aa_start, aa_end, jj, n_a_sel, n_a_shift);
+ for (int aa = aa_start; aa < aa_end; ++aa) {
+ FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + aa];
+ for (int dd0 = 0; dd0 < 3; ++dd0)
+ for (int dd1 = 0; dd1 < 3; ++dd1) {
+ FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
+ virial[dd0 * 3 + dd1] -= tmp_v;
+ atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
+ }
+ }
+ }
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void ProdVirialSeAGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift) {
+ ProdVirialSeAGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, n_a_sel, n_a_shift);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op ProdVirialSeA
+// ******************************************************************************
+
+template
+void GeluCPULauncher(const FPTYPE * in, FPTYPE * out, int const size) {
+ for (int ii = 0; ii < size; ii++) {
+ out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])));
+ }
+}
+
+template
+void GeluGradCPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+ for (int ii = 0; ii < size; ii++) {
+ FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+ out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
+ }
+}
+
+template
+void GeluGradGradCPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+ for (int ii = 0; ii < size; ii++) {
+ FPTYPE const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+ FPTYPE const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+ out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void GeluGPULauncher(const FPTYPE * in, FPTYPE * out, int const size) {
+ GeluGPUExecuteFunctor()(in, out, size);
+}
+
+template
+void GeluGradGPULauncher(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, int const size) {
+ GeluGradGPUExecuteFunctor()(dy, in, out, size);
+}
+
+template
+void GeluGradGradGPULauncher(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, int const size) {
+ GeluGradGradGPUExecuteFunctor()(dy, dy_, in, out, size);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op Gelu
+// ******************************************************************************
+
+template
+void compute_descriptor_se_r_cpu (
+ vector & descrpt_a,
+ vector & descrpt_a_deriv,
+ vector & rij_a,
+ const vector & posi,
+ const int & ntypes,
+ const vector & type,
+ const int & i_idx,
+ const vector & fmt_nlist_a,
+ const vector & sec_a,
+ const float & rmin,
+ const float & rmax)
+{
+ // compute the diff of the neighbors
+ rij_a.resize (sec_a.back() * 3);
+ fill (rij_a.begin(), rij_a.end(), 0.0);
+ for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii) {
+ for (int jj = sec_a[ii]; jj < sec_a[ii + 1]; ++jj) {
+ if (fmt_nlist_a[jj] < 0) break;
+ const int & j_idx = fmt_nlist_a[jj];
+
+ for (int dd = 0; dd < 3; ++dd) {
+ rij_a[jj * 3 + dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
+ }
+ }
+ }
+ // 1./rr, cos(theta), cos(phi), sin(phi)
+ descrpt_a.resize (sec_a.back());
+ fill (descrpt_a.begin(), descrpt_a.end(), 0.0);
+ // deriv wrt center: 3
+ descrpt_a_deriv.resize (sec_a.back() * 3);
+ fill (descrpt_a_deriv.begin(), descrpt_a_deriv.end(), 0.0);
+
+ for (int sec_iter = 0; sec_iter < int(sec_a.size()) - 1; ++sec_iter) {
+ for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {
+ if (fmt_nlist_a[nei_iter] < 0) break;
+ const FPTYPE * rr = &rij_a[nei_iter * 3];
+ FPTYPE nr2 = MathUtilities::dot(rr, rr);
+ FPTYPE inr = 1./sqrt(nr2);
+ FPTYPE nr = nr2 * inr;
+ FPTYPE inr2 = inr * inr;
+ FPTYPE inr4 = inr2 * inr2;
+ FPTYPE inr3 = inr4 * nr;
+ FPTYPE sw, dsw;
+ spline5_switch(sw, dsw, nr, rmin, rmax);
+ int idx_deriv = nei_iter * 3; // 1 components time 3 directions
+ int idx_value = nei_iter; // 1 components
+ // 4 value components
+ descrpt_a[idx_value + 0] = 1./nr;
+ // deriv of component 1/r
+ descrpt_a_deriv[idx_deriv + 0] = rr[0] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[0] * inr;
+ descrpt_a_deriv[idx_deriv + 1] = rr[1] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[1] * inr;
+ descrpt_a_deriv[idx_deriv + 2] = rr[2] * inr3 * sw - descrpt_a[idx_value + 0] * dsw * rr[2] * inr;
+ // 4 value components
+ descrpt_a[idx_value + 0] *= sw;
+ }
+ }
+}
+
+template
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) {
+ // set & normalize coord
+ std::vector d_coord3(nall * 3);
+ for (int ii = 0; ii < nall; ++ii) {
+ for (int dd = 0; dd < 3; ++dd) {
+ d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+ }
+ }
+
+ // set type
+ std::vector d_type (nall);
+ for (int ii = 0; ii < nall; ++ii) {
+ d_type[ii] = type[ii];
+ }
+
+ // build nlist
+ std::vector > d_nlist_a(nloc);
+
+ for (unsigned ii = 0; ii < nloc; ++ii) {
+ d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+ }
+ for (unsigned ii = 0; ii < nloc; ++ii) {
+ int i_idx = ilist[ii];
+ for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
+ int j_idx = jlist[jj];
+ d_nlist_a[i_idx].push_back (j_idx);
+ }
+ }
+
+ #pragma omp parallel for
+ for (int ii = 0; ii < nloc; ++ii) {
+ vector fmt_nlist_a;
+ int ret = -1;
+ if (fill_nei_a) {
+ format_nlist_fill_se_a_cpu(fmt_nlist_a, d_coord3, ntypes, d_type, ii, d_nlist_a[ii], rcut_r, sec_a);
+ }
+ std::vector d_descrpt_a;
+ std::vector d_descrpt_a_deriv;
+ std::vector d_descrpt_r;
+ std::vector d_descrpt_r_deriv;
+ std::vector d_rij_a;
+ compute_descriptor_se_r_cpu (d_descrpt_a, d_descrpt_a_deriv, d_rij_a, d_coord3, ntypes, d_type, ii, fmt_nlist_a, sec_a, rcut_r_smth, rcut_r);
+
+ // check sizes
+ assert (d_descrpt_a.size() == ndescrpt);
+ assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+ assert (d_rij_a.size() == nnei * 3);
+ assert (fmt_nlist_a.size() == nnei);
+ // record outputs
+ for (int jj = 0; jj < ndescrpt; ++jj) {
+ descrpt[ii * ndescrpt + jj] = (d_descrpt_a[jj] - avg[d_type[ii] * ndescrpt + jj]) / std[d_type[ii] * ndescrpt + jj];
+ }
+ for (int jj = 0; jj < ndescrpt * 3; ++jj) {
+ descrpt_deriv[ii * ndescrpt * 3 + jj] = d_descrpt_a_deriv[jj] / std[d_type[ii] * ndescrpt + jj / 3];
+ }
+ for (int jj = 0; jj < nnei * 3; ++jj) {
+ rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+ }
+ for (int jj = 0; jj < nnei; ++jj) {
+ nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+ }
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int magic_number) {
+ DescrptSeRGPUExecuteFunctor()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op DescrptSeR
+// ******************************************************************************
+
+template
+void ProdForceSeRCPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+ memset(force, 0.0, sizeof(FPTYPE) * nall * 3);
+ // compute force of a frame
+ for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+ // deriv wrt center atom
+ for (int aa = 0; aa < ndescrpt; ++aa) {
+ force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+ force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+ force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * in_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+ }
+ // deriv wrt neighbors
+ for (int jj = 0; jj < nnei; ++jj) {
+ int j_idx = nlist[i_idx * nnei + jj];
+ if (j_idx < 0) continue;
+ force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0];
+ force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1];
+ force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2];
+ }
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void ProdForceSeRGPULauncher(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+ ProdForceSeRGPUExecuteFunctor()(force, net_deriv, in_deriv, nlist, nloc, nall, nnei, ndescrpt);
+}
+#endif // GOOGLE_CUDA
+
+// ******************************************************************************
+// end of custome op ProdForceSeR
+// ******************************************************************************
+
+template
+void ProdVirialSeRCPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+ memset(virial, 0.0, sizeof(FPTYPE) * 9);
+ memset(atom_virial, 0.0, sizeof(FPTYPE) * nall * 9);
+
+ // compute virial of a frame
+ for (int i_idx = 0; i_idx < nloc; ++i_idx) {
+ // deriv wrt neighbors
+ for (int jj = 0; jj < nnei; ++jj) {
+ int j_idx = nlist[i_idx * nnei + jj];
+ if (j_idx < 0) continue;
+ FPTYPE pref = -1.0 * net_deriv[i_idx * ndescrpt + jj];
+ for (int dd0 = 0; dd0 < 3; ++dd0)
+ for (int dd1 = 0; dd1 < 3; ++dd1) {
+ FPTYPE tmp_v = pref * rij[i_idx * nnei * 3 + jj * 3 + dd1] * in_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
+ virial[dd0 * 3 + dd1] -= tmp_v;
+ atom_virial[j_idx * 9 + dd0 * 3 + dd1] -= tmp_v;
+ }
+ }
+ }
+}
+
+#if GOOGLE_CUDA
+template
+void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt) {
+ ProdVirialSeRGPUExecuteFunctor()(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei, ndescrpt);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op ProdVirialSeR
+// ******************************************************************************
diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h
new file mode 100644
index 0000000000..d51d617f84
--- /dev/null
+++ b/source/lib/include/DeviceFunctor.h
@@ -0,0 +1,62 @@
+#pragma once
+#include
+#include
+#include
+#include
+#include
+
+typedef unsigned long long int_64;
+#define SQRT_2_PI 0.7978845608028654
+
+#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
+inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+ if (code != cudaSuccess) {
+ fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
+ if (abort) exit(code);
+ }
+}
+
+template
+struct DescrptSeAGPUExecuteFunctor {
+ void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+};
+
+template
+struct DescrptSeRGPUExecuteFunctor {
+ void operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER);
+};
+
+template
+struct ProdForceSeAGPUExecuteFunctor {
+ void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template
+struct ProdForceSeRGPUExecuteFunctor {
+ void operator()(FPTYPE * force, const FPTYPE * net_derive, const FPTYPE * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt);
+};
+
+template
+struct ProdVirialSeAGPUExecuteFunctor {
+ void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const int n_a_sel, const int n_a_shift);
+};
+
+template
+struct ProdVirialSeRGPUExecuteFunctor {
+ void operator()(FPTYPE * virial, FPTYPE * atom_virial, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const FPTYPE * rij, const int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt);
+};
+
+template
+struct GeluGPUExecuteFunctor {
+ void operator()(const FPTYPE * in, FPTYPE * out, const int size);
+};
+
+template
+struct GeluGradGPUExecuteFunctor {
+ void operator()(const FPTYPE * dy, const FPTYPE * in, FPTYPE * out, const int size);
+};
+
+template
+struct GeluGradGradGPUExecuteFunctor {
+ void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size);
+};
\ No newline at end of file
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index a32940a738..6c37770758 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -98,9 +98,6 @@ class NNPInter
// function used for neighbor list copy
vector get_sel_a() const;
-#ifdef USE_CUDA_TOOLKIT
- void update_nbor(const InternalNeighborList & nlist, const int nloc);
-#endif
};
class NNPInterModelDevi
@@ -195,9 +192,6 @@ class NNPInterModelDevi
// function used for nborlist copy
vector > get_sel() const;
void cum_sum(const std::vector > n_sel);
-#ifdef USE_CUDA_TOOLKIT
- void update_nbor(const InternalNeighborList & nlist, const int nloc);
-#endif
};
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
index 620c88eb6f..59167816fe 100644
--- a/source/lib/include/common.h
+++ b/source/lib/include/common.h
@@ -10,6 +10,8 @@
using namespace tensorflow;
using namespace std;
+#include
+#include
#if TF_MAJOR_VERSION >= 2 && TF_MINOR_VERSION >= 2
typedef tensorflow::tstring STRINGTYPE;
@@ -19,8 +21,12 @@ typedef std::string STRINGTYPE;
#include "NNPAtomMap.h"
#include
+#include
+#include
#include "version.h"
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
#ifdef HIGH_PREC
typedef double VALUETYPE;
typedef double ENERGYTYPE;
@@ -130,6 +136,20 @@ session_input_tensors (std::vector> & input_tensors,
const int nghost = 0,
const string scope = "");
+int
+session_input_tensors (std::vector> & input_tensors,
+ const vector & dcoord_,
+ const int & ntypes,
+ const vector & datype_,
+ const vector & dbox,
+ InternalNeighborList & dlist,
+ const vector & fparam_,
+ const vector & aparam_,
+ const NNPAtomMap& nnpmap,
+ const int nghost,
+ const int ago,
+ const string scope = "");
+
int
session_input_tensors (std::vector> & input_tensors,
const vector & dcoord_,
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index c1347504bb..8bedd88c9c 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -4,11 +4,8 @@
#include
-#ifdef USE_CUDA_TOOLKIT
+#if GOOGLE_CUDA
#include "cuda_runtime.h"
-#include
-#include
-#include
#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
@@ -57,7 +54,6 @@ run_model (ENERGYTYPE & dener,
return;
}
-#ifdef USE_CUDA_TOOLKIT
std::vector output_tensors;
checkStatus (session->Run(input_tensors,
{"o_energy", "o_force", "o_atom_virial"},
@@ -74,56 +70,23 @@ run_model (ENERGYTYPE & dener,
dener = oe(0);
vector dforce (3 * nall);
- vector datom_virial (9 * nall);
dvirial.resize (9);
for (unsigned ii = 0; ii < nall * 3; ++ii){
dforce[ii] = of(ii);
}
- for (int ii = 0; ii < nall * 9; ++ii) {
- datom_virial[ii] = oav(ii);
- }
for (int ii = 0; ii < nall; ++ii) {
- dvirial[0] += 1.0 * datom_virial[9*ii+0];
- dvirial[1] += 1.0 * datom_virial[9*ii+1];
- dvirial[2] += 1.0 * datom_virial[9*ii+2];
- dvirial[3] += 1.0 * datom_virial[9*ii+3];
- dvirial[4] += 1.0 * datom_virial[9*ii+4];
- dvirial[5] += 1.0 * datom_virial[9*ii+5];
- dvirial[6] += 1.0 * datom_virial[9*ii+6];
- dvirial[7] += 1.0 * datom_virial[9*ii+7];
- dvirial[8] += 1.0 * datom_virial[9*ii+8];
- }
-
- dforce_ = dforce;
- nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-#else
- std::vector output_tensors;
-
- checkStatus (session->Run(input_tensors,
- {"o_energy", "o_force", "o_virial"},
- {},
- &output_tensors));
-
- Tensor output_e = output_tensors[0];
- Tensor output_f = output_tensors[1];
- Tensor output_v = output_tensors[2];
-
- auto oe = output_e.flat ();
- auto of = output_f.flat ();
- auto ov = output_v.flat ();
-
- dener = oe(0);
- vector dforce (3 * nall);
- dvirial.resize (9);
- for (unsigned ii = 0; ii < nall * 3; ++ii){
- dforce[ii] = of(ii);
- }
- for (unsigned ii = 0; ii < 9; ++ii){
- dvirial[ii] = ov(ii);
+ dvirial[0] += 1.0 * oav(9*ii+0);
+ dvirial[1] += 1.0 * oav(9*ii+1);
+ dvirial[2] += 1.0 * oav(9*ii+2);
+ dvirial[3] += 1.0 * oav(9*ii+3);
+ dvirial[4] += 1.0 * oav(9*ii+4);
+ dvirial[5] += 1.0 * oav(9*ii+5);
+ dvirial[6] += 1.0 * oav(9*ii+6);
+ dvirial[7] += 1.0 * oav(9*ii+7);
+ dvirial[8] += 1.0 * oav(9*ii+8);
}
dforce_ = dforce;
nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
-#endif
}
static void run_model (ENERGYTYPE & dener,
@@ -155,7 +118,6 @@ static void run_model (ENERGYTYPE & dener,
fill(datom_virial_.begin(), datom_virial_.end(), 0.0);
return;
}
-#ifdef USE_CUDA_TOOLKIT
std::vector output_tensors;
checkStatus (session->Run(input_tensors,
@@ -204,50 +166,6 @@ static void run_model (ENERGYTYPE & dener,
nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
-#else
- std::vector output_tensors;
-
- checkStatus (session->Run(input_tensors,
- {"o_energy", "o_force", "o_virial", "o_atom_energy", "o_atom_virial"},
- {},
- &output_tensors));
-
- Tensor output_e = output_tensors[0];
- Tensor output_f = output_tensors[1];
- Tensor output_v = output_tensors[2];
- Tensor output_ae = output_tensors[3];
- Tensor output_av = output_tensors[4];
-
- auto oe = output_e.flat ();
- auto of = output_f.flat ();
- auto ov = output_v.flat ();
- auto oae = output_ae.flat ();
- auto oav = output_av.flat ();
-
- dener = oe(0);
- vector dforce (3 * nall);
- vector datom_energy (nall, 0);
- vector datom_virial (9 * nall);
- dvirial.resize (9);
- for (int ii = 0; ii < nall * 3; ++ii) {
- dforce[ii] = of(ii);
- }
- for (int ii = 0; ii < nloc; ++ii) {
- datom_energy[ii] = oae(ii);
- }
- for (int ii = 0; ii < nall * 9; ++ii) {
- datom_virial[ii] = oav(ii);
- }
- for (int ii = 0; ii < 9; ++ii) {
- dvirial[ii] = ov(ii);
- }
- dforce_ = dforce;
- datom_energy_ = datom_energy;
- datom_virial_ = datom_virial;
- nnpmap.backward (dforce_.begin(), dforce.begin(), 3);
- nnpmap.backward (datom_energy_.begin(), datom_energy.begin(), 1);
- nnpmap.backward (datom_virial_.begin(), datom_virial.begin(), 9);
-#endif
}
@@ -266,50 +184,8 @@ NNPInter (const string & model, const int & gpu_rank)
init(model, gpu_rank);
}
-NNPInter::~NNPInter() {
- #ifdef USE_CUDA_TOOLKIT
- if (init_nbor) {
- cudaErrcheck(cudaFree(ilist));
- cudaErrcheck(cudaFree(jrange));
- cudaErrcheck(cudaFree(jlist));
- }
- #endif
-}
-
-#ifdef USE_CUDA_TOOLKIT
-void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
- if (!init_nbor) {
- cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
- cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
- cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
- ilist_size = nlist.ilist.size();
- jrange_size = nlist.jrange.size();
- jlist_size = nlist.jlist.size();
- init_nbor = true;
- }
- if (ilist_size < nlist.ilist.size()) {
- cudaErrcheck(cudaFree(ilist));
- cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
- ilist_size = nlist.ilist.size();
- }
- if (jrange_size < nlist.jrange.size()) {
- cudaErrcheck(cudaFree(jrange));
- cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
- jrange_size = nlist.jrange.size();
- }
- if (jlist_size < nlist.jlist.size()) {
- cudaErrcheck(cudaFree(jlist));
- cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
- jlist_size = nlist.jlist.size();
- }
-
- cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
- cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
- cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
-}
-#endif // USE_CUDA_TOOLKIT
+NNPInter::~NNPInter() {}
-#ifdef USE_CUDA_TOOLKIT
void
NNPInter::
init (const string & model, const int & gpu_rank)
@@ -318,21 +194,21 @@ init (const string & model, const int & gpu_rank)
SessionOptions options;
options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
- options.config.set_allow_soft_placement(true);
- options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
- options.config.mutable_gpu_options()->set_allow_growth(true);
checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
int gpu_num = -1;
- cudaGetDeviceCount(&gpu_num);
- // std::cout << "current number of devices: " << gpu_num << std::endl;
- // set device to GPU only when at least GPU is found
+ #if GOOGLE_CUDA
+ cudaGetDeviceCount(&gpu_num); // check current device environment
if (gpu_num > 0) {
+ options.config.set_allow_soft_placement(true);
+ options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+ options.config.mutable_gpu_options()->set_allow_growth(true);
+ cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
std::string str = "/gpu:";
str += std::to_string(gpu_rank % gpu_num);
graph::SetDefaultDevice(str, &graph_def);
- // std::cout << "current device rank: " << str << std::endl;
}
+ #endif // GOOGLE_CUDA
checkStatus (NewSession(options, &session));
checkStatus (session->Create(graph_def));
rcut = get_scalar("descrpt_attr/rcut");
@@ -340,8 +216,6 @@ init (const string & model, const int & gpu_rank)
ntypes = get_scalar("descrpt_attr/ntypes");
dfparam = get_scalar("fitting_attr/dfparam");
daparam = get_scalar("fitting_attr/daparam");
- // assert(rcut == get_rcut());
- // assert(ntypes == get_ntypes());
if (dfparam < 0) dfparam = 0;
if (daparam < 0) daparam = 0;
inited = true;
@@ -350,38 +224,6 @@ init (const string & model, const int & gpu_rank)
ilist = NULL; jrange = NULL; jlist = NULL;
ilist_size = 0; jrange_size = 0; jlist_size = 0;
}
-#else
-void
-NNPInter::
-init (const string & model, const int & gpu_rank)
-{
- assert (!inited);
- SessionOptions options;
- options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
- options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
- checkStatus (NewSession(options, &session));
- checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
- checkStatus (session->Create(graph_def));
- rcut = get_scalar("descrpt_attr/rcut");
- cell_size = rcut;
- ntypes = get_scalar("descrpt_attr/ntypes");
- dfparam = get_scalar("fitting_attr/dfparam");
- daparam = get_scalar("fitting_attr/daparam");
- // assert(rcut == get_rcut());
- // assert(ntypes == get_ntypes());
- if (dfparam < 0) dfparam = 0;
- if (daparam < 0) daparam = 0;
- // rcut = get_rcut();
- // cell_size = rcut;
- // ntypes = get_ntypes();
- // dfparam = get_dfparam();
- inited = true;
-
- init_nbor = false;
- ilist = NULL; jrange = NULL; jlist = NULL;
- ilist_size = 0; jrange_size = 0; jlist_size = 0;
-}
-#endif
void
NNPInter::
@@ -554,17 +396,10 @@ compute_inner (ENERGYTYPE & dener,
nnpmap = NNPAtomMap (datype_.begin(), datype_.begin() + nloc);
assert (nloc == nnpmap.get_type().size());
- shuffle_nlist (nlist, nnpmap);
- #ifdef USE_CUDA_TOOLKIT
- update_nbor(nlist, nloc);
- #endif
+ shuffle_nlist (nlist, nnpmap);
}
- #ifdef USE_CUDA_TOOLKIT
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
- #else
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
- #endif
+ int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
assert (nloc == ret);
run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
}
@@ -622,16 +457,9 @@ compute (ENERGYTYPE & dener,
// InternalNeighborList nlist;
convert_nlist_lmp_internal (nlist, lmp_list);
shuffle_nlist (nlist, nnpmap);
- #ifdef USE_CUDA_TOOLKIT
- update_nbor(nlist, nloc);
- #endif
}
- #ifdef USE_CUDA_TOOLKIT
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
- #else
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
- #endif
+ int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
assert (nloc == ret);
run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost);
}
@@ -663,17 +491,8 @@ NNPInterModelDevi (const vector & models, const int & gpu_rank)
init(models, gpu_rank);
}
-NNPInterModelDevi::~NNPInterModelDevi() {
-#ifdef USE_CUDA_TOOLKIT
- if (init_nbor) {
- cudaErrcheck(cudaFree(ilist));
- cudaErrcheck(cudaFree(jrange));
- cudaErrcheck(cudaFree(jlist));
- }
-#endif
-}
+NNPInterModelDevi::~NNPInterModelDevi() {}
-#ifdef USE_CUDA_TOOLKIT
void
NNPInterModelDevi::
init (const vector & models, const int & gpu_rank)
@@ -682,26 +501,32 @@ init (const vector & models, const int & gpu_rank)
numb_models = models.size();
sessions.resize(numb_models);
graph_defs.resize(numb_models);
+
+ int gpu_num = -1;
+ #if GOOGLE_CUDA
+ cudaGetDeviceCount(&gpu_num);
+ #endif // GOOGLE_CUDA
+
SessionOptions options;
options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
- options.config.set_allow_soft_placement(true);
- options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
- options.config.mutable_gpu_options()->set_allow_growth(true);
-
for (unsigned ii = 0; ii < numb_models; ++ii){
checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
}
- int gpu_num = -1;
- cudaGetDeviceCount(&gpu_num);
- // std::cout << "current number of devices: " << gpu_num << std::endl;
- for (unsigned ii = 0; ii < numb_models; ++ii){
- // set device to GPU only when at least GPU is found
+ #if GOOGLE_CUDA
+ if (gpu_num > 0) {
+ options.config.set_allow_soft_placement(true);
+ options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
+ options.config.mutable_gpu_options()->set_allow_growth(true);
+ cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
+ }
+ #endif // GOOGLE_CUDA
+
+ for (unsigned ii = 0; ii < numb_models; ++ii) {
if (gpu_num > 0) {
std::string str = "/gpu:";
str += std::to_string(gpu_rank % gpu_num);
graph::SetDefaultDevice(str, &graph_defs[ii]);
- // std::cout << "current device rank: " << str << std::endl;
}
checkStatus (NewSession(options, &(sessions[ii])));
checkStatus (sessions[ii]->Create(graph_defs[ii]));
@@ -722,40 +547,6 @@ init (const vector & models, const int & gpu_rank)
ilist = NULL; jrange = NULL; jlist = NULL;
ilist_size = 0; jrange_size = 0; jlist_size = 0;
}
-#else
-void
-NNPInterModelDevi::
-init (const vector & models, const int & gpu_rank)
-{
- assert (!inited);
- numb_models = models.size();
- sessions.resize(numb_models);
- graph_defs.resize(numb_models);
- SessionOptions options;
- options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
- options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
- for (unsigned ii = 0; ii < numb_models; ++ii){
- checkStatus (NewSession(options, &(sessions[ii])));
- checkStatus (ReadBinaryProto(Env::Default(), models[ii], &graph_defs[ii]));
- checkStatus (sessions[ii]->Create(graph_defs[ii]));
- }
- rcut = get_scalar("descrpt_attr/rcut");
- cell_size = rcut;
- ntypes = get_scalar("descrpt_attr/ntypes");
- dfparam = get_scalar("fitting_attr/dfparam");
- daparam = get_scalar("fitting_attr/daparam");
- if (dfparam < 0) dfparam = 0;
- if (daparam < 0) daparam = 0;
- // rcut = get_rcut();
- // cell_size = rcut;
- // ntypes = get_ntypes();
- inited = true;
-
- init_nbor = false;
- ilist = NULL; jrange = NULL; jlist = NULL;
- ilist_size = 0; jrange_size = 0; jlist_size = 0;
-}
-#endif
template
VT
@@ -821,42 +612,6 @@ cum_sum (const std::vector > n_sel)
}
}
-#ifdef USE_CUDA_TOOLKIT
-void
-NNPInterModelDevi::
-update_nbor(const InternalNeighborList & nlist, const int nloc)
-{
- if (!init_nbor) {
- cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
- cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
- cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
- ilist_size = nlist.ilist.size();
- jrange_size = nlist.jrange.size();
- jlist_size = nlist.jlist.size();
- init_nbor = true;
- }
- if (ilist_size < nlist.ilist.size()) {
- cudaErrcheck(cudaFree(ilist));
- cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
- ilist_size = nlist.ilist.size();
- }
- if (jrange_size < nlist.jrange.size()) {
- cudaErrcheck(cudaFree(jrange));
- cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
- jrange_size = nlist.jrange.size();
- }
- if (jlist_size < nlist.jlist.size()) {
- cudaErrcheck(cudaFree(jlist));
- cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
- jlist_size = nlist.jlist.size();
- }
-
- cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
- cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
- cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
-}
-#endif //USE_CUDA_TOOLKIT
-
void
NNPInterModelDevi::
validate_fparam_aparam(const int & nloc,
@@ -946,16 +701,8 @@ compute (vector & all_energy,
// InternalNeighborList nlist;
convert_nlist_lmp_internal (nlist, lmp_list);
shuffle_nlist (nlist, nnpmap);
- #ifdef USE_CUDA_TOOLKIT
- update_nbor(nlist, nloc);
- #endif
-
}
- #ifdef USE_CUDA_TOOLKIT
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
- #else
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
- #endif
+ int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
all_energy.resize (numb_models);
all_force.resize (numb_models);
@@ -996,16 +743,8 @@ compute (vector & all_energy,
// InternalNeighborList nlist;
convert_nlist_lmp_internal (nlist, lmp_list);
shuffle_nlist (nlist, nnpmap);
- #ifdef USE_CUDA_TOOLKIT
- update_nbor(nlist, nloc);
- #endif
-
}
- #ifdef USE_CUDA_TOOLKIT
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
- #else
- int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
- #endif
+ int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost, ago);
all_energy.resize (numb_models);
all_force .resize (numb_models);
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
index dd4c3f672a..d990195e6c 100644
--- a/source/lib/src/common.cc
+++ b/source/lib/src/common.cc
@@ -340,6 +340,136 @@ session_input_tensors (std::vector> & input_tensors,
return nloc;
}
+int
+session_input_tensors (std::vector> & input_tensors,
+ const vector & dcoord_,
+ const int & ntypes,
+ const vector & datype_,
+ const vector & dbox,
+ InternalNeighborList & dlist,
+ const vector & fparam_,
+ const vector & aparam_,
+ const NNPAtomMap& nnpmap,
+ const int nghost,
+ const int ago,
+ const string scope)
+{
+ assert (dbox.size() == 9);
+
+ int nframes = 1;
+ int nall = dcoord_.size() / 3;
+ int nloc = nall - nghost;
+ assert (nall == datype_.size());
+
+ vector datype = nnpmap.get_type();
+ vector type_count (ntypes, 0);
+ for (unsigned ii = 0; ii < datype.size(); ++ii){
+ type_count[datype[ii]] ++;
+ }
+ datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+ TensorShape coord_shape ;
+ coord_shape.AddDim (nframes);
+ coord_shape.AddDim (nall * 3);
+ TensorShape type_shape ;
+ type_shape.AddDim (nframes);
+ type_shape.AddDim (nall);
+ TensorShape box_shape ;
+ box_shape.AddDim (nframes);
+ box_shape.AddDim (9);
+ TensorShape mesh_shape ;
+ mesh_shape.AddDim (16);
+ TensorShape natoms_shape ;
+ natoms_shape.AddDim (2 + ntypes);
+ TensorShape fparam_shape ;
+ fparam_shape.AddDim (nframes);
+ fparam_shape.AddDim (fparam_.size());
+ TensorShape aparam_shape ;
+ aparam_shape.AddDim (nframes);
+ aparam_shape.AddDim (aparam_.size());
+
+#ifdef HIGH_PREC
+ Tensor coord_tensor (DT_DOUBLE, coord_shape);
+ Tensor box_tensor (DT_DOUBLE, box_shape);
+ Tensor fparam_tensor (DT_DOUBLE, fparam_shape);
+ Tensor aparam_tensor (DT_DOUBLE, aparam_shape);
+#else
+ Tensor coord_tensor (DT_FLOAT, coord_shape);
+ Tensor box_tensor (DT_FLOAT, box_shape);
+ Tensor fparam_tensor (DT_FLOAT, fparam_shape);
+ Tensor aparam_tensor (DT_FLOAT, aparam_shape);
+#endif
+ Tensor type_tensor (DT_INT32, type_shape);
+ Tensor mesh_tensor (DT_INT32, mesh_shape);
+ Tensor natoms_tensor (DT_INT32, natoms_shape);
+
+ auto coord = coord_tensor.matrix ();
+ auto type = type_tensor.matrix ();
+ auto box = box_tensor.matrix ();
+ auto mesh = mesh_tensor.flat ();
+ auto natoms = natoms_tensor.flat ();
+ auto fparam = fparam_tensor.matrix ();
+ auto aparam = aparam_tensor.matrix ();
+
+ vector dcoord (dcoord_);
+ nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+
+ for (int ii = 0; ii < nframes; ++ii){
+ for (int jj = 0; jj < nall * 3; ++jj){
+ coord(ii, jj) = dcoord[jj];
+ }
+ for (int jj = 0; jj < 9; ++jj){
+ box(ii, jj) = dbox[jj];
+ }
+ for (int jj = 0; jj < nall; ++jj){
+ type(ii, jj) = datype[jj];
+ }
+ for (int jj = 0; jj < fparam_.size(); ++jj){
+ fparam(ii, jj) = fparam_[jj];
+ }
+ for (int jj = 0; jj < aparam_.size(); ++jj){
+ aparam(ii, jj) = aparam_[jj];
+ }
+ }
+
+ for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
+
+ const int stride = sizeof(int *) / sizeof(int);
+ assert (stride * sizeof(int) == sizeof(int *));
+ assert (stride <= 4);
+ mesh (0) = ago;
+ mesh (1) = dlist.ilist.size();
+ mesh (2) = dlist.jrange.size();
+ mesh (3) = dlist.jlist.size();
+ dlist.make_ptrs();
+ memcpy (&mesh(4), &(dlist.pilist), sizeof(int *));
+ memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *));
+ memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *));
+
+ natoms (0) = nloc;
+ natoms (1) = nall;
+ for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+
+ string prefix = "";
+ if (scope != ""){
+ prefix = scope + "/";
+ }
+ input_tensors = {
+ {prefix+"t_coord", coord_tensor},
+ {prefix+"t_type", type_tensor},
+ {prefix+"t_box", box_tensor},
+ {prefix+"t_mesh", mesh_tensor},
+ {prefix+"t_natoms",natoms_tensor},
+ };
+ if (fparam_.size() > 0) {
+ input_tensors.push_back({prefix+"t_fparam", fparam_tensor});
+ }
+ if (aparam_.size() > 0) {
+ input_tensors.push_back({prefix+"t_aparam", aparam_tensor});
+ }
+ return nloc;
+}
+
int
session_input_tensors (std::vector> & input_tensors,
const vector & dcoord_,
diff --git a/source/lmp/pair_nnp.cpp b/source/lmp/pair_nnp.cpp
index ba3dcd3286..e8cff008f5 100644
--- a/source/lmp/pair_nnp.cpp
+++ b/source/lmp/pair_nnp.cpp
@@ -816,8 +816,8 @@ void PairNNP::coeff(int narg, char **arg)
ihi = n;
jhi = n;
if (narg == 2) {
- force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
- force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+ utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
+ utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
if (ilo != 1 || jlo != 1 || ihi != n || jhi != n) {
error->all(FLERR,"deepmd requires that the scale should be set to all atom types, i.e. pair_coeff * *.");
}
diff --git a/source/lmp/pppm_dplr.cpp b/source/lmp/pppm_dplr.cpp
index e5643e114f..da95f58c9d 100644
--- a/source/lmp/pppm_dplr.cpp
+++ b/source/lmp/pppm_dplr.cpp
@@ -1,6 +1,6 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
- http://lammps.sandia.gov, Sandia National Laboratories
+ https://lammps.sandia.gov/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
@@ -38,6 +38,7 @@ enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
#define ONEF 1.0
#endif
+
/* ---------------------------------------------------------------------- */
#ifdef OLD_LMP_PPPM
@@ -68,7 +69,6 @@ void PPPMDPLR::init()
fill(fele.begin(), fele.end(), 0.0);
}
-
/* ----------------------------------------------------------------------
compute the PPPM long-range force, energy, virial
------------------------------------------------------------------------- */
@@ -80,15 +80,9 @@ void PPPMDPLR::compute(int eflag, int vflag)
// set energy/virial flags
// invoke allocate_peratom() if needed for first time
- if (eflag || vflag) ev_setup(eflag,vflag);
- else evflag = evflag_atom = eflag_global = vflag_global =
- eflag_atom = vflag_atom = 0;
+ ev_init(eflag,vflag);
- if (evflag_atom && !peratom_allocate_flag) {
- allocate_peratom();
- cg_peratom->ghost_notify();
- cg_peratom->setup();
- }
+ if (evflag_atom && !peratom_allocate_flag) allocate_peratom();
// if atom count has changed, update qsum and qsqsum
@@ -127,7 +121,8 @@ void PPPMDPLR::compute(int eflag, int vflag)
// to fully sum contribution in their 3d bricks
// remap from 3d decomposition to FFT decomposition
- cg->reverse_comm(this,REVERSE_RHO);
+ gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+ gc_buf1,gc_buf2,MPI_FFT_SCALAR);
brick2fft();
// compute potential gradient on my FFT grid and
@@ -140,16 +135,22 @@ void PPPMDPLR::compute(int eflag, int vflag)
// all procs communicate E-field values
// to fill ghost cells surrounding their 3d bricks
- if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
- else cg->forward_comm(this,FORWARD_IK);
+ if (differentiation_flag == 1)
+ gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+ gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+ else
+ gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+ gc_buf1,gc_buf2,MPI_FFT_SCALAR);
// extra per-atom energy/virial communication
if (evflag_atom) {
if (differentiation_flag == 1 && vflag_atom)
- cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+ gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+ gc_buf1,gc_buf2,MPI_FFT_SCALAR);
else if (differentiation_flag == 0)
- cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+ gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+ gc_buf1,gc_buf2,MPI_FFT_SCALAR);
}
// calculate the force on my particles
@@ -183,14 +184,6 @@ void PPPMDPLR::compute(int eflag, int vflag)
MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
}
- // std::cout<< "energy in pppm -------------------" << std::endl;
- // std::cout << energy << " "
- // << std::endl;
- // std::cout<< "virial in pppm -------------------" << std::endl;
- // for (int ii = 0; ii < 6; ++ii){
- // std::cout << virial[ii] << " " ;
- // }
- // std::cout << std::endl;
// per-atom energy/virial
// energy includes self-energy correction
@@ -227,10 +220,10 @@ void PPPMDPLR::compute(int eflag, int vflag)
if (triclinic) domain->lamda2x(atom->nlocal);
}
-
/* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */
+
void PPPMDPLR::fieldforce_ik()
{
int i,l,m,n,nx,ny,nz,mx,my,mz;
@@ -288,21 +281,6 @@ void PPPMDPLR::fieldforce_ik()
fele[i*3+1] += qfactor*eky;
if (slabflag != 2) fele[i*3+2] += qfactor*ekz;
}
-
- // vector dcoord(nall*3), dbox(9);
- // vector dtype(nall);
- // {
- // double ** xx = atom->x;
- // for(int ii = 0; ii < nall; ++ii){
- // for (int dd = 0; dd < 3; +=dd){
- // dcoord[ii*3+dd] = xx[ii][dd];
- // }
- // }
- // int *type = atom->type;
- // for (int ii = 0; ii < nall; ++ii){
- // dtype[ii] = type[ii] - 1;
- // }
- // }
}
/* ----------------------------------------------------------------------
@@ -335,11 +313,14 @@ void PPPMDPLR::fieldforce_ad()
double *q = atom->q;
double **x = atom->x;
- double **f = atom->f;
+ // double **f = atom->f;
int nlocal = atom->nlocal;
+ int nghost = atom->nghost;
+ int nall = nlocal + nghost;
- vector fele(nlocal, 0.0);
+ fele.resize(nlocal*3);
+ fill(fele.begin(), fele.end(), 0.0);
for (i = 0; i < nlocal; i++) {
nx = part2grid[i][0];
@@ -369,7 +350,7 @@ void PPPMDPLR::fieldforce_ad()
eky *= hy_inv;
ekz *= hz_inv;
- // convert E-field to force and substract self forces
+ // convert E-field to force and subtract self forces
const double qfactor = qqrd2e * scale;
@@ -392,15 +373,5 @@ void PPPMDPLR::fieldforce_ad()
sf *= 2*q[i]*q[i];
if (slabflag != 2) fele[i*3+2] += qfactor*(ekz*q[i] - sf);
}
-
- // for (int ii = 0; ii < nlocal; ++ii){
- // cout << ii << "\t ";
- // for (int dd = 0; dd < 3; ++dd){
- // cout << fele[ii*3+dd] << " " ;
- // }
- // cout << endl;
- // }
}
-
-
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index e73e5dcb49..1ddaaf3255 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -4,8 +4,7 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D
set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc)
-file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
file(GLOB OP_PY *.py)
@@ -27,7 +26,7 @@ if (BUILD_PY_IF)
set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
set(CMAKE_INSTALL_RPATH $ORIGIN)
if (USE_CUDA_TOOLKIT)
- add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB})
+ add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
add_library(op_grads SHARED ${OP_GRADS_SRC})
add_subdirectory(cuda)
find_package(CUDA REQUIRED)
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 09b2ed2638..5965254111 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -1,42 +1,7 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_GPU
-#include
-#include
-#include
#include
#include
#include
-#include
-
-#ifdef HIGH_PREC
- typedef double VALUETYPE;
-#else
- typedef float VALUETYPE;
-#endif
-
-typedef double compute_t;
-
-typedef unsigned long long int_64;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
- if (code != cudaSuccess)
- {
- fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
- if (abort) exit(code);
- }
-}
+#include "DeviceFunctor.h"
template <
typename Key,
@@ -72,8 +37,8 @@ __global__ void BlockSortKernel(
cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items);
}
-template
-__device__ inline T dev_dot(T * arr1, T * arr2) {
+template
+__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) {
return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
}
@@ -111,7 +76,8 @@ __global__ void get_i_idx_se_a(const int nloc,
i_idx[ilist[idy]] = idy;
}
-__global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
+template
+__global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
const int * type,
const int * jrange,
const int * jlist,
@@ -121,8 +87,8 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
const int MAGIC_NUMBER)
{
// <<>>
- const unsigned int idx = blockIdx.y;
- const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+ const unsigned int idx = blockIdx.x;
+ const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
if (idy >= nsize) {
@@ -134,12 +100,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
int_64 * key_in = key + idx * MAGIC_NUMBER;
- compute_t diff[3];
+ FPTYPE diff[3];
const int & j_idx = nei_idx[idy];
for (int dd = 0; dd < 3; dd++) {
diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
}
- compute_t rr = sqrt(dev_dot(diff, diff));
+ FPTYPE rr = sqrt(dev_dot(diff, diff));
if (rr <= rcut) {
key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
}
@@ -199,8 +165,8 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript,
const int sec_a_size)
{
// <<>>
- const unsigned int idx = blockIdx.y;
- const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
+ const unsigned int idx = blockIdx.x;
+ const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
const int idx_deriv = idy * 4 * 3; // 4 components time 3 directions
const int idx_value = idy * 4; // 4 components
if (idy >= sec_a_size) {return;}
@@ -262,8 +228,9 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript,
}
}
+template
void format_nbor_list_256 (
- const VALUETYPE* coord,
+ const FPTYPE* coord,
const int* type,
const int* jrange,
const int* jlist,
@@ -276,9 +243,10 @@ void format_nbor_list_256 (
const int LEN = 256;
const int MAGIC_NUMBER = 256;
const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
- dim3 block_grid(nblock, nloc);
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
format_nlist_fill_a_se_a
- <<>> (
+ <<>> (
coord,
type,
jrange,
@@ -294,8 +262,9 @@ void format_nbor_list_256 (
BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER);
}
+template
void format_nbor_list_512 (
- const VALUETYPE* coord,
+ const FPTYPE* coord,
const int* type,
const int* jrange,
const int* jlist,
@@ -308,9 +277,10 @@ void format_nbor_list_512 (
const int LEN = 256;
const int MAGIC_NUMBER = 512;
const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
- dim3 block_grid(nblock, nloc);
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
format_nlist_fill_a_se_a
- <<>> (
+ <<>> (
coord,
type,
jrange,
@@ -326,8 +296,9 @@ void format_nbor_list_512 (
BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER);
}
+template
void format_nbor_list_1024 (
- const VALUETYPE* coord,
+ const FPTYPE* coord,
const int* type,
const int* jrange,
const int* jlist,
@@ -340,9 +311,10 @@ void format_nbor_list_1024 (
const int LEN = 256;
const int MAGIC_NUMBER = 1024;
const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
- dim3 block_grid(nblock, nloc);
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
format_nlist_fill_a_se_a
- <<>> (
+ <<>> (
coord,
type,
jrange,
@@ -358,8 +330,9 @@ void format_nbor_list_1024 (
BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER);
}
+template
void format_nbor_list_2048 (
- const VALUETYPE* coord,
+ const FPTYPE* coord,
const int* type,
const int* jrange,
const int* jlist,
@@ -372,9 +345,10 @@ void format_nbor_list_2048 (
const int LEN = 256;
const int MAGIC_NUMBER = 2048;
const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
- dim3 block_grid(nblock, nloc);
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
format_nlist_fill_a_se_a
- <<>> (
+ <<>> (
coord,
type,
jrange,
@@ -390,8 +364,9 @@ void format_nbor_list_2048 (
BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER);
}
+template
void format_nbor_list_4096 (
- const VALUETYPE* coord,
+ const FPTYPE* coord,
const int* type,
const int* jrange,
const int* jlist,
@@ -404,9 +379,10 @@ void format_nbor_list_4096 (
const int LEN = 256;
const int MAGIC_NUMBER = 4096;
const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
- dim3 block_grid(nblock, nloc);
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
format_nlist_fill_a_se_a
- <<>> (
+ <<>> (
coord,
type,
jrange,
@@ -422,29 +398,8 @@ void format_nbor_list_4096 (
BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER);
}
-void DescrptSeALauncher(const VALUETYPE* coord,
- const int* type,
- const int* ilist,
- const int* jrange,
- const int* jlist,
- int* array_int,
- unsigned long long* array_longlong,
- const VALUETYPE* avg,
- const VALUETYPE* std,
- VALUETYPE* descript,
- VALUETYPE* descript_deriv,
- VALUETYPE* rij,
- int* nlist,
- const int& nloc,
- const int& nnei,
- const float& rcut_r,
- const float& rcut_r_smth,
- const int& ndescrpt,
- const std::vector& sec_a,
- const bool& fill_nei_a,
- const int MAGIC_NUMBER
-)
-{
+template
+void DescrptSeAGPUExecuteFunctor::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
const int LEN = 256;
int nblock = (nloc + LEN -1) / LEN;
int * sec_a_dev = array_int;
@@ -456,8 +411,8 @@ void DescrptSeALauncher(const VALUETYPE* coord,
res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);
res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
- res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
- res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
+ res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
+ res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
if (fill_nei_a) {
// ~~~
@@ -536,8 +491,9 @@ void DescrptSeALauncher(const VALUETYPE* coord,
}
const int nblock_ = (sec_a.back() + LEN -1) / LEN;
- dim3 block_grid(nblock_, nloc);
- compute_descriptor_se_a<<>> (
+ dim3 block_grid(nloc, nblock_);
+ dim3 thread_grid(1, LEN);
+ compute_descriptor_se_a<<>> (
descript,
ndescrpt,
descript_deriv,
@@ -554,4 +510,7 @@ void DescrptSeALauncher(const VALUETYPE* coord,
rcut_r,
sec_a.back()
);
-}
\ No newline at end of file
+}
+
+template struct DescrptSeAGPUExecuteFunctor;
+template struct DescrptSeAGPUExecuteFunctor;
\ No newline at end of file
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index fa9678be34..a65ba5887a 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -1,45 +1,7 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_GPU
-#include
-#include
-#include
#include
#include
#include
-#include
-#include
-
-#define MAGIC_NUMBER 256
-
-#ifdef HIGH_PREC
- typedef double VALUETYPE;
-#else
- typedef float VALUETYPE;
-#endif
-
-typedef double compute_t;
-
-typedef unsigned long long int_64;
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
- if (code != cudaSuccess)
- {
- fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
- if (abort) exit(code);
- }
-}
+#include "DeviceFunctor.h"
template <
typename Key,
@@ -48,7 +10,7 @@ template <
__launch_bounds__ (BLOCK_THREADS)
__global__ void BlockSortKernel(
Key * d_in,
- Key * d_out) // Tile of output
+ Key * d_out) // Tile of output
{
enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
// Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
@@ -75,24 +37,25 @@ __global__ void BlockSortKernel(
cub::StoreDirectStriped(threadIdx.x, d_out + block_offset, items);
}
-template
-__device__ inline T dev_dot(T * arr1, T * arr2) {
+template
+__device__ inline FPTYPE dev_dot(FPTYPE * arr1, FPTYPE * arr2) {
return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
}
-__device__ inline void spline5_switch(compute_t & vv,
- compute_t & dd,
- compute_t & xx,
- const compute_t & rmin,
- const compute_t & rmax)
+template
+__device__ inline void spline5_switch(FPTYPE & vv,
+ FPTYPE & dd,
+ FPTYPE & xx,
+ const float & rmin,
+ const float & rmax)
{
if (xx < rmin) {
dd = 0;
vv = 1;
}
else if (xx < rmax) {
- compute_t uu = (xx - rmin) / (rmax - rmin) ;
- compute_t du = 1. / (rmax - rmin) ;
+ FPTYPE uu = (xx - rmin) / (rmax - rmin) ;
+ FPTYPE du = 1. / (rmax - rmin) ;
vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
}
@@ -104,7 +67,7 @@ __device__ inline void spline5_switch(compute_t & vv,
__global__ void get_i_idx_se_r(const int nloc,
const int * ilist,
- int * i_idx)
+ int * i_idx)
{
const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
if(idy >= nloc) {
@@ -113,17 +76,19 @@ __global__ void get_i_idx_se_r(const int nloc,
i_idx[ilist[idy]] = idy;
}
-__global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
+template
+__global__ void format_nlist_fill_a_se_r(const FPTYPE * coord,
const int * type,
const int * jrange,
const int * jlist,
const float rcut,
int_64 * key,
- int * i_idx)
+ int * i_idx,
+ const int MAGIC_NUMBER)
{
// <<>>
const unsigned int idx = blockIdx.x;
- const unsigned int idy = threadIdx.x;
+ const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
if (idy >= nsize) {
@@ -135,12 +100,12 @@ __global__ void format_nlist_fill_a_se_r(const VALUETYPE * coord,
int_64 * key_in = key + idx * MAGIC_NUMBER;
- compute_t diff[3];
+ FPTYPE diff[3];
const int & j_idx = nei_idx[idy];
for (int dd = 0; dd < 3; dd++) {
diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
}
- compute_t rr = sqrt(dev_dot(diff, diff));
+ FPTYPE rr = sqrt(dev_dot(diff, diff));
if (rr <= rcut) {
key_in[idy] = type[j_idx] * 1E15+ (int_64)(rr * 1.0E13) / 100000 * 100000 + j_idx;
}
@@ -153,9 +118,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
const int * jrange,
const int * jlist,
int_64 * key,
- const int * sec,
+ const int * sec_a,
const int sec_a_size,
- int * nei_iter_dev)
+ int * nei_iter_dev,
+ const int MAGIC_NUMBER)
{
const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -169,137 +135,329 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
for (int ii = 0; ii < sec_a_size; ii++) {
- nei_iter[ii] = sec[ii];
+ nei_iter[ii] = sec_a[ii];
}
for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
const int & nei_type = key_out[kk] / 1E15;
- if (nei_iter[nei_type] < sec[nei_type + 1]) {
+ if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
}
}
}
//it's ok!
-__global__ void compute_descriptor_se_r (VALUETYPE* descript,
+template
+__global__ void compute_descriptor_se_r (FPTYPE* descript,
const int ndescrpt,
- VALUETYPE* descript_deriv,
+ FPTYPE* descript_deriv,
const int descript_deriv_size,
- VALUETYPE* rij,
+ FPTYPE* rij,
const int rij_size,
const int* type,
- const VALUETYPE* avg,
- const VALUETYPE* std,
+ const FPTYPE* avg,
+ const FPTYPE* std,
int* nlist,
const int nlist_size,
- const VALUETYPE* coord,
- const VALUETYPE rmin,
- const VALUETYPE rmax,
- compute_t* sel_diff_dev,
- const int sec_size)
+ const FPTYPE* coord,
+ const float rmin,
+ const float rmax,
+ const int sec_a_size)
{
- // <<>>
- const unsigned int idx = blockIdx.y;
- const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
- const int idx_deriv = idy * 3; // 1 components time 3 directions
- const int idx_value = idy; // 1 components
- if (idy >= sec_size) {return;}
-
+ // <<>>
+ const unsigned int idx = blockIdx.x;
+ const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
+ const int idx_deriv = idy * 3; // 4 components time 3 directions
+ const int idx_value = idy; // 4 components
+ if (idy >= sec_a_size) {return;}
+
// else {return;}
- VALUETYPE * row_descript = descript + idx * ndescrpt;
- VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
- VALUETYPE * row_rij = rij + idx * rij_size;
- compute_t * sel_diff = sel_diff_dev + idx * nlist_size * 3;
+ FPTYPE * row_descript = descript + idx * ndescrpt;
+ FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
+ FPTYPE * row_rij = rij + idx * rij_size;
int * row_nlist = nlist + idx * nlist_size;
-
+
if (row_nlist[idy] >= 0) {
const int & j_idx = row_nlist[idy];
for (int kk = 0; kk < 3; kk++) {
- sel_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
- row_rij[idy * 3 + kk] = sel_diff[idy * 3 + kk];
+ row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
}
- const compute_t * rr = &sel_diff[idy * 3 + 0];
- compute_t nr2 = dev_dot(rr, rr);
- compute_t inr = 1./sqrt(nr2);
- compute_t nr = nr2 * inr;
- compute_t inr2 = inr * inr;
- compute_t inr4 = inr2 * inr2;
- compute_t inr3 = inr4 * nr;
- compute_t sw, dsw;
+ const FPTYPE * rr = &row_rij[idy * 3 + 0];
+ FPTYPE nr2 = dev_dot(rr, rr);
+ FPTYPE inr = 1./sqrt(nr2);
+ FPTYPE nr = nr2 * inr;
+ FPTYPE inr2 = inr * inr;
+ FPTYPE inr4 = inr2 * inr2;
+ FPTYPE inr3 = inr4 * nr;
+ FPTYPE sw, dsw;
spline5_switch(sw, dsw, nr, rmin, rmax);
row_descript[idx_value + 0] = (1./nr) ;//* sw;
row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
- // 1 value components
+ // 4 value components
row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
}
for (int ii = 0; ii < 1; ii++) {
row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
}
+ // idy nloc, idx ndescrpt * 3
+ // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
for (int ii = 0; ii < 3; ii++) {
row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
}
}
-void DescrptSeRLauncher(const VALUETYPE* coord,
- const int* type,
- const int* ilist,
- const int* jrange,
- const int* jlist,
- int* array_int,
- unsigned long long* array_longlong,
- compute_t* array_double,
- const VALUETYPE* avg,
- const VALUETYPE* std,
- VALUETYPE* descript,
- VALUETYPE* descript_deriv,
- VALUETYPE* rij,
- int* nlist,
- const int& ntypes,
- const int& nloc,
- const int& nall,
- const int& nnei,
- const float& rcut,
- const float& rcut_smth,
- const int& ndescrpt,
- const std::vector& sec,
- const bool& fill_nei_a
-)
+template
+void format_nbor_list_256 (
+ const FPTYPE* coord,
+ const int* type,
+ const int* jrange,
+ const int* jlist,
+ const int& nloc,
+ const float& rcut_r,
+ int * i_idx,
+ int_64 * key
+)
+{
+ const int LEN = 256;
+ const int MAGIC_NUMBER = 256;
+ const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
+ format_nlist_fill_a_se_r
+ <<>> (
+ coord,
+ type,
+ jrange,
+ jlist,
+ rcut_r,
+ key,
+ i_idx,
+ MAGIC_NUMBER
+ );
+ const int ITEMS_PER_THREAD = 4;
+ const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+ // BlockSortKernel<<>> (
+ BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+template
+void format_nbor_list_512 (
+ const FPTYPE* coord,
+ const int* type,
+ const int* jrange,
+ const int* jlist,
+ const int& nloc,
+ const float& rcut_r,
+ int * i_idx,
+ int_64 * key
+)
+{
+ const int LEN = 256;
+ const int MAGIC_NUMBER = 512;
+ const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+ dim3 block_grid(nloc, nblock);
+ dim3 thread_grid(1, LEN);
+ format_nlist_fill_a_se_r
+ <<>> (
+ coord,
+ type,
+ jrange,
+ jlist,
+ rcut_r,
+ key,
+ i_idx,
+ MAGIC_NUMBER
+ );
+ const int ITEMS_PER_THREAD = 4;
+ const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+ // BlockSortKernel<<