From 73f6761a9f470baf2d1421ed21f332944e79c033 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Tue, 8 Aug 2023 01:40:01 +0800
Subject: [PATCH 01/31] starting point of mcmd

---
 src/main_gpumd/run.cu            | 16 ++++++++--
 src/main_gpumd/run.cuh           |  2 ++
 src/makefile                     | 10 ++++--
 src/mc/mc.cu                     | 54 ++++++++++++++++++++++++++++++++
 src/mc/mc.cuh                    | 39 +++++++++++++++++++++++
 src/mc/mc_ensemble.cu            | 31 ++++++++++++++++++
 src/mc/mc_ensemble.cuh           | 32 +++++++++++++++++++
 src/mc/mc_ensemble_canonical.cu  | 35 +++++++++++++++++++++
 src/mc/mc_ensemble_canonical.cuh | 26 +++++++++++++++
 9 files changed, 241 insertions(+), 4 deletions(-)
 create mode 100644 src/mc/mc.cu
 create mode 100644 src/mc/mc.cuh
 create mode 100644 src/mc/mc_ensemble.cu
 create mode 100644 src/mc/mc_ensemble.cuh
 create mode 100644 src/mc/mc_ensemble_canonical.cu
 create mode 100644 src/mc/mc_ensemble_canonical.cuh

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index ed9579247..e952ee22d 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -158,6 +158,7 @@ void Run::execute_run_in()
 void Run::perform_a_run()
 {
   integrate.initialize(N, time_step, group, atom);
+  mc.initialize();
   measure.initialize(number_of_steps, time_step, integrate, group, atom, force);
 
 #ifdef USE_PLUMED
@@ -214,6 +215,8 @@ void Run::perform_a_run()
 
     integrate.compute2(time_step, double(step) / number_of_steps, group, box, atom, thermo);
 
+    mc.compute();
+
     measure.process(
       number_of_steps,
       step,
@@ -252,10 +255,17 @@ void Run::perform_a_run()
   printf("Speed of this run = %g atom*step/second.\n", run_speed);
   print_line_2();
 
-  measure.finalize(integrate, number_of_steps, time_step, integrate.temperature2, box.get_volume(),atom.number_of_beads);
+  measure.finalize(
+    integrate,
+    number_of_steps,
+    time_step,
+    integrate.temperature2,
+    box.get_volume(),
+    atom.number_of_beads);
 
   electron_stop.finalize();
   integrate.finalize();
+  mc.finalize();
   velocity.finalize();
   max_distance_per_step = 0.0;
 }
@@ -366,7 +376,7 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
     measure.sdc.parse(param, num_param, group);
   } else if (strcmp(param[0], "compute_msd") == 0) {
     measure.msd.parse(param, num_param, group);
-  }  else if (strcmp(param[0], "compute_rdf") == 0) {
+  } else if (strcmp(param[0], "compute_rdf") == 0) {
     measure.rdf.parse(param, num_param, box, number_of_types, number_of_steps);
   } else if (strcmp(param[0], "compute_hac") == 0) {
     measure.hac.parse(param, num_param);
@@ -392,6 +402,8 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
     integrate.parse_move(param, num_param, group);
   } else if (strcmp(param[0], "electron_stop") == 0) {
     electron_stop.parse(param, num_param, atom.number_of_atoms, number_of_types);
+  } else if (strcmp(param[0], "mc") == 0) {
+    mc.parse(param, num_param);
   } else if (strcmp(param[0], "run") == 0) {
     parse_run(param, num_param);
   } else {
diff --git a/src/main_gpumd/run.cuh b/src/main_gpumd/run.cuh
index 8cf91f7bf..32b75b54c 100644
--- a/src/main_gpumd/run.cuh
+++ b/src/main_gpumd/run.cuh
@@ -22,6 +22,7 @@ class Measure;
 #include "electron_stop.cuh"
 #include "force/force.cuh"
 #include "integrate/integrate.cuh"
+#include "mc/mc.cuh"
 #include "measure/measure.cuh"
 #include "model/atom.cuh"
 #include "model/box.cuh"
@@ -65,6 +66,7 @@ private:
 
   Force force;
   Integrate integrate;
+  MC mc;
   Measure measure;
   Electron_Stop electron_stop;
 };
diff --git a/src/makefile b/src/makefile
index 87fbac0c2..c7011287d 100755
--- a/src/makefile
+++ b/src/makefile
@@ -32,8 +32,9 @@ LIBS = -lcublas -lcusolver
 SOURCES_GPUMD =                   \
 	$(wildcard main_gpumd/*.cu)   \
 	$(wildcard minimize/*.cu)     \
-	$(wildcard phonon/*.cu)  \
+	$(wildcard phonon/*.cu)       \
 	$(wildcard integrate/*.cu)    \
+	$(wildcard mc/*.cu)           \
 	$(wildcard force/*.cu)        \
 	$(wildcard measure/*.cu)      \
 	$(wildcard model/*.cu)        \
@@ -62,11 +63,12 @@ HEADERS =                         \
 	$(wildcard utilities/*.cuh)   \
 	$(wildcard main_gpumd/*.cuh)  \
 	$(wildcard integrate/*.cuh)   \
+	$(wildcard mc/*.cuh)          \
 	$(wildcard minimize/*.cuh)    \
 	$(wildcard force/*.cuh)       \
 	$(wildcard measure/*.cuh)     \
 	$(wildcard model/*.cuh)       \
-	$(wildcard phonon/*.cuh) \
+	$(wildcard phonon/*.cuh)      \
 	$(wildcard main_nep/*.cuh)
 
 
@@ -86,6 +88,8 @@ nep: $(OBJ_NEP)
 ifdef OS # for Windows
 integrate/%.obj: integrate/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+mc/%.obj: mc/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 minimize/%.obj: minimize/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 force/%.obj: force/%.cu $(HEADERS)
@@ -105,6 +109,8 @@ main_nep/%.obj: main_nep/%.cu $(HEADERS)
 else # for Linux
 integrate/%.o: integrate/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
+md/%.o: mc/%.cu $(HEADERS)
+	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 minimize/%.o: minimize/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 force/%.o: force/%.cu $(HEADERS)
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
new file mode 100644
index 000000000..c7277626e
--- /dev/null
+++ b/src/mc/mc.cu
@@ -0,0 +1,54 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*----------------------------------------------------------------------------80
+The driver class for the various MC ensembles.
+------------------------------------------------------------------------------*/
+
+#include "mc.cuh"
+#include "mc_ensemble_canonical.cuh"
+#include "model/atom.cuh"
+#include "utilities/common.cuh"
+#include "utilities/read_file.cuh"
+
+void MC::initialize(void)
+{
+  // todo
+}
+
+void MC::finalize(void) { do_mcmd = false; }
+
+void MC::compute(void)
+{
+  if (do_mcmd) {
+    printf("    Do MCMD.\n");
+  }
+}
+
+void MC::parse(const char** param, int num_param)
+{
+  if (num_param != 2) {
+    PRINT_INPUT_ERROR("mc should have 1 parameter.\n");
+  }
+
+  if (strcmp(param[1], "C") == 0 || strcmp(param[1], "c") == 0) {
+    mc_ensemble.reset(new MC_Ensemble_Canonical());
+  }
+
+  printf("Perform MCMD:\n");
+  printf("    Use the Canonical ensemble.\n");
+
+  do_mcmd = true;
+}
\ No newline at end of file
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
new file mode 100644
index 000000000..1853c7fb6
--- /dev/null
+++ b/src/mc/mc.cuh
@@ -0,0 +1,39 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include "mc_ensemble.cuh"
+#include "model/box.cuh"
+#include "model/group.cuh"
+#include <memory>
+#include <vector>
+
+class Atom;
+
+class MC
+{
+public:
+  std::unique_ptr<MC_Ensemble> mc_ensemble;
+
+  void initialize(void);
+  void finalize(void);
+  void compute(void);
+
+  void parse(const char** param, int num_param);
+
+private:
+  bool do_mcmd = false;
+};
diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
new file mode 100644
index 000000000..75fae353c
--- /dev/null
+++ b/src/mc/mc_ensemble.cu
@@ -0,0 +1,31 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*----------------------------------------------------------------------------80
+The abstract base class (ABC) for the MC_Ensemble classes.
+------------------------------------------------------------------------------*/
+
+#include "mc_ensemble.cuh"
+#include "utilities/common.cuh"
+
+MC_Ensemble::MC_Ensemble(void)
+{
+  // nothing now
+}
+
+MC_Ensemble::~MC_Ensemble(void)
+{
+  // nothing now
+}
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
new file mode 100644
index 000000000..457086759
--- /dev/null
+++ b/src/mc/mc_ensemble.cuh
@@ -0,0 +1,32 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "model/atom.cuh"
+#include "model/box.cuh"
+#include "model/group.cuh"
+#include "utilities/gpu_vector.cuh"
+#include <vector>
+
+class MC_Ensemble
+{
+public:
+  MC_Ensemble(void);
+  virtual ~MC_Ensemble(void);
+
+  virtual void compute(void) = 0;
+
+protected:
+};
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
new file mode 100644
index 000000000..3529c0793
--- /dev/null
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -0,0 +1,35 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*----------------------------------------------------------------------------80
+The canonical ensemble for MCMD.
+------------------------------------------------------------------------------*/
+
+#include "mc_ensemble_canonical.cuh"
+
+MC_Ensemble_Canonical::MC_Ensemble_Canonical(void)
+{
+  // nothing now
+}
+
+MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void)
+{
+  // nothing now
+}
+
+void MC_Ensemble_Canonical::compute(void)
+{
+  // TODO
+}
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
new file mode 100644
index 000000000..4de7d5547
--- /dev/null
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -0,0 +1,26 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "mc_ensemble.cuh"
+
+class MC_Ensemble_Canonical : public MC_Ensemble
+{
+public:
+  MC_Ensemble_Canonical(void);
+  virtual ~MC_Ensemble_Canonical(void);
+
+  virtual void compute(void);
+};

From 1e4362937ce43a7e1f395a457806a73345142653 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Tue, 8 Aug 2023 05:02:32 +0800
Subject: [PATCH 02/31] more complete input parsing

---
 src/main_gpumd/run.cu            |  6 ++---
 src/mc/mc.cu                     | 41 +++++++++++++++++++++++++-------
 src/mc/mc.cuh                    |  7 ++++--
 src/mc/mc_ensemble.cuh           |  4 +++-
 src/mc/mc_ensemble_canonical.cu  | 11 +++++----
 src/mc/mc_ensemble_canonical.cuh |  4 ++--
 6 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index e952ee22d..ff9618e9b 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -215,7 +215,7 @@ void Run::perform_a_run()
 
     integrate.compute2(time_step, double(step) / number_of_steps, group, box, atom, thermo);
 
-    mc.compute();
+    mc.compute(step, atom);
 
     measure.process(
       number_of_steps,
@@ -402,8 +402,8 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
     integrate.parse_move(param, num_param, group);
   } else if (strcmp(param[0], "electron_stop") == 0) {
     electron_stop.parse(param, num_param, atom.number_of_atoms, number_of_types);
-  } else if (strcmp(param[0], "mc") == 0) {
-    mc.parse(param, num_param);
+  } else if (strcmp(param[0], "cmc") == 0) {
+    mc.parse_cmc(param, num_param);
   } else if (strcmp(param[0], "run") == 0) {
     parse_run(param, num_param);
   } else {
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index c7277626e..ea0abfe0e 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -30,25 +30,48 @@ void MC::initialize(void)
 
 void MC::finalize(void) { do_mcmd = false; }
 
-void MC::compute(void)
+void MC::compute(int step, Atom& atom)
 {
   if (do_mcmd) {
-    printf("    Do MCMD.\n");
+    if ((step + 1) % num_steps_md == 0) {
+      printf("Now do MC after %d MD steps:\n", step + 1);
+      mc_ensemble->compute(atom);
+    }
   }
 }
 
-void MC::parse(const char** param, int num_param)
+void MC::parse_cmc(const char** param, int num_param)
 {
-  if (num_param != 2) {
-    PRINT_INPUT_ERROR("mc should have 1 parameter.\n");
+  if (num_param != 4) {
+    PRINT_INPUT_ERROR("cmc should have 3 parameter.\n");
   }
 
-  if (strcmp(param[1], "C") == 0 || strcmp(param[1], "c") == 0) {
-    mc_ensemble.reset(new MC_Ensemble_Canonical());
+  if (!is_valid_int(param[1], &num_steps_md)) {
+    PRINT_INPUT_ERROR("number of MD steps for cmc should be an integer.\n");
   }
+  if (num_steps_md <= 0) {
+    PRINT_INPUT_ERROR("number of MD steps for cmc should be positive.\n");
+  }
+
+  if (!is_valid_int(param[2], &num_steps_mc)) {
+    PRINT_INPUT_ERROR("number of MC steps for cmc should be an integer.\n");
+  }
+  if (num_steps_mc <= 0) {
+    PRINT_INPUT_ERROR("number of MC steps for cmc should be positive.\n");
+  }
+
+  if (!is_valid_real(param[3], &temperature)) {
+    PRINT_INPUT_ERROR("temperature for cmc should be a number.\n");
+  }
+  if (temperature <= 0) {
+    PRINT_INPUT_ERROR("temperature for cmc should be positive.\n");
+  }
+
+  printf("Perform canonical MC:\n");
+  printf("    after every %d MD steps, do %d MC trials.\n", num_steps_md, num_steps_mc);
+  printf("    with a temperature of %g K.\n", temperature);
 
-  printf("Perform MCMD:\n");
-  printf("    Use the Canonical ensemble.\n");
+  mc_ensemble.reset(new MC_Ensemble_Canonical(num_steps_mc, temperature));
 
   do_mcmd = true;
 }
\ No newline at end of file
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index 1853c7fb6..4e96f48a4 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -30,10 +30,13 @@ public:
 
   void initialize(void);
   void finalize(void);
-  void compute(void);
+  void compute(int step, Atom& atom);
 
-  void parse(const char** param, int num_param);
+  void parse_cmc(const char** param, int num_param);
 
 private:
   bool do_mcmd = false;
+  int num_steps_md = 0;
+  int num_steps_mc = 0;
+  double temperature = 0.0;
 };
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index 457086759..3db4c0d71 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -26,7 +26,9 @@ public:
   MC_Ensemble(void);
   virtual ~MC_Ensemble(void);
 
-  virtual void compute(void) = 0;
+  virtual void compute(Atom& atom) = 0;
 
 protected:
+  int num_steps_mc = 0;
+  double temperature = 0.0;
 };
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 3529c0793..9b617516b 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -19,9 +19,10 @@ The canonical ensemble for MCMD.
 
 #include "mc_ensemble_canonical.cuh"
 
-MC_Ensemble_Canonical::MC_Ensemble_Canonical(void)
+MC_Ensemble_Canonical::MC_Ensemble_Canonical(int num_steps_mc_input, double temperature_input)
 {
-  // nothing now
+  num_steps_mc = num_steps_mc_input;
+  temperature = temperature_input;
 }
 
 MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void)
@@ -29,7 +30,9 @@ MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void)
   // nothing now
 }
 
-void MC_Ensemble_Canonical::compute(void)
+void MC_Ensemble_Canonical::compute(Atom& atom)
 {
-  // TODO
+  for (int step = 0; step < num_steps_mc; ++step) {
+    printf("    MC step %d, temperature = %g K.\n", step, temperature);
+  }
 }
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index 4de7d5547..b6666aa98 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -19,8 +19,8 @@
 class MC_Ensemble_Canonical : public MC_Ensemble
 {
 public:
-  MC_Ensemble_Canonical(void);
+  MC_Ensemble_Canonical(int num_steps_mc, double temperature);
   virtual ~MC_Ensemble_Canonical(void);
 
-  virtual void compute(void);
+  virtual void compute(Atom& atom);
 };

From be947e5c2717a905e4157ea3856e6ab3adf045ae Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 10 Aug 2023 18:26:49 +0800
Subject: [PATCH 03/31] find atom pairs to exchange

---
 src/main_gpumd/run.cu            |  2 +-
 src/mc/mc.cu                     |  4 +-
 src/mc/mc.cuh                    |  2 +-
 src/mc/mc_ensemble.cu            | 23 ++++++++++-
 src/mc/mc_ensemble.cuh           | 19 ++++++++-
 src/mc/mc_ensemble_canonical.cu  | 71 +++++++++++++++++++++++++++++++-
 src/mc/mc_ensemble_canonical.cuh |  2 +-
 7 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index ff9618e9b..5ad098f4f 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -215,7 +215,7 @@ void Run::perform_a_run()
 
     integrate.compute2(time_step, double(step) / number_of_steps, group, box, atom, thermo);
 
-    mc.compute(step, atom);
+    mc.compute(step, atom, box);
 
     measure.process(
       number_of_steps,
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index ea0abfe0e..7b4de28de 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -30,12 +30,12 @@ void MC::initialize(void)
 
 void MC::finalize(void) { do_mcmd = false; }
 
-void MC::compute(int step, Atom& atom)
+void MC::compute(int step, Atom& atom, Box& box)
 {
   if (do_mcmd) {
     if ((step + 1) % num_steps_md == 0) {
       printf("Now do MC after %d MD steps:\n", step + 1);
-      mc_ensemble->compute(atom);
+      mc_ensemble->compute(atom, box);
     }
   }
 }
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index 4e96f48a4..06073dd43 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -30,7 +30,7 @@ public:
 
   void initialize(void);
   void finalize(void);
-  void compute(int step, Atom& atom);
+  void compute(int step, Atom& atom, Box& box);
 
   void parse_cmc(const char** param, int num_param);
 
diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 75fae353c..05d891391 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -19,10 +19,31 @@ The abstract base class (ABC) for the MC_Ensemble classes.
 
 #include "mc_ensemble.cuh"
 #include "utilities/common.cuh"
+#include <chrono>
 
 MC_Ensemble::MC_Ensemble(void)
 {
-  // nothing now
+  const int n_max = 1000;
+  NN_radial.resize(n_max);
+  NN_angular.resize(n_max);
+  NL_radial.resize(n_max * n_max);
+  NL_angular.resize(n_max * n_max);
+  type_before.resize(n_max);
+  type_after.resize(n_max);
+  x12_radial.resize(n_max * n_max);
+  y12_radial.resize(n_max * n_max);
+  z12_radial.resize(n_max * n_max);
+  x12_angular.resize(n_max * n_max);
+  y12_angular.resize(n_max * n_max);
+  z12_angular.resize(n_max * n_max);
+  pe_before.resize(n_max);
+  pe_after.resize(n_max);
+
+#ifdef DEBUG
+  rng = std::mt19937(13579);
+#else
+  rng = std::mt19937(std::chrono::system_clock::now().time_since_epoch().count());
+#endif
 }
 
 MC_Ensemble::~MC_Ensemble(void)
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index 3db4c0d71..de1eb211c 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -18,6 +18,7 @@
 #include "model/box.cuh"
 #include "model/group.cuh"
 #include "utilities/gpu_vector.cuh"
+#include <random>
 #include <vector>
 
 class MC_Ensemble
@@ -26,9 +27,25 @@ public:
   MC_Ensemble(void);
   virtual ~MC_Ensemble(void);
 
-  virtual void compute(Atom& atom) = 0;
+  virtual void compute(Atom& atom, Box& box) = 0;
 
 protected:
   int num_steps_mc = 0;
   double temperature = 0.0;
+  std::mt19937 rng;
+
+  GPU_Vector<int> NN_radial;
+  GPU_Vector<int> NN_angular;
+  GPU_Vector<int> NL_radial;
+  GPU_Vector<int> NL_angular;
+  GPU_Vector<int> type_before;
+  GPU_Vector<int> type_after;
+  GPU_Vector<float> x12_radial;
+  GPU_Vector<float> y12_radial;
+  GPU_Vector<float> z12_radial;
+  GPU_Vector<float> x12_angular;
+  GPU_Vector<float> y12_angular;
+  GPU_Vector<float> z12_angular;
+  GPU_Vector<float> pe_before;
+  GPU_Vector<float> pe_after;
 };
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 9b617516b..92525d350 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -30,9 +30,78 @@ MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void)
   // nothing now
 }
 
-void MC_Ensemble_Canonical::compute(Atom& atom)
+static __global__ void get_types(
+  const int N,
+  const int i,
+  const int j,
+  const int type_i,
+  const int type_j,
+  const int* g_type,
+  int* g_type_before,
+  int* g_type_after)
 {
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < N) {
+    g_type_before[n] = g_type[n];
+    g_type_after[n] = g_type[n];
+    if (n == i) {
+      g_type_after[i] = type_j;
+    }
+    if (n == j) {
+      g_type_after[j] = type_i;
+    }
+  }
+}
+
+static int get_type(const int i, const int* g_type)
+{
+  int type_i = 0;
+  CHECK(cudaMemcpy(&type_i, &g_type[i], sizeof(int), cudaMemcpyDeviceToHost));
+  return type_i;
+}
+
+void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
+{
+  std::uniform_int_distribution<int> r1(0, atom.number_of_atoms - 1);
+
   for (int step = 0; step < num_steps_mc; ++step) {
     printf("    MC step %d, temperature = %g K.\n", step, temperature);
+
+    int i = r1(rng);
+    int type_i = get_type(i, atom.type.data());
+    printf("        get atom %d with type %d.\n", i, type_i);
+    int j = 0, type_j = type_i;
+    while (type_i == type_j) {
+      j = r1(rng);
+      type_j = get_type(j, atom.type.data());
+      printf("        get atom %d with type %d.\n", j, type_j);
+    }
+    printf(
+      "        try to exchange atom %d with type %d and atom %d with type %d.\n",
+      i,
+      type_i,
+      j,
+      type_j);
+
+    get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      i,
+      j,
+      type_i,
+      type_j,
+      atom.type.data(),
+      type_before.data(),
+      type_after.data());
+
+    // std::vector<int> type_before_cpu(atom.number_of_atoms);
+    // std::vector<int> type_after_cpu(atom.number_of_atoms);
+    // type_before.copy_to_host(type_before_cpu.data(), atom.number_of_atoms);
+    // type_after.copy_to_host(type_after_cpu.data(), atom.number_of_atoms);
+    // for (int n = 0; n < atom.number_of_atoms; ++n) {
+    // if (type_before_cpu[n] != type_after_cpu[n])
+    // printf("%d\n", n);
+    // }
+
+    exit(1);
   }
 }
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index b6666aa98..51396ba01 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -22,5 +22,5 @@ public:
   MC_Ensemble_Canonical(int num_steps_mc, double temperature);
   virtual ~MC_Ensemble_Canonical(void);
 
-  virtual void compute(Atom& atom);
+  virtual void compute(Atom& atom, Box& box);
 };

From afabb029ce3dbbc28af6d96ea89c9852074f4f11 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 10 Aug 2023 18:51:59 +0800
Subject: [PATCH 04/31] prepare data for energy calculator

---
 src/mc/mc_ensemble_canonical.cu | 92 ++++++++++++++++++++++++++++++---
 1 file changed, 84 insertions(+), 8 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 92525d350..2cec82ba0 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -53,6 +53,61 @@ static __global__ void get_types(
   }
 }
 
+static __global__ void create_inputs_for_energy_calculator(
+  const int N,
+  const Box box,
+  const float rc_radial_square,
+  const float rc_angular_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  int* g_NN_radial,
+  int* g_NL_radial,
+  int* g_NN_angular,
+  int* g_NL_angular,
+  float* g_x12_radial,
+  float* g_y12_radial,
+  float* g_z12_radial,
+  float* g_x12_angular,
+  float* g_y12_angular,
+  float* g_z12_angular)
+{
+  int n1 = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n1 < N) {
+    double x1 = g_x[n1];
+    double y1 = g_y[n1];
+    double z1 = g_z[n1];
+    int count_radial = 0;
+    int count_angular = 0;
+    for (int n2 = 0; n2 < N; ++n2) {
+      if (n2 == n1) {
+        continue;
+      }
+      double x12 = g_x[n2] - x1;
+      double y12 = g_y[n2] - y1;
+      double z12 = g_z[n2] - z1;
+      apply_mic(box, x12, y12, z12);
+      float distance_square = float(x12 * x12 + y12 * y12 + z12 * z12);
+      if (distance_square < rc_radial_square) {
+        g_NL_radial[count_radial * N + n1] = n2;
+        g_x12_radial[count_radial * N + n1] = float(x12);
+        g_y12_radial[count_radial * N + n1] = float(y12);
+        g_z12_radial[count_radial * N + n1] = float(z12);
+        count_radial++;
+      }
+      if (distance_square < rc_angular_square) {
+        g_NL_angular[count_angular * N + n1] = n2;
+        g_x12_angular[count_angular * N + n1] = float(x12);
+        g_y12_angular[count_angular * N + n1] = float(y12);
+        g_z12_angular[count_angular * N + n1] = float(z12);
+        count_angular++;
+      }
+    }
+    g_NN_radial[n1] = count_radial;
+    g_NN_angular[n1] = count_angular;
+  }
+}
+
 static int get_type(const int i, const int* g_type)
 {
   int type_i = 0;
@@ -92,16 +147,37 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       atom.type.data(),
       type_before.data(),
       type_after.data());
+    CUDA_CHECK_KERNEL
 
-    // std::vector<int> type_before_cpu(atom.number_of_atoms);
-    // std::vector<int> type_after_cpu(atom.number_of_atoms);
-    // type_before.copy_to_host(type_before_cpu.data(), atom.number_of_atoms);
-    // type_after.copy_to_host(type_after_cpu.data(), atom.number_of_atoms);
-    // for (int n = 0; n < atom.number_of_atoms; ++n) {
-    // if (type_before_cpu[n] != type_after_cpu[n])
-    // printf("%d\n", n);
-    // }
+    create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      box,
+      64.0f,
+      16.0f,
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      NN_radial.data(),
+      NL_radial.data(),
+      NN_angular.data(),
+      NL_angular.data(),
+      x12_radial.data(),
+      y12_radial.data(),
+      z12_radial.data(),
+      x12_angular.data(),
+      y12_angular.data(),
+      z12_angular.data());
+    CUDA_CHECK_KERNEL
 
+    /*
+        std::vector<int> NN_radial_cpu(atom.number_of_atoms);
+        std::vector<int> NN_angular_cpu(atom.number_of_atoms);
+        NN_radial.copy_to_host(NN_radial_cpu.data(), atom.number_of_atoms);
+        NN_angular.copy_to_host(NN_angular_cpu.data(), atom.number_of_atoms);
+        for (int n = 0; n < atom.number_of_atoms; ++n) {
+          printf("%d %d\n", NN_radial_cpu[n], NN_angular_cpu[n]);
+        }
+    */
     exit(1);
   }
 }

From fe1b0e7d88cc6ff905110f52f2e907cf8c0aadb3 Mon Sep 17 00:00:00 2001
From: Paul Erhart <erhart@chalmers.se>
Date: Thu, 10 Aug 2023 19:30:35 +0200
Subject: [PATCH 05/31] Fixed typo in makefile

---
 src/makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100755 => 100644 src/makefile

diff --git a/src/makefile b/src/makefile
old mode 100755
new mode 100644
index c7011287d..66917ba26
--- a/src/makefile
+++ b/src/makefile
@@ -109,7 +109,7 @@ main_nep/%.obj: main_nep/%.cu $(HEADERS)
 else # for Linux
 integrate/%.o: integrate/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
-md/%.o: mc/%.cu $(HEADERS)
+mc/%.o: mc/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@
 minimize/%.o: minimize/%.cu $(HEADERS)
 	$(CC) $(CFLAGS) $(INC) -c $< -o $@

From 4ff0af1f5934f7584bad3ad087c34e8c08f723cc Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 10 Aug 2023 18:12:20 +0800
Subject: [PATCH 06/31] start to add nep energy calculator

---
 src/mc/mc_ensemble.cu           |   3 +
 src/mc/mc_ensemble.cuh          |   3 +
 src/mc/mc_ensemble_canonical.cu |  21 +-
 src/mc/nep_energy.cu            | 462 ++++++++++++++++++++++++++++++++
 src/mc/nep_energy.cuh           |  79 ++++++
 5 files changed, 557 insertions(+), 11 deletions(-)
 create mode 100644 src/mc/nep_energy.cu
 create mode 100644 src/mc/nep_energy.cuh

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 05d891391..72ddb8853 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -39,6 +39,9 @@ MC_Ensemble::MC_Ensemble(void)
   pe_before.resize(n_max);
   pe_after.resize(n_max);
 
+  // TODO
+  nep_energy.initialize("../examples/11_NEP_potential_PbTe/nep.txt");
+
 #ifdef DEBUG
   rng = std::mt19937(13579);
 #else
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index de1eb211c..984db2d77 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -17,6 +17,7 @@
 #include "model/atom.cuh"
 #include "model/box.cuh"
 #include "model/group.cuh"
+#include "nep_energy.cuh"
 #include "utilities/gpu_vector.cuh"
 #include <random>
 #include <vector>
@@ -48,4 +49,6 @@ protected:
   GPU_Vector<float> z12_angular;
   GPU_Vector<float> pe_before;
   GPU_Vector<float> pe_after;
+
+  NEP_Energy nep_energy;
 };
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 2cec82ba0..38a2f644e 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -152,8 +152,8 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       box,
-      64.0f,
-      16.0f,
+      nep_energy.paramb.rc_radial * nep_energy.paramb.rc_radial,
+      nep_energy.paramb.rc_angular * nep_energy.paramb.rc_angular,
       atom.position_per_atom.data(),
       atom.position_per_atom.data() + atom.number_of_atoms,
       atom.position_per_atom.data() + atom.number_of_atoms * 2,
@@ -169,15 +169,14 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       z12_angular.data());
     CUDA_CHECK_KERNEL
 
-    /*
-        std::vector<int> NN_radial_cpu(atom.number_of_atoms);
-        std::vector<int> NN_angular_cpu(atom.number_of_atoms);
-        NN_radial.copy_to_host(NN_radial_cpu.data(), atom.number_of_atoms);
-        NN_angular.copy_to_host(NN_angular_cpu.data(), atom.number_of_atoms);
-        for (int n = 0; n < atom.number_of_atoms; ++n) {
-          printf("%d %d\n", NN_radial_cpu[n], NN_angular_cpu[n]);
-        }
-    */
+    std::vector<int> NN_radial_cpu(atom.number_of_atoms);
+    std::vector<int> NN_angular_cpu(atom.number_of_atoms);
+    NN_radial.copy_to_host(NN_radial_cpu.data(), atom.number_of_atoms);
+    NN_angular.copy_to_host(NN_angular_cpu.data(), atom.number_of_atoms);
+    for (int n = 0; n < atom.number_of_atoms; ++n) {
+      printf("%d %d\n", NN_radial_cpu[n], NN_angular_cpu[n]);
+    }
+
     exit(1);
   }
 }
diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu
new file mode 100644
index 000000000..4e9cbc122
--- /dev/null
+++ b/src/mc/nep_energy.cu
@@ -0,0 +1,462 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*----------------------------------------------------------------------------80
+The neuroevolution potential (NEP)
+Ref: Zheyong Fan et al., Neuroevolution machine learning potentials:
+Combining high accuracy and low cost in atomistic simulations and application to
+heat transport, Phys. Rev. B. 104, 104309 (2021).
+------------------------------------------------------------------------------*/
+
+#include "nep_energy.cuh"
+#include "utilities/common.cuh"
+#include "utilities/error.cuh"
+#include "utilities/nep_utilities.cuh"
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+const std::string ELEMENTS[NUM_ELEMENTS] = {
+  "H",  "He", "Li", "Be", "B",  "C",  "N",  "O",  "F",  "Ne", "Na", "Mg", "Al", "Si", "P",
+  "S",  "Cl", "Ar", "K",  "Ca", "Sc", "Ti", "V",  "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
+  "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y",  "Zr", "Nb", "Mo", "Tc", "Ru", "Rh",
+  "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I",  "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
+  "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W",  "Re",
+  "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
+  "Pa", "U",  "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr"};
+
+void NEP_Energy::initialize(const char* file_potential)
+{
+
+  std::ifstream input(file_potential);
+  if (!input.is_open()) {
+    std::cout << "Failed to open " << file_potential << std::endl;
+    exit(1);
+  }
+
+  // nep3 1 C
+  std::vector<std::string> tokens = get_tokens(input);
+  if (tokens.size() < 3) {
+    std::cout << "The first line of nep.txt should have at least 3 items." << std::endl;
+    exit(1);
+  }
+  if (tokens[0] == "nep") {
+    paramb.version = 2;
+    zbl.enabled = false;
+  } else if (tokens[0] == "nep3") {
+    paramb.version = 3;
+    zbl.enabled = false;
+  } else if (tokens[0] == "nep_zbl") {
+    paramb.version = 2;
+    zbl.enabled = true;
+  } else if (tokens[0] == "nep3_zbl") {
+    paramb.version = 3;
+    zbl.enabled = true;
+  } else if (tokens[0] == "nep4") {
+    paramb.version = 4;
+    zbl.enabled = false;
+  } else if (tokens[0] == "nep4_zbl") {
+    paramb.version = 4;
+    zbl.enabled = true;
+  }
+  paramb.num_types = get_int_from_token(tokens[1], __FILE__, __LINE__);
+  if (tokens.size() != 2 + paramb.num_types) {
+    std::cout << "The first line of nep.txt should have " << paramb.num_types << " atom symbols."
+              << std::endl;
+    exit(1);
+  }
+
+  if (paramb.num_types == 1) {
+    printf("Use the NEP%d potential with %d atom type.\n", paramb.version, paramb.num_types);
+  } else {
+    printf("Use the NEP%d potential with %d atom types.\n", paramb.version, paramb.num_types);
+  }
+
+  for (int n = 0; n < paramb.num_types; ++n) {
+    int atomic_number = 0;
+    for (int m = 0; m < NUM_ELEMENTS; ++m) {
+      if (tokens[2 + n] == ELEMENTS[m]) {
+        atomic_number = m + 1;
+        break;
+      }
+    }
+    zbl.atomic_numbers[n] = atomic_number;
+    printf("    type %d (%s with Z = %g).\n", n, tokens[2 + n].c_str(), zbl.atomic_numbers[n]);
+  }
+
+  // zbl 0.7 1.4
+  if (zbl.enabled) {
+    tokens = get_tokens(input);
+    if (tokens.size() != 3) {
+      std::cout << "This line should be zbl rc_inner rc_outer." << std::endl;
+      exit(1);
+    }
+    zbl.rc_inner = get_float_from_token(tokens[1], __FILE__, __LINE__);
+    zbl.rc_outer = get_float_from_token(tokens[2], __FILE__, __LINE__);
+    if (zbl.rc_inner == 0 && zbl.rc_outer == 0) {
+      zbl.flexibled = true;
+      printf("    has the flexible ZBL potential\n");
+    } else {
+      printf(
+        "    has the universal ZBL with inner cutoff %g A and outer cutoff %g A.\n",
+        zbl.rc_inner,
+        zbl.rc_outer);
+    }
+  }
+
+  // cutoff 4.2 3.7 80 47
+  tokens = get_tokens(input);
+  if (tokens.size() != 3 && tokens.size() != 5) {
+    std::cout << "This line should be cutoff rc_radial rc_angular [MN_radial] [MN_angular].\n";
+    exit(1);
+  }
+  paramb.rc_radial = get_float_from_token(tokens[1], __FILE__, __LINE__);
+  paramb.rc_angular = get_float_from_token(tokens[2], __FILE__, __LINE__);
+  printf("    radial cutoff = %g A.\n", paramb.rc_radial);
+  printf("    angular cutoff = %g A.\n", paramb.rc_angular);
+
+  paramb.MN_radial = 500;
+  paramb.MN_angular = 100;
+
+  if (tokens.size() == 5) {
+    int MN_radial = get_int_from_token(tokens[3], __FILE__, __LINE__);
+    int MN_angular = get_int_from_token(tokens[4], __FILE__, __LINE__);
+    printf("    MN_radial = %d.\n", MN_radial);
+    printf("    MN_angular = %d.\n", MN_angular);
+    paramb.MN_radial = int(ceil(MN_radial * 1.25));
+    paramb.MN_angular = int(ceil(MN_angular * 1.25));
+    printf("    enlarged MN_radial = %d.\n", paramb.MN_radial);
+    printf("    enlarged MN_angular = %d.\n", paramb.MN_angular);
+  }
+
+  // n_max 10 8
+  tokens = get_tokens(input);
+  if (tokens.size() != 3) {
+    std::cout << "This line should be n_max n_max_radial n_max_angular." << std::endl;
+    exit(1);
+  }
+  paramb.n_max_radial = get_int_from_token(tokens[1], __FILE__, __LINE__);
+  paramb.n_max_angular = get_int_from_token(tokens[2], __FILE__, __LINE__);
+  printf("    n_max_radial = %d.\n", paramb.n_max_radial);
+  printf("    n_max_angular = %d.\n", paramb.n_max_angular);
+
+  // basis_size 10 8
+  if (paramb.version >= 3) {
+    tokens = get_tokens(input);
+    if (tokens.size() != 3) {
+      std::cout << "This line should be basis_size basis_size_radial basis_size_angular."
+                << std::endl;
+      exit(1);
+    }
+    paramb.basis_size_radial = get_int_from_token(tokens[1], __FILE__, __LINE__);
+    paramb.basis_size_angular = get_int_from_token(tokens[2], __FILE__, __LINE__);
+    printf("    basis_size_radial = %d.\n", paramb.basis_size_radial);
+    printf("    basis_size_angular = %d.\n", paramb.basis_size_angular);
+  }
+
+  // l_max
+  tokens = get_tokens(input);
+  if (paramb.version == 2) {
+    if (tokens.size() != 2) {
+      std::cout << "This line should be l_max l_max_3body." << std::endl;
+      exit(1);
+    }
+  } else {
+    if (tokens.size() != 4) {
+      std::cout << "This line should be l_max l_max_3body l_max_4body l_max_5body." << std::endl;
+      exit(1);
+    }
+  }
+
+  paramb.L_max = get_int_from_token(tokens[1], __FILE__, __LINE__);
+  printf("    l_max_3body = %d.\n", paramb.L_max);
+  paramb.num_L = paramb.L_max;
+
+  if (paramb.version >= 3) {
+    int L_max_4body = get_int_from_token(tokens[2], __FILE__, __LINE__);
+    int L_max_5body = get_int_from_token(tokens[3], __FILE__, __LINE__);
+    printf("    l_max_4body = %d.\n", L_max_4body);
+    printf("    l_max_5body = %d.\n", L_max_5body);
+    if (L_max_4body == 2) {
+      paramb.num_L += 1;
+    }
+    if (L_max_5body == 1) {
+      paramb.num_L += 1;
+    }
+  }
+
+  paramb.dim_angular = (paramb.n_max_angular + 1) * paramb.num_L;
+
+  // ANN
+  tokens = get_tokens(input);
+  if (tokens.size() != 3) {
+    std::cout << "This line should be ANN num_neurons 0." << std::endl;
+    exit(1);
+  }
+  annmb.num_neurons1 = get_int_from_token(tokens[1], __FILE__, __LINE__);
+  annmb.dim = (paramb.n_max_radial + 1) + paramb.dim_angular;
+  printf("    ANN = %d-%d-1.\n", annmb.dim, annmb.num_neurons1);
+
+  // calculated parameters:
+  paramb.rcinv_radial = 1.0f / paramb.rc_radial;
+  paramb.rcinv_angular = 1.0f / paramb.rc_angular;
+  paramb.num_types_sq = paramb.num_types * paramb.num_types;
+
+  annmb.num_para =
+    (annmb.dim + 2) * annmb.num_neurons1 * (paramb.version == 4 ? paramb.num_types : 1) + 1;
+  printf("    number of neural network parameters = %d.\n", annmb.num_para);
+  int num_para_descriptor =
+    paramb.num_types_sq * ((paramb.n_max_radial + 1) * (paramb.basis_size_radial + 1) +
+                           (paramb.n_max_angular + 1) * (paramb.basis_size_angular + 1));
+  if (paramb.version == 2) {
+    num_para_descriptor =
+      (paramb.num_types == 1)
+        ? 0
+        : paramb.num_types_sq * (paramb.n_max_radial + paramb.n_max_angular + 2);
+  }
+  printf("    number of descriptor parameters = %d.\n", num_para_descriptor);
+  annmb.num_para += num_para_descriptor;
+  printf("    total number of parameters = %d.\n", annmb.num_para);
+
+  paramb.num_c_radial =
+    paramb.num_types_sq * (paramb.n_max_radial + 1) * (paramb.basis_size_radial + 1);
+  if (paramb.version == 2) {
+    paramb.num_c_radial =
+      (paramb.num_types == 1) ? 0 : paramb.num_types_sq * (paramb.n_max_radial + 1);
+  }
+
+  // NN and descriptor parameters
+  std::vector<float> parameters(annmb.num_para);
+  for (int n = 0; n < annmb.num_para; ++n) {
+    tokens = get_tokens(input);
+    parameters[n] = get_float_from_token(tokens[0], __FILE__, __LINE__);
+  }
+  nep_parameters.resize(annmb.num_para);
+  nep_parameters.copy_from_host(parameters.data());
+  update_potential(nep_parameters.data(), annmb);
+  for (int d = 0; d < annmb.dim; ++d) {
+    tokens = get_tokens(input);
+    paramb.q_scaler[d] = get_float_from_token(tokens[0], __FILE__, __LINE__);
+  }
+
+  // flexible zbl potential parameters
+  if (zbl.flexibled) {
+    int num_type_zbl = (paramb.num_types * (paramb.num_types + 1)) / 2;
+    for (int d = 0; d < num_type_zbl; ++d) {
+      tokens = get_tokens(input);
+      zbl.rc_flexible_inner[d] = get_float_from_token(tokens[0], __FILE__, __LINE__);
+    }
+    for (int d = 0; d < num_type_zbl; ++d) {
+      tokens = get_tokens(input);
+      zbl.rc_flexible_outer[d] = get_float_from_token(tokens[0], __FILE__, __LINE__);
+    }
+    for (int d = 0; d < 6 * num_type_zbl; ++d) {
+      tokens = get_tokens(input);
+      zbl.para[d] = get_float_from_token(tokens[0], __FILE__, __LINE__);
+    }
+    zbl.num_types = paramb.num_types;
+  }
+}
+
+NEP_Energy::NEP_Energy(void)
+{
+  // nothing
+}
+
+NEP_Energy::~NEP_Energy(void)
+{
+  // nothing
+}
+
+void NEP_Energy::update_potential(float* parameters, ANN& ann)
+{
+  float* pointer = parameters;
+  for (int t = 0; t < paramb.num_types; ++t) {
+    if (t > 0 && paramb.version != 4) { // Use the same set of NN parameters for NEP2 and NEP3
+      pointer -= (ann.dim + 2) * ann.num_neurons1;
+    }
+    ann.w0[t] = pointer;
+    pointer += ann.num_neurons1 * ann.dim;
+    ann.b0[t] = pointer;
+    pointer += ann.num_neurons1;
+    ann.w1[t] = pointer;
+    pointer += ann.num_neurons1;
+  }
+  ann.b1 = pointer;
+  ann.c = ann.b1 + 1;
+}
+
+static __global__ void find_energy_nep(
+  NEP_Energy::ParaMB paramb,
+  NEP_Energy::ANN annmb,
+  const int N,
+  const int* g_NN_radial,
+  const int* g_NL_radial,
+  const int* g_NN_angular,
+  const int* g_NL_angular,
+  const int* __restrict__ g_type,
+  const float* __restrict__ g_x12_radial,
+  const float* __restrict__ g_y12_radial,
+  const float* __restrict__ g_z12_radial,
+  const float* __restrict__ g_x12_angular,
+  const float* __restrict__ g_y12_angular,
+  const float* __restrict__ g_z12_angular,
+  double* g_pe)
+{
+  int n1 = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n1 < N) {
+    int t1 = g_type[n1];
+    float q[MAX_DIM] = {0.0f};
+
+    // get radial descriptors
+    for (int i1 = 0; i1 < g_NN_radial[n1]; ++i1) {
+      int index = i1 * N + n1;
+      int n2 = g_NL_radial[index];
+      float r12[3] = {g_x12_radial[index], g_y12_radial[index], g_z12_radial[index]};
+      float d12 = sqrt(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
+      float fc12;
+      find_fc(paramb.rc_radial, paramb.rcinv_radial, d12, fc12);
+      int t2 = g_type[n2];
+      float fn12[MAX_NUM_N];
+      if (paramb.version == 2) {
+        find_fn(paramb.n_max_radial, paramb.rcinv_radial, d12, fc12, fn12);
+        for (int n = 0; n <= paramb.n_max_radial; ++n) {
+          float c = (paramb.num_types == 1)
+                      ? 1.0f
+                      : annmb.c[(n * paramb.num_types + t1) * paramb.num_types + t2];
+          q[n] += fn12[n] * c;
+        }
+      } else {
+        find_fn(paramb.basis_size_radial, paramb.rcinv_radial, d12, fc12, fn12);
+        for (int n = 0; n <= paramb.n_max_radial; ++n) {
+          float gn12 = 0.0f;
+          for (int k = 0; k <= paramb.basis_size_radial; ++k) {
+            int c_index = (n * (paramb.basis_size_radial + 1) + k) * paramb.num_types_sq;
+            c_index += t1 * paramb.num_types + t2;
+            gn12 += fn12[k] * annmb.c[c_index];
+          }
+          q[n] += gn12;
+        }
+      }
+    }
+
+    // get angular descriptors
+    for (int n = 0; n <= paramb.n_max_angular; ++n) {
+      float s[NUM_OF_ABC] = {0.0f};
+      for (int i1 = 0; i1 < g_NN_angular[n1]; ++i1) {
+        int index = i1 * N + n1;
+        int n2 = g_NL_angular[index];
+        float r12[3] = {g_x12_angular[index], g_y12_angular[index], g_z12_angular[index]};
+        float d12 = sqrt(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
+        float fc12;
+        find_fc(paramb.rc_angular, paramb.rcinv_angular, d12, fc12);
+        int t2 = g_type[n2];
+        if (paramb.version == 2) {
+          float fn;
+          find_fn(n, paramb.rcinv_angular, d12, fc12, fn);
+          fn *=
+            (paramb.num_types == 1)
+              ? 1.0f
+              : annmb.c
+                  [((paramb.n_max_radial + 1 + n) * paramb.num_types + t1) * paramb.num_types + t2];
+          accumulate_s(d12, r12[0], r12[1], r12[2], fn, s);
+        } else {
+          float fn12[MAX_NUM_N];
+          find_fn(paramb.basis_size_angular, paramb.rcinv_angular, d12, fc12, fn12);
+          float gn12 = 0.0f;
+          for (int k = 0; k <= paramb.basis_size_angular; ++k) {
+            int c_index = (n * (paramb.basis_size_angular + 1) + k) * paramb.num_types_sq;
+            c_index += t1 * paramb.num_types + t2 + paramb.num_c_radial;
+            gn12 += fn12[k] * annmb.c[c_index];
+          }
+          accumulate_s(d12, r12[0], r12[1], r12[2], gn12, s);
+        }
+      }
+      if (paramb.num_L == paramb.L_max) {
+        find_q(paramb.n_max_angular + 1, n, s, q + (paramb.n_max_radial + 1));
+      } else if (paramb.num_L == paramb.L_max + 1) {
+        find_q_with_4body(paramb.n_max_angular + 1, n, s, q + (paramb.n_max_radial + 1));
+      } else {
+        find_q_with_5body(paramb.n_max_angular + 1, n, s, q + (paramb.n_max_radial + 1));
+      }
+    }
+
+    // nomalize descriptor
+    for (int d = 0; d < annmb.dim; ++d) {
+      q[d] = q[d] * paramb.q_scaler[d];
+    }
+
+    // get energy and energy gradient
+    float F = 0.0f, Fp[MAX_DIM] = {0.0f};
+    apply_ann_one_layer(
+      annmb.dim, annmb.num_neurons1, annmb.w0[t1], annmb.b0[t1], annmb.w1[t1], annmb.b1, q, F, Fp);
+    g_pe[n1] = F;
+  }
+}
+
+static __global__ void find_energy_ZBL(
+  const int N,
+  const NEP_Energy::ZBL zbl,
+  const int* g_NN,
+  const int* g_NL,
+  const int* __restrict__ g_type,
+  const float* __restrict__ g_x12,
+  const float* __restrict__ g_y12,
+  const float* __restrict__ g_z12,
+  double* g_pe)
+{
+  int n1 = blockIdx.x * blockDim.x + threadIdx.x + N;
+  if (n1 < N) {
+    float s_pe = 0.0f;
+    int type1 = g_type[n1];
+    float zi = zbl.atomic_numbers[type1];
+    float pow_zi = pow(zi, 0.23f);
+    for (int i1 = 0; i1 < g_NN[n1]; ++i1) {
+      int index = i1 * N + n1;
+      int n2 = g_NL[index];
+      float r12[3] = {g_x12[index], g_y12[index], g_z12[index]};
+      float d12 = sqrt(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
+      float d12inv = 1.0f / d12;
+      float f, fp;
+      int type2 = g_type[n2];
+      float zj = zbl.atomic_numbers[type2];
+      float a_inv = (pow_zi + pow(zj, 0.23f)) * 2.134563f;
+      float zizj = K_C_SP * zi * zj;
+      if (zbl.flexibled) {
+        int t1, t2;
+        if (type1 < type2) {
+          t1 = type1;
+          t2 = type2;
+        } else {
+          t1 = type2;
+          t2 = type1;
+        }
+        int zbl_index = t1 * zbl.num_types - (t1 * (t1 - 1)) / 2 + (t2 - t1);
+        float rc_inner = zbl.rc_flexible_inner[zbl_index];
+        float rc_outer = zbl.rc_flexible_outer[zbl_index];
+        float ZBL_para[6];
+        for (int i = 0; i < 6; ++i) {
+          ZBL_para[i] = zbl.para[6 * zbl_index + i];
+        }
+        find_f_and_fp_zbl(ZBL_para, zizj, a_inv, rc_inner, rc_outer, d12, d12inv, f, fp);
+      } else {
+        find_f_and_fp_zbl(zizj, a_inv, zbl.rc_inner, zbl.rc_outer, d12, d12inv, f, fp);
+      }
+      s_pe += f * 0.5f;
+    }
+    g_pe[n1] += s_pe;
+  }
+}
diff --git a/src/mc/nep_energy.cuh b/src/mc/nep_energy.cuh
new file mode 100644
index 000000000..2bb43e2bf
--- /dev/null
+++ b/src/mc/nep_energy.cuh
@@ -0,0 +1,79 @@
+/*
+    Copyright 2017 Zheyong Fan, Ville Vierimaa, Mikko Ervasti, and Ari Harju
+    This file is part of GPUMD.
+    GPUMD is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    GPUMD is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with GPUMD.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+#include "model/box.cuh"
+#include "utilities/common.cuh"
+#include "utilities/gpu_vector.cuh"
+
+class NEP_Energy
+{
+public:
+  struct ParaMB {
+    int version = 2;            // NEP version, 2 for NEP2 and 3 for NEP3
+    float rc_radial = 0.0f;     // radial cutoff
+    float rc_angular = 0.0f;    // angular cutoff
+    float rcinv_radial = 0.0f;  // inverse of the radial cutoff
+    float rcinv_angular = 0.0f; // inverse of the angular cutoff
+    int MN_radial = 200;
+    int MN_angular = 100;
+    int n_max_radial = 0;  // n_radial = 0, 1, 2, ..., n_max_radial
+    int n_max_angular = 0; // n_angular = 0, 1, 2, ..., n_max_angular
+    int L_max = 0;         // l = 0, 1, 2, ..., L_max
+    int dim_angular;
+    int num_L;
+    int basis_size_radial = 8;  // for nep3
+    int basis_size_angular = 8; // for nep3
+    int num_types_sq = 0;       // for nep3
+    int num_c_radial = 0;       // for nep3
+    int num_types = 0;
+    float q_scaler[140];
+  };
+
+  struct ANN {
+    int dim = 0;          // dimension of the descriptor
+    int num_neurons1 = 0; // number of neurons in the 1st hidden layer
+    int num_para = 0;     // number of parameters
+    const float* w0[100]; // weight from the input layer to the hidden layer
+    const float* b0[100]; // bias for the hidden layer
+    const float* w1[100]; // weight from the hidden layer to the output layer
+    const float* b1;      // bias for the output layer
+    const float* c;
+  };
+
+  struct ZBL {
+    bool enabled = false;
+    bool flexibled = false;
+    float rc_inner = 1.0f;
+    float rc_outer = 2.0f;
+    float rc_flexible_inner[55];
+    float rc_flexible_outer[55];
+    float para[330];
+    float atomic_numbers[NUM_ELEMENTS];
+    int num_types;
+  };
+
+  ParaMB paramb;
+  ANN annmb;
+  ZBL zbl;
+
+  NEP_Energy(void);
+  ~NEP_Energy(void);
+  void initialize(const char* file_potential);
+
+private:
+  GPU_Vector<float> nep_parameters; // parameters to be optimized
+  void update_potential(float* parameters, ANN& ann);
+};

From 07b9add2d80ae21cc6d4cca2700e94b53b30de4a Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 10 Aug 2023 19:23:17 +0800
Subject: [PATCH 07/31] nep energy calculator finished

---
 src/mc/mc_ensemble_canonical.cu | 45 +++++++++++++++++++++++++++++----
 src/mc/nep_energy.cu            | 44 +++++++++++++++++++++++++++++---
 src/mc/nep_energy.cuh           | 14 ++++++++++
 3 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 38a2f644e..0a510e901 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -169,13 +169,48 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       z12_angular.data());
     CUDA_CHECK_KERNEL
 
-    std::vector<int> NN_radial_cpu(atom.number_of_atoms);
-    std::vector<int> NN_angular_cpu(atom.number_of_atoms);
-    NN_radial.copy_to_host(NN_radial_cpu.data(), atom.number_of_atoms);
-    NN_angular.copy_to_host(NN_angular_cpu.data(), atom.number_of_atoms);
+    nep_energy.find_energy(
+      atom.number_of_atoms,
+      NN_radial.data(),
+      NL_radial.data(),
+      NN_angular.data(),
+      NL_angular.data(),
+      type_before.data(),
+      x12_radial.data(),
+      y12_radial.data(),
+      z12_radial.data(),
+      x12_angular.data(),
+      y12_angular.data(),
+      z12_angular.data(),
+      pe_before.data());
+
+    nep_energy.find_energy(
+      atom.number_of_atoms,
+      NN_radial.data(),
+      NL_radial.data(),
+      NN_angular.data(),
+      NL_angular.data(),
+      type_after.data(),
+      x12_radial.data(),
+      y12_radial.data(),
+      z12_radial.data(),
+      x12_angular.data(),
+      y12_angular.data(),
+      z12_angular.data(),
+      pe_after.data());
+
+    std::vector<float> pe_before_cpu(atom.number_of_atoms);
+    std::vector<float> pe_after_cpu(atom.number_of_atoms);
+    pe_before.copy_to_host(pe_before_cpu.data(), atom.number_of_atoms);
+    pe_after.copy_to_host(pe_after_cpu.data(), atom.number_of_atoms);
+    float pe_before_total = 0.0f;
+    float pe_after_total = 0.0f;
     for (int n = 0; n < atom.number_of_atoms; ++n) {
-      printf("%d %d\n", NN_radial_cpu[n], NN_angular_cpu[n]);
+      pe_before_total += pe_before_cpu[n];
+      pe_after_total += pe_after_cpu[n];
     }
+    printf("total energy before swapping = %g eV.\n", pe_before_total);
+    printf("total energy after swapping = %g eV.\n", pe_after_total);
 
     exit(1);
   }
diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu
index 4e9cbc122..fb4c7c886 100644
--- a/src/mc/nep_energy.cu
+++ b/src/mc/nep_energy.cu
@@ -314,7 +314,7 @@ static __global__ void find_energy_nep(
   const float* __restrict__ g_x12_angular,
   const float* __restrict__ g_y12_angular,
   const float* __restrict__ g_z12_angular,
-  double* g_pe)
+  float* g_pe)
 {
   int n1 = blockIdx.x * blockDim.x + threadIdx.x;
   if (n1 < N) {
@@ -407,7 +407,7 @@ static __global__ void find_energy_nep(
   }
 }
 
-static __global__ void find_energy_ZBL(
+static __global__ void find_energy_zbl(
   const int N,
   const NEP_Energy::ZBL zbl,
   const int* g_NN,
@@ -416,7 +416,7 @@ static __global__ void find_energy_ZBL(
   const float* __restrict__ g_x12,
   const float* __restrict__ g_y12,
   const float* __restrict__ g_z12,
-  double* g_pe)
+  float* g_pe)
 {
   int n1 = blockIdx.x * blockDim.x + threadIdx.x + N;
   if (n1 < N) {
@@ -460,3 +460,41 @@ static __global__ void find_energy_ZBL(
     g_pe[n1] += s_pe;
   }
 }
+
+void NEP_Energy::find_energy(
+  const int N,
+  const int* g_NN_radial,
+  const int* g_NL_radial,
+  const int* g_NN_angular,
+  const int* g_NL_angular,
+  const int* g_type,
+  const float* g_x12_radial,
+  const float* g_y12_radial,
+  const float* g_z12_radial,
+  const float* g_x12_angular,
+  const float* g_y12_angular,
+  const float* g_z12_angular,
+  float* g_pe)
+{
+  find_energy_nep<<<(N - 1) / 64 + 1, 64>>>(
+    paramb,
+    annmb,
+    N,
+    g_NN_radial,
+    g_NL_radial,
+    g_NN_angular,
+    g_NL_angular,
+    g_type,
+    g_x12_radial,
+    g_y12_radial,
+    g_z12_radial,
+    g_x12_angular,
+    g_y12_angular,
+    g_z12_angular,
+    g_pe);
+  CUDA_CHECK_KERNEL
+
+  find_energy_zbl<<<(N - 1) / 64 + 1, 64>>>(
+    N, zbl, g_NN_angular, g_NL_angular, g_type, g_x12_angular, g_y12_angular, g_z12_angular, g_pe);
+  CUDA_CHECK_KERNEL
+}
diff --git a/src/mc/nep_energy.cuh b/src/mc/nep_energy.cuh
index 2bb43e2bf..02d03d3a6 100644
--- a/src/mc/nep_energy.cuh
+++ b/src/mc/nep_energy.cuh
@@ -72,6 +72,20 @@ public:
   NEP_Energy(void);
   ~NEP_Energy(void);
   void initialize(const char* file_potential);
+  void find_energy(
+    const int N,
+    const int* g_NN_radial,
+    const int* g_NL_radial,
+    const int* g_NN_angular,
+    const int* g_NL_angular,
+    const int* g_type,
+    const float* g_x12_radial,
+    const float* g_y12_radial,
+    const float* g_z12_radial,
+    const float* g_x12_angular,
+    const float* g_y12_angular,
+    const float* g_z12_angular,
+    float* g_pe);
 
 private:
   GPU_Vector<float> nep_parameters; // parameters to be optimized

From fc42c15843dd0ccfdc917411ca886e4ae60b1f99 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 10 Aug 2023 19:48:36 +0800
Subject: [PATCH 08/31] accept or reject

---
 src/mc/mc_ensemble_canonical.cu | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 0a510e901..b46c8ac51 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -211,7 +211,17 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     }
     printf("total energy before swapping = %g eV.\n", pe_before_total);
     printf("total energy after swapping = %g eV.\n", pe_after_total);
-
-    exit(1);
+    float energy_difference = pe_after_total - pe_before_total;
+    std::uniform_real_distribution<float> r2(0, 1);
+    float random_number = r2(rng);
+    printf("random number = %g.\n", random_number);
+    float probability = exp(-energy_difference / (K_B * temperature));
+    printf("probability = %g.\n", probability);
+
+    if (random_number < probability) {
+      printf("the MC trail is accepted.\n");
+    } else {
+      printf("the MC trail is rejected.\n");
+    }
   }
 }

From 5ed52c2961dc6c61935f659af716eeac5c130f9e Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Sun, 13 Aug 2023 06:05:47 +0800
Subject: [PATCH 09/31] find the local atoms for energy calculations

---
 src/mc/mc_ensemble.cu            |  19 +++--
 src/mc/mc_ensemble_canonical.cu  | 140 ++++++++++++++++++++++++++++++-
 src/mc/mc_ensemble_canonical.cuh |   8 ++
 3 files changed, 156 insertions(+), 11 deletions(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 72ddb8853..e108d1695 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -23,19 +23,20 @@ The abstract base class (ABC) for the MC_Ensemble classes.
 
 MC_Ensemble::MC_Ensemble(void)
 {
-  const int n_max = 1000;
+  const int n_max = 16000;
+  const int m_max = 100;
   NN_radial.resize(n_max);
   NN_angular.resize(n_max);
-  NL_radial.resize(n_max * n_max);
-  NL_angular.resize(n_max * n_max);
+  NL_radial.resize(n_max * m_max);
+  NL_angular.resize(n_max * m_max);
   type_before.resize(n_max);
   type_after.resize(n_max);
-  x12_radial.resize(n_max * n_max);
-  y12_radial.resize(n_max * n_max);
-  z12_radial.resize(n_max * n_max);
-  x12_angular.resize(n_max * n_max);
-  y12_angular.resize(n_max * n_max);
-  z12_angular.resize(n_max * n_max);
+  x12_radial.resize(n_max * m_max);
+  y12_radial.resize(n_max * m_max);
+  z12_radial.resize(n_max * m_max);
+  x12_angular.resize(n_max * m_max);
+  y12_angular.resize(n_max * m_max);
+  z12_angular.resize(n_max * m_max);
   pe_before.resize(n_max);
   pe_after.resize(n_max);
 
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index b46c8ac51..f6cda9975 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -23,6 +23,12 @@ MC_Ensemble_Canonical::MC_Ensemble_Canonical(int num_steps_mc_input, double temp
 {
   num_steps_mc = num_steps_mc_input;
   temperature = temperature_input;
+  NN_i.resize(1);
+  NN_j.resize(1);
+  NN_ij.resize(1);
+  NL_i.resize(1000);
+  NL_j.resize(1000);
+  NL_ij.resize(1000);
 }
 
 MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void)
@@ -53,6 +59,46 @@ static __global__ void get_types(
   }
 }
 
+static __global__ void get_neighbors_of_i_and_j(
+  const int N,
+  const Box box,
+  const int i,
+  const int j,
+  const float rc_radial_square,
+  const double* __restrict__ g_x,
+  const double* __restrict__ g_y,
+  const double* __restrict__ g_z,
+  int* g_NN_i,
+  int* g_NN_j,
+  int* g_NL_i,
+  int* g_NL_j)
+{
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < N) {
+    double x0 = g_x[n];
+    double y0 = g_y[n];
+    double z0 = g_z[n];
+    double x0i = g_x[i] - x0;
+    double y0i = g_y[i] - y0;
+    double z0i = g_z[i] - z0;
+    double x0j = g_x[j] - x0;
+    double y0j = g_y[j] - y0;
+    double z0j = g_z[j] - z0;
+
+    apply_mic(box, x0i, y0i, z0i);
+    float distance_square_i = float(x0i * x0i + y0i * y0i + z0i * z0i);
+    apply_mic(box, x0j, y0j, z0j);
+    float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
+
+    if (distance_square_i < rc_radial_square) {
+      g_NL_i[atomicAdd(g_NN_i, 1)] = n;
+    }
+    if (distance_square_j < rc_radial_square) {
+      g_NL_j[atomicAdd(g_NN_j, 1)] = n;
+    }
+  }
+}
+
 static __global__ void create_inputs_for_energy_calculator(
   const int N,
   const Box box,
@@ -115,8 +161,32 @@ static int get_type(const int i, const int* g_type)
   return type_i;
 }
 
+static bool check_if_small_box(const double rc, const Box& box)
+{
+  double volume = box.get_volume();
+  double thickness_x = volume / box.get_area(0);
+  double thickness_y = volume / box.get_area(1);
+  double thickness_z = volume / box.get_area(2);
+  bool is_small_box = false;
+  if (box.pbc_x && thickness_x <= 2.0 * rc) {
+    is_small_box = true;
+  }
+  if (box.pbc_y && thickness_y <= 2.0 * rc) {
+    is_small_box = true;
+  }
+  if (box.pbc_z && thickness_z <= 2.0 * rc) {
+    is_small_box = true;
+  }
+  return is_small_box;
+}
+
 void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
 {
+  if (check_if_small_box(nep_energy.paramb.rc_radial, box)) {
+    printf("Cannot use small box for MCMD.\n");
+    exit(1);
+  }
+
   std::uniform_int_distribution<int> r1(0, atom.number_of_atoms - 1);
 
   for (int step = 0; step < num_steps_mc; ++step) {
@@ -138,6 +208,72 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       j,
       type_j);
 
+    cudaMemset(NN_i.data(), 0, sizeof(int));
+    cudaMemset(NN_j.data(), 0, sizeof(int));
+    get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
+      atom.number_of_atoms,
+      box,
+      i,
+      j,
+      nep_energy.paramb.rc_radial * nep_energy.paramb.rc_radial,
+      atom.position_per_atom.data(),
+      atom.position_per_atom.data() + atom.number_of_atoms,
+      atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      NN_i.data(),
+      NN_j.data(),
+      NL_i.data(),
+      NL_j.data());
+    CUDA_CHECK_KERNEL
+
+    // copy to host
+    int NN_i_cpu, NN_j_cpu;
+    int NL_i_cpu[1000], NL_j_cpu[1000];
+    NN_i.copy_to_host(&NN_i_cpu);
+    NN_j.copy_to_host(&NN_j_cpu);
+    NL_i.copy_to_host(NL_i_cpu, NN_i_cpu);
+    NL_j.copy_to_host(NL_j_cpu, NN_j_cpu);
+
+    printf("        atom %d has %d neighbors:\n", i, NN_i_cpu);
+    for (int k = 0; k < NN_i_cpu; ++k) {
+      printf(" %d", NL_i_cpu[k]);
+    }
+    printf("\n");
+    printf("        atom %d has %d neighbors:\n", j, NN_j_cpu);
+    for (int k = 0; k < NN_j_cpu; ++k) {
+      printf(" %d", NL_j_cpu[k]);
+    }
+    printf("\n");
+
+    // check in host
+    int NN_ij_cpu = 0;
+    int NL_ij_cpu[1000];
+    for (; NN_ij_cpu < NN_i_cpu; ++NN_ij_cpu) {
+      NL_ij_cpu[NN_ij_cpu] = NL_i_cpu[NN_ij_cpu];
+    }
+
+    for (int k = 0; k < NN_j_cpu; ++k) {
+      bool is_repeating = false;
+      for (int m = 0; m < NN_i_cpu; ++m) {
+        if (NL_j_cpu[k] == NL_i_cpu[m]) {
+          is_repeating = true;
+          break;
+        }
+      }
+      if (!is_repeating) {
+        NL_ij_cpu[NN_ij_cpu++] = NL_j_cpu[k];
+      }
+    }
+
+    printf("        i and j has %d neighbors in total:\n", NN_ij_cpu);
+    for (int k = 0; k < NN_ij_cpu; ++k) {
+      printf(" %d", NL_ij_cpu[k]);
+    }
+    printf("\n");
+
+    // copy to device
+    NN_ij.copy_from_host(&NN_ij_cpu);
+    NL_ij.copy_from_host(NL_ij_cpu, NN_ij_cpu);
+
     get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       i,
@@ -209,8 +345,8 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       pe_before_total += pe_before_cpu[n];
       pe_after_total += pe_after_cpu[n];
     }
-    printf("total energy before swapping = %g eV.\n", pe_before_total);
-    printf("total energy after swapping = %g eV.\n", pe_after_total);
+    printf("per-atom energy before swapping = %g eV.\n", pe_before_total / atom.number_of_atoms);
+    printf("per-atom energy after swapping = %g eV.\n", pe_after_total / atom.number_of_atoms);
     float energy_difference = pe_after_total - pe_before_total;
     std::uniform_real_distribution<float> r2(0, 1);
     float random_number = r2(rng);
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index 51396ba01..d614fa186 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -23,4 +23,12 @@ public:
   virtual ~MC_Ensemble_Canonical(void);
 
   virtual void compute(Atom& atom, Box& box);
+
+private:
+  GPU_Vector<int> NN_i;
+  GPU_Vector<int> NN_j;
+  GPU_Vector<int> NN_ij;
+  GPU_Vector<int> NL_i;
+  GPU_Vector<int> NL_j;
+  GPU_Vector<int> NL_ij;
 };

From 88ed7b36dd052a0e876d75d236b0136fd914ec2f Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Sun, 13 Aug 2023 06:25:54 +0800
Subject: [PATCH 10/31] simplify

---
 src/mc/mc_ensemble_canonical.cu  | 71 ++++++--------------------------
 src/mc/mc_ensemble_canonical.cuh |  4 --
 2 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index f6cda9975..934f71ec1 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -23,11 +23,7 @@ MC_Ensemble_Canonical::MC_Ensemble_Canonical(int num_steps_mc_input, double temp
 {
   num_steps_mc = num_steps_mc_input;
   temperature = temperature_input;
-  NN_i.resize(1);
-  NN_j.resize(1);
   NN_ij.resize(1);
-  NL_i.resize(1000);
-  NL_j.resize(1000);
   NL_ij.resize(1000);
 }
 
@@ -68,10 +64,8 @@ static __global__ void get_neighbors_of_i_and_j(
   const double* __restrict__ g_x,
   const double* __restrict__ g_y,
   const double* __restrict__ g_z,
-  int* g_NN_i,
-  int* g_NN_j,
-  int* g_NL_i,
-  int* g_NL_j)
+  int* g_NN_ij,
+  int* g_NL_ij)
 {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
   if (n < N) {
@@ -91,10 +85,11 @@ static __global__ void get_neighbors_of_i_and_j(
     float distance_square_j = float(x0j * x0j + y0j * y0j + z0j * z0j);
 
     if (distance_square_i < rc_radial_square) {
-      g_NL_i[atomicAdd(g_NN_i, 1)] = n;
-    }
-    if (distance_square_j < rc_radial_square) {
-      g_NL_j[atomicAdd(g_NN_j, 1)] = n;
+      g_NL_ij[atomicAdd(g_NN_ij, 1)] = n;
+    } else {
+      if (distance_square_j < rc_radial_square) {
+        g_NL_ij[atomicAdd(g_NN_ij, 1)] = n;
+      }
     }
   }
 }
@@ -208,8 +203,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       j,
       type_j);
 
-    cudaMemset(NN_i.data(), 0, sizeof(int));
-    cudaMemset(NN_j.data(), 0, sizeof(int));
+    cudaMemset(NN_ij.data(), 0, sizeof(int));
     get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       box,
@@ -219,50 +213,15 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       atom.position_per_atom.data(),
       atom.position_per_atom.data() + atom.number_of_atoms,
       atom.position_per_atom.data() + atom.number_of_atoms * 2,
-      NN_i.data(),
-      NN_j.data(),
-      NL_i.data(),
-      NL_j.data());
+      NN_ij.data(),
+      NL_ij.data());
     CUDA_CHECK_KERNEL
 
     // copy to host
-    int NN_i_cpu, NN_j_cpu;
-    int NL_i_cpu[1000], NL_j_cpu[1000];
-    NN_i.copy_to_host(&NN_i_cpu);
-    NN_j.copy_to_host(&NN_j_cpu);
-    NL_i.copy_to_host(NL_i_cpu, NN_i_cpu);
-    NL_j.copy_to_host(NL_j_cpu, NN_j_cpu);
-
-    printf("        atom %d has %d neighbors:\n", i, NN_i_cpu);
-    for (int k = 0; k < NN_i_cpu; ++k) {
-      printf(" %d", NL_i_cpu[k]);
-    }
-    printf("\n");
-    printf("        atom %d has %d neighbors:\n", j, NN_j_cpu);
-    for (int k = 0; k < NN_j_cpu; ++k) {
-      printf(" %d", NL_j_cpu[k]);
-    }
-    printf("\n");
-
-    // check in host
-    int NN_ij_cpu = 0;
+    int NN_ij_cpu;
     int NL_ij_cpu[1000];
-    for (; NN_ij_cpu < NN_i_cpu; ++NN_ij_cpu) {
-      NL_ij_cpu[NN_ij_cpu] = NL_i_cpu[NN_ij_cpu];
-    }
-
-    for (int k = 0; k < NN_j_cpu; ++k) {
-      bool is_repeating = false;
-      for (int m = 0; m < NN_i_cpu; ++m) {
-        if (NL_j_cpu[k] == NL_i_cpu[m]) {
-          is_repeating = true;
-          break;
-        }
-      }
-      if (!is_repeating) {
-        NL_ij_cpu[NN_ij_cpu++] = NL_j_cpu[k];
-      }
-    }
+    NN_ij.copy_to_host(&NN_ij_cpu);
+    NL_ij.copy_to_host(NL_ij_cpu, NN_ij_cpu);
 
     printf("        i and j has %d neighbors in total:\n", NN_ij_cpu);
     for (int k = 0; k < NN_ij_cpu; ++k) {
@@ -270,10 +229,6 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     }
     printf("\n");
 
-    // copy to device
-    NN_ij.copy_from_host(&NN_ij_cpu);
-    NL_ij.copy_from_host(NL_ij_cpu, NN_ij_cpu);
-
     get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       i,
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index d614fa186..d9ac0c9b6 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -25,10 +25,6 @@ public:
   virtual void compute(Atom& atom, Box& box);
 
 private:
-  GPU_Vector<int> NN_i;
-  GPU_Vector<int> NN_j;
   GPU_Vector<int> NN_ij;
-  GPU_Vector<int> NL_i;
-  GPU_Vector<int> NL_j;
   GPU_Vector<int> NL_ij;
 };

From 1257545caea4091e12508580f76ca29a94d1174d Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 02:21:51 +0800
Subject: [PATCH 11/31] speed up using locality

---
 src/mc/mc_ensemble.cu           |  14 +--
 src/mc/mc_ensemble.cuh          |   8 +-
 src/mc/mc_ensemble_canonical.cu | 146 ++++++++++++++++++++------------
 src/mc/nep_energy.cu            |  27 +++---
 src/mc/nep_energy.cuh           |   4 +-
 5 files changed, 121 insertions(+), 78 deletions(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index e108d1695..b62110fd5 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -23,14 +23,16 @@ The abstract base class (ABC) for the MC_Ensemble classes.
 
 MC_Ensemble::MC_Ensemble(void)
 {
-  const int n_max = 16000;
-  const int m_max = 100;
+  const int n_max = 1000;
+  const int m_max = 1000;
   NN_radial.resize(n_max);
   NN_angular.resize(n_max);
-  NL_radial.resize(n_max * m_max);
-  NL_angular.resize(n_max * m_max);
-  type_before.resize(n_max);
-  type_after.resize(n_max);
+  local_type_before.resize(n_max);
+  local_type_after.resize(n_max);
+  t2_radial_before.resize(n_max * m_max);
+  t2_radial_after.resize(n_max * m_max);
+  t2_angular_before.resize(n_max * m_max);
+  t2_angular_after.resize(n_max * m_max);
   x12_radial.resize(n_max * m_max);
   y12_radial.resize(n_max * m_max);
   z12_radial.resize(n_max * m_max);
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index 984db2d77..e3997be96 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -37,10 +37,14 @@ protected:
 
   GPU_Vector<int> NN_radial;
   GPU_Vector<int> NN_angular;
-  GPU_Vector<int> NL_radial;
-  GPU_Vector<int> NL_angular;
   GPU_Vector<int> type_before;
   GPU_Vector<int> type_after;
+  GPU_Vector<int> local_type_before;
+  GPU_Vector<int> local_type_after;
+  GPU_Vector<int> t2_radial_before;
+  GPU_Vector<int> t2_radial_after;
+  GPU_Vector<int> t2_angular_before;
+  GPU_Vector<int> t2_angular_after;
   GPU_Vector<float> x12_radial;
   GPU_Vector<float> y12_radial;
   GPU_Vector<float> z12_radial;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 934f71ec1..74702d722 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -55,6 +55,22 @@ static __global__ void get_types(
   }
 }
 
+static __global__ void find_local_types(
+  const int N_local,
+  const int* atom_local,
+  const int* g_type_before,
+  const int* g_type_after,
+  int* g_local_type_before,
+  int* g_local_type_after)
+{
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k < N_local) {
+    int n = atom_local[k];
+    g_local_type_before[k] = g_type_before[n];
+    g_local_type_after[k] = g_type_after[n];
+  }
+}
+
 static __global__ void get_neighbors_of_i_and_j(
   const int N,
   const Box box,
@@ -96,16 +112,22 @@ static __global__ void get_neighbors_of_i_and_j(
 
 static __global__ void create_inputs_for_energy_calculator(
   const int N,
+  const int N_local,
+  const int* atom_local,
   const Box box,
   const float rc_radial_square,
   const float rc_angular_square,
   const double* __restrict__ g_x,
   const double* __restrict__ g_y,
   const double* __restrict__ g_z,
+  const int* g_type_before,
+  const int* g_type_after,
   int* g_NN_radial,
-  int* g_NL_radial,
   int* g_NN_angular,
-  int* g_NL_angular,
+  int* g_t2_radial_before,
+  int* g_t2_radial_after,
+  int* g_t2_angular_before,
+  int* g_t2_angular_after,
   float* g_x12_radial,
   float* g_y12_radial,
   float* g_z12_radial,
@@ -113,39 +135,39 @@ static __global__ void create_inputs_for_energy_calculator(
   float* g_y12_angular,
   float* g_z12_angular)
 {
-  int n1 = blockIdx.x * blockDim.x + threadIdx.x;
-  if (n1 < N) {
-    double x1 = g_x[n1];
-    double y1 = g_y[n1];
-    double z1 = g_z[n1];
-    int count_radial = 0;
-    int count_angular = 0;
-    for (int n2 = 0; n2 < N; ++n2) {
-      if (n2 == n1) {
+  int n2 = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n2 < N) {
+    double x2 = g_x[n2];
+    double y2 = g_y[n2];
+    double z2 = g_z[n2];
+
+    for (int k = 0; k < N_local; ++k) {
+      int n1 = atom_local[k];
+      if (n1 == n2) {
         continue;
       }
-      double x12 = g_x[n2] - x1;
-      double y12 = g_y[n2] - y1;
-      double z12 = g_z[n2] - z1;
+      double x12 = x2 - g_x[n1];
+      double y12 = y2 - g_y[n1];
+      double z12 = z2 - g_z[n1];
       apply_mic(box, x12, y12, z12);
       float distance_square = float(x12 * x12 + y12 * y12 + z12 * z12);
       if (distance_square < rc_radial_square) {
-        g_NL_radial[count_radial * N + n1] = n2;
-        g_x12_radial[count_radial * N + n1] = float(x12);
-        g_y12_radial[count_radial * N + n1] = float(y12);
-        g_z12_radial[count_radial * N + n1] = float(z12);
-        count_radial++;
+        int count_radial = atomicAdd(&g_NN_radial[k], 1);
+        g_t2_radial_before[count_radial * N_local + k] = g_type_before[n2];
+        g_t2_radial_after[count_radial * N_local + k] = g_type_after[n2];
+        g_x12_radial[count_radial * N_local + k] = float(x12);
+        g_y12_radial[count_radial * N_local + k] = float(y12);
+        g_z12_radial[count_radial * N_local + k] = float(z12);
       }
       if (distance_square < rc_angular_square) {
-        g_NL_angular[count_angular * N + n1] = n2;
-        g_x12_angular[count_angular * N + n1] = float(x12);
-        g_y12_angular[count_angular * N + n1] = float(y12);
-        g_z12_angular[count_angular * N + n1] = float(z12);
-        count_angular++;
+        int count_angular = atomicAdd(&g_NN_angular[k], 1);
+        g_t2_angular_before[count_angular * N_local + k] = g_type_before[n2];
+        g_t2_angular_after[count_angular * N_local + k] = g_type_after[n2];
+        g_x12_angular[count_angular * N_local + k] = float(x12);
+        g_y12_angular[count_angular * N_local + k] = float(y12);
+        g_z12_angular[count_angular * N_local + k] = float(z12);
       }
     }
-    g_NN_radial[n1] = count_radial;
-    g_NN_angular[n1] = count_angular;
   }
 }
 
@@ -182,6 +204,11 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     exit(1);
   }
 
+  if (type_before.size() < atom.number_of_atoms) {
+    type_before.resize(atom.number_of_atoms);
+    type_after.resize(atom.number_of_atoms);
+  }
+
   std::uniform_int_distribution<int> r1(0, atom.number_of_atoms - 1);
 
   for (int step = 0; step < num_steps_mc; ++step) {
@@ -203,7 +230,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       j,
       type_j);
 
-    cudaMemset(NN_ij.data(), 0, sizeof(int));
+    CHECK(cudaMemset(NN_ij.data(), 0, sizeof(int)));
     get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
       box,
@@ -223,11 +250,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     NN_ij.copy_to_host(&NN_ij_cpu);
     NL_ij.copy_to_host(NL_ij_cpu, NN_ij_cpu);
 
-    printf("        i and j has %d neighbors in total:\n", NN_ij_cpu);
-    for (int k = 0; k < NN_ij_cpu; ++k) {
-      printf(" %d", NL_ij_cpu[k]);
-    }
-    printf("\n");
+    printf("        i and j has %d neighbors in total.\n", NN_ij_cpu);
 
     get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
@@ -240,18 +263,35 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       type_after.data());
     CUDA_CHECK_KERNEL
 
+    find_local_types<<<(NN_ij_cpu - 1) / 64 + 1, 64>>>(
+      NN_ij_cpu,
+      NL_ij.data(),
+      type_before.data(),
+      type_after.data(),
+      local_type_before.data(),
+      local_type_after.data());
+    CUDA_CHECK_KERNEL
+
+    CHECK(cudaMemset(NN_radial.data(), 0, sizeof(int) * NN_radial.size()));
+    CHECK(cudaMemset(NN_angular.data(), 0, sizeof(int) * NN_angular.size()));
     create_inputs_for_energy_calculator<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
+      NN_ij_cpu,
+      NL_ij.data(),
       box,
       nep_energy.paramb.rc_radial * nep_energy.paramb.rc_radial,
       nep_energy.paramb.rc_angular * nep_energy.paramb.rc_angular,
       atom.position_per_atom.data(),
       atom.position_per_atom.data() + atom.number_of_atoms,
       atom.position_per_atom.data() + atom.number_of_atoms * 2,
+      type_before.data(),
+      type_after.data(),
       NN_radial.data(),
-      NL_radial.data(),
       NN_angular.data(),
-      NL_angular.data(),
+      t2_radial_before.data(),
+      t2_radial_after.data(),
+      t2_angular_before.data(),
+      t2_angular_after.data(),
       x12_radial.data(),
       y12_radial.data(),
       z12_radial.data(),
@@ -261,12 +301,12 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     CUDA_CHECK_KERNEL
 
     nep_energy.find_energy(
-      atom.number_of_atoms,
+      NN_ij_cpu, // atom.number_of_atoms,
       NN_radial.data(),
-      NL_radial.data(),
       NN_angular.data(),
-      NL_angular.data(),
-      type_before.data(),
+      local_type_before.data(),
+      t2_radial_before.data(),
+      t2_angular_before.data(),
       x12_radial.data(),
       y12_radial.data(),
       z12_radial.data(),
@@ -276,12 +316,12 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       pe_before.data());
 
     nep_energy.find_energy(
-      atom.number_of_atoms,
+      NN_ij_cpu, // atom.number_of_atoms,
       NN_radial.data(),
-      NL_radial.data(),
       NN_angular.data(),
-      NL_angular.data(),
-      type_after.data(),
+      local_type_after.data(),
+      t2_radial_after.data(),
+      t2_angular_after.data(),
       x12_radial.data(),
       y12_radial.data(),
       z12_radial.data(),
@@ -290,29 +330,29 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       z12_angular.data(),
       pe_after.data());
 
-    std::vector<float> pe_before_cpu(atom.number_of_atoms);
-    std::vector<float> pe_after_cpu(atom.number_of_atoms);
-    pe_before.copy_to_host(pe_before_cpu.data(), atom.number_of_atoms);
-    pe_after.copy_to_host(pe_after_cpu.data(), atom.number_of_atoms);
+    std::vector<float> pe_before_cpu(NN_ij_cpu);
+    std::vector<float> pe_after_cpu(NN_ij_cpu);
+    pe_before.copy_to_host(pe_before_cpu.data(), NN_ij_cpu);
+    pe_after.copy_to_host(pe_after_cpu.data(), NN_ij_cpu);
     float pe_before_total = 0.0f;
     float pe_after_total = 0.0f;
-    for (int n = 0; n < atom.number_of_atoms; ++n) {
+    for (int n = 0; n < NN_ij_cpu; ++n) {
       pe_before_total += pe_before_cpu[n];
       pe_after_total += pe_after_cpu[n];
     }
-    printf("per-atom energy before swapping = %g eV.\n", pe_before_total / atom.number_of_atoms);
-    printf("per-atom energy after swapping = %g eV.\n", pe_after_total / atom.number_of_atoms);
+    printf("        per-atom energy before swapping = %g eV.\n", pe_before_total / NN_ij_cpu);
+    printf("        per-atom energy after swapping = %g eV.\n", pe_after_total / NN_ij_cpu);
     float energy_difference = pe_after_total - pe_before_total;
     std::uniform_real_distribution<float> r2(0, 1);
     float random_number = r2(rng);
-    printf("random number = %g.\n", random_number);
+    printf("        random number = %g.\n", random_number);
     float probability = exp(-energy_difference / (K_B * temperature));
-    printf("probability = %g.\n", probability);
+    printf("        probability = %g.\n", probability);
 
     if (random_number < probability) {
-      printf("the MC trail is accepted.\n");
+      printf("        the MC trail is accepted.\n");
     } else {
-      printf("the MC trail is rejected.\n");
+      printf("        the MC trail is rejected.\n");
     }
   }
 }
diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu
index fb4c7c886..a9d3966a6 100644
--- a/src/mc/nep_energy.cu
+++ b/src/mc/nep_energy.cu
@@ -304,10 +304,10 @@ static __global__ void find_energy_nep(
   NEP_Energy::ANN annmb,
   const int N,
   const int* g_NN_radial,
-  const int* g_NL_radial,
   const int* g_NN_angular,
-  const int* g_NL_angular,
   const int* __restrict__ g_type,
+  const int* __restrict__ g_t2_radial,
+  const int* __restrict__ g_t2_angular,
   const float* __restrict__ g_x12_radial,
   const float* __restrict__ g_y12_radial,
   const float* __restrict__ g_z12_radial,
@@ -324,12 +324,11 @@ static __global__ void find_energy_nep(
     // get radial descriptors
     for (int i1 = 0; i1 < g_NN_radial[n1]; ++i1) {
       int index = i1 * N + n1;
-      int n2 = g_NL_radial[index];
       float r12[3] = {g_x12_radial[index], g_y12_radial[index], g_z12_radial[index]};
       float d12 = sqrt(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
       float fc12;
       find_fc(paramb.rc_radial, paramb.rcinv_radial, d12, fc12);
-      int t2 = g_type[n2];
+      int t2 = g_t2_radial[index];
       float fn12[MAX_NUM_N];
       if (paramb.version == 2) {
         find_fn(paramb.n_max_radial, paramb.rcinv_radial, d12, fc12, fn12);
@@ -358,12 +357,11 @@ static __global__ void find_energy_nep(
       float s[NUM_OF_ABC] = {0.0f};
       for (int i1 = 0; i1 < g_NN_angular[n1]; ++i1) {
         int index = i1 * N + n1;
-        int n2 = g_NL_angular[index];
         float r12[3] = {g_x12_angular[index], g_y12_angular[index], g_z12_angular[index]};
         float d12 = sqrt(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
         float fc12;
         find_fc(paramb.rc_angular, paramb.rcinv_angular, d12, fc12);
-        int t2 = g_type[n2];
+        int t2 = g_t2_angular[index];
         if (paramb.version == 2) {
           float fn;
           find_fn(n, paramb.rcinv_angular, d12, fc12, fn);
@@ -411,14 +409,14 @@ static __global__ void find_energy_zbl(
   const int N,
   const NEP_Energy::ZBL zbl,
   const int* g_NN,
-  const int* g_NL,
   const int* __restrict__ g_type,
+  const int* g_t2_angular,
   const float* __restrict__ g_x12,
   const float* __restrict__ g_y12,
   const float* __restrict__ g_z12,
   float* g_pe)
 {
-  int n1 = blockIdx.x * blockDim.x + threadIdx.x + N;
+  int n1 = blockIdx.x * blockDim.x + threadIdx.x;
   if (n1 < N) {
     float s_pe = 0.0f;
     int type1 = g_type[n1];
@@ -426,12 +424,11 @@ static __global__ void find_energy_zbl(
     float pow_zi = pow(zi, 0.23f);
     for (int i1 = 0; i1 < g_NN[n1]; ++i1) {
       int index = i1 * N + n1;
-      int n2 = g_NL[index];
       float r12[3] = {g_x12[index], g_y12[index], g_z12[index]};
       float d12 = sqrt(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
       float d12inv = 1.0f / d12;
       float f, fp;
-      int type2 = g_type[n2];
+      int type2 = g_t2_angular[index];
       float zj = zbl.atomic_numbers[type2];
       float a_inv = (pow_zi + pow(zj, 0.23f)) * 2.134563f;
       float zizj = K_C_SP * zi * zj;
@@ -464,10 +461,10 @@ static __global__ void find_energy_zbl(
 void NEP_Energy::find_energy(
   const int N,
   const int* g_NN_radial,
-  const int* g_NL_radial,
   const int* g_NN_angular,
-  const int* g_NL_angular,
   const int* g_type,
+  const int* g_t2_radial,
+  const int* g_t2_angular,
   const float* g_x12_radial,
   const float* g_y12_radial,
   const float* g_z12_radial,
@@ -481,10 +478,10 @@ void NEP_Energy::find_energy(
     annmb,
     N,
     g_NN_radial,
-    g_NL_radial,
     g_NN_angular,
-    g_NL_angular,
     g_type,
+    g_t2_radial,
+    g_t2_angular,
     g_x12_radial,
     g_y12_radial,
     g_z12_radial,
@@ -495,6 +492,6 @@ void NEP_Energy::find_energy(
   CUDA_CHECK_KERNEL
 
   find_energy_zbl<<<(N - 1) / 64 + 1, 64>>>(
-    N, zbl, g_NN_angular, g_NL_angular, g_type, g_x12_angular, g_y12_angular, g_z12_angular, g_pe);
+    N, zbl, g_NN_angular, g_type, g_t2_angular, g_x12_angular, g_y12_angular, g_z12_angular, g_pe);
   CUDA_CHECK_KERNEL
 }
diff --git a/src/mc/nep_energy.cuh b/src/mc/nep_energy.cuh
index 02d03d3a6..1345186ec 100644
--- a/src/mc/nep_energy.cuh
+++ b/src/mc/nep_energy.cuh
@@ -75,10 +75,10 @@ public:
   void find_energy(
     const int N,
     const int* g_NN_radial,
-    const int* g_NL_radial,
     const int* g_NN_angular,
-    const int* g_NL_angular,
     const int* g_type,
+    const int* g_t2_radial,
+    const int* g_t2_angular,
     const float* g_x12_radial,
     const float* g_y12_radial,
     const float* g_z12_radial,

From 717c498d05a7add22d3c768a8e9779c057b92b0b Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 02:35:49 +0800
Subject: [PATCH 12/31] only calculated zbl when it is enabled

---
 src/mc/mc_ensemble_canonical.cu |  3 ---
 src/mc/nep_energy.cu            | 16 +++++++++++++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 74702d722..54469f837 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -246,10 +246,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
 
     // copy to host
     int NN_ij_cpu;
-    int NL_ij_cpu[1000];
     NN_ij.copy_to_host(&NN_ij_cpu);
-    NL_ij.copy_to_host(NL_ij_cpu, NN_ij_cpu);
-
     printf("        i and j has %d neighbors in total.\n", NN_ij_cpu);
 
     get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu
index a9d3966a6..f2d3e4ad5 100644
--- a/src/mc/nep_energy.cu
+++ b/src/mc/nep_energy.cu
@@ -491,7 +491,17 @@ void NEP_Energy::find_energy(
     g_pe);
   CUDA_CHECK_KERNEL
 
-  find_energy_zbl<<<(N - 1) / 64 + 1, 64>>>(
-    N, zbl, g_NN_angular, g_type, g_t2_angular, g_x12_angular, g_y12_angular, g_z12_angular, g_pe);
-  CUDA_CHECK_KERNEL
+  if (zbl.enabled) {
+    find_energy_zbl<<<(N - 1) / 64 + 1, 64>>>(
+      N,
+      zbl,
+      g_NN_angular,
+      g_type,
+      g_t2_angular,
+      g_x12_angular,
+      g_y12_angular,
+      g_z12_angular,
+      g_pe);
+    CUDA_CHECK_KERNEL
+  }
 }

From a431c49342385eecb5eb78c9664dd9c626b7a303 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 05:03:27 +0800
Subject: [PATCH 13/31] small simplify

---
 src/mc/mc_ensemble_canonical.cu | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 54469f837..5e15ee2e4 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -153,19 +153,21 @@ static __global__ void create_inputs_for_energy_calculator(
       float distance_square = float(x12 * x12 + y12 * y12 + z12 * z12);
       if (distance_square < rc_radial_square) {
         int count_radial = atomicAdd(&g_NN_radial[k], 1);
-        g_t2_radial_before[count_radial * N_local + k] = g_type_before[n2];
-        g_t2_radial_after[count_radial * N_local + k] = g_type_after[n2];
-        g_x12_radial[count_radial * N_local + k] = float(x12);
-        g_y12_radial[count_radial * N_local + k] = float(y12);
-        g_z12_radial[count_radial * N_local + k] = float(z12);
+        int index_radial = count_radial * N_local + k;
+        g_t2_radial_before[index_radial] = g_type_before[n2];
+        g_t2_radial_after[index_radial] = g_type_after[n2];
+        g_x12_radial[index_radial] = float(x12);
+        g_y12_radial[index_radial] = float(y12);
+        g_z12_radial[index_radial] = float(z12);
       }
       if (distance_square < rc_angular_square) {
         int count_angular = atomicAdd(&g_NN_angular[k], 1);
-        g_t2_angular_before[count_angular * N_local + k] = g_type_before[n2];
-        g_t2_angular_after[count_angular * N_local + k] = g_type_after[n2];
-        g_x12_angular[count_angular * N_local + k] = float(x12);
-        g_y12_angular[count_angular * N_local + k] = float(y12);
-        g_z12_angular[count_angular * N_local + k] = float(z12);
+        int index_angular = count_angular * N_local + k;
+        g_t2_angular_before[index_angular] = g_type_before[n2];
+        g_t2_angular_after[index_angular] = g_type_after[n2];
+        g_x12_angular[index_angular] = float(x12);
+        g_y12_angular[index_angular] = float(y12);
+        g_z12_angular[index_angular] = float(z12);
       }
     }
   }

From 22aa0a07a36e90a38ff72d94c59bab8bacaad75e Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 05:35:34 +0800
Subject: [PATCH 14/31] exchange in GPU

---
 src/mc/mc_ensemble_canonical.cu | 46 +++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 5e15ee2e4..21b2c8579 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -173,6 +173,38 @@ static __global__ void create_inputs_for_energy_calculator(
   }
 }
 
+// a kernel with a single thread <<<1, 1>>>
+static __global__ void exchange(
+  const int i,
+  const int j,
+  const int type_i,
+  const int type_j,
+  int* g_type,
+  double* g_mass,
+  double* g_vx,
+  double* g_vy,
+  double* g_vz)
+{
+  g_type[i] = type_j;
+  g_type[j] = type_i;
+
+  double mass_i = g_mass[i];
+  g_mass[i] = g_mass[j];
+  g_mass[j] = mass_i;
+
+  double vx_i = g_vx[i];
+  g_vx[i] = g_vx[j];
+  g_vx[j] = vx_i;
+
+  double vy_i = g_vy[i];
+  g_vy[i] = g_vy[j];
+  g_vy[j] = vy_i;
+
+  double vz_i = g_vz[i];
+  g_vz[i] = g_vz[j];
+  g_vz[j] = vz_i;
+}
+
 static int get_type(const int i, const int* g_type)
 {
   int type_i = 0;
@@ -349,9 +381,19 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     printf("        probability = %g.\n", probability);
 
     if (random_number < probability) {
-      printf("        the MC trail is accepted.\n");
+      printf("        the atom exchange is accepted.\n");
+      exchange<<<1, 1>>>(
+        i,
+        j,
+        type_i,
+        type_j,
+        atom.type.data(),
+        atom.mass.data(),
+        atom.velocity_per_atom.data(),
+        atom.velocity_per_atom.data() + atom.number_of_atoms,
+        atom.velocity_per_atom.data() + atom.number_of_atoms * 2);
     } else {
-      printf("        the MC trail is rejected.\n");
+      printf("        the atom exchange is rejected.\n");
     }
   }
 }

From f3c0ff8933e751c2795ad826cdf83d98175e9ed7 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 13:37:22 +0800
Subject: [PATCH 15/31] exchange in CPU

---
 src/mc/mc_ensemble_canonical.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 21b2c8579..604e54bb7 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -382,6 +382,13 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
 
     if (random_number < probability) {
       printf("        the atom exchange is accepted.\n");
+      atom.cpu_type[i] = type_j;
+      atom.cpu_type[j] = type_i;
+
+      double mass_i = atom.cpu_mass[i];
+      atom.cpu_mass[i] = atom.cpu_mass[j];
+      atom.cpu_mass[j] = mass_i;
+
       exchange<<<1, 1>>>(
         i,
         j,

From 0183f0307a7455f7cac642acde39f445c57d064d Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 13:43:56 +0800
Subject: [PATCH 16/31] simplify type

---
 src/mc/mc_ensemble_canonical.cu | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 604e54bb7..ddd50f9f3 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -205,13 +205,6 @@ static __global__ void exchange(
   g_vz[j] = vz_i;
 }
 
-static int get_type(const int i, const int* g_type)
-{
-  int type_i = 0;
-  CHECK(cudaMemcpy(&type_i, &g_type[i], sizeof(int), cudaMemcpyDeviceToHost));
-  return type_i;
-}
-
 static bool check_if_small_box(const double rc, const Box& box)
 {
   double volume = box.get_volume();
@@ -249,12 +242,12 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     printf("    MC step %d, temperature = %g K.\n", step, temperature);
 
     int i = r1(rng);
-    int type_i = get_type(i, atom.type.data());
+    int type_i = atom.cpu_type[i];
     printf("        get atom %d with type %d.\n", i, type_i);
     int j = 0, type_j = type_i;
     while (type_i == type_j) {
       j = r1(rng);
-      type_j = get_type(j, atom.type.data());
+      type_j = atom.cpu_type[j];
       printf("        get atom %d with type %d.\n", j, type_j);
     }
     printf(
@@ -332,7 +325,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     CUDA_CHECK_KERNEL
 
     nep_energy.find_energy(
-      NN_ij_cpu, // atom.number_of_atoms,
+      NN_ij_cpu,
       NN_radial.data(),
       NN_angular.data(),
       local_type_before.data(),
@@ -347,7 +340,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       pe_before.data());
 
     nep_energy.find_energy(
-      NN_ij_cpu, // atom.number_of_atoms,
+      NN_ij_cpu,
       NN_radial.data(),
       NN_angular.data(),
       local_type_after.data(),

From cfe4382de1a29f66b1356cfc00c3d7041d3293d5 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 16:27:07 +0800
Subject: [PATCH 17/31] get potential file name

---
 src/mc/mc_ensemble.cu | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index b62110fd5..949dc41bf 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -20,6 +20,32 @@ The abstract base class (ABC) for the MC_Ensemble classes.
 #include "mc_ensemble.cuh"
 #include "utilities/common.cuh"
 #include <chrono>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+static std::string get_potential_file_name()
+{
+  std::ifstream input_run("run.in");
+  if (!input_run.is_open()) {
+    PRINT_INPUT_ERROR("Cannot open run.in.");
+  }
+  std::string potential_file_name;
+  std::string line;
+  while (std::getline(input_run, line)) {
+    std::vector<std::string> tokens = get_tokens(line);
+    if (tokens.size() != 0) {
+      if (tokens[0] == "potential") {
+        potential_file_name = tokens[1];
+        break;
+      }
+    }
+  }
+
+  input_run.close();
+  return potential_file_name;
+}
 
 MC_Ensemble::MC_Ensemble(void)
 {
@@ -42,8 +68,9 @@ MC_Ensemble::MC_Ensemble(void)
   pe_before.resize(n_max);
   pe_after.resize(n_max);
 
-  // TODO
-  nep_energy.initialize("../examples/11_NEP_potential_PbTe/nep.txt");
+  std::string potential_file_name = get_potential_file_name();
+  printf("potential file name = %s\n", potential_file_name.c_str());
+  nep_energy.initialize(potential_file_name.c_str());
 
 #ifdef DEBUG
   rng = std::mt19937(13579);

From 21edb7c13c618fbf2d78d6b0b0f7f375172bcdb0 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 16:44:41 +0800
Subject: [PATCH 18/31] check potential name

---
 src/mc/mc_ensemble.cu | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 949dc41bf..72fe2d2c9 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -47,6 +47,26 @@ static std::string get_potential_file_name()
   return potential_file_name;
 }
 
+static bool is_nep(std::string& potential_file_name)
+{
+  std::ifstream input_potential(potential_file_name);
+  if (!input_potential.is_open()) {
+    PRINT_INPUT_ERROR("Cannot open potential file.");
+  }
+  std::string line;
+  std::getline(input_potential, line);
+  std::vector<std::string> tokens = get_tokens(line);
+  if (tokens[0].substr(0, 3) == "nep") {
+    return true;
+  } else if (tokens[0] == "eam_zhou_2004") {
+    return false;
+  } else {
+    PRINT_INPUT_ERROR("Unsupported potential for MCMD.");
+  }
+
+  input_potential.close();
+}
+
 MC_Ensemble::MC_Ensemble(void)
 {
   const int n_max = 1000;
@@ -70,7 +90,14 @@ MC_Ensemble::MC_Ensemble(void)
 
   std::string potential_file_name = get_potential_file_name();
   printf("potential file name = %s\n", potential_file_name.c_str());
-  nep_energy.initialize(potential_file_name.c_str());
+
+  if (is_nep(potential_file_name)) {
+    printf("potential is NEP.\n");
+    nep_energy.initialize(potential_file_name.c_str());
+  } else {
+    printf("EAM is not ready yet.\n");
+    exit(1);
+  }
 
 #ifdef DEBUG
   rng = std::mt19937(13579);

From 722b33774c6c44c15ed6ff79f7376b11a95ab980 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 21:39:43 +0800
Subject: [PATCH 19/31] no eam

---
 src/mc/mc_ensemble.cu | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 72fe2d2c9..2dde9431b 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -47,7 +47,7 @@ static std::string get_potential_file_name()
   return potential_file_name;
 }
 
-static bool is_nep(std::string& potential_file_name)
+static void check_is_nep(std::string& potential_file_name)
 {
   std::ifstream input_potential(potential_file_name);
   if (!input_potential.is_open()) {
@@ -56,12 +56,8 @@ static bool is_nep(std::string& potential_file_name)
   std::string line;
   std::getline(input_potential, line);
   std::vector<std::string> tokens = get_tokens(line);
-  if (tokens[0].substr(0, 3) == "nep") {
-    return true;
-  } else if (tokens[0] == "eam_zhou_2004") {
-    return false;
-  } else {
-    PRINT_INPUT_ERROR("Unsupported potential for MCMD.");
+  if (tokens[0].substr(0, 3) != "nep") {
+    PRINT_INPUT_ERROR("MCMD only supports NEP models.");
   }
 
   input_potential.close();
@@ -89,15 +85,8 @@ MC_Ensemble::MC_Ensemble(void)
   pe_after.resize(n_max);
 
   std::string potential_file_name = get_potential_file_name();
-  printf("potential file name = %s\n", potential_file_name.c_str());
-
-  if (is_nep(potential_file_name)) {
-    printf("potential is NEP.\n");
-    nep_energy.initialize(potential_file_name.c_str());
-  } else {
-    printf("EAM is not ready yet.\n");
-    exit(1);
-  }
+  check_is_nep(potential_file_name);
+  nep_energy.initialize(potential_file_name.c_str());
 
 #ifdef DEBUG
   rng = std::mt19937(13579);

From e9bb98ed8973eedbf5c72c6e4976f0836ebb1297 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Mon, 14 Aug 2023 21:45:30 +0800
Subject: [PATCH 20/31] clean up printf

---
 src/mc/mc.cu                    |  1 -
 src/mc/mc_ensemble_canonical.cu | 20 ++------------------
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index 7b4de28de..be0121ff5 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -34,7 +34,6 @@ void MC::compute(int step, Atom& atom, Box& box)
 {
   if (do_mcmd) {
     if ((step + 1) % num_steps_md == 0) {
-      printf("Now do MC after %d MD steps:\n", step + 1);
       mc_ensemble->compute(atom, box);
     }
   }
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index ddd50f9f3..52767d4d8 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -239,23 +239,14 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
   std::uniform_int_distribution<int> r1(0, atom.number_of_atoms - 1);
 
   for (int step = 0; step < num_steps_mc; ++step) {
-    printf("    MC step %d, temperature = %g K.\n", step, temperature);
 
     int i = r1(rng);
     int type_i = atom.cpu_type[i];
-    printf("        get atom %d with type %d.\n", i, type_i);
     int j = 0, type_j = type_i;
     while (type_i == type_j) {
       j = r1(rng);
       type_j = atom.cpu_type[j];
-      printf("        get atom %d with type %d.\n", j, type_j);
     }
-    printf(
-      "        try to exchange atom %d with type %d and atom %d with type %d.\n",
-      i,
-      type_i,
-      j,
-      type_j);
 
     CHECK(cudaMemset(NN_ij.data(), 0, sizeof(int)));
     get_neighbors_of_i_and_j<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
@@ -271,10 +262,8 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       NL_ij.data());
     CUDA_CHECK_KERNEL
 
-    // copy to host
     int NN_ij_cpu;
     NN_ij.copy_to_host(&NN_ij_cpu);
-    printf("        i and j has %d neighbors in total.\n", NN_ij_cpu);
 
     get_types<<<(atom.number_of_atoms - 1) / 64 + 1, 64>>>(
       atom.number_of_atoms,
@@ -364,17 +353,14 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
       pe_before_total += pe_before_cpu[n];
       pe_after_total += pe_after_cpu[n];
     }
-    printf("        per-atom energy before swapping = %g eV.\n", pe_before_total / NN_ij_cpu);
-    printf("        per-atom energy after swapping = %g eV.\n", pe_after_total / NN_ij_cpu);
+    // printf("        per-atom energy before swapping = %g eV.\n", pe_before_total / NN_ij_cpu);
+    // printf("        per-atom energy after swapping = %g eV.\n", pe_after_total / NN_ij_cpu);
     float energy_difference = pe_after_total - pe_before_total;
     std::uniform_real_distribution<float> r2(0, 1);
     float random_number = r2(rng);
-    printf("        random number = %g.\n", random_number);
     float probability = exp(-energy_difference / (K_B * temperature));
-    printf("        probability = %g.\n", probability);
 
     if (random_number < probability) {
-      printf("        the atom exchange is accepted.\n");
       atom.cpu_type[i] = type_j;
       atom.cpu_type[j] = type_i;
 
@@ -392,8 +378,6 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
         atom.velocity_per_atom.data(),
         atom.velocity_per_atom.data() + atom.number_of_atoms,
         atom.velocity_per_atom.data() + atom.number_of_atoms * 2);
-    } else {
-      printf("        the atom exchange is rejected.\n");
     }
   }
 }

From e90c2ab05b9088d2c5fd211628a7002e77ec0780 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Tue, 15 Aug 2023 02:55:19 +0800
Subject: [PATCH 21/31] output mcmd.out

---
 src/mc/mc.cu                     |  4 ++--
 src/mc/mc_ensemble.cu            |  5 +++++
 src/mc/mc_ensemble.cuh           |  8 +++++++-
 src/mc/mc_ensemble_canonical.cu  | 12 +++++++-----
 src/mc/mc_ensemble_canonical.cuh |  2 +-
 5 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index be0121ff5..e78164277 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -33,8 +33,8 @@ void MC::finalize(void) { do_mcmd = false; }
 void MC::compute(int step, Atom& atom, Box& box)
 {
   if (do_mcmd) {
-    if ((step + 1) % num_steps_md == 0) {
-      mc_ensemble->compute(atom, box);
+    if ((step + 2) % num_steps_md == 0) {
+      mc_ensemble->compute(step + 2, atom, box);
     }
   }
 }
diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index 2dde9431b..c89b696a1 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -65,6 +65,11 @@ static void check_is_nep(std::string& potential_file_name)
 
 MC_Ensemble::MC_Ensemble(void)
 {
+  num_accepted = 0;
+  num_attempted = 0;
+  mc_output.open("mcmd.out", std::ios::app);
+  mc_output << "# num_MD_steps  num_MC_accepted  num_MC_attempted" << std::endl;
+
   const int n_max = 1000;
   const int m_max = 1000;
   NN_radial.resize(n_max);
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index e3997be96..65fb25348 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -19,6 +19,8 @@
 #include "model/group.cuh"
 #include "nep_energy.cuh"
 #include "utilities/gpu_vector.cuh"
+#include <fstream>
+#include <iostream>
 #include <random>
 #include <vector>
 
@@ -28,13 +30,17 @@ public:
   MC_Ensemble(void);
   virtual ~MC_Ensemble(void);
 
-  virtual void compute(Atom& atom, Box& box) = 0;
+  virtual void compute(int md_step, Atom& atom, Box& box) = 0;
 
 protected:
   int num_steps_mc = 0;
   double temperature = 0.0;
   std::mt19937 rng;
 
+  int num_accepted = 0;
+  int num_attempted = 0;
+  std::ofstream mc_output;
+
   GPU_Vector<int> NN_radial;
   GPU_Vector<int> NN_angular;
   GPU_Vector<int> type_before;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 52767d4d8..405d77c13 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -27,10 +27,7 @@ MC_Ensemble_Canonical::MC_Ensemble_Canonical(int num_steps_mc_input, double temp
   NL_ij.resize(1000);
 }
 
-MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void)
-{
-  // nothing now
-}
+MC_Ensemble_Canonical::~MC_Ensemble_Canonical(void) { mc_output.close(); }
 
 static __global__ void get_types(
   const int N,
@@ -224,7 +221,7 @@ static bool check_if_small_box(const double rc, const Box& box)
   return is_small_box;
 }
 
-void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
+void MC_Ensemble_Canonical::compute(int md_step, Atom& atom, Box& box)
 {
   if (check_if_small_box(nep_energy.paramb.rc_radial, box)) {
     printf("Cannot use small box for MCMD.\n");
@@ -361,6 +358,8 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
     float probability = exp(-energy_difference / (K_B * temperature));
 
     if (random_number < probability) {
+      ++num_accepted;
+
       atom.cpu_type[i] = type_j;
       atom.cpu_type[j] = type_i;
 
@@ -380,4 +379,7 @@ void MC_Ensemble_Canonical::compute(Atom& atom, Box& box)
         atom.velocity_per_atom.data() + atom.number_of_atoms * 2);
     }
   }
+
+  num_attempted += num_steps_mc;
+  mc_output << md_step << "  " << num_accepted << "  " << num_attempted << std::endl;
 }
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index d9ac0c9b6..b19a47983 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -22,7 +22,7 @@ public:
   MC_Ensemble_Canonical(int num_steps_mc, double temperature);
   virtual ~MC_Ensemble_Canonical(void);
 
-  virtual void compute(Atom& atom, Box& box);
+  virtual void compute(int md_step, Atom& atom, Box& box);
 
 private:
   GPU_Vector<int> NN_ij;

From cece1c8d115c6ab40062b2f72d135a0e7a66ae5e Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Tue, 15 Aug 2023 03:11:10 +0800
Subject: [PATCH 22/31] change order of output

---
 src/mc/mc_ensemble.cu           | 2 +-
 src/mc/mc_ensemble_canonical.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index c89b696a1..ad8578b00 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -68,7 +68,7 @@ MC_Ensemble::MC_Ensemble(void)
   num_accepted = 0;
   num_attempted = 0;
   mc_output.open("mcmd.out", std::ios::app);
-  mc_output << "# num_MD_steps  num_MC_accepted  num_MC_attempted" << std::endl;
+  mc_output << "# num_MD_steps  num_MC_attempted  num_MC_accepted" << std::endl;
 
   const int n_max = 1000;
   const int m_max = 1000;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 405d77c13..4734dfadb 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -381,5 +381,5 @@ void MC_Ensemble_Canonical::compute(int md_step, Atom& atom, Box& box)
   }
 
   num_attempted += num_steps_mc;
-  mc_output << md_step << "  " << num_accepted << "  " << num_attempted << std::endl;
+  mc_output << md_step << "  " << num_attempted << "  " << num_accepted << std::endl;
 }

From 3ad036361aaf02a699e7696d30d0842d0bef7c57 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Tue, 15 Aug 2023 18:48:05 +0800
Subject: [PATCH 23/31] exchange atom symbol (not only type)

---
 src/mc/mc_ensemble_canonical.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 4734dfadb..fea1358aa 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -363,6 +363,10 @@ void MC_Ensemble_Canonical::compute(int md_step, Atom& atom, Box& box)
       atom.cpu_type[i] = type_j;
       atom.cpu_type[j] = type_i;
 
+      auto atom_symbol_i = atom.cpu_atom_symbol[i];
+      atom.cpu_atom_symbol[i] = atom.cpu_atom_symbol[j];
+      atom.cpu_atom_symbol[j] = atom_symbol_i;
+
       double mass_i = atom.cpu_mass[i];
       atom.cpu_mass[i] = atom.cpu_mass[j];
       atom.cpu_mass[j] = mass_i;

From 749ab68bdef5b05d9557d4fd56289369410e0913 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Wed, 16 Aug 2023 00:09:29 +0800
Subject: [PATCH 24/31] output acceptance ratio

---
 src/mc/mc_ensemble.cu           | 4 +---
 src/mc/mc_ensemble.cuh          | 2 --
 src/mc/mc_ensemble_canonical.cu | 4 ++--
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index ad8578b00..d5b1dbcc2 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -65,10 +65,8 @@ static void check_is_nep(std::string& potential_file_name)
 
 MC_Ensemble::MC_Ensemble(void)
 {
-  num_accepted = 0;
-  num_attempted = 0;
   mc_output.open("mcmd.out", std::ios::app);
-  mc_output << "# num_MD_steps  num_MC_attempted  num_MC_accepted" << std::endl;
+  mc_output << "# num_MD_steps  acceptance_ratio" << std::endl;
 
   const int n_max = 1000;
   const int m_max = 1000;
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index 65fb25348..105af2b2b 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -37,8 +37,6 @@ protected:
   double temperature = 0.0;
   std::mt19937 rng;
 
-  int num_accepted = 0;
-  int num_attempted = 0;
   std::ofstream mc_output;
 
   GPU_Vector<int> NN_radial;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index fea1358aa..8b7a8cd3b 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -235,6 +235,7 @@ void MC_Ensemble_Canonical::compute(int md_step, Atom& atom, Box& box)
 
   std::uniform_int_distribution<int> r1(0, atom.number_of_atoms - 1);
 
+  int num_accepted = 0;
   for (int step = 0; step < num_steps_mc; ++step) {
 
     int i = r1(rng);
@@ -384,6 +385,5 @@ void MC_Ensemble_Canonical::compute(int md_step, Atom& atom, Box& box)
     }
   }
 
-  num_attempted += num_steps_mc;
-  mc_output << md_step << "  " << num_attempted << "  " << num_accepted << std::endl;
+  mc_output << md_step << "  " << num_accepted / double(num_steps_mc) << std::endl;
 }

From b47083d2790e32de33cb01fbf33a14940c100ab2 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 17 Aug 2023 00:11:06 +0800
Subject: [PATCH 25/31] check number of types

---
 src/mc/mc_ensemble.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mc/mc_ensemble.cu b/src/mc/mc_ensemble.cu
index d5b1dbcc2..e84a500bb 100644
--- a/src/mc/mc_ensemble.cu
+++ b/src/mc/mc_ensemble.cu
@@ -60,6 +60,11 @@ static void check_is_nep(std::string& potential_file_name)
     PRINT_INPUT_ERROR("MCMD only supports NEP models.");
   }
 
+  int num_types = get_int_from_token(tokens[1], __FILE__, __LINE__);
+  if (num_types < 2) {
+    PRINT_INPUT_ERROR("MCMD only supports multi-component models.");
+  }
+
   input_potential.close();
 }
 

From f0c21847163f2cc3ae4ad6118fcbf8e893747c4c Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Sun, 20 Aug 2023 02:34:31 +0800
Subject: [PATCH 26/31] change syntax

---
 src/main_gpumd/run.cu            |  6 ++--
 src/mc/mc.cu                     | 62 ++++++++++++++++++++++----------
 src/mc/mc.cuh                    |  7 ++--
 src/mc/mc_ensemble.cuh           |  2 +-
 src/mc/mc_ensemble_canonical.cu  |  5 ++-
 src/mc/mc_ensemble_canonical.cuh |  4 +--
 src/mc/nep_energy.cu             | 44 +++++++++++------------
 7 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index 5ad098f4f..70e66aca1 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -215,7 +215,7 @@ void Run::perform_a_run()
 
     integrate.compute2(time_step, double(step) / number_of_steps, group, box, atom, thermo);
 
-    mc.compute(step, atom, box);
+    mc.compute(step, number_of_steps, atom, box);
 
     measure.process(
       number_of_steps,
@@ -402,8 +402,8 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
     integrate.parse_move(param, num_param, group);
   } else if (strcmp(param[0], "electron_stop") == 0) {
     electron_stop.parse(param, num_param, atom.number_of_atoms, number_of_types);
-  } else if (strcmp(param[0], "cmc") == 0) {
-    mc.parse_cmc(param, num_param);
+  } else if (strcmp(param[0], "mc") == 0) {
+    mc.parse_mc(param, num_param);
   } else if (strcmp(param[0], "run") == 0) {
     parse_run(param, num_param);
   } else {
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index e78164277..34df602fe 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -30,47 +30,73 @@ void MC::initialize(void)
 
 void MC::finalize(void) { do_mcmd = false; }
 
-void MC::compute(int step, Atom& atom, Box& box)
+void MC::compute(int step, int num_steps, Atom& atom, Box& box)
 {
   if (do_mcmd) {
     if ((step + 2) % num_steps_md == 0) {
-      mc_ensemble->compute(step + 2, atom, box);
+      double temperature =
+        temperature_initial + step * (temperature_final - temperature_initial) / num_steps;
+      mc_ensemble->compute(step + 2, temperature, atom, box);
     }
   }
 }
 
-void MC::parse_cmc(const char** param, int num_param)
+void MC::parse_mc(const char** param, int num_param)
 {
-  if (num_param != 4) {
-    PRINT_INPUT_ERROR("cmc should have 3 parameter.\n");
+  if (num_param < 6) {
+    PRINT_INPUT_ERROR("mc should have at least 5 parameters.\n");
   }
 
-  if (!is_valid_int(param[1], &num_steps_md)) {
-    PRINT_INPUT_ERROR("number of MD steps for cmc should be an integer.\n");
+  int mc_ensemble_type = 0;
+  if (strcmp(param[1], "canonical") == 0) {
+    mc_ensemble_type = 0;
+  } else if (strcmp(param[1], "sgc") == 0) {
+    PRINT_INPUT_ERROR("semi-grand canonical MCMD has not been implemented yet.\n");
+  } else if (strcmp(param[1], "vcsgc") == 0) {
+    PRINT_INPUT_ERROR(
+      "variance constrained semi-grand canonical MCMD has not been implemented yet.\n");
+  } else {
+    PRINT_INPUT_ERROR("invalid MC ensemble for MCMD.\n");
+  }
+
+  if (!is_valid_int(param[2], &num_steps_md)) {
+    PRINT_INPUT_ERROR("number of MD steps for MCMD should be an integer.\n");
   }
   if (num_steps_md <= 0) {
-    PRINT_INPUT_ERROR("number of MD steps for cmc should be positive.\n");
+    PRINT_INPUT_ERROR("number of MD steps for MCMD should be positive.\n");
   }
 
-  if (!is_valid_int(param[2], &num_steps_mc)) {
-    PRINT_INPUT_ERROR("number of MC steps for cmc should be an integer.\n");
+  if (!is_valid_int(param[3], &num_steps_mc)) {
+    PRINT_INPUT_ERROR("number of MC steps for MCMD should be an integer.\n");
   }
   if (num_steps_mc <= 0) {
-    PRINT_INPUT_ERROR("number of MC steps for cmc should be positive.\n");
+    PRINT_INPUT_ERROR("number of MC steps for MCMD should be positive.\n");
+  }
+
+  if (!is_valid_real(param[4], &temperature_initial)) {
+    PRINT_INPUT_ERROR("initial temperature for MCMD should be a number.\n");
+  }
+  if (temperature_initial <= 0) {
+    PRINT_INPUT_ERROR("initial temperature for MCMD should be positive.\n");
   }
 
-  if (!is_valid_real(param[3], &temperature)) {
-    PRINT_INPUT_ERROR("temperature for cmc should be a number.\n");
+  if (!is_valid_real(param[5], &temperature_final)) {
+    PRINT_INPUT_ERROR("final temperature for MCMD should be a number.\n");
   }
-  if (temperature <= 0) {
-    PRINT_INPUT_ERROR("temperature for cmc should be positive.\n");
+  if (temperature_final <= 0) {
+    PRINT_INPUT_ERROR("final temperature for MCMD should be positive.\n");
   }
 
-  printf("Perform canonical MC:\n");
+  if (mc_ensemble_type == 0) {
+    printf("Perform canonical MCMD:\n");
+  }
   printf("    after every %d MD steps, do %d MC trials.\n", num_steps_md, num_steps_mc);
-  printf("    with a temperature of %g K.\n", temperature);
+  printf(
+    "    with an initial temperature of %g K and a final temperature of %g K.\n",
+    temperature_initial,
+    temperature_final);
 
-  mc_ensemble.reset(new MC_Ensemble_Canonical(num_steps_mc, temperature));
+  mc_ensemble.reset(new MC_Ensemble_Canonical(num_steps_mc));
 
   do_mcmd = true;
 }
\ No newline at end of file
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index 06073dd43..697aee7da 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -30,13 +30,14 @@ public:
 
   void initialize(void);
   void finalize(void);
-  void compute(int step, Atom& atom, Box& box);
+  void compute(int step, int num_steps, Atom& atom, Box& box);
 
-  void parse_cmc(const char** param, int num_param);
+  void parse_mc(const char** param, int num_param);
 
 private:
   bool do_mcmd = false;
   int num_steps_md = 0;
   int num_steps_mc = 0;
-  double temperature = 0.0;
+  double temperature_initial = 0.0;
+  double temperature_final = 0.0;
 };
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index 105af2b2b..008efcaaa 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -30,7 +30,7 @@ public:
   MC_Ensemble(void);
   virtual ~MC_Ensemble(void);
 
-  virtual void compute(int md_step, Atom& atom, Box& box) = 0;
+  virtual void compute(int md_step, double temperature, Atom& atom, Box& box) = 0;
 
 protected:
   int num_steps_mc = 0;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index 8b7a8cd3b..f884f085d 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -19,10 +19,9 @@ The canonical ensemble for MCMD.
 
 #include "mc_ensemble_canonical.cuh"
 
-MC_Ensemble_Canonical::MC_Ensemble_Canonical(int num_steps_mc_input, double temperature_input)
+MC_Ensemble_Canonical::MC_Ensemble_Canonical(int num_steps_mc_input)
 {
   num_steps_mc = num_steps_mc_input;
-  temperature = temperature_input;
   NN_ij.resize(1);
   NL_ij.resize(1000);
 }
@@ -221,7 +220,7 @@ static bool check_if_small_box(const double rc, const Box& box)
   return is_small_box;
 }
 
-void MC_Ensemble_Canonical::compute(int md_step, Atom& atom, Box& box)
+void MC_Ensemble_Canonical::compute(int md_step, double temperature, Atom& atom, Box& box)
 {
   if (check_if_small_box(nep_energy.paramb.rc_radial, box)) {
     printf("Cannot use small box for MCMD.\n");
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index b19a47983..a76676fb6 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -19,10 +19,10 @@
 class MC_Ensemble_Canonical : public MC_Ensemble
 {
 public:
-  MC_Ensemble_Canonical(int num_steps_mc, double temperature);
+  MC_Ensemble_Canonical(int num_steps_mc);
   virtual ~MC_Ensemble_Canonical(void);
 
-  virtual void compute(int md_step, Atom& atom, Box& box);
+  virtual void compute(int md_step, double temperature, Atom& atom, Box& box);
 
 private:
   GPU_Vector<int> NN_ij;
diff --git a/src/mc/nep_energy.cu b/src/mc/nep_energy.cu
index f2d3e4ad5..3f57efd6c 100644
--- a/src/mc/nep_energy.cu
+++ b/src/mc/nep_energy.cu
@@ -80,9 +80,9 @@ void NEP_Energy::initialize(const char* file_potential)
   }
 
   if (paramb.num_types == 1) {
-    printf("Use the NEP%d potential with %d atom type.\n", paramb.version, paramb.num_types);
+    printf("    Use the NEP%d potential with %d atom type.\n", paramb.version, paramb.num_types);
   } else {
-    printf("Use the NEP%d potential with %d atom types.\n", paramb.version, paramb.num_types);
+    printf("    Use the NEP%d potential with %d atom types.\n", paramb.version, paramb.num_types);
   }
 
   for (int n = 0; n < paramb.num_types; ++n) {
@@ -94,7 +94,7 @@ void NEP_Energy::initialize(const char* file_potential)
       }
     }
     zbl.atomic_numbers[n] = atomic_number;
-    printf("    type %d (%s with Z = %g).\n", n, tokens[2 + n].c_str(), zbl.atomic_numbers[n]);
+    printf("        type %d (%s with Z = %g).\n", n, tokens[2 + n].c_str(), zbl.atomic_numbers[n]);
   }
 
   // zbl 0.7 1.4
@@ -108,10 +108,10 @@ void NEP_Energy::initialize(const char* file_potential)
     zbl.rc_outer = get_float_from_token(tokens[2], __FILE__, __LINE__);
     if (zbl.rc_inner == 0 && zbl.rc_outer == 0) {
       zbl.flexibled = true;
-      printf("    has the flexible ZBL potential\n");
+      printf("        has the flexible ZBL potential\n");
     } else {
       printf(
-        "    has the universal ZBL with inner cutoff %g A and outer cutoff %g A.\n",
+        "        has the universal ZBL with inner cutoff %g A and outer cutoff %g A.\n",
         zbl.rc_inner,
         zbl.rc_outer);
     }
@@ -125,8 +125,8 @@ void NEP_Energy::initialize(const char* file_potential)
   }
   paramb.rc_radial = get_float_from_token(tokens[1], __FILE__, __LINE__);
   paramb.rc_angular = get_float_from_token(tokens[2], __FILE__, __LINE__);
-  printf("    radial cutoff = %g A.\n", paramb.rc_radial);
-  printf("    angular cutoff = %g A.\n", paramb.rc_angular);
+  printf("        radial cutoff = %g A.\n", paramb.rc_radial);
+  printf("        angular cutoff = %g A.\n", paramb.rc_angular);
 
   paramb.MN_radial = 500;
   paramb.MN_angular = 100;
@@ -134,12 +134,12 @@ void NEP_Energy::initialize(const char* file_potential)
   if (tokens.size() == 5) {
     int MN_radial = get_int_from_token(tokens[3], __FILE__, __LINE__);
     int MN_angular = get_int_from_token(tokens[4], __FILE__, __LINE__);
-    printf("    MN_radial = %d.\n", MN_radial);
-    printf("    MN_angular = %d.\n", MN_angular);
+    printf("        MN_radial = %d.\n", MN_radial);
+    printf("        MN_angular = %d.\n", MN_angular);
     paramb.MN_radial = int(ceil(MN_radial * 1.25));
     paramb.MN_angular = int(ceil(MN_angular * 1.25));
-    printf("    enlarged MN_radial = %d.\n", paramb.MN_radial);
-    printf("    enlarged MN_angular = %d.\n", paramb.MN_angular);
+    printf("        enlarged MN_radial = %d.\n", paramb.MN_radial);
+    printf("        enlarged MN_angular = %d.\n", paramb.MN_angular);
   }
 
   // n_max 10 8
@@ -150,8 +150,8 @@ void NEP_Energy::initialize(const char* file_potential)
   }
   paramb.n_max_radial = get_int_from_token(tokens[1], __FILE__, __LINE__);
   paramb.n_max_angular = get_int_from_token(tokens[2], __FILE__, __LINE__);
-  printf("    n_max_radial = %d.\n", paramb.n_max_radial);
-  printf("    n_max_angular = %d.\n", paramb.n_max_angular);
+  printf("        n_max_radial = %d.\n", paramb.n_max_radial);
+  printf("        n_max_angular = %d.\n", paramb.n_max_angular);
 
   // basis_size 10 8
   if (paramb.version >= 3) {
@@ -163,8 +163,8 @@ void NEP_Energy::initialize(const char* file_potential)
     }
     paramb.basis_size_radial = get_int_from_token(tokens[1], __FILE__, __LINE__);
     paramb.basis_size_angular = get_int_from_token(tokens[2], __FILE__, __LINE__);
-    printf("    basis_size_radial = %d.\n", paramb.basis_size_radial);
-    printf("    basis_size_angular = %d.\n", paramb.basis_size_angular);
+    printf("        basis_size_radial = %d.\n", paramb.basis_size_radial);
+    printf("        basis_size_angular = %d.\n", paramb.basis_size_angular);
   }
 
   // l_max
@@ -182,14 +182,14 @@ void NEP_Energy::initialize(const char* file_potential)
   }
 
   paramb.L_max = get_int_from_token(tokens[1], __FILE__, __LINE__);
-  printf("    l_max_3body = %d.\n", paramb.L_max);
+  printf("        l_max_3body = %d.\n", paramb.L_max);
   paramb.num_L = paramb.L_max;
 
   if (paramb.version >= 3) {
     int L_max_4body = get_int_from_token(tokens[2], __FILE__, __LINE__);
     int L_max_5body = get_int_from_token(tokens[3], __FILE__, __LINE__);
-    printf("    l_max_4body = %d.\n", L_max_4body);
-    printf("    l_max_5body = %d.\n", L_max_5body);
+    printf("        l_max_4body = %d.\n", L_max_4body);
+    printf("        l_max_5body = %d.\n", L_max_5body);
     if (L_max_4body == 2) {
       paramb.num_L += 1;
     }
@@ -208,7 +208,7 @@ void NEP_Energy::initialize(const char* file_potential)
   }
   annmb.num_neurons1 = get_int_from_token(tokens[1], __FILE__, __LINE__);
   annmb.dim = (paramb.n_max_radial + 1) + paramb.dim_angular;
-  printf("    ANN = %d-%d-1.\n", annmb.dim, annmb.num_neurons1);
+  printf("        ANN = %d-%d-1.\n", annmb.dim, annmb.num_neurons1);
 
   // calculated parameters:
   paramb.rcinv_radial = 1.0f / paramb.rc_radial;
@@ -217,7 +217,7 @@ void NEP_Energy::initialize(const char* file_potential)
 
   annmb.num_para =
     (annmb.dim + 2) * annmb.num_neurons1 * (paramb.version == 4 ? paramb.num_types : 1) + 1;
-  printf("    number of neural network parameters = %d.\n", annmb.num_para);
+  printf("        number of neural network parameters = %d.\n", annmb.num_para);
   int num_para_descriptor =
     paramb.num_types_sq * ((paramb.n_max_radial + 1) * (paramb.basis_size_radial + 1) +
                            (paramb.n_max_angular + 1) * (paramb.basis_size_angular + 1));
@@ -227,9 +227,9 @@ void NEP_Energy::initialize(const char* file_potential)
         ? 0
         : paramb.num_types_sq * (paramb.n_max_radial + paramb.n_max_angular + 2);
   }
-  printf("    number of descriptor parameters = %d.\n", num_para_descriptor);
+  printf("        number of descriptor parameters = %d.\n", num_para_descriptor);
   annmb.num_para += num_para_descriptor;
-  printf("    total number of parameters = %d.\n", annmb.num_para);
+  printf("        total number of parameters = %d.\n", annmb.num_para);
 
   paramb.num_c_radial =
     paramb.num_types_sq * (paramb.n_max_radial + 1) * (paramb.basis_size_radial + 1);

From 5cf0a0ae09b2f83eae8443e145c1702131862a1c Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Sun, 20 Aug 2023 03:18:18 +0800
Subject: [PATCH 27/31] start to add group in mcmd

---
 src/mc/mc.cu  | 33 +++++++++++++++++++++++++++++++++
 src/mc/mc.cuh |  2 ++
 2 files changed, 35 insertions(+)

diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index 34df602fe..d616f443e 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -87,6 +87,31 @@ void MC::parse_mc(const char** param, int num_param)
     PRINT_INPUT_ERROR("final temperature for MCMD should be positive.\n");
   }
 
+  if (mc_ensemble_type == 0) {
+    if (num_param > 6) {
+      if (num_param != 9) {
+        PRINT_INPUT_ERROR("mc canonical must has 9 paramters when using a grouping method.\n");
+      }
+      if (strcmp(param[6], "group") != 0) {
+        PRINT_INPUT_ERROR("invalid option for mc.\n");
+      }
+      if (!is_valid_int(param[7], &grouping_method)) {
+        PRINT_INPUT_ERROR("grouping method of MCMD should be an integer.\n");
+      }
+      if (grouping_method < 0) {
+        PRINT_INPUT_ERROR("grouping method of MCMD should >= 0.\n");
+      }
+      // TODO check upper bound of grouping method
+      if (!is_valid_int(param[8], &group_id)) {
+        PRINT_INPUT_ERROR("group ID of MCMD should be an integer.\n");
+      }
+      if (grouping_method < 0) {
+        PRINT_INPUT_ERROR("group ID of MCMD should >= 0.\n");
+      }
+      // TODO check upper bound of group id
+    }
+  }
+
   if (mc_ensemble_type == 0) {
     printf("Perform canonical MCMD:\n");
   }
@@ -96,6 +121,14 @@ void MC::parse_mc(const char** param, int num_param)
     temperature_initial,
     temperature_final);
 
+  if (mc_ensemble_type == 0) {
+    if (num_param == 6) {
+      printf("    for all the atoms in the system.\n");
+    } else {
+      printf("    only for atoms in group %d of grouping method %d.\n", group_id, grouping_method);
+    }
+  }
+
   mc_ensemble.reset(new MC_Ensemble_Canonical(num_steps_mc));
 
   do_mcmd = true;
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index 697aee7da..ab9cfaca0 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -38,6 +38,8 @@ private:
   bool do_mcmd = false;
   int num_steps_md = 0;
   int num_steps_mc = 0;
+  int grouping_method = 0;
+  int group_id = 0;
   double temperature_initial = 0.0;
   double temperature_final = 0.0;
 };

From d039835f95ccb0e0a634ccab9a2f889ca425bf65 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 24 Aug 2023 04:08:48 +0800
Subject: [PATCH 28/31] parse group

---
 src/main_gpumd/run.cu |  2 +-
 src/mc/mc.cu          | 12 ++++++++----
 src/mc/mc.cuh         |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index 70e66aca1..d86ca30d8 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -403,7 +403,7 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
   } else if (strcmp(param[0], "electron_stop") == 0) {
     electron_stop.parse(param, num_param, atom.number_of_atoms, number_of_types);
   } else if (strcmp(param[0], "mc") == 0) {
-    mc.parse_mc(param, num_param);
+    mc.parse_mc(param, num_param, group);
   } else if (strcmp(param[0], "run") == 0) {
     parse_run(param, num_param);
   } else {
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index d616f443e..f06391441 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -41,7 +41,7 @@ void MC::compute(int step, int num_steps, Atom& atom, Box& box)
   }
 }
 
-void MC::parse_mc(const char** param, int num_param)
+void MC::parse_mc(const char** param, int num_param, std::vector<Group>& groups)
 {
   if (num_param < 6) {
     PRINT_INPUT_ERROR("mc should have at least 5 parameters.\n");
@@ -101,14 +101,18 @@ void MC::parse_mc(const char** param, int num_param)
       if (grouping_method < 0) {
         PRINT_INPUT_ERROR("grouping method of MCMD should >= 0.\n");
       }
-      // TODO check upper bound of grouping method
+      if (grouping_method >= groups.size()) {
+        PRINT_INPUT_ERROR("Grouping method should < number of grouping methods.");
+      }
       if (!is_valid_int(param[8], &group_id)) {
         PRINT_INPUT_ERROR("group ID of MCMD should be an integer.\n");
       }
-      if (grouping_method < 0) {
+      if (group_id < 0) {
         PRINT_INPUT_ERROR("group ID of MCMD should >= 0.\n");
       }
-      // TODO check upper bound of group id
+      if (group_id >= groups[grouping_method].number) {
+        PRINT_INPUT_ERROR("Group ID should < number of groups.");
+      }
     }
   }
 
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index ab9cfaca0..dfaebbc22 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -32,7 +32,7 @@ public:
   void finalize(void);
   void compute(int step, int num_steps, Atom& atom, Box& box);
 
-  void parse_mc(const char** param, int num_param);
+  void parse_mc(const char** param, int num_param, std::vector<Group>& group);
 
 private:
   bool do_mcmd = false;

From e23d2954cdc527e05bd1c09672c5cd6fbdb7bb04 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 24 Aug 2023 04:51:14 +0800
Subject: [PATCH 29/31] check group for mcmd

---
 src/main_gpumd/run.cu |  2 +-
 src/mc/mc.cu          | 21 ++++++++++++++++++++-
 src/mc/mc.cuh         |  3 ++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index d86ca30d8..261ab65b4 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -403,7 +403,7 @@ void Run::parse_one_keyword(std::vector<std::string>& tokens)
   } else if (strcmp(param[0], "electron_stop") == 0) {
     electron_stop.parse(param, num_param, atom.number_of_atoms, number_of_types);
   } else if (strcmp(param[0], "mc") == 0) {
-    mc.parse_mc(param, num_param, group);
+    mc.parse_mc(param, num_param, group, atom.cpu_type);
   } else if (strcmp(param[0], "run") == 0) {
     parse_run(param, num_param);
   } else {
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index f06391441..73de300fd 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -41,7 +41,8 @@ void MC::compute(int step, int num_steps, Atom& atom, Box& box)
   }
 }
 
-void MC::parse_mc(const char** param, int num_param, std::vector<Group>& groups)
+void MC::parse_mc(
+  const char** param, int num_param, std::vector<Group>& groups, std::vector<int>& cpu_type)
 {
   if (num_param < 6) {
     PRINT_INPUT_ERROR("mc should have at least 5 parameters.\n");
@@ -113,6 +114,24 @@ void MC::parse_mc(const char** param, int num_param, std::vector<Group>& groups)
       if (group_id >= groups[grouping_method].number) {
         PRINT_INPUT_ERROR("Group ID should < number of groups.");
       }
+
+      bool has_multi_types = false;
+      int type0 = 0;
+      for (int k = 0; k < groups[grouping_method].cpu_size[group_id]; ++k) {
+        int n =
+          groups[grouping_method].cpu_contents[groups[grouping_method].cpu_size_sum[group_id] + k];
+        if (k == 0) {
+          type0 = cpu_type[n];
+        } else {
+          if (cpu_type[n] != type0) {
+            has_multi_types = true;
+            break;
+          }
+        }
+      }
+      if (!has_multi_types) {
+        PRINT_INPUT_ERROR("Must have more than one atom type in the specified group.");
+      }
     }
   }
 
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index dfaebbc22..ea12bbc79 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -32,7 +32,8 @@ public:
   void finalize(void);
   void compute(int step, int num_steps, Atom& atom, Box& box);
 
-  void parse_mc(const char** param, int num_param, std::vector<Group>& group);
+  void parse_mc(
+    const char** param, int num_param, std::vector<Group>& group, std::vector<int>& cpu_type);
 
 private:
   bool do_mcmd = false;

From 479915346a6c1c55d1487f8933690ac3842f3c8f Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 24 Aug 2023 05:14:23 +0800
Subject: [PATCH 30/31] use group in mcmd

---
 src/main_gpumd/run.cu            |  2 +-
 src/mc/mc.cu                     |  4 ++--
 src/mc/mc.cuh                    |  2 +-
 src/mc/mc_ensemble.cuh           |  3 ++-
 src/mc/mc_ensemble_canonical.cu  | 10 ++++++----
 src/mc/mc_ensemble_canonical.cuh |  3 ++-
 6 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/main_gpumd/run.cu b/src/main_gpumd/run.cu
index 261ab65b4..a250667e6 100644
--- a/src/main_gpumd/run.cu
+++ b/src/main_gpumd/run.cu
@@ -215,7 +215,7 @@ void Run::perform_a_run()
 
     integrate.compute2(time_step, double(step) / number_of_steps, group, box, atom, thermo);
 
-    mc.compute(step, number_of_steps, atom, box);
+    mc.compute(step, number_of_steps, atom, box, group);
 
     measure.process(
       number_of_steps,
diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index 73de300fd..d80ebc3f0 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -30,13 +30,13 @@ void MC::initialize(void)
 
 void MC::finalize(void) { do_mcmd = false; }
 
-void MC::compute(int step, int num_steps, Atom& atom, Box& box)
+void MC::compute(int step, int num_steps, Atom& atom, Box& box, std::vector<Group>& group)
 {
   if (do_mcmd) {
     if ((step + 2) % num_steps_md == 0) {
       double temperature =
         temperature_initial + step * (temperature_final - temperature_initial) / num_steps;
-      mc_ensemble->compute(step + 2, temperature, atom, box);
+      mc_ensemble->compute(step + 2, temperature, atom, box, group[grouping_method], group_id);
     }
   }
 }
diff --git a/src/mc/mc.cuh b/src/mc/mc.cuh
index ea12bbc79..786f15f8f 100644
--- a/src/mc/mc.cuh
+++ b/src/mc/mc.cuh
@@ -30,7 +30,7 @@ public:
 
   void initialize(void);
   void finalize(void);
-  void compute(int step, int num_steps, Atom& atom, Box& box);
+  void compute(int step, int num_steps, Atom& atom, Box& box, std::vector<Group>& group);
 
   void parse_mc(
     const char** param, int num_param, std::vector<Group>& group, std::vector<int>& cpu_type);
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index 008efcaaa..ca6422141 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -30,7 +30,8 @@ public:
   MC_Ensemble(void);
   virtual ~MC_Ensemble(void);
 
-  virtual void compute(int md_step, double temperature, Atom& atom, Box& box) = 0;
+  virtual void
+  compute(int md_step, double temperature, Atom& atom, Box& box, Group& group, int group_id) = 0;
 
 protected:
   int num_steps_mc = 0;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index f884f085d..dc0e5da18 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -220,7 +220,8 @@ static bool check_if_small_box(const double rc, const Box& box)
   return is_small_box;
 }
 
-void MC_Ensemble_Canonical::compute(int md_step, double temperature, Atom& atom, Box& box)
+void MC_Ensemble_Canonical::compute(
+  int md_step, double temperature, Atom& atom, Box& box, Group& group, int group_id)
 {
   if (check_if_small_box(nep_energy.paramb.rc_radial, box)) {
     printf("Cannot use small box for MCMD.\n");
@@ -232,16 +233,17 @@ void MC_Ensemble_Canonical::compute(int md_step, double temperature, Atom& atom,
     type_after.resize(atom.number_of_atoms);
   }
 
-  std::uniform_int_distribution<int> r1(0, atom.number_of_atoms - 1);
+  int group_size = group.cpu_size[group_id];
+  std::uniform_int_distribution<int> r1(0, group_size - 1);
 
   int num_accepted = 0;
   for (int step = 0; step < num_steps_mc; ++step) {
 
-    int i = r1(rng);
+    int i = group.cpu_contents[group.cpu_size_sum[group_id] + r1(rng)];
     int type_i = atom.cpu_type[i];
     int j = 0, type_j = type_i;
     while (type_i == type_j) {
-      j = r1(rng);
+      j = group.cpu_contents[group.cpu_size_sum[group_id] + r1(rng)];
       type_j = atom.cpu_type[j];
     }
 
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index a76676fb6..960183c4f 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -22,7 +22,8 @@ public:
   MC_Ensemble_Canonical(int num_steps_mc);
   virtual ~MC_Ensemble_Canonical(void);
 
-  virtual void compute(int md_step, double temperature, Atom& atom, Box& box);
+  virtual void
+  compute(int md_step, double temperature, Atom& atom, Box& box, Group& group, int group_id);
 
 private:
   GPU_Vector<int> NN_ij;

From 0b2e890178f0cb8eb1c8a928fc7a0f6a8b251ab2 Mon Sep 17 00:00:00 2001
From: Zheyong Fan <brucenju@gmail.com>
Date: Thu, 24 Aug 2023 19:10:08 +0800
Subject: [PATCH 31/31] fix bug related to group

---
 src/mc/mc.cu                     |  2 +-
 src/mc/mc_ensemble.cuh           | 10 ++++++++--
 src/mc/mc_ensemble_canonical.cu  | 21 +++++++++++++++++----
 src/mc/mc_ensemble_canonical.cuh | 10 ++++++++--
 4 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/src/mc/mc.cu b/src/mc/mc.cu
index d80ebc3f0..6b74153bf 100644
--- a/src/mc/mc.cu
+++ b/src/mc/mc.cu
@@ -36,7 +36,7 @@ void MC::compute(int step, int num_steps, Atom& atom, Box& box, std::vector<Grou
     if ((step + 2) % num_steps_md == 0) {
       double temperature =
         temperature_initial + step * (temperature_final - temperature_initial) / num_steps;
-      mc_ensemble->compute(step + 2, temperature, atom, box, group[grouping_method], group_id);
+      mc_ensemble->compute(step + 2, temperature, atom, box, group, grouping_method, group_id);
     }
   }
 }
diff --git a/src/mc/mc_ensemble.cuh b/src/mc/mc_ensemble.cuh
index ca6422141..8499dd2e8 100644
--- a/src/mc/mc_ensemble.cuh
+++ b/src/mc/mc_ensemble.cuh
@@ -30,8 +30,14 @@ public:
   MC_Ensemble(void);
   virtual ~MC_Ensemble(void);
 
-  virtual void
-  compute(int md_step, double temperature, Atom& atom, Box& box, Group& group, int group_id) = 0;
+  virtual void compute(
+    int md_step,
+    double temperature,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id) = 0;
 
 protected:
   int num_steps_mc = 0;
diff --git a/src/mc/mc_ensemble_canonical.cu b/src/mc/mc_ensemble_canonical.cu
index dc0e5da18..2f68b8a44 100644
--- a/src/mc/mc_ensemble_canonical.cu
+++ b/src/mc/mc_ensemble_canonical.cu
@@ -221,7 +221,13 @@ static bool check_if_small_box(const double rc, const Box& box)
 }
 
 void MC_Ensemble_Canonical::compute(
-  int md_step, double temperature, Atom& atom, Box& box, Group& group, int group_id)
+  int md_step,
+  double temperature,
+  Atom& atom,
+  Box& box,
+  std::vector<Group>& groups,
+  int grouping_method,
+  int group_id)
 {
   if (check_if_small_box(nep_energy.paramb.rc_radial, box)) {
     printf("Cannot use small box for MCMD.\n");
@@ -233,17 +239,24 @@ void MC_Ensemble_Canonical::compute(
     type_after.resize(atom.number_of_atoms);
   }
 
-  int group_size = group.cpu_size[group_id];
+  int group_size =
+    groups.size() > 0 ? groups[grouping_method].cpu_size[group_id] : atom.number_of_atoms;
   std::uniform_int_distribution<int> r1(0, group_size - 1);
 
   int num_accepted = 0;
   for (int step = 0; step < num_steps_mc; ++step) {
 
-    int i = group.cpu_contents[group.cpu_size_sum[group_id] + r1(rng)];
+    int i = groups.size() > 0
+              ? groups[grouping_method]
+                  .cpu_contents[groups[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+              : r1(rng);
     int type_i = atom.cpu_type[i];
     int j = 0, type_j = type_i;
     while (type_i == type_j) {
-      j = group.cpu_contents[group.cpu_size_sum[group_id] + r1(rng)];
+      j = groups.size() > 0
+            ? groups[grouping_method]
+                .cpu_contents[groups[grouping_method].cpu_size_sum[group_id] + r1(rng)]
+            : r1(rng);
       type_j = atom.cpu_type[j];
     }
 
diff --git a/src/mc/mc_ensemble_canonical.cuh b/src/mc/mc_ensemble_canonical.cuh
index 960183c4f..e2b6d9a11 100644
--- a/src/mc/mc_ensemble_canonical.cuh
+++ b/src/mc/mc_ensemble_canonical.cuh
@@ -22,8 +22,14 @@ public:
   MC_Ensemble_Canonical(int num_steps_mc);
   virtual ~MC_Ensemble_Canonical(void);
 
-  virtual void
-  compute(int md_step, double temperature, Atom& atom, Box& box, Group& group, int group_id);
+  virtual void compute(
+    int md_step,
+    double temperature,
+    Atom& atom,
+    Box& box,
+    std::vector<Group>& group,
+    int grouping_method,
+    int group_id);
 
 private:
   GPU_Vector<int> NN_ij;