From d2baf5f8194303e1151d6c59ff37c0e9fd39b965 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 5 Aug 2024 10:32:21 -0400
Subject: [PATCH 01/13] branch init

---
 README_active_learning.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 README_active_learning.md

diff --git a/README_active_learning.md b/README_active_learning.md
new file mode 100644
index 00000000..89f9f177
--- /dev/null
+++ b/README_active_learning.md
@@ -0,0 +1 @@
+Instructions to run the active learning benchmark.
\ No newline at end of file

From a69c6e56ef46e842cd248078d045a44308977a0c Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 5 Aug 2024 15:19:38 -0400
Subject: [PATCH 02/13] adding hydra to the requirements

---
 .../{train_mtp.py => mlip/mtp_train.py}       |  0
 crystal_diffusion/mlip/mtp_utils.py           | 59 +++++++++++++++++++
 requirements.txt                              |  3 +-
 3 files changed, 61 insertions(+), 1 deletion(-)
 rename crystal_diffusion/{train_mtp.py => mlip/mtp_train.py} (100%)
 create mode 100644 crystal_diffusion/mlip/mtp_utils.py

diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/mlip/mtp_train.py
similarity index 100%
rename from crystal_diffusion/train_mtp.py
rename to crystal_diffusion/mlip/mtp_train.py
diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py
new file mode 100644
index 00000000..71076e52
--- /dev/null
+++ b/crystal_diffusion/mlip/mtp_utils.py
@@ -0,0 +1,59 @@
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+import yaml
+from pymatgen.core import Structure
+
+
+def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
+        Tuple[List[Structure], List[List[float]]]:
+    """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
+
+    Args:
+        filename: path to LAMMPS output file in yaml format
+        atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str)
+
+    Returns:
+        list of pymatgen Structure containing the atoms and their positions
+        list of forces (n x 3) for each atom
+    """
+    structures = []
+    forces = []
+    with (open(filename, 'r') as f):
+        l_yaml = yaml.safe_load_all(f)
+        for d in l_yaml:  # loop over LAMMPS outputs and convert in pymatgen Structure objects
+            # lattice in yaml is 3 x 2 [0, x_lim]
+            # we assume a rectangular lattice for now with the 2nd coordinates as the lattice vectors
+            lattice = np.zeros((3, 3))
+            for i, x in enumerate(d['box']):
+                lattice[i, i] = x[1]
+            type_idx = d['keywords'].index('type')
+            species = [atom_dict[x[type_idx]] for x in d['data']]  # convert to atom type
+            coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']]
+            coords = [[x[i] for i in coords_idx] for x in d['data']]
+            pm_structure = Structure(lattice=lattice,
+                                     species=species,
+                                     coords=coords,
+                                     coords_are_cartesian=True)
+            structures.append(pm_structure)
+            force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
+            structure_forces = [[x[i] for i in force_idx] for x in d['data']]
+            forces.append(structure_forces)
+    return structures, forces
+
+
+def extract_energy_from_thermo_log(filename: str) -> List[float]:
+    """Read energies from LAMMPS thermodynamic output file.
+
+    Args:
+        filename: path to LAMMPS thermodynamic output file in yaml format.
+
+    Returns:
+        list of energies (1 value per configuration)
+    """
+    with open(filename, 'r') as f:
+        log_yaml = yaml.safe_load(f)
+        kin_idx = log_yaml['keywords'].index('KinEng')
+        pot_idx = log_yaml['keywords'].index('PotEng')
+        energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
+    return energies
diff --git a/requirements.txt b/requirements.txt
index 34680d3e..7bc45fec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ datasets==2.17.1
 flake8==4.0.1
 flake8-docstrings==1.6.0
 gitpython==3.1.27
+hydra-core==1.3.2
 isort==5.13.2
 jupyter==1.0.0
 jinja2==3.1.2
@@ -36,4 +37,4 @@ pykeops==2.2.3
 comet_ml
 einops==0.8.0
 torchode==0.2.0
-torchsde==0.2.6
\ No newline at end of file
+torchsde==0.2.6

From 44b06ffa4f8fed31440754b0620219a665ead743 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 7 Aug 2024 10:47:53 -0400
Subject: [PATCH 03/13] wip: active learning loop with mostly placholders

---
 .../active_learning_loop/benchmark.py         | 160 ++++++++++++++++++
 crystal_diffusion/mlip/mtp_train.py           |  99 +----------
 crystal_diffusion/mlip/mtp_utils.py           |  64 +++++++
 3 files changed, 231 insertions(+), 92 deletions(-)
 create mode 100644 crystal_diffusion/active_learning_loop/benchmark.py

diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
new file mode 100644
index 00000000..5e0e2ea1
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -0,0 +1,160 @@
+import argparse
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+import pandas as pd
+import yaml
+from hydra.utils import instantiate
+
+from crystal_diffusion.mlip.mtp_train import train_mtp, prepare_dataset, evaluate_mtp
+from crystal_diffusion.models.mtp import MTPWithMLIP3
+
+
+def get_arguments() -> argparse.Namespace:
+    """Parse arguments.
+
+    Returns:
+        args: arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mtp_config', help='path to data directory', required=True)
+    args = parser.parse_args()
+    return args
+
+
+@dataclass(kw_only=True)
+class MTPArguments:
+    training_data_dir: str  # training data directory
+    evaluation_data_dir: str  # evaluation data directory
+    mlip_dir: str  # directory with the mlp executable
+    output_dir: str  # directory where to save the MTP outputs
+
+
+@dataclass(kw_only=True)
+class StructureEvaluationArguments:
+    evaluation_criteria: str ='nbh_grades'
+    criteria_threshold: float = 10
+    number_of_structures: int = None
+    extraction_radius: float = 3
+
+
+def train_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str]) -> MTPWithMLIP3:
+    """Train a MTP model using the specified data.
+
+    Args:
+        mtp_args: MTPArguments data class
+        atom_dict: map between atom names and indices used by LAMMPS
+
+    Returns:
+        trained MTP model
+    """
+    # TODO make it more configurable
+    train_datasets = prepare_dataset(mtp_args.training_data_dir, atom_dict, mode="train")
+    # create the output directory if it doesn't exist already
+    trained_mtp = train_mtp(train_datasets, mlip_folder_path=mtp_args.mlip_dir, save_dir=mtp_args.output_dir)
+    return trained_mtp
+
+
+def evaluate_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame:
+    """Evaluate a MTP model using the specified data.
+
+    Args:
+        mtp_args: MTPArguments data class
+        atom_dict:  map between atom names and indices used by LAMMPS
+        mtp: trained MTP model
+
+    Returns:
+        dataframe with a column specified the structure, a column with the atom index, 3 columns with the x,y,z
+        coordinates and a column with the MaxVol criteria (nbh_grades)
+    """
+    evaluation_datasets = prepare_dataset(mtp_args.evaluation_data_dir, atom_dict, mode="evaluation")
+    # TODO the current evaluation method also returns the ground truth informations - this won't always be the case
+    # TODO make more configurable
+    _, prediction_df = evaluate_mtp(evaluation_datasets, mtp)
+    return prediction_df
+
+
+def get_structures_for_retraining(prediction_df: pd.DataFrame,
+                                  criteria_threshold: Optional[float] = None,
+                                  number_of_structures: Optional[int] = None,
+                                  evaluation_criteria: str = 'nbh_grades',
+                                  structure_index: str = 'structure_index'
+                                  ) -> List[pd.DataFrame]:
+    assert criteria_threshold is not None or number_of_structures is not None, \
+        "criteria_threshold or number_of_structures should be set."
+    # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection
+    criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max()
+    # find the top number_of_structures
+    structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False)
+    if number_of_structures is not None:
+        structures_indices = structures_indices[:number_of_structures]
+    else:  #  criteria_threshold is not None
+        structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold]
+    structures_indices = structures_indices.index.to_list()
+    assert len(structures_indices) > 0, "No structure meet the criteria."
+    structures_to_retrain = []
+    for idx in structures_indices:
+        structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx])
+    return structures_to_retrain
+
+
+def extract_target_region(structure_df: pd.DataFrame,
+                          extraction_radius: float,
+                          evaluation_criteria: str ='nbh_grades') -> pd.DataFrame:
+    """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious.
+
+    Args:
+        structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value)
+        extraction_radius: include all atoms within this distance of the targeted atom
+        evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP)
+
+    Returns:
+        dataframe with the atomic coordinates in columns x, y, z
+    """
+    # extract the worst ato and a region around of radius extraction_radius
+    # TODO better method to determine radius: number of atoms ?
+    target_atom = structure_df[evaluation_criteria].idxmax()
+    target_position = structure_df.loc[target_atom][['x', 'y', 'z']]
+    structure_df.loc[:, 'distance_squared'] = structure_df.apply(
+        lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1)
+    atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']]
+    return atom_positions
+
+
+def generate_new_structures(fixed_atoms: List[pd.DataFrame]) -> None:
+    pass
+
+
+def main():
+    # args = get_arguments()
+    # TODO get mtp_config_path from the args
+    mtp_config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/"
+    mtp_config_path = os.path.join(mtp_config_path, "config", "mtp_training.yaml")
+    with open(mtp_config_path, 'r') as stream:
+        mtp_config = yaml.load(stream, Loader=yaml.FullLoader)
+    # use hydra to convert the yaml file in a dataclass format
+    mtp_config = instantiate(mtp_config)
+    os.makedirs(mtp_config.output_dir, exist_ok=True)
+    atom_dict = {1: "Si"}  # TODO this should be define somewhere smart
+    # STEP 1: train a MLIP
+    trained_mtp = train_mlip(mtp_config, atom_dict)
+    # STEP 2: evaluate the MLIP
+    prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp)
+    # STEP 3: identify the problematic structures
+    # TODO extraction_params should come from a config file with hydra instantiate
+    extraction_params = StructureEvaluationArguments()
+    structures_to_retrain = get_structures_for_retraining(prediction_df,
+                                                          criteria_threshold=extraction_params.criteria_threshold,
+                                                          number_of_structures=extraction_params.number_of_structures,
+                                                          evaluation_criteria=extraction_params.evaluation_criteria)
+    # STEP 4: extract the region
+    bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius)
+                   for s in structures_to_retrain]
+    # STEP 5: call the generative model to create new candidates
+
+    print('hello')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py
index 59d65bf3..3153ccee 100644
--- a/crystal_diffusion/mlip/mtp_train.py
+++ b/crystal_diffusion/mlip/mtp_train.py
@@ -1,8 +1,9 @@
 """Script to train and evaluate a MTP.
 
-Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp.
+Running the main() runs a debugging example. Entry points are train_mtp.
 """
 import argparse
+from dataclasses import dataclass
 from typing import Any, Dict, List, NamedTuple, Tuple
 
 import numpy as np
@@ -12,101 +13,15 @@
 from sklearn.metrics import mean_absolute_error
 
 from crystal_diffusion.models.mtp import MTPWithMLIP3
+from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTP_Inputs
 
 atom_dict = {1: 'Si'}
 
 
-def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
-        Tuple[List[Structure], List[List[float]]]:
-    """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
-
-    Args:
-        filename: path to LAMMPS output file in yaml format
-        atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str)
-
-    Returns:
-        list of pymatgen Structure containing the atoms and their positions
-        list of forces (n x 3) for each atom
-    """
-    structures = []
-    forces = []
-    with (open(filename, 'r') as f):
-        l_yaml = yaml.safe_load_all(f)
-        for d in l_yaml:  # loop over LAMMPS outputs and convert in pymatgen Structure objects
-            # lattice in yaml is 3 x 2 [0, x_lim]
-            # we assume a rectangular lattice for now with the 2nd coordinates as the lattice vectors
-            lattice = np.zeros((3, 3))
-            for i, x in enumerate(d['box']):
-                lattice[i, i] = x[1]
-            type_idx = d['keywords'].index('type')
-            species = [atom_dict[x[type_idx]] for x in d['data']]  # convert to atom type
-            coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']]
-            coords = [[x[i] for i in coords_idx] for x in d['data']]
-            pm_structure = Structure(lattice=lattice,
-                                     species=species,
-                                     coords=coords,
-                                     coords_are_cartesian=True)
-            structures.append(pm_structure)
-            force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
-            structure_forces = [[x[i] for i in force_idx] for x in d['data']]
-            forces.append(structure_forces)
-    return structures, forces
-
-
-def extract_energy_from_thermo_log(filename: str) -> List[float]:
-    """Read energies from LAMMPS thermodynamic output file.
-
-    Args:
-        filename: path to LAMMPS thermodynamic output file in yaml format.
-
-    Returns:
-        list of energies (1 value per configuration)
-    """
-    with open(filename, 'r') as f:
-        log_yaml = yaml.safe_load(f)
-        kin_idx = log_yaml['keywords'].index('KinEng')
-        pot_idx = log_yaml['keywords'].index('PotEng')
-        energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
-    return energies
-
-
-class MTP_Inputs(NamedTuple):
-    """Create a namedtuple instance for MTP inputs."""
-
-    structure: List[Structure]
-    forces: List[List[float]]
-    energy: List[float]
-
-
-def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
-                                   thermo_yaml: List[str],
-                                   atom_dict: Dict[int, Any]
-                                   ) -> MTP_Inputs:
-    """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
-
-    Args:
-        output_yaml: list of LAMMPS output files as yaml.
-        thermo_yaml: list of LAMMPS thermodynamic output files as yaml.
-        atom_dict: mapping of LAMMPS indices to atom type.
-
-    Returns:
-        namedtuple with structure, energies and forces usable by MTP.
-    """
-    mtp_inputs = {
-        'structure': [],
-        'energy': [],
-        'forces': []
-    }
-    for filename in output_yaml:
-        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict)
-        mtp_inputs['structure'] += structures
-        mtp_inputs['forces'] += forces
-    for filename in thermo_yaml:
-        mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
-    mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'],
-                            energy=mtp_inputs['energy'],
-                            forces=mtp_inputs['forces'])
-    return mtp_inputs
+def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTP_Inputs:
+    lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
+    mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict)
+    return mtp_dataset
 
 
 def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py
index 71076e52..f717d8f6 100644
--- a/crystal_diffusion/mlip/mtp_utils.py
+++ b/crystal_diffusion/mlip/mtp_utils.py
@@ -1,3 +1,6 @@
+import os
+import re
+from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 
 import numpy as np
@@ -5,6 +8,14 @@
 from pymatgen.core import Structure
 
 
+@dataclass(kw_only=True)
+class MTP_Inputs:
+    """Create a dataclass to train or evaluate a MTP model."""
+    structure: List[Structure]
+    forces: List[List[float]]
+    energy: List[float]
+
+
 def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
         Tuple[List[Structure], List[List[float]]]:
     """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
@@ -57,3 +68,56 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
         pot_idx = log_yaml['keywords'].index('PotEng')
         energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
     return energies
+
+
+def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
+                                   thermo_yaml: List[str],
+                                   atom_dict: Dict[int, Any]
+                                   ) -> MTP_Inputs:
+    """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
+
+    Args:
+        output_yaml: list of LAMMPS output files as yaml.
+        thermo_yaml: list of LAMMPS thermodynamic output files as yaml.
+        atom_dict: mapping of LAMMPS indices to atom type.
+
+    Returns:
+        dataclass used to
+    """
+    mtp_inputs = {
+        'structure': [],
+        'energy': [],
+        'forces': []
+    }
+    for filename in output_yaml:
+        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict)
+        mtp_inputs['structure'] += structures
+        mtp_inputs['forces'] += forces
+    for filename in thermo_yaml:
+        mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
+    mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'],
+                            energy=mtp_inputs['energy'],
+                            forces=mtp_inputs['forces'])
+    return mtp_inputs
+
+
+def crawl_lammps_directory(folder_name: str, folder_name_pattern: str="train") -> Tuple[List[str], List[str]]:
+    """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name.
+
+    LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml
+
+    Args:
+        folder_name: folder to crawl
+        folder_name_pattern (optional): name of the subfolder to keep. Defaults to train.
+
+    Returns:
+        list of LAMMPS dump outputs and list of LAMMPS thermo outputs
+
+    """
+    assert os.path.exists(folder_name), "Invalid folder name provided."
+    lammps_output_files, thermo_output_files = [], []
+    for dirpath, _, filenames in os.walk(folder_name):
+        if re.search(folder_name_pattern, dirpath):
+            lammps_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("dump.yaml")])
+            thermo_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("thermo.yaml")])
+    return lammps_output_files, thermo_output_files

From 3c307d66d6e30437a5ff33ec88e1d662d641590f Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 9 Aug 2024 10:19:47 -0400
Subject: [PATCH 04/13] wip: active learning loop - training a MTP with modular
 classes

---
 .../active_learning_loop/benchmark.py         | 143 +++++++++++++----
 crystal_diffusion/mlip/mtp_train.py           |   4 +-
 crystal_diffusion/mlip/mtp_utils.py           |  12 +-
 crystal_diffusion/models/{ => mlip}/mtp.py    | 145 +++++++++---------
 .../config/mtp_training.yaml                  |  33 ++++
 5 files changed, 222 insertions(+), 115 deletions(-)
 rename crystal_diffusion/models/{ => mlip}/mtp.py (73%)
 create mode 100644 experiments/active_learning_benchmark/config/mtp_training.yaml

diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
index 5e0e2ea1..b17f6110 100644
--- a/crystal_diffusion/active_learning_loop/benchmark.py
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -3,12 +3,103 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
+import numpy as np
 import pandas as pd
 import yaml
 from hydra.utils import instantiate
 
-from crystal_diffusion.mlip.mtp_train import train_mtp, prepare_dataset, evaluate_mtp
-from crystal_diffusion.models.mtp import MTPWithMLIP3
+# from crystal_diffusion.mlip.mtp_train import prepare_dataset, evaluate_mtp
+from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
+from crystal_diffusion.mlip.mtp_utils import MTPInputs
+
+
+@dataclass(kw_only=True)
+class ActiveLearningDataArguments:
+    training_data_dir: str  # training data directory
+    evaluation_data_dir: str  # evaluation data directory
+    output_dir: str  # directory where to save the results
+
+
+@dataclass(kw_only=True)
+class StructureEvaluationArguments:
+    evaluation_criteria: str = 'nbh_grades'
+    criteria_threshold: float = 10
+    number_of_structures: int = None
+    extraction_radius: float = 3
+
+
+class ActiveLearningLoop:
+    def __init__(self,
+                 meta_config: str,
+                 evaluation_config: str):
+        assert os.path.exists(meta_config), "configuration file for active learning loop does not exist."
+        self.data_paths, self.mlip_model, self.evaluation_args = None, None, None
+        # use hydra to convert the yaml file in a dataclass format
+        self.parse_config(meta_config)
+        self.atom_dict = {1: "Si"}  # TODO this should be define somewhere smart
+        self.trained_mlips = []  # history of trained MLIPs (optional - not sure if we should keep this)
+
+    def parse_config(self, meta_config: str):
+        with open(meta_config, 'r') as stream:
+            meta_config = yaml.load(stream, Loader=yaml.FullLoader)
+        self.data_paths = instantiate(meta_config['active_learning_data'])
+        self.mlip_model = instantiate(meta_config['mlip'])
+        self.evaluation_args = instantiate(meta_config['structure_evaluation'])
+
+    def train_mlip(self, round: int = 1) -> MTPWithMLIP3:
+        train_dataset = self.mlip_model.prepare_dataset_from_lammps(
+            root_data_dir=self.data_paths.training_data_dir,
+            atom_dict=self.atom_dict,
+            mode="train"
+        )
+        trained_mtp = self.mlip_model.train(train_dataset, mlip_name=f'mlip_round_{round}')
+        self.trained_mlips.append(trained_mtp)  # history of trained MLIPs ... not sure if useful
+        return trained_mtp
+
+    def evaluate_mlip(self, trained_mtp: MTPWithMLIP3) -> pd.DataFrame:
+        prediction_df = evaluate_mlip(self.mtp_config, self.atom_dict, trained_mtp)
+        return prediction_df
+
+    def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]:
+        structures_to_retrain = get_structures_for_retraining(prediction_df,
+                                                              criteria_threshold=self.eval_config.criteria_threshold,
+                                                              number_of_structures=self.eval_config.number_of_structures,
+                                                              evaluation_criteria=self.eval_config.evaluation_criteria)
+        return structures_to_retrain
+
+    def excise_worst_atom(self, structures_to_retrain: List[pd.DataFrame]) -> List[pd.DataFrame]:
+        bad_regions = [extract_target_region(s, extraction_radius=self.eval_config.extraction_radius)
+                       for s in structures_to_retrain]
+        return bad_regions
+
+    def get_structure_candidate_from_generative_model(self,
+                                                      fixed_atoms: pd.DataFrame,
+                                                      number_of_candidates: int =1
+                                                      ) -> None:
+        # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms
+        # the output should be a list of np.ndarray (?)
+        return None
+
+    def update_mlip_training_set(self, new_structures: List[np.ndarray]):
+        # TODO take the examples generated by the diffusion model and append them to the MLIP training set
+        return None
+
+    def round_of_active_learning_loop(self, trained_mtp: Optional[MTPWithMLIP3]=None, training_set=None):
+        # one round from a known mtp (or train from provided training set)
+        # evaluate, find candidates and update MTP
+        # return the updated MTP
+        if trained_mtp is None:
+            trained_mtp = self.train_mlip(training_set)
+        pred_df = self.evaluate_mlip(trained_mtp)
+        bad_structures = self.get_bad_structures(pred_df)
+        bad_regions = self.excise_worst_atom(bad_structures)
+        new_candidates = [self.get_structure_candidate_from_generative_model(x) for x in bad_regions]
+        new_mlip_training_set = self.update_mlip_training_set(self, new_candidates)
+        new_mtp = self.trained_mlip(new_mlip_training_set)
+        return new_mtp
+
+    def evaluate_mtp_update(self):
+        pass
 
 
 def get_arguments() -> argparse.Namespace:
@@ -23,23 +114,10 @@ def get_arguments() -> argparse.Namespace:
     return args
 
 
-@dataclass(kw_only=True)
-class MTPArguments:
-    training_data_dir: str  # training data directory
-    evaluation_data_dir: str  # evaluation data directory
-    mlip_dir: str  # directory with the mlp executable
-    output_dir: str  # directory where to save the MTP outputs
-
 
-@dataclass(kw_only=True)
-class StructureEvaluationArguments:
-    evaluation_criteria: str ='nbh_grades'
-    criteria_threshold: float = 10
-    number_of_structures: int = None
-    extraction_radius: float = 3
 
 
-def train_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str]) -> MTPWithMLIP3:
+def train_mlip(mtp_args, atom_dict: Dict[int, str]) -> MTPWithMLIP3:
     """Train a MTP model using the specified data.
 
     Args:
@@ -56,7 +134,7 @@ def train_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str]) -> MTPWithMLIP
     return trained_mtp
 
 
-def evaluate_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame:
+def evaluate_mlip(mtp_args, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame:
     """Evaluate a MTP model using the specified data.
 
     Args:
@@ -129,30 +207,27 @@ def generate_new_structures(fixed_atoms: List[pd.DataFrame]) -> None:
 def main():
     # args = get_arguments()
     # TODO get mtp_config_path from the args
-    mtp_config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/"
-    mtp_config_path = os.path.join(mtp_config_path, "config", "mtp_training.yaml")
-    with open(mtp_config_path, 'r') as stream:
-        mtp_config = yaml.load(stream, Loader=yaml.FullLoader)
-    # use hydra to convert the yaml file in a dataclass format
-    mtp_config = instantiate(mtp_config)
-    os.makedirs(mtp_config.output_dir, exist_ok=True)
+    config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/"
+    config_path = os.path.join(config_path, "config", "mtp_training.yaml")
+    al_loop = ActiveLearningLoop(config_path, 'hello')
+    al_loop.train_mlip()
+
     atom_dict = {1: "Si"}  # TODO this should be define somewhere smart
     # STEP 1: train a MLIP
-    trained_mtp = train_mlip(mtp_config, atom_dict)
+    # trained_mtp = train_mlip(mtp_config, atom_dict)
     # STEP 2: evaluate the MLIP
-    prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp)
+    # prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp)
     # STEP 3: identify the problematic structures
     # TODO extraction_params should come from a config file with hydra instantiate
-    extraction_params = StructureEvaluationArguments()
-    structures_to_retrain = get_structures_for_retraining(prediction_df,
-                                                          criteria_threshold=extraction_params.criteria_threshold,
-                                                          number_of_structures=extraction_params.number_of_structures,
-                                                          evaluation_criteria=extraction_params.evaluation_criteria)
+    # extraction_params = StructureEvaluationArguments()
+    #structures_to_retrain = get_structures_for_retraining(prediction_df,
+    #                                                      criteria_threshold=extraction_params.criteria_threshold,
+    #                                                      number_of_structures=extraction_params.number_of_structures,
+    #                                                      evaluation_criteria=extraction_params.evaluation_criteria)
     # STEP 4: extract the region
-    bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius)
-                   for s in structures_to_retrain]
+    #bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius)
+    #               for s in structures_to_retrain]
     # STEP 5: call the generative model to create new candidates
-
     print('hello')
 
 
diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py
index 3153ccee..8e449f80 100644
--- a/crystal_diffusion/mlip/mtp_train.py
+++ b/crystal_diffusion/mlip/mtp_train.py
@@ -12,8 +12,8 @@
 from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
-from crystal_diffusion.models.mtp import MTPWithMLIP3
-from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTP_Inputs
+from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
+from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTPInputs
 
 atom_dict = {1: 'Si'}
 
diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py
index f717d8f6..9f2f974c 100644
--- a/crystal_diffusion/mlip/mtp_utils.py
+++ b/crystal_diffusion/mlip/mtp_utils.py
@@ -9,7 +9,7 @@
 
 
 @dataclass(kw_only=True)
-class MTP_Inputs:
+class MTPInputs:
     """Create a dataclass to train or evaluate a MTP model."""
     structure: List[Structure]
     forces: List[List[float]]
@@ -73,7 +73,7 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
 def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
                                    thermo_yaml: List[str],
                                    atom_dict: Dict[int, Any]
-                                   ) -> MTP_Inputs:
+                                   ) -> MTPInputs:
     """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
 
     Args:
@@ -95,13 +95,13 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
         mtp_inputs['forces'] += forces
     for filename in thermo_yaml:
         mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
-    mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'],
-                            energy=mtp_inputs['energy'],
-                            forces=mtp_inputs['forces'])
+    mtp_inputs = MTPInputs(structure=mtp_inputs['structure'],
+                           energy=mtp_inputs['energy'],
+                           forces=mtp_inputs['forces'])
     return mtp_inputs
 
 
-def crawl_lammps_directory(folder_name: str, folder_name_pattern: str="train") -> Tuple[List[str], List[str]]:
+def crawl_lammps_directory(folder_name: str, folder_name_pattern: str= "train") -> Tuple[List[str], List[str]]:
     """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name.
 
     LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml
diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mlip/mtp.py
similarity index 73%
rename from crystal_diffusion/models/mtp.py
rename to crystal_diffusion/models/mlip/mtp.py
index 67570495..474b6c25 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mlip/mtp.py
@@ -10,6 +10,7 @@
 import shutil
 import subprocess
 from collections import defaultdict
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, TextIO, Tuple
 
 import numpy as np
@@ -20,30 +21,47 @@
 from monty.tempfile import ScratchDir
 from pymatgen.core import Structure
 
+from crystal_diffusion.mlip.mtp_utils import MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps
+
+
+@dataclass(kw_only=True)
+class MTPArguments:
+    mlip_path: str  # path to MLIP3 library
+    name: Optional[str]= None  # MTP
+    param: Optional[Dict[Any, Any]] = None
+    unfitted_mtp: str = "08.almtp"  # Define the initial mtp file. Default to 08g.amltp
+    fitted_mtp_savedir: str = '../'  # save directory for the fitted MTP. Defaults to '../' (current wd)
+    max_dist: float = 5  # The actual radial cutoff. Defaults to 5.
+    radial_basis_size: int = 8  # Relevant to number of radial basis function. Defaults to 8.
+    max_iter: int = 1000  # The number of maximum iteration. Defaults to 1000.
+    energy_weight: float = 1  # The weight of energy. Defaults to 1
+    force_weight: float = 1e-2  # The weight of forces. Defaults to 1e-2
+    stress_weight: float = 1e-3  # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+    init_params: str = "same"  # how to initialize parameters if a potential was not pre-fitted: "same" or "random".
+    scale_by_force: float = 0  # If > 0 then configurations near equilibrium get more weight. Defaults to 0.
+    bfgs_conv_tol: float = 1e-3  # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations.
+    weighting: str = "vibration"  # How to weight configuration with different sizes relative to each other.
+    # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
 
 class MTPWithMLIP3(MTPotential):
     """MTP with MLIP-3."""
 
-    def __init__(self,
-                 mlip_path: str,
-                 name: Optional[str] = None,
-                 param: Optional[Dict[Any, Any]] = None,
-                 version: Optional[str] = None):
+    def __init__(self, mtp_args: MTPArguments):
         """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3.
 
         Args:
-            mlip_path: path to mlip3 library
-            name: MTPotential argument
-            param: MTPotential argument
-            version: MTPotential argument
+            mtp_args: MTP arguments from the class MTPArguments
         """
-        super().__init__(name, param, version)
-        self.mlp_command = os.path.join(mlip_path, "build", "mlp")
+        super().__init__(mtp_args.name, mtp_args.param)
+        self.mlp_command = os.path.join(mtp_args.mlip_path, "build", "mlp")
         assert os.path.exists(self.mlp_command), "mlp command not found in mlip-3 build folder"
-        self.mlp_templates = os.path.join(mlip_path, "MTP_templates")
+        self.mlp_templates = os.path.join(mtp_args.mlip_path, "MTP_templates")
         assert os.path.exists(self.mlp_templates), "MTP templates not found in mlip-3 folder"
         self.fitted_mtp = None
         self.elements = None
+        self.mtp_args = mtp_args
+        os.makedirs(mtp_args.fitted_mtp_savedir, exist_ok=True)
 
     def to_lammps_format(self):
         """Write the trained MTP in a LAMMPS compatible format."""
@@ -235,72 +253,50 @@ def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO):
         with subprocess.Popen(cmd, stdout=output_file) as p:
             p.communicate()[0]
 
-    def train(
-            self,
-            train_structures: List[Structure],
-            train_energies: List[float],
-            train_forces: List[List[float]],
-            train_stresses: Optional[List[List[float]]] = None,
-            unfitted_mtp: str = "08.almtp",
-            fitted_mtp_savedir: str = '../',
-            max_dist: float = 5,
-            radial_basis_size: int = 8,
-            max_iter: int = 1000,  # TODO check the next kwargs in mlip3
-            energy_weight: float = 1,
-            force_weight: float = 1e-2,
-            stress_weight: float = 1e-3,
-            init_params: str = "same",
-            scale_by_force: float = 0,
-            bfgs_conv_tol: float = 1e-3,
-            weighting: str = "vibration",
-    ) -> int:
+    @staticmethod
+    def prepare_dataset_from_lammps(
+            root_data_dir: str,
+            atom_dict: Dict[int, str],
+            mode: str = "train"
+    ) -> MTPInputs:
+        lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
+        mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict)
+        return mtp_dataset
+
+    def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str:
         """Training data with moment tensor method using MLIP-3.
 
         Override the base class method.
 
         Args:
-            train_structures: The list of Pymatgen Structure object.
-            train_energies: List of total energies of each structure in structures list.
-            train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
-                m can be varied with each single structure case.
-            train_stresses (optional): List of (6, ) virial stresses of each structure in structures list.
-                Defaults to None.
-            unfitted_mtp (optional): Define the initial mtp file. Default to 08g.amltp
-            fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd)
-            max_dist (optional): The actual radial cutoff. Defaults to 5.
-            radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8.
-            max_iter (optional): The number of maximum iteration. Defaults to 1000.
-            energy_weight (optional): The weight of energy. Defaults to 1
-            force_weight (optional): The weight of forces. Defaults to 1e-2
-            stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
-            init_params (optional): How to initialize parameters if a potential was not
-                pre-fitted. Choose from "same" and "random". Defaults to "same".
-            scale_by_force (optional): If >0 then configurations near equilibrium
-               (with roughly force < scale_by_force) get more weight. Defaults to 0.
-            bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this
-                over 50 BFGS iterations. Defaults to 1e-3.
-            weighting (optional): How to weight configuration with different sizes relative to each other.
-                Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+            dataset: MTPInputs dataclass with the following elements:
+                structures: The list of Pymatgen Structure object.
+                energies: List of total energies of each structure in structures list.
+                forces: List of (m, 3) forces array of each structure with m atoms in structures list.
+                    m can be varied with each single structure case.
+            mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp
 
         Returns:
-            rc : return code of the mlp training script
+            fitted_mtp: path to the fitted MTP
         """
         train_structures, train_forces, train_stresses = check_structures_forces_stresses(
-            train_structures, train_forces, train_stresses
+            dataset.structure, dataset.forces, None
         )
-        train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses)
+        # last argument is for stresses - not used currently
+        train_pool = pool_from(train_structures, dataset.energy, train_forces)
+
         elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures])))
         self.elements = [str(element) for element in elements]  # TODO move to __init__
 
         atoms_filename = "train.cfgs"
 
-        with (ScratchDir(".")):  # create a tmpdir - deleted afterwards
+        with ((ScratchDir("."))):  # create a tmpdir - deleted afterwards
             atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
 
-            if not unfitted_mtp:
+            if not self.mtp_args.unfitted_mtp:
                 raise RuntimeError("No specific parameter file provided.")
-            mtp_file_path = os.path.join(self.mlp_templates, unfitted_mtp)
-            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
+            mtp_file_path = os.path.join(self.mlp_templates, self.mtp_args.unfitted_mtp)
+            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), self.mtp_args.unfitted_mtp))
             commands = [self.mlp_command, "mindist", atoms_filename]
             with open("min_dist", "w") as f:
                 self._call_cmd_to_stdout(commands, f)
@@ -311,23 +307,26 @@ def train(
             # split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
             # min_dist = float(lines[-1].split(split_symbol)[1])
 
-            save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
+            save_fitted_mtp = mlip_name
+            if not save_fitted_mtp.endswith('.almtp'):
+                save_fitted_mtp += '.almtp'
+
             cmds_list = [
                 self.mlp_command,
                 "train",
-                unfitted_mtp,
+                self.mtp_args.unfitted_mtp,
                 atoms_filename,
                 f"--save_to={save_fitted_mtp}",
-                f"--iteration_limit={max_iter}",
+                f"--iteration_limit={self.mtp_args.max_iter}",
                 "--al_mode=nbh",  # active learning mode - required to get extrapolation grade
-                # f"--curr-pot-name={unfitted_mtp}",  # TODO check those kwargs
-                # f"--energy-weight={energy_weight}",
-                # f"--force-weight={force_weight}",
-                # f"--stress-weight={stress_weight}",
-                # f"--init-params={init_params}",
-                # f"--scale-by-force={scale_by_force}",
-                # f"--bfgs-conv-tol={bfgs_conv_tol}",
-                # f"--weighting={weighting}",
+                f"--curr-pot-name={self.mtp_args.unfitted_mtp}",
+                f"--energy-weight={self.mtp_args.energy_weight}",
+                f"--force-weight={self.mtp_args.force_weight}",
+                f"--stress-weight={self.mtp_args.stress_weight}",
+                f"--init-params={self.mtp_args.init_params}",
+                f"--scale-by-force={self.mtp_args.scale_by_force}",
+                f"--bfgs-conv-tol={self.mtp_args.bfgs_conv_tol}",
+                f"--weighting={self.mtp_args.weighting}",
             ]
             stdout, rc = self._call_mlip(cmds_list)
             if rc != 0:
@@ -340,6 +339,6 @@ def train(
                     error_msg += msg[-1]
                 raise RuntimeError(error_msg)
             # copy the fitted mtp outside the working directory
-            self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp)
+            self.fitted_mtp = os.path.join(self.mtp_args.fitted_mtp_savedir, save_fitted_mtp)
             shutil.copyfile(save_fitted_mtp, self.fitted_mtp)
-        return rc
+        return self.fitted_mtp
diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml
new file mode 100644
index 00000000..1c835a78
--- /dev/null
+++ b/experiments/active_learning_benchmark/config/mtp_training.yaml
@@ -0,0 +1,33 @@
+active_learning_data:
+  _target_: crystal_diffusion.active_learning_loop.benchmark.ActiveLearningDataArguments
+  training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/
+  evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/
+  output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/
+
+mlip:
+  _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3
+  mtp_args:
+    _target_: crystal_diffusion.models.mlip.mtp.MTPArguments
+    mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/  # path to MLIP3 library
+    name: None  # MTP
+    param: None
+    unfitted_mtp: 08.almtp  # Define the initial mtp file. Default to 08g.amltp
+    fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/  # save directory for the fitted MTP. Defaults to '../' (current wd)
+    max_dist: 5.0  # The actual radial cutoff. Defaults to 5.
+    radial_basis_size: 8  # Relevant to number of radial basis function. Defaults to 8.
+    max_iter: 1000  # The number of maximum iteration. Defaults to 1000.
+    energy_weight: 1.0  # The weight of energy. Defaults to 1
+    force_weight: 1e-2  # The weight of forces. Defaults to 1e-2
+    stress_weight: 1e-3  # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+    init_params: same  # how to initialize parameters if a potential was not pre-fitted: "same" or "random".
+    scale_by_force: 0  # If > 0 then configurations near equilibrium get more weight. Defaults to 0.
+    bfgs_conv_tol: 1e-3  # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations.
+    weighting: vibration  # How to weight configuration with different sizes relative to each other.
+    # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
+structure_evaluation:
+  _target_: crystal_diffusion.active_learning_loop.benchmark.StructureEvaluationArguments
+  evaluation_criteria: 'nbh_grades'  # maxvol coefficient name in the mtp outputs
+  criteria_threshold: 10.0  # atoms with a MaxVol value under this value are considered bad
+  number_of_structures: None  # alternatively, take the N worst samples from the evaluation set
+  extraction_radius: float = 3  # extract atoms within this radius from the
\ No newline at end of file

From f3ac433ded5342b871b95566208917612eb5adc2 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 15 Aug 2024 08:43:28 -0400
Subject: [PATCH 05/13] active learning loop basic implementation

---
 .../activelearning_dataclasses.py             |  21 +
 .../active_learning_loop/benchmark.py         | 365 ++++++++++--------
 .../active_learning_loop/oracle.py            |  53 +++
 .../active_learning_loop/utils.py             |  68 ++++
 crystal_diffusion/models/mlip/mtp.py          |  98 +++--
 5 files changed, 425 insertions(+), 180 deletions(-)
 create mode 100644 crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
 create mode 100644 crystal_diffusion/active_learning_loop/oracle.py
 create mode 100644 crystal_diffusion/active_learning_loop/utils.py

diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
new file mode 100644
index 00000000..d4880f80
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+
+@dataclass(kw_only=True)
+class ActiveLearningDataArguments:
+    training_data_dir: str  # training data directory
+    evaluation_data_dir: str  # evaluation data directory
+    output_dir: str  # directory where to save the results
+
+
+@dataclass(kw_only=True)
+class StructureEvaluationArguments:
+    evaluation_criteria: str ='nbh_grades'
+    criteria_threshold: float = 10
+    number_of_structures: int = None
+    extraction_radius: float = 3
+
+
+@dataclass(kw_only=True)
+class RepaintingArguments:
+    model: str = 'dev_dummy'
diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
index b17f6110..a01fb8c2 100644
--- a/crystal_diffusion/active_learning_loop/benchmark.py
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -1,66 +1,122 @@
 import argparse
 import os
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Any, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
 import yaml
 from hydra.utils import instantiate
 
-# from crystal_diffusion.mlip.mtp_train import prepare_dataset, evaluate_mtp
-from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
-from crystal_diffusion.mlip.mtp_utils import MTPInputs
-
-
-@dataclass(kw_only=True)
-class ActiveLearningDataArguments:
-    training_data_dir: str  # training data directory
-    evaluation_data_dir: str  # evaluation data directory
-    output_dir: str  # directory where to save the results
-
-
-@dataclass(kw_only=True)
-class StructureEvaluationArguments:
-    evaluation_criteria: str = 'nbh_grades'
-    criteria_threshold: float = 10
-    number_of_structures: int = None
-    extraction_radius: float = 3
+from crystal_diffusion.active_learning_loop.utils import get_structures_for_retraining, extract_target_region
+from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3, MTPInputs
 
 
 class ActiveLearningLoop:
     def __init__(self,
                  meta_config: str,
-                 evaluation_config: str):
+                 ):
+        """Active learning benchmark with methods to train & evaluate a MLIP, isolate bad sub-structures, repaint
+        new structures and retrain the MLIP
+
+        Args:
+            meta_config: path to a yaml configuration with the parameters for the modules in the class
+        """
         assert os.path.exists(meta_config), "configuration file for active learning loop does not exist."
-        self.data_paths, self.mlip_model, self.evaluation_args = None, None, None
-        # use hydra to convert the yaml file in a dataclass format
+        # define the modules in the __init__ function
+        self.data_paths, self.mlip_model, self.eval_config, self.structure_generation = None, None, None, None
+        self.oracle = None
+        # use hydra to convert the yaml into modules and other data classes
         self.parse_config(meta_config)
         self.atom_dict = {1: "Si"}  # TODO this should be define somewhere smart
         self.trained_mlips = []  # history of trained MLIPs (optional - not sure if we should keep this)
+        self.training_sets = []  # history of training sets
 
     def parse_config(self, meta_config: str):
+        """Read a configuration file and instantiate the different blocks with hydra.
+
+        The configuration file should have the following blocks of parameters:
+            active_learning_data: dataset paths
+            mlip_model: MLIP module training parameters
+            structure_evaluation: identification and isolation of the atomic regions to finetune the MLIP
+
+        Args:
+            meta_config: path to configuration yaml file
+        """
         with open(meta_config, 'r') as stream:
             meta_config = yaml.load(stream, Loader=yaml.FullLoader)
+        # paths to the training & evaluation datasets
         self.data_paths = instantiate(meta_config['active_learning_data'])
+        # MLIP model - for example MTP
         self.mlip_model = instantiate(meta_config['mlip'])
-        self.evaluation_args = instantiate(meta_config['structure_evaluation'])
+        # parameters to find and isolate the problematic regions in the evaluation dataset
+        self.eval_config = instantiate(meta_config['structure_evaluation'])
+        # structure generation module
+        self.structure_generation = instantiate(meta_config['repainting_model'])
+        # force labeling module
+        self.oracle = instantiate(meta_config['oracle'])
+
+    def train_mlip(self, round: int = 1, training_set: Optional[Any] = None) -> str:
+        """Train a MLIP using the parameters specified in the configuration file.
+
+        Args:
+            round (optional): current round of training. Used to track now configurations in the training set. A round
+                includes the initial training and the evaluation process.
+            training_set (optional): if specified, use this dataset for training. Otherwise, use the dataset from the
+               paths in the configuration file. Defaults to None.
+
+        Returns:
+            path to the trained MLIP model
+        """
+        if training_set is None:
+            if len(self.training_sets) == 0:
+                self.training_sets = [self.mlip_model.prepare_dataset_from_lammps(
+                    root_data_dir=self.data_paths.training_data_dir,
+                    atom_dict=self.atom_dict,
+                    mode="train"
+                )]
+            training_set = self.mlip_model.merge_inputs(self.training_sets)
+
+        trained_mtp = self.mlip_model.train(training_set, mlip_name=f'mlip_round_{round}')
+        self.trained_mlips.append(trained_mtp)  # history of trained MLIPs ... not sure if useful
+        return trained_mtp
+
+    def evaluate_mlip(self, round: int = 1, mlip_name: Optional[str] = None, forces_available: bool = True
+                      ) -> pd.DataFrame:
+        """Evaluate a MLIP using the parameters specified in the configuration file.
 
-    def train_mlip(self, round: int = 1) -> MTPWithMLIP3:
-        train_dataset = self.mlip_model.prepare_dataset_from_lammps(
-            root_data_dir=self.data_paths.training_data_dir,
+        Args:
+            round (optional): current round of training. Defaults to 1.
+            mlip_name (optional): if not None, use this MTP to evaluate the dataset.
+            forces_available (optional): if True, get the ground truth forces from the dataset.
+
+        Returns:
+            dataframe with the atomic indices, positions, forces and evaluation criteria
+        """
+        evaluation_dataset = self.mlip_model.prepare_dataset_from_lammps(
+            root_data_dir=self.data_paths.evaluation_data_dir,
             atom_dict=self.atom_dict,
-            mode="train"
+            mode="evaluation",
+            get_forces=forces_available
         )
-        trained_mtp = self.mlip_model.train(train_dataset, mlip_name=f'mlip_round_{round}')
-        self.trained_mlips.append(trained_mtp)  # history of trained MLIPs ... not sure if useful
-        return trained_mtp
+        # first returned element is the ground truth DF
+        # TODO make sure this works even if the GT is not available...
+        if mlip_name is None:
+            mlip_name = os.path.join(self.mlip_model.savedir, f'mlip_round_{round}.almtp')
+        _, prediction_df = self.mlip_model.evaluate(evaluation_dataset, mlip_name=mlip_name)
 
-    def evaluate_mlip(self, trained_mtp: MTPWithMLIP3) -> pd.DataFrame:
-        prediction_df = evaluate_mlip(self.mtp_config, self.atom_dict, trained_mtp)
         return prediction_df
 
     def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]:
+        """Find the structures with a high uncertainty based on the configuration file parameters.
+
+        Args:
+            prediction_df: evaluation outputs of the MLIP model. Should contain atomic positions, uncertainty criteria
+               and structure indices.
+
+        Returns:
+            list of structures with a high uncertainty criteria.
+        """
         structures_to_retrain = get_structures_for_retraining(prediction_df,
                                                               criteria_threshold=self.eval_config.criteria_threshold,
                                                               number_of_structures=self.eval_config.number_of_structures,
@@ -68,37 +124,138 @@ def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]:
         return structures_to_retrain
 
     def excise_worst_atom(self, structures_to_retrain: List[pd.DataFrame]) -> List[pd.DataFrame]:
-        bad_regions = [extract_target_region(s, extraction_radius=self.eval_config.extraction_radius)
+        """For a given structure, isolate the atom with the highest uncertainty criteria.
+
+        Args:
+            structures_to_retrain: list of dataframes with the atomic positions and evaluate criteria
+
+        Returns:
+            list of dataframes with only the targeted region
+        """
+        # we assume the extraction region to be a sphere of radius extraction_radius around the worst atoms
+        # if more than 1 atom are bad in a structure, we only extract the worst
+        # TODO implement other extraction methods
+        bad_regions = [extract_target_region(s,
+                                             extraction_radius=self.eval_config.extraction_radius,
+                                             evaluation_criteria=self.eval_config.evaluation_criteria)
                        for s in structures_to_retrain]
         return bad_regions
 
     def get_structure_candidate_from_generative_model(self,
                                                       fixed_atoms: pd.DataFrame,
-                                                      number_of_candidates: int =1
-                                                      ) -> None:
-        # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms
-        # the output should be a list of np.ndarray (?)
-        return None
+                                                      number_of_candidates: int = 1
+                                                      ) -> pd.DataFrame:
+        """Generate new structures around the specified fixed atoms.
+
+        Args:
+            fixed_atoms: dataframe with the atom type, coordinates and unit cell information
+            number_of_candidates: how many structure to generate. Defaults to 1.
 
-    def update_mlip_training_set(self, new_structures: List[np.ndarray]):
-        # TODO take the examples generated by the diffusion model and append them to the MLIP training set
-        return None
+        Returns:
+            dataframe with the atom type, coordinates and unit cell
 
-    def round_of_active_learning_loop(self, trained_mtp: Optional[MTPWithMLIP3]=None, training_set=None):
+        """
+        # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms
+        if self.structure_generation.model == 'dev_dummy':  # replace with a wrapper around the diffusion model
+            # and hydra instantiate
+            return fixed_atoms
+        else:
+            raise NotImplemented('Only dev_dummy is supported at the moment.')
+
+    def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1):
+        """Save the generated structures in a csv format in the output dir
+
+        Args:
+            new_structures: structures proposed by the generative model
+            round: current round of training. Defaults to 1.
+        """
+        root_data_dir = os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}')
+        os.makedirs(root_data_dir, exist_ok=True)
+        for i, new_struc in enumerate(new_structures):
+            new_struc.to_csv(os.path.join(root_data_dir, f'structure_{i}.csv'), index=False)
+
+    def get_labels_from_oracle(self, round: int = 1) -> Any:
+        """Compute energy and forces from an oracle such as LAMMPS for the new candidates generated in a round of AL.
+
+        Args:
+            round (optional): round of retraining. Defaults to 1.
+
+        Returns:
+            mlip data input (for example, MTPInputs)
+        """
+        new_labeled_samples = []
+        for file in os.listdir(os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}')):
+            if file.endswith('.csv'):
+                new_labeled_samples.append(self.call_oracle(
+                    os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}', file)
+                ))
+        new_labeled_samples = self.mlip_model.merge_inputs(new_labeled_samples)
+        return new_labeled_samples
+
+    def call_oracle(self, path_to_file: str) -> Any:
+        """Compute energy and forces for a given atomic structure.
+
+        Args:
+            path_to_file: path to csv file containing the atomic positions and structure information
+
+        Returns:
+            mlip data inputs (for example, MTPInputs)
+        """
+        data = pd.read_csv(path_to_file)
+        cartesian_positions = data[['x', 'y', 'z']].to_numpy()
+        box = np.eye(3, 3) * 5.43  # TODO this is bad - fix this
+        atom_type = np.ones(cartesian_positions.shape[0], dtype=np.integer)  # TODO also bad
+        energy, forces = self.oracle(cartesian_positions, box, atom_type)
+        labels_as_mtp = self.mlip_model.prepare_dataset_from_numpy(
+            cartesian_positions,
+            box,
+            forces,
+            energy,
+            atom_type,
+        )
+        return labels_as_mtp
+
+    def round_of_active_learning_loop(self, trained_mlip: Optional[MTPWithMLIP3] = None
+                                      ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Do a full loop of activate learning.
+
+        The following steps are done in sequence:
+            - train a MLIP from the training set specified in the config file if trained_mlip is not specified
+            - evaluate the MLIP with the evaluation set specified in the config file
+            - find the "bad" structures in the evaluation set based on the criteria from the config file
+            - excise the problematic regions
+            - generate new candidates based on these regions
+            - call the oracle to get the labels for the new generated candidates
+            - retrain the MLIP
+            - evaluate the MLIP again
+
+        Args:
+            trained_mlip (optional): if not None, use this MLIP as a starting point. If None, train a MLIP from scratch
+                using the training data specified in the config file.
+
+        Returns:
+            dataframe with the MLIP evaluation results before finetuning with the generated structures
+            dataframe with the MLIP evaluation results after finetuning with the generated structures
+        """
         # one round from a known mtp (or train from provided training set)
         # evaluate, find candidates and update MTP
         # return the updated MTP
-        if trained_mtp is None:
-            trained_mtp = self.train_mlip(training_set)
-        pred_df = self.evaluate_mlip(trained_mtp)
+        if trained_mlip is None:
+            trained_mlip = self.train_mlip()
+        pred_df = self.evaluate_mlip(mlip_name=trained_mlip)
         bad_structures = self.get_bad_structures(pred_df)
         bad_regions = self.excise_worst_atom(bad_structures)
         new_candidates = [self.get_structure_candidate_from_generative_model(x) for x in bad_regions]
-        new_mlip_training_set = self.update_mlip_training_set(self, new_candidates)
-        new_mtp = self.trained_mlip(new_mlip_training_set)
-        return new_mtp
+        self.new_structure_to_csv(new_candidates)
+        new_labeled_candidates = self.get_labels_from_oracle()
+        new_training_set = self.mlip_model.merge_inputs([self.training_sets[-1], new_labeled_candidates])
+        self.training_sets.append(new_training_set)
+        new_mtp = self.train_mlip()
+        new_pred_df = self.evaluate_mlip(mlip_name=new_mtp)
+        return pred_df, new_pred_df
 
     def evaluate_mtp_update(self):
+        # TODO for benchmarking, compare results before and after finetuning
         pass
 
 
@@ -114,121 +271,13 @@ def get_arguments() -> argparse.Namespace:
     return args
 
 
-
-
-
-def train_mlip(mtp_args, atom_dict: Dict[int, str]) -> MTPWithMLIP3:
-    """Train a MTP model using the specified data.
-
-    Args:
-        mtp_args: MTPArguments data class
-        atom_dict: map between atom names and indices used by LAMMPS
-
-    Returns:
-        trained MTP model
-    """
-    # TODO make it more configurable
-    train_datasets = prepare_dataset(mtp_args.training_data_dir, atom_dict, mode="train")
-    # create the output directory if it doesn't exist already
-    trained_mtp = train_mtp(train_datasets, mlip_folder_path=mtp_args.mlip_dir, save_dir=mtp_args.output_dir)
-    return trained_mtp
-
-
-def evaluate_mlip(mtp_args, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame:
-    """Evaluate a MTP model using the specified data.
-
-    Args:
-        mtp_args: MTPArguments data class
-        atom_dict:  map between atom names and indices used by LAMMPS
-        mtp: trained MTP model
-
-    Returns:
-        dataframe with a column specified the structure, a column with the atom index, 3 columns with the x,y,z
-        coordinates and a column with the MaxVol criteria (nbh_grades)
-    """
-    evaluation_datasets = prepare_dataset(mtp_args.evaluation_data_dir, atom_dict, mode="evaluation")
-    # TODO the current evaluation method also returns the ground truth informations - this won't always be the case
-    # TODO make more configurable
-    _, prediction_df = evaluate_mtp(evaluation_datasets, mtp)
-    return prediction_df
-
-
-def get_structures_for_retraining(prediction_df: pd.DataFrame,
-                                  criteria_threshold: Optional[float] = None,
-                                  number_of_structures: Optional[int] = None,
-                                  evaluation_criteria: str = 'nbh_grades',
-                                  structure_index: str = 'structure_index'
-                                  ) -> List[pd.DataFrame]:
-    assert criteria_threshold is not None or number_of_structures is not None, \
-        "criteria_threshold or number_of_structures should be set."
-    # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection
-    criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max()
-    # find the top number_of_structures
-    structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False)
-    if number_of_structures is not None:
-        structures_indices = structures_indices[:number_of_structures]
-    else:  #  criteria_threshold is not None
-        structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold]
-    structures_indices = structures_indices.index.to_list()
-    assert len(structures_indices) > 0, "No structure meet the criteria."
-    structures_to_retrain = []
-    for idx in structures_indices:
-        structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx])
-    return structures_to_retrain
-
-
-def extract_target_region(structure_df: pd.DataFrame,
-                          extraction_radius: float,
-                          evaluation_criteria: str ='nbh_grades') -> pd.DataFrame:
-    """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious.
-
-    Args:
-        structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value)
-        extraction_radius: include all atoms within this distance of the targeted atom
-        evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP)
-
-    Returns:
-        dataframe with the atomic coordinates in columns x, y, z
-    """
-    # extract the worst ato and a region around of radius extraction_radius
-    # TODO better method to determine radius: number of atoms ?
-    target_atom = structure_df[evaluation_criteria].idxmax()
-    target_position = structure_df.loc[target_atom][['x', 'y', 'z']]
-    structure_df.loc[:, 'distance_squared'] = structure_df.apply(
-        lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1)
-    atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']]
-    return atom_positions
-
-
-def generate_new_structures(fixed_atoms: List[pd.DataFrame]) -> None:
-    pass
-
-
 def main():
     # args = get_arguments()
     # TODO get mtp_config_path from the args
     config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/"
     config_path = os.path.join(config_path, "config", "mtp_training.yaml")
-    al_loop = ActiveLearningLoop(config_path, 'hello')
-    al_loop.train_mlip()
-
-    atom_dict = {1: "Si"}  # TODO this should be define somewhere smart
-    # STEP 1: train a MLIP
-    # trained_mtp = train_mlip(mtp_config, atom_dict)
-    # STEP 2: evaluate the MLIP
-    # prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp)
-    # STEP 3: identify the problematic structures
-    # TODO extraction_params should come from a config file with hydra instantiate
-    # extraction_params = StructureEvaluationArguments()
-    #structures_to_retrain = get_structures_for_retraining(prediction_df,
-    #                                                      criteria_threshold=extraction_params.criteria_threshold,
-    #                                                      number_of_structures=extraction_params.number_of_structures,
-    #                                                      evaluation_criteria=extraction_params.evaluation_criteria)
-    # STEP 4: extract the region
-    #bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius)
-    #               for s in structures_to_retrain]
-    # STEP 5: call the generative model to create new candidates
-    print('hello')
+    al_loop = ActiveLearningLoop(config_path)
+    al_loop.round_of_active_learning_loop()
 
 
 if __name__ == '__main__':
diff --git a/crystal_diffusion/active_learning_loop/oracle.py b/crystal_diffusion/active_learning_loop/oracle.py
new file mode 100644
index 00000000..cb6edbe5
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/oracle.py
@@ -0,0 +1,53 @@
+from pathlib import Path
+from typing import Dict, Tuple
+
+import numpy as np
+
+from crystal_diffusion import DATA_DIR
+from crystal_diffusion.oracle.lammps import get_energy_and_forces_from_lammps
+
+
+class LAMMPS_for_active_learning:
+    def __init__(self):
+        pass
+
+    def __call__(self,
+                 cartesian_positions: np.ndarray,
+                 box: np.ndarray,
+                 atom_types: np.ndarray,
+                 atom_type_map: Dict[int, str] = {1: 'Si'},
+                 tmp_work_dir: str = './',
+                 pair_coeff_dir: Path = DATA_DIR) -> Tuple[float, np.ndarray]:
+        """Call LAMMPS to get energy and forces for a given set of atoms.
+
+        Args:
+            cartesian_positions: atomic positions as a n_atom x 3 array
+            box: unit cell definition as a 3x3 array. Assumed to be diagonal.
+            atom_types: integers defining each atoms as an array of length n_atom
+            atom_type_map: map between indices and atom type. Defaults to {1: 'Si'}
+            tmp_work_dir: temporary work directory for LAMMPS. Defaults to ./
+            pair_coeff_dir: path to stilinger-weber potential. Defaults to DATA_DIR.
+
+        Returns:
+            energy and forces on each atom (n_atom x 3)
+        """
+        shifted_positions = self.shift_positions(cartesian_positions, box)
+        energy, forces = get_energy_and_forces_from_lammps(shifted_positions, box, atom_types, atom_type_map,
+                                                           tmp_work_dir, pair_coeff_dir)
+        return energy, forces[['fx', 'fy', 'fz']].to_numpy()
+
+    def shift_positions(self, cartesian_positions: np.ndarray, box: np.ndarray) -> np.ndarray:
+        """Shift the positions of the atoms so all coordinates are positives.
+
+        This is because LAMMPS will ignore atoms with coordinates outside the [0, a] range (a = size of the unit cell).
+
+        Args:
+            cartesian_positions: atomic positions (n_atom x 3 array)
+            box: unit cell (3x3 array) - assumed to be diagonal
+
+        Returns:
+            array with shifted positions
+        """
+        for i, cell_size in enumerate(np.diag(box)):
+            cartesian_positions[:, i] = cartesian_positions[:, i] % cell_size
+        return cartesian_positions
diff --git a/crystal_diffusion/active_learning_loop/utils.py b/crystal_diffusion/active_learning_loop/utils.py
new file mode 100644
index 00000000..4fac563e
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/utils.py
@@ -0,0 +1,68 @@
+from typing import List, Optional
+
+import pandas as pd
+
+
+def get_structures_for_retraining(prediction_df: pd.DataFrame,
+                                  criteria_threshold: Optional[float] = None,
+                                  number_of_structures: Optional[int] = None,
+                                  evaluation_criteria: str = 'nbh_grades',
+                                  structure_index: str = 'structure_index'
+                                  ) -> List[pd.DataFrame]:
+    """Find the structures with the worst value of the evaluation criteria.
+
+    Args:
+        prediction_df: dataframe with the atom positions, forces, uncertainty criteria (e.g. MaxVol coefficient),
+            indices and structure indices
+        criteria_threshold: value above which the evaluation_criteria is considered bad. Either this or
+            number_of_structures should be specified. number_of_structures has priority if both are specified.
+            Defaults to None.
+        number_of_structures: number of structures to return. The top number_of_structures with the highest value of
+            evaluation_criteria are returned. Either this or criteria_threshold should be specified. Defaults to None.
+        evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (MaxVol coefficient in MTP)
+        structure_index: name of the column in the dataframe with the index identifying the structure. Defaults to
+            structure_index.
+
+    Returns:
+        list of the structures with a bad evaluation criteria. Length of the list depends on criteria_threhold and
+            number_of_structures.
+    """
+    assert criteria_threshold is not None or number_of_structures is not None, \
+        "criteria_threshold or number_of_structures should be set."
+    # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection
+    criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max()
+    # find the top number_of_structures
+    structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False)
+    if criteria_threshold is not None:
+        structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold]
+    structures_indices = structures_indices.index.to_list()
+    if number_of_structures is not None:
+        structures_indices = structures_indices[:number_of_structures]
+    structures_to_retrain = []
+    for idx in structures_indices:
+        structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx])
+    return structures_to_retrain
+
+
+def extract_target_region(structure_df: pd.DataFrame,
+                          extraction_radius: float,
+                          evaluation_criteria: str ='nbh_grades') -> pd.DataFrame:
+    """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious.
+
+    Args:
+        structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value)
+        extraction_radius: include all atoms within this distance of the targeted atom
+        evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP)
+
+    Returns:
+        dataframe with the atomic coordinates in columns x, y, z
+    """
+    # extract the worst ato and a region around of radius extraction_radius
+    # TODO better method to determine radius: number of atoms ?
+    target_atom = structure_df[evaluation_criteria].idxmax()
+    target_position = structure_df.loc[target_atom][['x', 'y', 'z']]
+    # TODO periodicity...
+    structure_df.loc[:, 'distance_squared'] = structure_df.apply(
+        lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1)
+    atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']]
+    return atom_positions
\ No newline at end of file
diff --git a/crystal_diffusion/models/mlip/mtp.py b/crystal_diffusion/models/mlip/mtp.py
index 474b6c25..5ff09839 100644
--- a/crystal_diffusion/models/mlip/mtp.py
+++ b/crystal_diffusion/models/mlip/mtp.py
@@ -21,7 +21,8 @@
 from monty.tempfile import ScratchDir
 from pymatgen.core import Structure
 
-from crystal_diffusion.mlip.mtp_utils import MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps
+from crystal_diffusion.mlip.mtp_utils import (MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps,
+                                              concat_mtp_inputs)
 
 
 @dataclass(kw_only=True)
@@ -61,7 +62,8 @@ def __init__(self, mtp_args: MTPArguments):
         self.fitted_mtp = None
         self.elements = None
         self.mtp_args = mtp_args
-        os.makedirs(mtp_args.fitted_mtp_savedir, exist_ok=True)
+        self.savedir = mtp_args.fitted_mtp_savedir
+        os.makedirs(self.savedir, exist_ok=True)
 
     def to_lammps_format(self):
         """Write the trained MTP in a LAMMPS compatible format."""
@@ -78,44 +80,51 @@ def to_lammps_format(self):
         pass
 
     def evaluate(self,
-                 test_structures: List[Structure],
-                 test_energies: List[float],
-                 test_forces: List[List[float]],
-                 test_stresses: Optional[List[List[float]]] = None,
+                 dataset: MTPInputs,
+                 mlip_name: str = 'mtp_fitted.almtp'
                  ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP.
 
         Args:
-            test_structures: evaluation set of pymatgen Structure Objects.
-            test_energies: list of total energies of each structure to evaluation in test_structures list.
-            test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list.
-                m can be varied with each single structure case.
-            test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in
-                test_structures list. If None, do not evaluate on stresses. Default to None.
+            dataset: MTPInputs dataclass with the following elements:
+                structures: The list of Pymatgen Structure object.
+                energies: List of total energies of each structure in structures list.
+                forces: List of (m, 3) forces array of each structure with m atoms in structures list.
+                    m can be varied with each single structure case.
+            mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp
 
         Returns:
             dataframe with ground truth energies, forces
             dataframe with predicted energies, forces, MaxVol gamma (nbh grades)
         """
-        if self.fitted_mtp is None:
-            raise AttributeError('MTP was not trained. Please call train() before evaluate().')
+        if not mlip_name.endswith('.almtp'):
+            mlip_name += '.almtp'
+        assert os.path.exists(mlip_name), f"Trained MTP does not exists: {mlip_name}"
 
         original_file = "original.cfgs"
         predict_file = "predict.cfgs"
-        test_structures, test_forces, test_stresses = check_structures_forces_stresses(
-            test_structures, test_forces, test_stresses
+
+        # TODO if forces are not available...
+        test_structures, test_forces, _ = check_structures_forces_stresses(
+            dataset.structure, dataset.forces, stresses=None
         )
-        predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses)
+        predict_pool = pool_from(test_structures, dataset.energy, test_forces)
+        local_mtp_name = "mtp.almtp"
 
         with ScratchDir("."):  # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere
             # write the structures to evaluate in a mlp compatible format
             original_file = self.write_cfg(original_file, cfg_pool=predict_pool)
+            # TODO how to handle when GT is not available
             df_orig = self.read_cfgs(original_file, nbh_grade=False)  # read original values as a DataFrame
 
+            # copy the trained mtp in the scratchdir
+            shutil.copyfile(mlip_name, os.path.join(os.getcwd(), local_mtp_name))
             # calculate_grade is the method to get the forces, energy & maxvol values
-            cmd = [self.mlp_command, "calculate_grade", self.fitted_mtp, original_file, predict_file]
+            cmd = [self.mlp_command, "calculate_grade", local_mtp_name, original_file, predict_file]
             predict_file += '.0'  # added by mlp...
             stdout, rc = self._call_mlip(cmd)
+
+            # check that MTP was called properly
             if rc != 0:
                 error_msg = f"mlp exited with return code {rc}"
                 msg = stdout.decode("utf-8").split("\n")[:-1]
@@ -125,7 +134,7 @@ def evaluate(self,
                 except Exception:
                     error_msg += msg[-1]
                 raise RuntimeError(error_msg)
-
+            # read the config
             df_predict = self.read_cfgs(predict_file, nbh_grade=True)
         return df_orig, df_predict
 
@@ -257,12 +266,57 @@ def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO):
     def prepare_dataset_from_lammps(
             root_data_dir: str,
             atom_dict: Dict[int, str],
-            mode: str = "train"
+            mode: str = "train",
+            get_forces: bool = True,
     ) -> MTPInputs:
+        """Get the LAMMPS in a folder and organize them as inputs for a MTP
+
+        Args:
+            root_data_dir: folder to read. Each LAMMPS sample is expected to be in a subfolder.
+            atom_dict: map from LAMMPS index to atom name. e.g. {1: 'Si'}
+            mode: subset of samples to get. Data from root_data_dir/*mode*/ folders will be parsed. Defaults to train.
+            get_forces: if True, get the forces from the samples. Defaults to True.
+
+        Returns:
+            inputs for MTP in the MTPInputs dataclass
+        """
         lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
-        mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict)
+        mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict, get_forces=get_forces)
         return mtp_dataset
 
+    @staticmethod
+    def prepare_dataset_from_numpy(
+            cartesian_positions: np.ndarray,
+            box: np.ndarray,
+            forces: np.ndarray,
+            energy: float,
+            atom_type: np.ndarray,
+            atom_dict: Dict[int, str] = {1: 'Si'}
+    ):
+        structure = Structure(
+            lattice=box,
+            species=[atom_dict[x] for x in atom_type],
+            coords=cartesian_positions,
+            coords_are_cartesian=True
+        )
+        forces = forces.tolist()  # from Nx3 np array to a list of length N where each element is a list of 3 forces
+        return MTPInputs(structure=[structure], forces=[forces], energy=[energy])
+
+    @staticmethod
+    def merge_inputs(mtp_inputs: List[MTPInputs]) -> MTPInputs:
+        """Merge a list of MTPInputs in a single MTPInputs.
+
+        Args:
+            mtp_inputs: list of MTPInputs
+
+        Returns:
+            merged MTPInputs
+        """
+        merged_inputs = MTPInputs(structure=[], forces=[], energy=[])
+        for x in mtp_inputs:
+            merged_inputs = concat_mtp_inputs(merged_inputs, x)
+        return merged_inputs
+
     def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str:
         """Training data with moment tensor method using MLIP-3.
 
@@ -339,6 +393,6 @@ def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str:
                     error_msg += msg[-1]
                 raise RuntimeError(error_msg)
             # copy the fitted mtp outside the working directory
-            self.fitted_mtp = os.path.join(self.mtp_args.fitted_mtp_savedir, save_fitted_mtp)
+            self.fitted_mtp = os.path.join(self.savedir, save_fitted_mtp)
             shutil.copyfile(save_fitted_mtp, self.fitted_mtp)
         return self.fitted_mtp

From a7707c2f8a97cbfad8957fb121ea036ce20d0efb Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 15 Aug 2024 08:46:56 -0400
Subject: [PATCH 06/13] missing files

---
 crystal_diffusion/mlip/mtp_utils.py           | 48 ++++++++++++++-----
 .../config/mtp_training.yaml                  | 15 ++++--
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py
index 9f2f974c..eedae3fe 100644
--- a/crystal_diffusion/mlip/mtp_utils.py
+++ b/crystal_diffusion/mlip/mtp_utils.py
@@ -1,7 +1,7 @@
 import os
 import re
 from dataclasses import dataclass
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import yaml
@@ -12,24 +12,25 @@
 class MTPInputs:
     """Create a dataclass to train or evaluate a MTP model."""
     structure: List[Structure]
-    forces: List[List[float]]
+    forces: List[List[List[float]]]  # num samples x num atoms x spatial dimension
     energy: List[float]
 
 
-def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
-        Tuple[List[Structure], List[List[float]]]:
+def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any], forces_avail: bool = True) -> \
+        Tuple[List[Structure], Optional[List[List[float]]]]:
     """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
 
     Args:
         filename: path to LAMMPS output file in yaml format
         atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str)
+        forces_avail (optional): if True, get the forces from the LAMMPS output file. Defaults to True.
 
     Returns:
         list of pymatgen Structure containing the atoms and their positions
-        list of forces (n x 3) for each atom
+        list of forces (n x 3) for each atom. None if forces_avail is False
     """
     structures = []
-    forces = []
+    forces = [] if forces_avail else None
     with (open(filename, 'r') as f):
         l_yaml = yaml.safe_load_all(f)
         for d in l_yaml:  # loop over LAMMPS outputs and convert in pymatgen Structure objects
@@ -47,9 +48,10 @@ def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, A
                                      coords=coords,
                                      coords_are_cartesian=True)
             structures.append(pm_structure)
-            force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
-            structure_forces = [[x[i] for i in force_idx] for x in d['data']]
-            forces.append(structure_forces)
+            if forces_avail:
+                force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
+                structure_forces = [[x[i] for i in force_idx] for x in d['data']]
+                forces.append(structure_forces)
     return structures, forces
 
 
@@ -72,7 +74,8 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
 
 def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
                                    thermo_yaml: List[str],
-                                   atom_dict: Dict[int, Any]
+                                   atom_dict: Dict[int, Any],
+                                   get_forces: bool = True,
                                    ) -> MTPInputs:
     """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
 
@@ -80,9 +83,10 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
         output_yaml: list of LAMMPS output files as yaml.
         thermo_yaml: list of LAMMPS thermodynamic output files as yaml.
         atom_dict: mapping of LAMMPS indices to atom type.
+        get_forces (optional): if True, get the forces. Defaults to True.
 
     Returns:
-        dataclass used to
+        dataclass used as inputs to train and evaluation a MTP model
     """
     mtp_inputs = {
         'structure': [],
@@ -90,9 +94,9 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
         'forces': []
     }
     for filename in output_yaml:
-        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict)
+        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict, get_forces)
         mtp_inputs['structure'] += structures
-        mtp_inputs['forces'] += forces
+        mtp_inputs['forces'] += forces  # will be None if get_forces is False
     for filename in thermo_yaml:
         mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
     mtp_inputs = MTPInputs(structure=mtp_inputs['structure'],
@@ -121,3 +125,21 @@ def crawl_lammps_directory(folder_name: str, folder_name_pattern: str= "train")
             lammps_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("dump.yaml")])
             thermo_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("thermo.yaml")])
     return lammps_output_files, thermo_output_files
+
+
+def concat_mtp_inputs(input1: MTPInputs, input2: MTPInputs) -> MTPInputs:
+    """Merge two MTP inputs data class.
+
+    Args:
+        input1: first MTPInputs dataset
+        input2: second MTPInputs dataset
+
+    Returns:
+        concatenated MTPInputs dataset
+    """
+    concat_inputs = MTPInputs(
+        structure=input1.structure + input2.structure,
+        forces=input1.forces + input2.forces,
+        energy=input1.energy + input2.energy
+    )
+    return concat_inputs
diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml
index 1c835a78..b9e35944 100644
--- a/experiments/active_learning_benchmark/config/mtp_training.yaml
+++ b/experiments/active_learning_benchmark/config/mtp_training.yaml
@@ -1,5 +1,5 @@
 active_learning_data:
-  _target_: crystal_diffusion.active_learning_loop.benchmark.ActiveLearningDataArguments
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments
   training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/
   evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/
   output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/
@@ -26,8 +26,15 @@ mlip:
     # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
 
 structure_evaluation:
-  _target_: crystal_diffusion.active_learning_loop.benchmark.StructureEvaluationArguments
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments
   evaluation_criteria: 'nbh_grades'  # maxvol coefficient name in the mtp outputs
   criteria_threshold: 10.0  # atoms with a MaxVol value under this value are considered bad
-  number_of_structures: None  # alternatively, take the N worst samples from the evaluation set
-  extraction_radius: float = 3  # extract atoms within this radius from the
\ No newline at end of file
+  # number_of_structures: None  # alternatively, take the N worst samples from the evaluation set
+  extraction_radius: 3  # extract atoms within this radius from the worst atom in a structure
+
+repainting_model:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments
+  model: dev_dummy  # placeholder for development
+
+oracle:
+  _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning

From 80eb9702ffe3b1ed57abddf2cef4bbdf7fb5781a Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Aug 2024 09:29:44 -0400
Subject: [PATCH 07/13] add simple metric for MaxVol improvement

---
 .../active_learning_loop/benchmark.py         | 31 ++++++++++++++++---
 .../config/mtp_training.yaml                  |  4 +--
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
index a01fb8c2..dc56e606 100644
--- a/crystal_diffusion/active_learning_loop/benchmark.py
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -254,9 +254,26 @@ def round_of_active_learning_loop(self, trained_mlip: Optional[MTPWithMLIP3] = N
         new_pred_df = self.evaluate_mlip(mlip_name=new_mtp)
         return pred_df, new_pred_df
 
-    def evaluate_mtp_update(self):
-        # TODO for benchmarking, compare results before and after finetuning
-        pass
+    def evaluate_mtp_update(self, original_predictions: pd.DataFrame, updated_predictions) -> Tuple[float, float]:
+        """Find the evaluation criteria in the original predictions and the corresponding value after retraining.
+
+        Args:
+            original_predictions: MLIP predictions before retraining
+            updated_predictions: MLIP predictions after retraining
+
+        Returns:
+             worst evaluation_criteria (e.g. MaxVol) in the original evaluation
+             corresponding value after retraining with new samples. Not guaranteed to be the maximum value.
+        """
+        # find the highest MaxVol in the original predictions - identified by the atom index and structure index
+        # TODO we assume a max - but it could be a min i
+        criteria = self.eval_config.evaluation_criteria
+        atom_index, structure_index, original_value = original_predictions.iloc[
+            original_predictions[criteria].argmax()][['atom_index', 'structure_index', criteria]]
+        updated_value = updated_predictions.loc[
+            (updated_predictions['atom_index'] == atom_index) &
+            (updated_predictions['structure_index'] == structure_index), criteria].values.item()
+        return original_value, updated_value
 
 
 def get_arguments() -> argparse.Namespace:
@@ -277,8 +294,12 @@ def main():
     config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/"
     config_path = os.path.join(config_path, "config", "mtp_training.yaml")
     al_loop = ActiveLearningLoop(config_path)
-    al_loop.round_of_active_learning_loop()
-
+    # initial_df, new_df = al_loop.round_of_active_learning_loop()
+    # initial_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv'), index=False)
+    # new_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv'), index=False)
+    initial_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv'))
+    new_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv'))
+    al_loop.evaluate_mtp_update(initial_df, new_df)
 
 if __name__ == '__main__':
     main()
diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml
index b9e35944..e6ab83b0 100644
--- a/experiments/active_learning_benchmark/config/mtp_training.yaml
+++ b/experiments/active_learning_benchmark/config/mtp_training.yaml
@@ -1,7 +1,7 @@
 active_learning_data:
   _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments
-  training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/
-  evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/
+  training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
   output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/
 
 mlip:

From 4102713aab4479a17d0d8d56b2a44043af8f2161 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 28 Aug 2024 10:04:41 -0400
Subject: [PATCH 08/13] unit tests for al benchmark

---
 tests/active_learning_loop/test_benchmark.py | 147 +++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 tests/active_learning_loop/test_benchmark.py

diff --git a/tests/active_learning_loop/test_benchmark.py b/tests/active_learning_loop/test_benchmark.py
new file mode 100644
index 00000000..ac712970
--- /dev/null
+++ b/tests/active_learning_loop/test_benchmark.py
@@ -0,0 +1,147 @@
+import os
+from unittest.mock import MagicMock, mock_open
+
+import pandas as pd
+import pytest
+
+from crystal_diffusion.active_learning_loop.benchmark import ActiveLearningLoop
+
+
+class TestActiveLearningLoop:
+    @pytest.fixture
+    def mock_yaml_config(self):
+        return """
+        active_learning_data:
+            key1: value1
+        mlip:
+            key2: value2
+        structure_evaluation:
+            key3: value3
+        repainting_model:
+            key4: value4
+        oracle:
+            key5: value5
+        """
+
+    @pytest.fixture
+    def meta_config(self):  # mock a path to a meta_config yaml file
+        return "fake_config.yaml"
+
+    @pytest.fixture
+    def mock_al_loop(self, mocker, mock_yaml_config, meta_config):
+        # Mock the open function to simulate reading the YAML file
+        mocker.patch("builtins.open", mock_open(read_data=mock_yaml_config))
+        # Mock os.path.exists to always return True
+        mocker.patch("os.path.exists", return_value=True)
+        # Mock the instantiate function from hydra.utils
+        mock_instantiate = mocker.patch("crystal_diffusion.active_learning_loop.benchmark.instantiate")
+        mock_instantiate.side_effect = lambda x: x  # Return the config itself for simplicity
+
+        # Create an instance of ActiveLearningLoop
+        loop = ActiveLearningLoop(meta_config)
+        return loop
+
+    def test_parse_config(self, mock_al_loop, mock_yaml_config, meta_config):
+
+        # Assertions to verify that the attributes were correctly set
+        assert mock_al_loop.data_paths == {'key1': 'value1'}
+        assert mock_al_loop.mlip_model == {'key2': 'value2'}
+        assert mock_al_loop.eval_config == {'key3': 'value3'}
+        assert mock_al_loop.structure_generation == {'key4': 'value4'}
+        assert mock_al_loop.oracle == {'key5': 'value5'}
+
+        # Verify that the file was opened and the path was checked
+        open.assert_called_once_with(meta_config, 'r')
+        os.path.exists.assert_called_once_with(meta_config)
+
+    def test_train_mlip(self, mocker, mock_yaml_config, mock_al_loop):
+        # Mocking the mlip_model's methods
+        mock_mlip_model = MagicMock()
+        mock_mlip_model.prepare_dataset_from_lammps.return_value = "mock_training_set"
+        mock_mlip_model.train.return_value = "mock_trained_mlip_model"
+        mock_mlip_model.merge_inputs.return_value = "mock_training_set"
+
+        # Inject the mocked mlip_model into the loop instance
+        mock_al_loop.mlip_model = mock_mlip_model
+        mock_al_loop.data_paths = MagicMock(training_data_dir="mock_training_data_dir")
+
+        # Run the train_mlip method without providing a training_set
+        result = mock_al_loop.train_mlip(round=1)
+
+        # Verify the methods were called with expected parameters
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with(
+            root_data_dir="mock_training_data_dir",
+            atom_dict=mock_al_loop.atom_dict,
+            mode="train"
+        )
+
+        mock_mlip_model.train.assert_called_once_with("mock_training_set", mlip_name="mlip_round_1")
+
+        # Verify the trained model path is correctly returned
+        assert result == "mock_trained_mlip_model"
+
+        # Verify that the trained model is appended to the history
+        assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model"]
+
+        # Test when a training set is provided
+        custom_training_set = "custom_training_set"
+        result = mock_al_loop.train_mlip(round=2, training_set=custom_training_set)
+
+        # The prepare_dataset_from_lammps should not be called since we provided a training_set
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_once()  # No new call
+        mock_mlip_model.train.assert_called_with(custom_training_set, mlip_name="mlip_round_2")
+
+        assert result == "mock_trained_mlip_model"
+        assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model", "mock_trained_mlip_model"]
+
+    def test_evaluate_mlip(self, mock_al_loop, tmpdir):
+        # Mocking the mlip_model's methods
+        mock_mlip_model = MagicMock()
+        mock_evaluation_dataset = "mock_evaluation_dataset"
+        mock_prediction_df = pd.DataFrame({"atom_index": [0, 1], "force": [1.0, 2.0]})
+
+        # Mocking return values for the prepare_dataset_from_lammps and evaluate methods
+        mock_mlip_model.prepare_dataset_from_lammps.return_value = mock_evaluation_dataset
+        mock_mlip_model.evaluate.return_value = (None, mock_prediction_df)
+
+        loop = mock_al_loop
+
+        # Inject the mocked mlip_model into the loop instance
+        loop.mlip_model = mock_mlip_model
+        loop.data_paths = MagicMock(evaluation_data_dir="mock_evaluation_data_dir")
+
+        # Run the evaluate_mlip method without specifying mlip_name
+        result_df = loop.evaluate_mlip(round=1)
+
+        # Verify the prepare_dataset_from_lammps method was called with expected parameters
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with(
+            root_data_dir="mock_evaluation_data_dir",
+            atom_dict=loop.atom_dict,
+            mode="evaluation",
+            get_forces=True
+        )
+        # Verify the evaluate method was called with the correct parameters
+        expected_mlip_name = os.path.join(mock_mlip_model.savedir, 'mlip_round_1.almtp')
+        mock_mlip_model.evaluate.assert_called_once_with(mock_evaluation_dataset, mlip_name=expected_mlip_name)
+
+        # Verify the method returns the correct dataframe
+        pd.testing.assert_frame_equal(result_df, mock_prediction_df)
+
+        # Run the evaluate_mlip method with a custom mlip_name
+        custom_mlip_name = "custom_mlip.almtp"
+        result_df = loop.evaluate_mlip(round=2, mlip_name=custom_mlip_name)
+
+        # The evaluate method should be called with the custom mlip_name
+        mock_mlip_model.evaluate.assert_called_with(mock_evaluation_dataset, mlip_name=custom_mlip_name)
+
+        pd.testing.assert_frame_equal(result_df, mock_prediction_df)
+
+        # Test without forces_available
+        _ = loop.evaluate_mlip(round=3, forces_available=False)
+
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_with(
+            root_data_dir="mock_evaluation_data_dir",
+            atom_dict=loop.atom_dict,
+            mode="evaluation",
+            get_forces=False
+        )

From 713ebdce2a50e852e62e5a1e85ab42751d1197ce Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 28 Aug 2024 10:17:52 -0400
Subject: [PATCH 09/13] some flake8 and cleaning the main of benchmark

---
 .../active_learning_loop/benchmark.py         | 29 ++++++-------
 .../active_learning_loop/config.yaml          | 43 +++++++++++++++++++
 .../local/active_learning/run_benchmark.sh    |  5 +++
 3 files changed, 61 insertions(+), 16 deletions(-)
 create mode 100644 examples/config_files/active_learning_loop/config.yaml
 create mode 100644 examples/local/active_learning/run_benchmark.sh

diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
index dc56e606..af794e6e 100644
--- a/crystal_diffusion/active_learning_loop/benchmark.py
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -1,6 +1,5 @@
 import argparse
 import os
-from dataclasses import dataclass
 from typing import Any, List, Optional, Tuple
 
 import numpy as np
@@ -8,8 +7,9 @@
 import yaml
 from hydra.utils import instantiate
 
-from crystal_diffusion.active_learning_loop.utils import get_structures_for_retraining, extract_target_region
-from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3, MTPInputs
+from crystal_diffusion.active_learning_loop.utils import (
+    extract_target_region, get_structures_for_retraining)
+from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
 
 
 class ActiveLearningLoop:
@@ -117,9 +117,10 @@ def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]:
         Returns:
             list of structures with a high uncertainty criteria.
         """
+        num_structures = self.eval_config.number_of_structures
         structures_to_retrain = get_structures_for_retraining(prediction_df,
                                                               criteria_threshold=self.eval_config.criteria_threshold,
-                                                              number_of_structures=self.eval_config.number_of_structures,
+                                                              number_of_structures=num_structures,
                                                               evaluation_criteria=self.eval_config.evaluation_criteria)
         return structures_to_retrain
 
@@ -160,7 +161,7 @@ def get_structure_candidate_from_generative_model(self,
             # and hydra instantiate
             return fixed_atoms
         else:
-            raise NotImplemented('Only dev_dummy is supported at the moment.')
+            raise NotImplementedError('Only dev_dummy is supported at the moment.')
 
     def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1):
         """Save the generated structures in a csv format in the output dir
@@ -271,8 +272,8 @@ def evaluate_mtp_update(self, original_predictions: pd.DataFrame, updated_predic
         atom_index, structure_index, original_value = original_predictions.iloc[
             original_predictions[criteria].argmax()][['atom_index', 'structure_index', criteria]]
         updated_value = updated_predictions.loc[
-            (updated_predictions['atom_index'] == atom_index) &
-            (updated_predictions['structure_index'] == structure_index), criteria].values.item()
+            (updated_predictions['atom_index'] == atom_index)
+            & (updated_predictions['structure_index'] == structure_index), criteria].values.item()
         return original_value, updated_value
 
 
@@ -283,23 +284,19 @@ def get_arguments() -> argparse.Namespace:
         args: arguments
     """
     parser = argparse.ArgumentParser()
-    parser.add_argument('--mtp_config', help='path to data directory', required=True)
+    parser.add_argument('--config', help='path to data directory', required=True)
     args = parser.parse_args()
     return args
 
 
 def main():
-    # args = get_arguments()
+    args = get_arguments()
     # TODO get mtp_config_path from the args
-    config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/"
-    config_path = os.path.join(config_path, "config", "mtp_training.yaml")
+    config_path = args.config
     al_loop = ActiveLearningLoop(config_path)
-    # initial_df, new_df = al_loop.round_of_active_learning_loop()
-    # initial_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv'), index=False)
-    # new_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv'), index=False)
-    initial_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv'))
-    new_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv'))
+    initial_df, new_df = al_loop.round_of_active_learning_loop()
     al_loop.evaluate_mtp_update(initial_df, new_df)
 
+
 if __name__ == '__main__':
     main()
diff --git a/examples/config_files/active_learning_loop/config.yaml b/examples/config_files/active_learning_loop/config.yaml
new file mode 100644
index 00000000..04e70faf
--- /dev/null
+++ b/examples/config_files/active_learning_loop/config.yaml
@@ -0,0 +1,43 @@
+active_learning_data:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments
+  training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/
+
+mlip:
+  _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3
+  mtp_args:
+    _target_: crystal_diffusion.models.mlip.mtp.MTPArguments
+    mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/  # path to MLIP3 library
+    name: None  # MTP
+    param: None
+    unfitted_mtp: 08.almtp  # Define the initial mtp file. Default to 08g.amltp
+    fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/  # save directory for the fitted MTP. Defaults to '../' (current wd)
+    max_dist: 5.0  # The actual radial cutoff. Defaults to 5.
+    radial_basis_size: 8  # Relevant to number of radial basis function. Defaults to 8.
+    max_iter: 1000  # The number of maximum iteration. Defaults to 1000.
+    energy_weight: 1.0  # The weight of energy. Defaults to 1
+    force_weight: 1e-2  # The weight of forces. Defaults to 1e-2
+    stress_weight: 1e-3  # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+    init_params: same  # how to initialize parameters if a potential was not pre-fitted: "same" or "random".
+    scale_by_force: 0  # If > 0 then configurations near equilibrium get more weight. Defaults to 0.
+    bfgs_conv_tol: 1e-3  # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations.
+    weighting: vibration  # How to weight configuration with different sizes relative to each other.
+    # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
+structure_evaluation:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments
+  evaluation_criteria: 'nbh_grades'  # maxvol coefficient name in the mtp outputs
+  criteria_threshold: 10.0  # atoms with a MaxVol value under this value are considered bad
+  # number_of_structures: None  # alternatively, take the N worst samples from the evaluation set
+  extraction_radius: 3  # extract atoms within this radius from the worst atom in a structure
+
+repainting_model:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments
+  model: dev_dummy  # placeholder for development
+
+oracle:
+  _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning
+
+initial_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/before_tuning.csv
+updated_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/after_tuning.csv
\ No newline at end of file
diff --git a/examples/local/active_learning/run_benchmark.sh b/examples/local/active_learning/run_benchmark.sh
new file mode 100644
index 00000000..20c65901
--- /dev/null
+++ b/examples/local/active_learning/run_benchmark.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+CONFIG=../../config_files/active_learning_loop/config.yaml
+
+python ../../../crystal_diffusion/active_learning_loop/benchmark.py --config $CONFIG
\ No newline at end of file

From f83f766b78e4d794c49a86d6ebea44438a37d168 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 28 Aug 2024 10:59:18 -0400
Subject: [PATCH 10/13] fixing mtp unit tests and some flake errors

---
 crystal_diffusion/mlip/mtp_utils.py  | 29 +++++++++++++-
 crystal_diffusion/models/mlip/mtp.py |  9 +++--
 tests/models/test_mtp.py             | 60 ++++++++++++++++------------
 3 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py
index eedae3fe..d2cbf298 100644
--- a/crystal_diffusion/mlip/mtp_utils.py
+++ b/crystal_diffusion/mlip/mtp_utils.py
@@ -4,8 +4,10 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
+import pandas as pd
 import yaml
 from pymatgen.core import Structure
+from sklearn.metrics import mean_absolute_error
 
 
 @dataclass(kw_only=True)
@@ -105,7 +107,7 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
     return mtp_inputs
 
 
-def crawl_lammps_directory(folder_name: str, folder_name_pattern: str= "train") -> Tuple[List[str], List[str]]:
+def crawl_lammps_directory(folder_name: str, folder_name_pattern: str = "train") -> Tuple[List[str], List[str]]:
     """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name.
 
     LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml
@@ -143,3 +145,28 @@ def concat_mtp_inputs(input1: MTPInputs, input2: MTPInputs) -> MTPInputs:
         energy=input1.energy + input2.energy
     )
     return concat_inputs
+
+
+def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]:
+    """Get mean absolute error on energy and forces from the outputs of MTP.
+
+    Args:
+        df_orig: dataframe with ground truth values
+        df_predict: dataframe with MTP predictions
+
+    Returns:
+        MAE on energy in eV/atom and MAE on forces in eV/Å
+    """
+    # from demo in maml
+    # get a single predicted energy per structure
+    predicted_energy = df_predict.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
+    # normalize by number of atoms
+    predicted_energy = (predicted_energy['energy'] / predicted_energy['atom_index']).to_numpy()
+    # same for ground truth
+    gt_energy = df_orig.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
+    gt_energy = (gt_energy['energy'] / gt_energy['atom_index']).to_numpy()
+
+    predicted_forces = (df_predict[['fx', 'fy', 'fz']].to_numpy().flatten())
+    gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten())
+
+    return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces)
diff --git a/crystal_diffusion/models/mlip/mtp.py b/crystal_diffusion/models/mlip/mtp.py
index 5ff09839..31ce468f 100644
--- a/crystal_diffusion/models/mlip/mtp.py
+++ b/crystal_diffusion/models/mlip/mtp.py
@@ -21,14 +21,15 @@
 from monty.tempfile import ScratchDir
 from pymatgen.core import Structure
 
-from crystal_diffusion.mlip.mtp_utils import (MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps,
-                                              concat_mtp_inputs)
+from crystal_diffusion.mlip.mtp_utils import (MTPInputs, concat_mtp_inputs,
+                                              crawl_lammps_directory,
+                                              prepare_mtp_inputs_from_lammps)
 
 
 @dataclass(kw_only=True)
 class MTPArguments:
     mlip_path: str  # path to MLIP3 library
-    name: Optional[str]= None  # MTP
+    name: Optional[str] = None  # MTP
     param: Optional[Dict[Any, Any]] = None
     unfitted_mtp: str = "08.almtp"  # Define the initial mtp file. Default to 08g.amltp
     fitted_mtp_savedir: str = '../'  # save directory for the fitted MTP. Defaults to '../' (current wd)
@@ -106,7 +107,7 @@ def evaluate(self,
 
         # TODO if forces are not available...
         test_structures, test_forces, _ = check_structures_forces_stresses(
-            dataset.structure, dataset.forces, stresses=None
+            dataset.structure, dataset.forces, None
         )
         predict_pool = pool_from(test_structures, dataset.energy, test_forces)
         local_mtp_name = "mtp.almtp"
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 5431dd15..13a661ae 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -8,10 +8,11 @@
 from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
-from crystal_diffusion.models.mtp import MTPWithMLIP3
-from crystal_diffusion.train_mtp import (
-    extract_energy_from_thermo_log, extract_structure_and_forces_from_file,
-    get_metrics_from_pred, prepare_mtp_inputs_from_lammps)
+from crystal_diffusion.mlip.mtp_utils import (
+    MTPInputs, extract_energy_from_thermo_log,
+    extract_structure_and_forces_from_file, get_metrics_from_pred,
+    prepare_mtp_inputs_from_lammps)
+from crystal_diffusion.models.mlip.mtp import MTPArguments, MTPWithMLIP3
 
 
 class FakeStructure:
@@ -38,15 +39,15 @@ def mock_popen(mocker):
 
 
 # Mock the external dependencies and method calls within the MTPWithMLIP3.train method
-def test_train(mocker, mock_popen):
+def test_train(mocker, mock_popen, tmpdir):
     # Mock os.path.exists to always return True
     mocker.patch("os.path.exists", return_value=True)
 
     # Mock check_structures_forces_stresses to return a value without needing real input
-    mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", side_effect=passthrough)
+    mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses", side_effect=passthrough)
 
     # Mock pool_from to return a simplified pool object
-    mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="simple_pool_object")
+    mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="simple_pool_object")
 
     # Mock self.write_cfg to simulate creating a config file without file operations
     mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
@@ -54,19 +55,23 @@ def test_train(mocker, mock_popen):
     mocker.patch("shutil.copyfile", return_value=None)
 
     # Initialize MTPWithMLIP3 with mock parameters
-    model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model")
+    mtp_args = MTPArguments(
+        mlip_path="/mock/path",
+        name="test_model",
+        unfitted_mtp="08.almtp",
+        fitted_mtp_savedir=tmpdir
+    )
+    model = MTPWithMLIP3(mtp_args)
     # Call the train method
+    mtp_inputs = MTPInputs(
+        structure=[FakeStructure(['H', 'O']), FakeStructure(['Si'])],
+        forces=[],
+        energy=[1, 2]
+    )
 
-    return_code = model.train(
-        train_structures=[FakeStructure(['H', 'O']), FakeStructure(['Si'])],
-        train_energies=[1, 2],
-        train_forces=[],
-        train_stresses=[],
-        unfitted_mtp="08.almtp",
-        fitted_mtp_savedir="/mock/dir"
+    _ = model.train(
+        mtp_inputs,
     )
-    # Assert the expected results
-    assert return_code == 0  # The train method should return the mocked subprocess success return code from mock_open
 
     # Assert that mocked methods were called
     model.write_cfg.assert_called()
@@ -91,19 +96,17 @@ def mtp_instance(mocker):
     return instance
 
 
-# def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
 def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
     test_structures = [fake_structure]
     test_energies = [1.0]
     test_forces = [[[0, 0, 0]]]
-    test_stresses = None  # or appropriate mock stresses
 
     # Mock check_structures_forces_stresses to return the arguments unmodified
-    mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses",
+    mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses",
                  side_effect=lambda s, f, st: (s, f, st))
 
     # Mock pool_from to return a mocked value
-    mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="mock_pool")
+    mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="mock_pool")
 
     # Mock self.write_cfg to simulate creating a config file without file operations
     mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
@@ -116,8 +119,14 @@ def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
     mocker.patch("shutil.copyfile", return_value=None)
     mocker.patch("os.path.exists", return_value=True)
 
+    mtp_inputs = MTPInputs(
+        structure=test_structures,
+        forces=test_forces,
+        energy=test_energies
+    )
+
     # Perform the test
-    df_orig, df_predict = mtp_instance.evaluate(test_structures, test_energies, test_forces, test_stresses)
+    df_orig, df_predict = mtp_instance.evaluate(mtp_inputs)
 
     # Assertions can vary based on the real output of `read_cfgs`
     # Here's an example assertion assuming `read_cfgs` returns a string in this mocked scenario
@@ -209,12 +218,13 @@ def test_extract_energy_from_thermo_log(tmpdir):
 
 @pytest.fixture
 def mock_extract_energy_from_thermo_log(mocker):
-    return mocker.patch('crystal_diffusion.train_mtp.extract_energy_from_thermo_log', return_value=[])
+    return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_energy_from_thermo_log', return_value=[])
 
 
 @pytest.fixture
 def mock_extract_structure_and_forces(mocker):
-    return mocker.patch('crystal_diffusion.train_mtp.extract_structure_and_forces_from_file', return_value=([], []))
+    return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_structure_and_forces_from_file',
+                        return_value=([], []))
 
 
 def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_extract_energy_from_thermo_log, tmpdir):
@@ -230,7 +240,7 @@ def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_
 
     # Verify that the mocks were called correctly
     assert mock_extract_structure_and_forces.call_count == 2
-    mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict)
+    mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict, True)
 
     assert mock_extract_energy_from_thermo_log.call_count == 2
     mock_extract_energy_from_thermo_log.assert_called_with(thermo_yaml_files[1])

From 259c00307156ff1332de85953acc53770d9dc57e Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 3 Sep 2024 12:33:15 -0400
Subject: [PATCH 11/13] flake8 fixes

---
 .../activelearning_dataclasses.py                |  2 +-
 crystal_diffusion/active_learning_loop/utils.py  |  4 ++--
 crystal_diffusion/mlip/mtp_train.py              | 16 +++++++---------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
index d4880f80..f0ffcd89 100644
--- a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
+++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
@@ -10,7 +10,7 @@ class ActiveLearningDataArguments:
 
 @dataclass(kw_only=True)
 class StructureEvaluationArguments:
-    evaluation_criteria: str ='nbh_grades'
+    evaluation_criteria: str = 'nbh_grades'
     criteria_threshold: float = 10
     number_of_structures: int = None
     extraction_radius: float = 3
diff --git a/crystal_diffusion/active_learning_loop/utils.py b/crystal_diffusion/active_learning_loop/utils.py
index 4fac563e..02833f89 100644
--- a/crystal_diffusion/active_learning_loop/utils.py
+++ b/crystal_diffusion/active_learning_loop/utils.py
@@ -46,7 +46,7 @@ def get_structures_for_retraining(prediction_df: pd.DataFrame,
 
 def extract_target_region(structure_df: pd.DataFrame,
                           extraction_radius: float,
-                          evaluation_criteria: str ='nbh_grades') -> pd.DataFrame:
+                          evaluation_criteria: str = 'nbh_grades') -> pd.DataFrame:
     """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious.
 
     Args:
@@ -65,4 +65,4 @@ def extract_target_region(structure_df: pd.DataFrame,
     structure_df.loc[:, 'distance_squared'] = structure_df.apply(
         lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1)
     atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']]
-    return atom_positions
\ No newline at end of file
+    return atom_positions
diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py
index 8e449f80..ae70b0c9 100644
--- a/crystal_diffusion/mlip/mtp_train.py
+++ b/crystal_diffusion/mlip/mtp_train.py
@@ -3,28 +3,26 @@
 Running the main() runs a debugging example. Entry points are train_mtp.
 """
 import argparse
-from dataclasses import dataclass
-from typing import Any, Dict, List, NamedTuple, Tuple
+from typing import Dict, Tuple
 
-import numpy as np
 import pandas as pd
-import yaml
-from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
+from crystal_diffusion.mlip.mtp_utils import (MTPInputs,
+                                              crawl_lammps_directory,
+                                              prepare_mtp_inputs_from_lammps)
 from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
-from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTPInputs
 
 atom_dict = {1: 'Si'}
 
 
-def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTP_Inputs:
+def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTPInputs:
     lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
     mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict)
     return mtp_dataset
 
 
-def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
+def train_mtp(train_inputs: MTPInputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
     """Create and train an MTP potential.
 
     Args:
@@ -52,7 +50,7 @@ def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) ->
     return mtp
 
 
-def evaluate_mtp(eval_inputs: MTP_Inputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
+def evaluate_mtp(eval_inputs: MTPInputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Evaluate a trained MTP potential.
 
     Args:

From 9f6eca4602959f7b9691d889eb05fdb229dcff49 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 3 Sep 2024 12:52:59 -0400
Subject: [PATCH 12/13] some missing docstrings

---
 .../activelearning_dataclasses.py              |  3 +++
 .../active_learning_loop/benchmark.py          | 10 +++++++---
 .../active_learning_loop/oracle.py             |  2 ++
 crystal_diffusion/mlip/mtp_train.py            | 11 +++++++++++
 crystal_diffusion/models/mlip/mtp.py           | 18 ++++++++++++++++--
 5 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
index f0ffcd89..c5333d3c 100644
--- a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
+++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
@@ -3,6 +3,7 @@
 
 @dataclass(kw_only=True)
 class ActiveLearningDataArguments:
+    """Paths to the training, validaition datasets and output directory."""
     training_data_dir: str  # training data directory
     evaluation_data_dir: str  # evaluation data directory
     output_dir: str  # directory where to save the results
@@ -10,6 +11,7 @@ class ActiveLearningDataArguments:
 
 @dataclass(kw_only=True)
 class StructureEvaluationArguments:
+    """Parameters related to the MLIP evaluation."""
     evaluation_criteria: str = 'nbh_grades'
     criteria_threshold: float = 10
     number_of_structures: int = None
@@ -18,4 +20,5 @@ class StructureEvaluationArguments:
 
 @dataclass(kw_only=True)
 class RepaintingArguments:
+    """Parameters related to the structure generation model."""
     model: str = 'dev_dummy'
diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
index af794e6e..7b231d3a 100644
--- a/crystal_diffusion/active_learning_loop/benchmark.py
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -13,11 +13,14 @@
 
 
 class ActiveLearningLoop:
+    """Method to train, evaluate and fine-tune a MLIP."""
     def __init__(self,
                  meta_config: str,
                  ):
-        """Active learning benchmark with methods to train & evaluate a MLIP, isolate bad sub-structures, repaint
-        new structures and retrain the MLIP
+        """Active learning benchmark.
+
+        Includes methods to train & evaluate a MLIP, isolate bad sub-structures, repaint new structures and retrain
+        the MLIP.
 
         Args:
             meta_config: path to a yaml configuration with the parameters for the modules in the class
@@ -164,7 +167,7 @@ def get_structure_candidate_from_generative_model(self,
             raise NotImplementedError('Only dev_dummy is supported at the moment.')
 
     def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1):
-        """Save the generated structures in a csv format in the output dir
+        """Save the generated structures in a csv format in the output dir.
 
         Args:
             new_structures: structures proposed by the generative model
@@ -290,6 +293,7 @@ def get_arguments() -> argparse.Namespace:
 
 
 def main():
+    """Example to do an active learning loop once."""
     args = get_arguments()
     # TODO get mtp_config_path from the args
     config_path = args.config
diff --git a/crystal_diffusion/active_learning_loop/oracle.py b/crystal_diffusion/active_learning_loop/oracle.py
index cb6edbe5..a1893f5c 100644
--- a/crystal_diffusion/active_learning_loop/oracle.py
+++ b/crystal_diffusion/active_learning_loop/oracle.py
@@ -8,7 +8,9 @@
 
 
 class LAMMPS_for_active_learning:
+    """Oracle using LAMMPS to get the energy and forces on atoms."""
     def __init__(self):
+        """Initialize the class."""
         pass
 
     def __call__(self,
diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py
index ae70b0c9..bcf527ed 100644
--- a/crystal_diffusion/mlip/mtp_train.py
+++ b/crystal_diffusion/mlip/mtp_train.py
@@ -17,6 +17,17 @@
 
 
 def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTPInputs:
+    """Prepare the dataset in a given directory into a MTP format.
+
+    Args:
+        root_data_dir: root data directory to parse
+        atom_dict: map between an index and an atom type
+        mode: type of data to look for. e.g. train will only look at the folders in root_data_dirwith "train" in their
+            name. Defaults to train.
+
+    Returns:
+        data in the MTPInputs dataclass
+    """
     lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
     mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict)
     return mtp_dataset
diff --git a/crystal_diffusion/models/mlip/mtp.py b/crystal_diffusion/models/mlip/mtp.py
index 31ce468f..bc822cf6 100644
--- a/crystal_diffusion/models/mlip/mtp.py
+++ b/crystal_diffusion/models/mlip/mtp.py
@@ -28,6 +28,7 @@
 
 @dataclass(kw_only=True)
 class MTPArguments:
+    """Arguments to train an MTP with the MLIP3 library."""
     mlip_path: str  # path to MLIP3 library
     name: Optional[str] = None  # MTP
     param: Optional[Dict[Any, Any]] = None
@@ -270,7 +271,7 @@ def prepare_dataset_from_lammps(
             mode: str = "train",
             get_forces: bool = True,
     ) -> MTPInputs:
-        """Get the LAMMPS in a folder and organize them as inputs for a MTP
+        """Get the LAMMPS in a folder and organize them as inputs for a MTP.
 
         Args:
             root_data_dir: folder to read. Each LAMMPS sample is expected to be in a subfolder.
@@ -293,7 +294,20 @@ def prepare_dataset_from_numpy(
             energy: float,
             atom_type: np.ndarray,
             atom_dict: Dict[int, str] = {1: 'Si'}
-    ):
+    ) -> MTPInputs:
+        """Convert numpy array variables to a format compatible with MTP.
+
+        Args:
+            cartesian_positions: atomic positions in Angstrom as a (n_atom, 3) array.
+            box: unit cell description as a (3, 3) array.
+            forces: forces on each atom as a (n_atom, 3) array
+            energy: energy of the configuration
+            atom_type: indices for each atom in the structure as a (n_atom,) array
+            atom_dict: map between atom indices and atom types
+
+        Returns:
+            data formatted at an input for MTP.
+        """
         structure = Structure(
             lattice=box,
             species=[atom_dict[x] for x in atom_type],

From 73c647df9098e049877ebdca5335b26e979fcab7 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 3 Sep 2024 13:19:37 -0400
Subject: [PATCH 13/13] fixing lightning version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7bc45fec..d9681bfb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ pyyaml==6.0.1
 pytest==7.1.2
 pytest-cov==3.0.0
 pytest-mock==3.12.0
-pytorch_lightning>=2.2.0
+pytorch_lightning==2.2.1
 pytype==2024.2.13
 sphinx==7.2.6
 sphinx-autoapi==3.0.0