diff --git a/README_active_learning.md b/README_active_learning.md
new file mode 100644
index 00000000..89f9f177
--- /dev/null
+++ b/README_active_learning.md
@@ -0,0 +1 @@
+Instructions to run the active learning benchmark.
\ No newline at end of file
diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
new file mode 100644
index 00000000..c5333d3c
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py
@@ -0,0 +1,24 @@
+from dataclasses import dataclass
+
+
+@dataclass(kw_only=True)
+class ActiveLearningDataArguments:
+    """Paths to the training, validaition datasets and output directory."""
+    training_data_dir: str  # training data directory
+    evaluation_data_dir: str  # evaluation data directory
+    output_dir: str  # directory where to save the results
+
+
+@dataclass(kw_only=True)
+class StructureEvaluationArguments:
+    """Parameters related to the MLIP evaluation."""
+    evaluation_criteria: str = 'nbh_grades'
+    criteria_threshold: float = 10
+    number_of_structures: int = None
+    extraction_radius: float = 3
+
+
+@dataclass(kw_only=True)
+class RepaintingArguments:
+    """Parameters related to the structure generation model."""
+    model: str = 'dev_dummy'
diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py
new file mode 100644
index 00000000..7b231d3a
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/benchmark.py
@@ -0,0 +1,306 @@
+import argparse
+import os
+from typing import Any, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import yaml
+from hydra.utils import instantiate
+
+from crystal_diffusion.active_learning_loop.utils import (
+    extract_target_region, get_structures_for_retraining)
+from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
+
+
+class ActiveLearningLoop:
+    """Method to train, evaluate and fine-tune a MLIP."""
+    def __init__(self,
+                 meta_config: str,
+                 ):
+        """Active learning benchmark.
+
+        Includes methods to train & evaluate a MLIP, isolate bad sub-structures, repaint new structures and retrain
+        the MLIP.
+
+        Args:
+            meta_config: path to a yaml configuration with the parameters for the modules in the class
+        """
+        assert os.path.exists(meta_config), "configuration file for active learning loop does not exist."
+        # define the modules in the __init__ function
+        self.data_paths, self.mlip_model, self.eval_config, self.structure_generation = None, None, None, None
+        self.oracle = None
+        # use hydra to convert the yaml into modules and other data classes
+        self.parse_config(meta_config)
+        self.atom_dict = {1: "Si"}  # TODO this should be define somewhere smart
+        self.trained_mlips = []  # history of trained MLIPs (optional - not sure if we should keep this)
+        self.training_sets = []  # history of training sets
+
+    def parse_config(self, meta_config: str):
+        """Read a configuration file and instantiate the different blocks with hydra.
+
+        The configuration file should have the following blocks of parameters:
+            active_learning_data: dataset paths
+            mlip_model: MLIP module training parameters
+            structure_evaluation: identification and isolation of the atomic regions to finetune the MLIP
+
+        Args:
+            meta_config: path to configuration yaml file
+        """
+        with open(meta_config, 'r') as stream:
+            meta_config = yaml.load(stream, Loader=yaml.FullLoader)
+        # paths to the training & evaluation datasets
+        self.data_paths = instantiate(meta_config['active_learning_data'])
+        # MLIP model - for example MTP
+        self.mlip_model = instantiate(meta_config['mlip'])
+        # parameters to find and isolate the problematic regions in the evaluation dataset
+        self.eval_config = instantiate(meta_config['structure_evaluation'])
+        # structure generation module
+        self.structure_generation = instantiate(meta_config['repainting_model'])
+        # force labeling module
+        self.oracle = instantiate(meta_config['oracle'])
+
+    def train_mlip(self, round: int = 1, training_set: Optional[Any] = None) -> str:
+        """Train a MLIP using the parameters specified in the configuration file.
+
+        Args:
+            round (optional): current round of training. Used to track now configurations in the training set. A round
+                includes the initial training and the evaluation process.
+            training_set (optional): if specified, use this dataset for training. Otherwise, use the dataset from the
+               paths in the configuration file. Defaults to None.
+
+        Returns:
+            path to the trained MLIP model
+        """
+        if training_set is None:
+            if len(self.training_sets) == 0:
+                self.training_sets = [self.mlip_model.prepare_dataset_from_lammps(
+                    root_data_dir=self.data_paths.training_data_dir,
+                    atom_dict=self.atom_dict,
+                    mode="train"
+                )]
+            training_set = self.mlip_model.merge_inputs(self.training_sets)
+
+        trained_mtp = self.mlip_model.train(training_set, mlip_name=f'mlip_round_{round}')
+        self.trained_mlips.append(trained_mtp)  # history of trained MLIPs ... not sure if useful
+        return trained_mtp
+
+    def evaluate_mlip(self, round: int = 1, mlip_name: Optional[str] = None, forces_available: bool = True
+                      ) -> pd.DataFrame:
+        """Evaluate a MLIP using the parameters specified in the configuration file.
+
+        Args:
+            round (optional): current round of training. Defaults to 1.
+            mlip_name (optional): if not None, use this MTP to evaluate the dataset.
+            forces_available (optional): if True, get the ground truth forces from the dataset.
+
+        Returns:
+            dataframe with the atomic indices, positions, forces and evaluation criteria
+        """
+        evaluation_dataset = self.mlip_model.prepare_dataset_from_lammps(
+            root_data_dir=self.data_paths.evaluation_data_dir,
+            atom_dict=self.atom_dict,
+            mode="evaluation",
+            get_forces=forces_available
+        )
+        # first returned element is the ground truth DF
+        # TODO make sure this works even if the GT is not available...
+        if mlip_name is None:
+            mlip_name = os.path.join(self.mlip_model.savedir, f'mlip_round_{round}.almtp')
+        _, prediction_df = self.mlip_model.evaluate(evaluation_dataset, mlip_name=mlip_name)
+
+        return prediction_df
+
+    def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]:
+        """Find the structures with a high uncertainty based on the configuration file parameters.
+
+        Args:
+            prediction_df: evaluation outputs of the MLIP model. Should contain atomic positions, uncertainty criteria
+               and structure indices.
+
+        Returns:
+            list of structures with a high uncertainty criteria.
+        """
+        num_structures = self.eval_config.number_of_structures
+        structures_to_retrain = get_structures_for_retraining(prediction_df,
+                                                              criteria_threshold=self.eval_config.criteria_threshold,
+                                                              number_of_structures=num_structures,
+                                                              evaluation_criteria=self.eval_config.evaluation_criteria)
+        return structures_to_retrain
+
+    def excise_worst_atom(self, structures_to_retrain: List[pd.DataFrame]) -> List[pd.DataFrame]:
+        """For a given structure, isolate the atom with the highest uncertainty criteria.
+
+        Args:
+            structures_to_retrain: list of dataframes with the atomic positions and evaluate criteria
+
+        Returns:
+            list of dataframes with only the targeted region
+        """
+        # we assume the extraction region to be a sphere of radius extraction_radius around the worst atoms
+        # if more than 1 atom are bad in a structure, we only extract the worst
+        # TODO implement other extraction methods
+        bad_regions = [extract_target_region(s,
+                                             extraction_radius=self.eval_config.extraction_radius,
+                                             evaluation_criteria=self.eval_config.evaluation_criteria)
+                       for s in structures_to_retrain]
+        return bad_regions
+
+    def get_structure_candidate_from_generative_model(self,
+                                                      fixed_atoms: pd.DataFrame,
+                                                      number_of_candidates: int = 1
+                                                      ) -> pd.DataFrame:
+        """Generate new structures around the specified fixed atoms.
+
+        Args:
+            fixed_atoms: dataframe with the atom type, coordinates and unit cell information
+            number_of_candidates: how many structure to generate. Defaults to 1.
+
+        Returns:
+            dataframe with the atom type, coordinates and unit cell
+
+        """
+        # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms
+        if self.structure_generation.model == 'dev_dummy':  # replace with a wrapper around the diffusion model
+            # and hydra instantiate
+            return fixed_atoms
+        else:
+            raise NotImplementedError('Only dev_dummy is supported at the moment.')
+
+    def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1):
+        """Save the generated structures in a csv format in the output dir.
+
+        Args:
+            new_structures: structures proposed by the generative model
+            round: current round of training. Defaults to 1.
+        """
+        root_data_dir = os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}')
+        os.makedirs(root_data_dir, exist_ok=True)
+        for i, new_struc in enumerate(new_structures):
+            new_struc.to_csv(os.path.join(root_data_dir, f'structure_{i}.csv'), index=False)
+
+    def get_labels_from_oracle(self, round: int = 1) -> Any:
+        """Compute energy and forces from an oracle such as LAMMPS for the new candidates generated in a round of AL.
+
+        Args:
+            round (optional): round of retraining. Defaults to 1.
+
+        Returns:
+            mlip data input (for example, MTPInputs)
+        """
+        new_labeled_samples = []
+        for file in os.listdir(os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}')):
+            if file.endswith('.csv'):
+                new_labeled_samples.append(self.call_oracle(
+                    os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}', file)
+                ))
+        new_labeled_samples = self.mlip_model.merge_inputs(new_labeled_samples)
+        return new_labeled_samples
+
+    def call_oracle(self, path_to_file: str) -> Any:
+        """Compute energy and forces for a given atomic structure.
+
+        Args:
+            path_to_file: path to csv file containing the atomic positions and structure information
+
+        Returns:
+            mlip data inputs (for example, MTPInputs)
+        """
+        data = pd.read_csv(path_to_file)
+        cartesian_positions = data[['x', 'y', 'z']].to_numpy()
+        box = np.eye(3, 3) * 5.43  # TODO this is bad - fix this
+        atom_type = np.ones(cartesian_positions.shape[0], dtype=np.integer)  # TODO also bad
+        energy, forces = self.oracle(cartesian_positions, box, atom_type)
+        labels_as_mtp = self.mlip_model.prepare_dataset_from_numpy(
+            cartesian_positions,
+            box,
+            forces,
+            energy,
+            atom_type,
+        )
+        return labels_as_mtp
+
+    def round_of_active_learning_loop(self, trained_mlip: Optional[MTPWithMLIP3] = None
+                                      ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Do a full loop of activate learning.
+
+        The following steps are done in sequence:
+            - train a MLIP from the training set specified in the config file if trained_mlip is not specified
+            - evaluate the MLIP with the evaluation set specified in the config file
+            - find the "bad" structures in the evaluation set based on the criteria from the config file
+            - excise the problematic regions
+            - generate new candidates based on these regions
+            - call the oracle to get the labels for the new generated candidates
+            - retrain the MLIP
+            - evaluate the MLIP again
+
+        Args:
+            trained_mlip (optional): if not None, use this MLIP as a starting point. If None, train a MLIP from scratch
+                using the training data specified in the config file.
+
+        Returns:
+            dataframe with the MLIP evaluation results before finetuning with the generated structures
+            dataframe with the MLIP evaluation results after finetuning with the generated structures
+        """
+        # one round from a known mtp (or train from provided training set)
+        # evaluate, find candidates and update MTP
+        # return the updated MTP
+        if trained_mlip is None:
+            trained_mlip = self.train_mlip()
+        pred_df = self.evaluate_mlip(mlip_name=trained_mlip)
+        bad_structures = self.get_bad_structures(pred_df)
+        bad_regions = self.excise_worst_atom(bad_structures)
+        new_candidates = [self.get_structure_candidate_from_generative_model(x) for x in bad_regions]
+        self.new_structure_to_csv(new_candidates)
+        new_labeled_candidates = self.get_labels_from_oracle()
+        new_training_set = self.mlip_model.merge_inputs([self.training_sets[-1], new_labeled_candidates])
+        self.training_sets.append(new_training_set)
+        new_mtp = self.train_mlip()
+        new_pred_df = self.evaluate_mlip(mlip_name=new_mtp)
+        return pred_df, new_pred_df
+
+    def evaluate_mtp_update(self, original_predictions: pd.DataFrame, updated_predictions) -> Tuple[float, float]:
+        """Find the evaluation criteria in the original predictions and the corresponding value after retraining.
+
+        Args:
+            original_predictions: MLIP predictions before retraining
+            updated_predictions: MLIP predictions after retraining
+
+        Returns:
+             worst evaluation_criteria (e.g. MaxVol) in the original evaluation
+             corresponding value after retraining with new samples. Not guaranteed to be the maximum value.
+        """
+        # find the highest MaxVol in the original predictions - identified by the atom index and structure index
+        # TODO we assume a max - but it could be a min i
+        criteria = self.eval_config.evaluation_criteria
+        atom_index, structure_index, original_value = original_predictions.iloc[
+            original_predictions[criteria].argmax()][['atom_index', 'structure_index', criteria]]
+        updated_value = updated_predictions.loc[
+            (updated_predictions['atom_index'] == atom_index)
+            & (updated_predictions['structure_index'] == structure_index), criteria].values.item()
+        return original_value, updated_value
+
+
+def get_arguments() -> argparse.Namespace:
+    """Parse arguments.
+
+    Returns:
+        args: arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', help='path to data directory', required=True)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    """Example to do an active learning loop once."""
+    args = get_arguments()
+    # TODO get mtp_config_path from the args
+    config_path = args.config
+    al_loop = ActiveLearningLoop(config_path)
+    initial_df, new_df = al_loop.round_of_active_learning_loop()
+    al_loop.evaluate_mtp_update(initial_df, new_df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/crystal_diffusion/active_learning_loop/oracle.py b/crystal_diffusion/active_learning_loop/oracle.py
new file mode 100644
index 00000000..a1893f5c
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/oracle.py
@@ -0,0 +1,55 @@
+from pathlib import Path
+from typing import Dict, Tuple
+
+import numpy as np
+
+from crystal_diffusion import DATA_DIR
+from crystal_diffusion.oracle.lammps import get_energy_and_forces_from_lammps
+
+
+class LAMMPS_for_active_learning:
+    """Oracle using LAMMPS to get the energy and forces on atoms."""
+    def __init__(self):
+        """Initialize the class."""
+        pass
+
+    def __call__(self,
+                 cartesian_positions: np.ndarray,
+                 box: np.ndarray,
+                 atom_types: np.ndarray,
+                 atom_type_map: Dict[int, str] = {1: 'Si'},
+                 tmp_work_dir: str = './',
+                 pair_coeff_dir: Path = DATA_DIR) -> Tuple[float, np.ndarray]:
+        """Call LAMMPS to get energy and forces for a given set of atoms.
+
+        Args:
+            cartesian_positions: atomic positions as a n_atom x 3 array
+            box: unit cell definition as a 3x3 array. Assumed to be diagonal.
+            atom_types: integers defining each atoms as an array of length n_atom
+            atom_type_map: map between indices and atom type. Defaults to {1: 'Si'}
+            tmp_work_dir: temporary work directory for LAMMPS. Defaults to ./
+            pair_coeff_dir: path to stilinger-weber potential. Defaults to DATA_DIR.
+
+        Returns:
+            energy and forces on each atom (n_atom x 3)
+        """
+        shifted_positions = self.shift_positions(cartesian_positions, box)
+        energy, forces = get_energy_and_forces_from_lammps(shifted_positions, box, atom_types, atom_type_map,
+                                                           tmp_work_dir, pair_coeff_dir)
+        return energy, forces[['fx', 'fy', 'fz']].to_numpy()
+
+    def shift_positions(self, cartesian_positions: np.ndarray, box: np.ndarray) -> np.ndarray:
+        """Shift the positions of the atoms so all coordinates are positives.
+
+        This is because LAMMPS will ignore atoms with coordinates outside the [0, a] range (a = size of the unit cell).
+
+        Args:
+            cartesian_positions: atomic positions (n_atom x 3 array)
+            box: unit cell (3x3 array) - assumed to be diagonal
+
+        Returns:
+            array with shifted positions
+        """
+        for i, cell_size in enumerate(np.diag(box)):
+            cartesian_positions[:, i] = cartesian_positions[:, i] % cell_size
+        return cartesian_positions
diff --git a/crystal_diffusion/active_learning_loop/utils.py b/crystal_diffusion/active_learning_loop/utils.py
new file mode 100644
index 00000000..02833f89
--- /dev/null
+++ b/crystal_diffusion/active_learning_loop/utils.py
@@ -0,0 +1,68 @@
+from typing import List, Optional
+
+import pandas as pd
+
+
+def get_structures_for_retraining(prediction_df: pd.DataFrame,
+                                  criteria_threshold: Optional[float] = None,
+                                  number_of_structures: Optional[int] = None,
+                                  evaluation_criteria: str = 'nbh_grades',
+                                  structure_index: str = 'structure_index'
+                                  ) -> List[pd.DataFrame]:
+    """Find the structures with the worst value of the evaluation criteria.
+
+    Args:
+        prediction_df: dataframe with the atom positions, forces, uncertainty criteria (e.g. MaxVol coefficient),
+            indices and structure indices
+        criteria_threshold: value above which the evaluation_criteria is considered bad. Either this or
+            number_of_structures should be specified. number_of_structures has priority if both are specified.
+            Defaults to None.
+        number_of_structures: number of structures to return. The top number_of_structures with the highest value of
+            evaluation_criteria are returned. Either this or criteria_threshold should be specified. Defaults to None.
+        evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (MaxVol coefficient in MTP)
+        structure_index: name of the column in the dataframe with the index identifying the structure. Defaults to
+            structure_index.
+
+    Returns:
+        list of the structures with a bad evaluation criteria. Length of the list depends on criteria_threhold and
+            number_of_structures.
+    """
+    assert criteria_threshold is not None or number_of_structures is not None, \
+        "criteria_threshold or number_of_structures should be set."
+    # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection
+    criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max()
+    # find the top number_of_structures
+    structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False)
+    if criteria_threshold is not None:
+        structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold]
+    structures_indices = structures_indices.index.to_list()
+    if number_of_structures is not None:
+        structures_indices = structures_indices[:number_of_structures]
+    structures_to_retrain = []
+    for idx in structures_indices:
+        structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx])
+    return structures_to_retrain
+
+
+def extract_target_region(structure_df: pd.DataFrame,
+                          extraction_radius: float,
+                          evaluation_criteria: str = 'nbh_grades') -> pd.DataFrame:
+    """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious.
+
+    Args:
+        structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value)
+        extraction_radius: include all atoms within this distance of the targeted atom
+        evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP)
+
+    Returns:
+        dataframe with the atomic coordinates in columns x, y, z
+    """
+    # extract the worst ato and a region around of radius extraction_radius
+    # TODO better method to determine radius: number of atoms ?
+    target_atom = structure_df[evaluation_criteria].idxmax()
+    target_position = structure_df.loc[target_atom][['x', 'y', 'z']]
+    # TODO periodicity...
+    structure_df.loc[:, 'distance_squared'] = structure_df.apply(
+        lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1)
+    atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']]
+    return atom_positions
diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py
new file mode 100644
index 00000000..bcf527ed
--- /dev/null
+++ b/crystal_diffusion/mlip/mtp_train.py
@@ -0,0 +1,132 @@
+"""Script to train and evaluate a MTP.
+
+Running the main() runs a debugging example. Entry points are train_mtp.
+"""
+import argparse
+from typing import Dict, Tuple
+
+import pandas as pd
+from sklearn.metrics import mean_absolute_error
+
+from crystal_diffusion.mlip.mtp_utils import (MTPInputs,
+                                              crawl_lammps_directory,
+                                              prepare_mtp_inputs_from_lammps)
+from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3
+
+atom_dict = {1: 'Si'}
+
+
+def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTPInputs:
+    """Prepare the dataset in a given directory into a MTP format.
+
+    Args:
+        root_data_dir: root data directory to parse
+        atom_dict: map between an index and an atom type
+        mode: type of data to look for. e.g. train will only look at the folders in root_data_dirwith "train" in their
+            name. Defaults to train.
+
+    Returns:
+        data in the MTPInputs dataclass
+    """
+    lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
+    mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict)
+    return mtp_dataset
+
+
+def train_mtp(train_inputs: MTPInputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
+    """Create and train an MTP potential.
+
+    Args:
+        train_inputs: inputs for training. Should contain structure, energies and forces
+        mlip_folder_path: path to MLIP-3 folder
+        save_dir: path to directory where to save the fitted model
+
+    Returns:
+       dataframe with original and predicted energies and forces.
+    """
+    # TODO more kwargs for MTP training. See maml and mlip-3 documentation.
+    # create MTP
+    mtp = MTPWithMLIP3(mlip_path=mlip_folder_path)
+    # train
+    mtp.train(
+        train_structures=train_inputs.structure,
+        train_energies=train_inputs.energy,
+        train_forces=train_inputs.forces,
+        train_stresses=None,
+        max_dist=5,
+        stress_weight=0,
+        fitted_mtp_savedir=save_dir,
+    )
+
+    return mtp
+
+
+def evaluate_mtp(eval_inputs: MTPInputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Evaluate a trained MTP potential.
+
+    Args:
+        eval_inputs: inputs to evaluate. Should contain structure, energies and forces
+        mtp: trained MTP potential.
+
+    Returns:
+       dataframe with original and predicted energies and forces.
+    """
+    # evaluate
+    df_orig, df_predict = mtp.evaluate(
+        test_structures=eval_inputs.structure,
+        test_energies=eval_inputs.energy,
+        test_forces=eval_inputs.forces,
+        test_stresses=None,
+    )
+    return df_orig, df_predict
+
+
+def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]:
+    """Get mean absolute error on energy and forces from the outputs of MTP.
+
+    Args:
+        df_orig: dataframe with ground truth values
+        df_predict: dataframe with MTP predictions
+
+    Returns:
+        MAE on energy in eV/atom and MAE on forces in eV/Å
+    """
+    # from demo in maml
+    # get a single predicted energy per structure
+    predicted_energy = df_predict.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
+    # normalize by number of atoms
+    predicted_energy = (predicted_energy['energy'] / predicted_energy['atom_index']).to_numpy()
+    # same for ground truth
+    gt_energy = df_orig.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
+    gt_energy = (gt_energy['energy'] / gt_energy['atom_index']).to_numpy()
+
+    predicted_forces = (df_predict[['fx', 'fy', 'fz']].to_numpy().flatten())
+    gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten())
+
+    return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces)
+
+
+def main():
+    """Train and evaluate an example for MTP."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--lammps_yaml', help='path to LAMMPS yaml file', required=True, nargs='+')
+    parser.add_argument('--lammps_thermo', help='path to LAMMPS thermo output', required=True, nargs='+')
+    parser.add_argument('--mlip_dir', help='directory to MLIP compilation folder', required=True)
+    parser.add_argument('--output_dir', help='path to folder where outputs will be saved', required=True)
+    args = parser.parse_args()
+
+    lammps_yaml = args.lammps_yaml
+    lammps_thermo_yaml = args.lammps_thermo
+    assert len(lammps_yaml) == len(lammps_thermo_yaml), "LAMMPS outputs yaml should match thermodynamics output."
+
+    mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict)
+    mtp = train_mtp(mtp_inputs, args.mlip_dir, args.output_dir)
+    print("Training is done")
+    df_orig, df_predict = evaluate_mtp(mtp_inputs, mtp)
+    energy_mae, force_mae = get_metrics_from_pred(df_orig, df_predict)
+    print(f"MAE of training energy prediction is {energy_mae * 1000} meV/atom")
+    print(f"MAE of training force prediction is {force_mae} eV/Å")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/mlip/mtp_utils.py
similarity index 52%
rename from crystal_diffusion/train_mtp.py
rename to crystal_diffusion/mlip/mtp_utils.py
index 59d65bf3..d2cbf298 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/mlip/mtp_utils.py
@@ -1,9 +1,7 @@
-"""Script to train and evaluate a MTP.
-
-Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp.
-"""
-import argparse
-from typing import Any, Dict, List, NamedTuple, Tuple
+import os
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
@@ -11,25 +9,30 @@
 from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
-from crystal_diffusion.models.mtp import MTPWithMLIP3
 
-atom_dict = {1: 'Si'}
+@dataclass(kw_only=True)
+class MTPInputs:
+    """Create a dataclass to train or evaluate a MTP model."""
+    structure: List[Structure]
+    forces: List[List[List[float]]]  # num samples x num atoms x spatial dimension
+    energy: List[float]
 
 
-def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
-        Tuple[List[Structure], List[List[float]]]:
+def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any], forces_avail: bool = True) -> \
+        Tuple[List[Structure], Optional[List[List[float]]]]:
     """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
 
     Args:
         filename: path to LAMMPS output file in yaml format
         atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str)
+        forces_avail (optional): if True, get the forces from the LAMMPS output file. Defaults to True.
 
     Returns:
         list of pymatgen Structure containing the atoms and their positions
-        list of forces (n x 3) for each atom
+        list of forces (n x 3) for each atom. None if forces_avail is False
     """
     structures = []
-    forces = []
+    forces = [] if forces_avail else None
     with (open(filename, 'r') as f):
         l_yaml = yaml.safe_load_all(f)
         for d in l_yaml:  # loop over LAMMPS outputs and convert in pymatgen Structure objects
@@ -47,9 +50,10 @@ def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, A
                                      coords=coords,
                                      coords_are_cartesian=True)
             structures.append(pm_structure)
-            force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
-            structure_forces = [[x[i] for i in force_idx] for x in d['data']]
-            forces.append(structure_forces)
+            if forces_avail:
+                force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
+                structure_forces = [[x[i] for i in force_idx] for x in d['data']]
+                forces.append(structure_forces)
     return structures, forces
 
 
@@ -70,27 +74,21 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
     return energies
 
 
-class MTP_Inputs(NamedTuple):
-    """Create a namedtuple instance for MTP inputs."""
-
-    structure: List[Structure]
-    forces: List[List[float]]
-    energy: List[float]
-
-
 def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
                                    thermo_yaml: List[str],
-                                   atom_dict: Dict[int, Any]
-                                   ) -> MTP_Inputs:
+                                   atom_dict: Dict[int, Any],
+                                   get_forces: bool = True,
+                                   ) -> MTPInputs:
     """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
 
     Args:
         output_yaml: list of LAMMPS output files as yaml.
         thermo_yaml: list of LAMMPS thermodynamic output files as yaml.
         atom_dict: mapping of LAMMPS indices to atom type.
+        get_forces (optional): if True, get the forces. Defaults to True.
 
     Returns:
-        namedtuple with structure, energies and forces usable by MTP.
+        dataclass used as inputs to train and evaluation a MTP model
     """
     mtp_inputs = {
         'structure': [],
@@ -98,63 +96,55 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
         'forces': []
     }
     for filename in output_yaml:
-        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict)
+        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict, get_forces)
         mtp_inputs['structure'] += structures
-        mtp_inputs['forces'] += forces
+        mtp_inputs['forces'] += forces  # will be None if get_forces is False
     for filename in thermo_yaml:
         mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
-    mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'],
-                            energy=mtp_inputs['energy'],
-                            forces=mtp_inputs['forces'])
+    mtp_inputs = MTPInputs(structure=mtp_inputs['structure'],
+                           energy=mtp_inputs['energy'],
+                           forces=mtp_inputs['forces'])
     return mtp_inputs
 
 
-def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
-    """Create and train an MTP potential.
+def crawl_lammps_directory(folder_name: str, folder_name_pattern: str = "train") -> Tuple[List[str], List[str]]:
+    """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name.
+
+    LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml
 
     Args:
-        train_inputs: inputs for training. Should contain structure, energies and forces
-        mlip_folder_path: path to MLIP-3 folder
-        save_dir: path to directory where to save the fitted model
+        folder_name: folder to crawl
+        folder_name_pattern (optional): name of the subfolder to keep. Defaults to train.
 
     Returns:
-       dataframe with original and predicted energies and forces.
-    """
-    # TODO more kwargs for MTP training. See maml and mlip-3 documentation.
-    # create MTP
-    mtp = MTPWithMLIP3(mlip_path=mlip_folder_path)
-    # train
-    mtp.train(
-        train_structures=train_inputs.structure,
-        train_energies=train_inputs.energy,
-        train_forces=train_inputs.forces,
-        train_stresses=None,
-        max_dist=5,
-        stress_weight=0,
-        fitted_mtp_savedir=save_dir,
-    )
+        list of LAMMPS dump outputs and list of LAMMPS thermo outputs
 
-    return mtp
+    """
+    assert os.path.exists(folder_name), "Invalid folder name provided."
+    lammps_output_files, thermo_output_files = [], []
+    for dirpath, _, filenames in os.walk(folder_name):
+        if re.search(folder_name_pattern, dirpath):
+            lammps_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("dump.yaml")])
+            thermo_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("thermo.yaml")])
+    return lammps_output_files, thermo_output_files
 
 
-def evaluate_mtp(eval_inputs: MTP_Inputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """Evaluate a trained MTP potential.
+def concat_mtp_inputs(input1: MTPInputs, input2: MTPInputs) -> MTPInputs:
+    """Merge two MTP inputs data class.
 
     Args:
-        eval_inputs: inputs to evaluate. Should contain structure, energies and forces
-        mtp: trained MTP potential.
+        input1: first MTPInputs dataset
+        input2: second MTPInputs dataset
 
     Returns:
-       dataframe with original and predicted energies and forces.
+        concatenated MTPInputs dataset
     """
-    # evaluate
-    df_orig, df_predict = mtp.evaluate(
-        test_structures=eval_inputs.structure,
-        test_energies=eval_inputs.energy,
-        test_forces=eval_inputs.forces,
-        test_stresses=None,
+    concat_inputs = MTPInputs(
+        structure=input1.structure + input2.structure,
+        forces=input1.forces + input2.forces,
+        energy=input1.energy + input2.energy
     )
-    return df_orig, df_predict
+    return concat_inputs
 
 
 def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]:
@@ -180,29 +170,3 @@ def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tu
     gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten())
 
     return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces)
-
-
-def main():
-    """Train and evaluate an example for MTP."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--lammps_yaml', help='path to LAMMPS yaml file', required=True, nargs='+')
-    parser.add_argument('--lammps_thermo', help='path to LAMMPS thermo output', required=True, nargs='+')
-    parser.add_argument('--mlip_dir', help='directory to MLIP compilation folder', required=True)
-    parser.add_argument('--output_dir', help='path to folder where outputs will be saved', required=True)
-    args = parser.parse_args()
-
-    lammps_yaml = args.lammps_yaml
-    lammps_thermo_yaml = args.lammps_thermo
-    assert len(lammps_yaml) == len(lammps_thermo_yaml), "LAMMPS outputs yaml should match thermodynamics output."
-
-    mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict)
-    mtp = train_mtp(mtp_inputs, args.mlip_dir, args.output_dir)
-    print("Training is done")
-    df_orig, df_predict = evaluate_mtp(mtp_inputs, mtp)
-    energy_mae, force_mae = get_metrics_from_pred(df_orig, df_predict)
-    print(f"MAE of training energy prediction is {energy_mae * 1000} meV/atom")
-    print(f"MAE of training force prediction is {force_mae} eV/Å")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mlip/mtp.py
similarity index 56%
rename from crystal_diffusion/models/mtp.py
rename to crystal_diffusion/models/mlip/mtp.py
index 67570495..bc822cf6 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mlip/mtp.py
@@ -10,6 +10,7 @@
 import shutil
 import subprocess
 from collections import defaultdict
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, TextIO, Tuple
 
 import numpy as np
@@ -20,30 +21,51 @@
 from monty.tempfile import ScratchDir
 from pymatgen.core import Structure
 
+from crystal_diffusion.mlip.mtp_utils import (MTPInputs, concat_mtp_inputs,
+                                              crawl_lammps_directory,
+                                              prepare_mtp_inputs_from_lammps)
+
+
+@dataclass(kw_only=True)
+class MTPArguments:
+    """Arguments to train an MTP with the MLIP3 library."""
+    mlip_path: str  # path to MLIP3 library
+    name: Optional[str] = None  # MTP
+    param: Optional[Dict[Any, Any]] = None
+    unfitted_mtp: str = "08.almtp"  # Define the initial mtp file. Default to 08g.amltp
+    fitted_mtp_savedir: str = '../'  # save directory for the fitted MTP. Defaults to '../' (current wd)
+    max_dist: float = 5  # The actual radial cutoff. Defaults to 5.
+    radial_basis_size: int = 8  # Relevant to number of radial basis function. Defaults to 8.
+    max_iter: int = 1000  # The number of maximum iteration. Defaults to 1000.
+    energy_weight: float = 1  # The weight of energy. Defaults to 1
+    force_weight: float = 1e-2  # The weight of forces. Defaults to 1e-2
+    stress_weight: float = 1e-3  # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+    init_params: str = "same"  # how to initialize parameters if a potential was not pre-fitted: "same" or "random".
+    scale_by_force: float = 0  # If > 0 then configurations near equilibrium get more weight. Defaults to 0.
+    bfgs_conv_tol: float = 1e-3  # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations.
+    weighting: str = "vibration"  # How to weight configuration with different sizes relative to each other.
+    # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
 
 class MTPWithMLIP3(MTPotential):
     """MTP with MLIP-3."""
 
-    def __init__(self,
-                 mlip_path: str,
-                 name: Optional[str] = None,
-                 param: Optional[Dict[Any, Any]] = None,
-                 version: Optional[str] = None):
+    def __init__(self, mtp_args: MTPArguments):
         """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3.
 
         Args:
-            mlip_path: path to mlip3 library
-            name: MTPotential argument
-            param: MTPotential argument
-            version: MTPotential argument
+            mtp_args: MTP arguments from the class MTPArguments
         """
-        super().__init__(name, param, version)
-        self.mlp_command = os.path.join(mlip_path, "build", "mlp")
+        super().__init__(mtp_args.name, mtp_args.param)
+        self.mlp_command = os.path.join(mtp_args.mlip_path, "build", "mlp")
         assert os.path.exists(self.mlp_command), "mlp command not found in mlip-3 build folder"
-        self.mlp_templates = os.path.join(mlip_path, "MTP_templates")
+        self.mlp_templates = os.path.join(mtp_args.mlip_path, "MTP_templates")
         assert os.path.exists(self.mlp_templates), "MTP templates not found in mlip-3 folder"
         self.fitted_mtp = None
         self.elements = None
+        self.mtp_args = mtp_args
+        self.savedir = mtp_args.fitted_mtp_savedir
+        os.makedirs(self.savedir, exist_ok=True)
 
     def to_lammps_format(self):
         """Write the trained MTP in a LAMMPS compatible format."""
@@ -60,44 +82,51 @@ def to_lammps_format(self):
         pass
 
     def evaluate(self,
-                 test_structures: List[Structure],
-                 test_energies: List[float],
-                 test_forces: List[List[float]],
-                 test_stresses: Optional[List[List[float]]] = None,
+                 dataset: MTPInputs,
+                 mlip_name: str = 'mtp_fitted.almtp'
                  ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP.
 
         Args:
-            test_structures: evaluation set of pymatgen Structure Objects.
-            test_energies: list of total energies of each structure to evaluation in test_structures list.
-            test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list.
-                m can be varied with each single structure case.
-            test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in
-                test_structures list. If None, do not evaluate on stresses. Default to None.
+            dataset: MTPInputs dataclass with the following elements:
+                structures: The list of Pymatgen Structure object.
+                energies: List of total energies of each structure in structures list.
+                forces: List of (m, 3) forces array of each structure with m atoms in structures list.
+                    m can be varied with each single structure case.
+            mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp
 
         Returns:
             dataframe with ground truth energies, forces
             dataframe with predicted energies, forces, MaxVol gamma (nbh grades)
         """
-        if self.fitted_mtp is None:
-            raise AttributeError('MTP was not trained. Please call train() before evaluate().')
+        if not mlip_name.endswith('.almtp'):
+            mlip_name += '.almtp'
+        assert os.path.exists(mlip_name), f"Trained MTP does not exists: {mlip_name}"
 
         original_file = "original.cfgs"
         predict_file = "predict.cfgs"
-        test_structures, test_forces, test_stresses = check_structures_forces_stresses(
-            test_structures, test_forces, test_stresses
+
+        # TODO if forces are not available...
+        test_structures, test_forces, _ = check_structures_forces_stresses(
+            dataset.structure, dataset.forces, None
         )
-        predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses)
+        predict_pool = pool_from(test_structures, dataset.energy, test_forces)
+        local_mtp_name = "mtp.almtp"
 
         with ScratchDir("."):  # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere
             # write the structures to evaluate in a mlp compatible format
             original_file = self.write_cfg(original_file, cfg_pool=predict_pool)
+            # TODO how to handle when GT is not available
             df_orig = self.read_cfgs(original_file, nbh_grade=False)  # read original values as a DataFrame
 
+            # copy the trained mtp in the scratchdir
+            shutil.copyfile(mlip_name, os.path.join(os.getcwd(), local_mtp_name))
             # calculate_grade is the method to get the forces, energy & maxvol values
-            cmd = [self.mlp_command, "calculate_grade", self.fitted_mtp, original_file, predict_file]
+            cmd = [self.mlp_command, "calculate_grade", local_mtp_name, original_file, predict_file]
             predict_file += '.0'  # added by mlp...
             stdout, rc = self._call_mlip(cmd)
+
+            # check that MTP was called properly
             if rc != 0:
                 error_msg = f"mlp exited with return code {rc}"
                 msg = stdout.decode("utf-8").split("\n")[:-1]
@@ -107,7 +136,7 @@ def evaluate(self,
                 except Exception:
                     error_msg += msg[-1]
                 raise RuntimeError(error_msg)
-
+            # read the config
             df_predict = self.read_cfgs(predict_file, nbh_grade=True)
         return df_orig, df_predict
 
@@ -235,72 +264,108 @@ def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO):
         with subprocess.Popen(cmd, stdout=output_file) as p:
             p.communicate()[0]
 
-    def train(
-            self,
-            train_structures: List[Structure],
-            train_energies: List[float],
-            train_forces: List[List[float]],
-            train_stresses: Optional[List[List[float]]] = None,
-            unfitted_mtp: str = "08.almtp",
-            fitted_mtp_savedir: str = '../',
-            max_dist: float = 5,
-            radial_basis_size: int = 8,
-            max_iter: int = 1000,  # TODO check the next kwargs in mlip3
-            energy_weight: float = 1,
-            force_weight: float = 1e-2,
-            stress_weight: float = 1e-3,
-            init_params: str = "same",
-            scale_by_force: float = 0,
-            bfgs_conv_tol: float = 1e-3,
-            weighting: str = "vibration",
-    ) -> int:
+    @staticmethod
+    def prepare_dataset_from_lammps(
+            root_data_dir: str,
+            atom_dict: Dict[int, str],
+            mode: str = "train",
+            get_forces: bool = True,
+    ) -> MTPInputs:
+        """Get the LAMMPS in a folder and organize them as inputs for a MTP.
+
+        Args:
+            root_data_dir: folder to read. Each LAMMPS sample is expected to be in a subfolder.
+            atom_dict: map from LAMMPS index to atom name. e.g. {1: 'Si'}
+            mode: subset of samples to get. Data from root_data_dir/*mode*/ folders will be parsed. Defaults to train.
+            get_forces: if True, get the forces from the samples. Defaults to True.
+
+        Returns:
+            inputs for MTP in the MTPInputs dataclass
+        """
+        lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode)
+        mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict, get_forces=get_forces)
+        return mtp_dataset
+
+    @staticmethod
+    def prepare_dataset_from_numpy(
+            cartesian_positions: np.ndarray,
+            box: np.ndarray,
+            forces: np.ndarray,
+            energy: float,
+            atom_type: np.ndarray,
+            atom_dict: Dict[int, str] = {1: 'Si'}
+    ) -> MTPInputs:
+        """Convert numpy array variables to a format compatible with MTP.
+
+        Args:
+            cartesian_positions: atomic positions in Angstrom as a (n_atom, 3) array.
+            box: unit cell description as a (3, 3) array.
+            forces: forces on each atom as a (n_atom, 3) array
+            energy: energy of the configuration
+            atom_type: indices for each atom in the structure as a (n_atom,) array
+            atom_dict: map between atom indices and atom types
+
+        Returns:
+            data formatted at an input for MTP.
+        """
+        structure = Structure(
+            lattice=box,
+            species=[atom_dict[x] for x in atom_type],
+            coords=cartesian_positions,
+            coords_are_cartesian=True
+        )
+        forces = forces.tolist()  # from Nx3 np array to a list of length N where each element is a list of 3 forces
+        return MTPInputs(structure=[structure], forces=[forces], energy=[energy])
+
+    @staticmethod
+    def merge_inputs(mtp_inputs: List[MTPInputs]) -> MTPInputs:
+        """Merge a list of MTPInputs in a single MTPInputs.
+
+        Args:
+            mtp_inputs: list of MTPInputs
+
+        Returns:
+            merged MTPInputs
+        """
+        merged_inputs = MTPInputs(structure=[], forces=[], energy=[])
+        for x in mtp_inputs:
+            merged_inputs = concat_mtp_inputs(merged_inputs, x)
+        return merged_inputs
+
+    def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str:
         """Training data with moment tensor method using MLIP-3.
 
         Override the base class method.
 
         Args:
-            train_structures: The list of Pymatgen Structure object.
-            train_energies: List of total energies of each structure in structures list.
-            train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
-                m can be varied with each single structure case.
-            train_stresses (optional): List of (6, ) virial stresses of each structure in structures list.
-                Defaults to None.
-            unfitted_mtp (optional): Define the initial mtp file. Default to 08g.amltp
-            fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd)
-            max_dist (optional): The actual radial cutoff. Defaults to 5.
-            radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8.
-            max_iter (optional): The number of maximum iteration. Defaults to 1000.
-            energy_weight (optional): The weight of energy. Defaults to 1
-            force_weight (optional): The weight of forces. Defaults to 1e-2
-            stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
-            init_params (optional): How to initialize parameters if a potential was not
-                pre-fitted. Choose from "same" and "random". Defaults to "same".
-            scale_by_force (optional): If >0 then configurations near equilibrium
-               (with roughly force < scale_by_force) get more weight. Defaults to 0.
-            bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this
-                over 50 BFGS iterations. Defaults to 1e-3.
-            weighting (optional): How to weight configuration with different sizes relative to each other.
-                Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+            dataset: MTPInputs dataclass with the following elements:
+                structures: The list of Pymatgen Structure object.
+                energies: List of total energies of each structure in structures list.
+                forces: List of (m, 3) forces array of each structure with m atoms in structures list.
+                    m can be varied with each single structure case.
+            mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp
 
         Returns:
-            rc : return code of the mlp training script
+            fitted_mtp: path to the fitted MTP
         """
         train_structures, train_forces, train_stresses = check_structures_forces_stresses(
-            train_structures, train_forces, train_stresses
+            dataset.structure, dataset.forces, None
         )
-        train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses)
+        # last argument is for stresses - not used currently
+        train_pool = pool_from(train_structures, dataset.energy, train_forces)
+
         elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures])))
         self.elements = [str(element) for element in elements]  # TODO move to __init__
 
         atoms_filename = "train.cfgs"
 
-        with (ScratchDir(".")):  # create a tmpdir - deleted afterwards
+        with ((ScratchDir("."))):  # create a tmpdir - deleted afterwards
             atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
 
-            if not unfitted_mtp:
+            if not self.mtp_args.unfitted_mtp:
                 raise RuntimeError("No specific parameter file provided.")
-            mtp_file_path = os.path.join(self.mlp_templates, unfitted_mtp)
-            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
+            mtp_file_path = os.path.join(self.mlp_templates, self.mtp_args.unfitted_mtp)
+            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), self.mtp_args.unfitted_mtp))
             commands = [self.mlp_command, "mindist", atoms_filename]
             with open("min_dist", "w") as f:
                 self._call_cmd_to_stdout(commands, f)
@@ -311,23 +376,26 @@ def train(
             # split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
             # min_dist = float(lines[-1].split(split_symbol)[1])
 
-            save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
+            save_fitted_mtp = mlip_name
+            if not save_fitted_mtp.endswith('.almtp'):
+                save_fitted_mtp += '.almtp'
+
             cmds_list = [
                 self.mlp_command,
                 "train",
-                unfitted_mtp,
+                self.mtp_args.unfitted_mtp,
                 atoms_filename,
                 f"--save_to={save_fitted_mtp}",
-                f"--iteration_limit={max_iter}",
+                f"--iteration_limit={self.mtp_args.max_iter}",
                 "--al_mode=nbh",  # active learning mode - required to get extrapolation grade
-                # f"--curr-pot-name={unfitted_mtp}",  # TODO check those kwargs
-                # f"--energy-weight={energy_weight}",
-                # f"--force-weight={force_weight}",
-                # f"--stress-weight={stress_weight}",
-                # f"--init-params={init_params}",
-                # f"--scale-by-force={scale_by_force}",
-                # f"--bfgs-conv-tol={bfgs_conv_tol}",
-                # f"--weighting={weighting}",
+                f"--curr-pot-name={self.mtp_args.unfitted_mtp}",
+                f"--energy-weight={self.mtp_args.energy_weight}",
+                f"--force-weight={self.mtp_args.force_weight}",
+                f"--stress-weight={self.mtp_args.stress_weight}",
+                f"--init-params={self.mtp_args.init_params}",
+                f"--scale-by-force={self.mtp_args.scale_by_force}",
+                f"--bfgs-conv-tol={self.mtp_args.bfgs_conv_tol}",
+                f"--weighting={self.mtp_args.weighting}",
             ]
             stdout, rc = self._call_mlip(cmds_list)
             if rc != 0:
@@ -340,6 +408,6 @@ def train(
                     error_msg += msg[-1]
                 raise RuntimeError(error_msg)
             # copy the fitted mtp outside the working directory
-            self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp)
+            self.fitted_mtp = os.path.join(self.savedir, save_fitted_mtp)
             shutil.copyfile(save_fitted_mtp, self.fitted_mtp)
-        return rc
+        return self.fitted_mtp
diff --git a/examples/config_files/active_learning_loop/config.yaml b/examples/config_files/active_learning_loop/config.yaml
new file mode 100644
index 00000000..04e70faf
--- /dev/null
+++ b/examples/config_files/active_learning_loop/config.yaml
@@ -0,0 +1,43 @@
+active_learning_data:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments
+  training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/
+
+mlip:
+  _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3
+  mtp_args:
+    _target_: crystal_diffusion.models.mlip.mtp.MTPArguments
+    mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/  # path to MLIP3 library
+    name: None  # MTP
+    param: None
+    unfitted_mtp: 08.almtp  # Define the initial mtp file. Default to 08g.amltp
+    fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/  # save directory for the fitted MTP. Defaults to '../' (current wd)
+    max_dist: 5.0  # The actual radial cutoff. Defaults to 5.
+    radial_basis_size: 8  # Relevant to number of radial basis function. Defaults to 8.
+    max_iter: 1000  # The number of maximum iteration. Defaults to 1000.
+    energy_weight: 1.0  # The weight of energy. Defaults to 1
+    force_weight: 1e-2  # The weight of forces. Defaults to 1e-2
+    stress_weight: 1e-3  # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+    init_params: same  # how to initialize parameters if a potential was not pre-fitted: "same" or "random".
+    scale_by_force: 0  # If > 0 then configurations near equilibrium get more weight. Defaults to 0.
+    bfgs_conv_tol: 1e-3  # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations.
+    weighting: vibration  # How to weight configuration with different sizes relative to each other.
+    # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
+structure_evaluation:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments
+  evaluation_criteria: 'nbh_grades'  # maxvol coefficient name in the mtp outputs
+  criteria_threshold: 10.0  # atoms with a MaxVol value under this value are considered bad
+  # number_of_structures: None  # alternatively, take the N worst samples from the evaluation set
+  extraction_radius: 3  # extract atoms within this radius from the worst atom in a structure
+
+repainting_model:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments
+  model: dev_dummy  # placeholder for development
+
+oracle:
+  _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning
+
+initial_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/before_tuning.csv
+updated_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/after_tuning.csv
\ No newline at end of file
diff --git a/examples/local/active_learning/run_benchmark.sh b/examples/local/active_learning/run_benchmark.sh
new file mode 100644
index 00000000..20c65901
--- /dev/null
+++ b/examples/local/active_learning/run_benchmark.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+CONFIG=../../config_files/active_learning_loop/config.yaml
+
+python ../../../crystal_diffusion/active_learning_loop/benchmark.py --config $CONFIG
\ No newline at end of file
diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml
new file mode 100644
index 00000000..e6ab83b0
--- /dev/null
+++ b/experiments/active_learning_benchmark/config/mtp_training.yaml
@@ -0,0 +1,40 @@
+active_learning_data:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments
+  training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/
+  output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/
+
+mlip:
+  _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3
+  mtp_args:
+    _target_: crystal_diffusion.models.mlip.mtp.MTPArguments
+    mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/  # path to MLIP3 library
+    name: None  # MTP
+    param: None
+    unfitted_mtp: 08.almtp  # Define the initial mtp file. Default to 08g.amltp
+    fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/  # save directory for the fitted MTP. Defaults to '../' (current wd)
+    max_dist: 5.0  # The actual radial cutoff. Defaults to 5.
+    radial_basis_size: 8  # Relevant to number of radial basis function. Defaults to 8.
+    max_iter: 1000  # The number of maximum iteration. Defaults to 1000.
+    energy_weight: 1.0  # The weight of energy. Defaults to 1
+    force_weight: 1e-2  # The weight of forces. Defaults to 1e-2
+    stress_weight: 1e-3  # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+    init_params: same  # how to initialize parameters if a potential was not pre-fitted: "same" or "random".
+    scale_by_force: 0  # If > 0 then configurations near equilibrium get more weight. Defaults to 0.
+    bfgs_conv_tol: 1e-3  # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations.
+    weighting: vibration  # How to weight configuration with different sizes relative to each other.
+    # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
+structure_evaluation:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments
+  evaluation_criteria: 'nbh_grades'  # maxvol coefficient name in the mtp outputs
+  criteria_threshold: 10.0  # atoms with a MaxVol value under this value are considered bad
+  # number_of_structures: None  # alternatively, take the N worst samples from the evaluation set
+  extraction_radius: 3  # extract atoms within this radius from the worst atom in a structure
+
+repainting_model:
+  _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments
+  model: dev_dummy  # placeholder for development
+
+oracle:
+  _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning
diff --git a/requirements.txt b/requirements.txt
index 46421893..dd631c5b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ datasets==2.17.1
 flake8==4.0.1
 flake8-docstrings==1.6.0
 gitpython==3.1.27
+hydra-core==1.3.2
 isort==5.13.2
 jupyter==1.0.0
 jinja2==3.1.2
@@ -38,3 +39,4 @@ einops==0.8.0
 torchode==0.2.0
 torchsde==0.2.6
 ovito==3.10.6.post2
+
diff --git a/tests/active_learning_loop/test_benchmark.py b/tests/active_learning_loop/test_benchmark.py
new file mode 100644
index 00000000..ac712970
--- /dev/null
+++ b/tests/active_learning_loop/test_benchmark.py
@@ -0,0 +1,147 @@
+import os
+from unittest.mock import MagicMock, mock_open
+
+import pandas as pd
+import pytest
+
+from crystal_diffusion.active_learning_loop.benchmark import ActiveLearningLoop
+
+
+class TestActiveLearningLoop:
+    @pytest.fixture
+    def mock_yaml_config(self):
+        return """
+        active_learning_data:
+            key1: value1
+        mlip:
+            key2: value2
+        structure_evaluation:
+            key3: value3
+        repainting_model:
+            key4: value4
+        oracle:
+            key5: value5
+        """
+
+    @pytest.fixture
+    def meta_config(self):  # mock a path to a meta_config yaml file
+        return "fake_config.yaml"
+
+    @pytest.fixture
+    def mock_al_loop(self, mocker, mock_yaml_config, meta_config):
+        # Mock the open function to simulate reading the YAML file
+        mocker.patch("builtins.open", mock_open(read_data=mock_yaml_config))
+        # Mock os.path.exists to always return True
+        mocker.patch("os.path.exists", return_value=True)
+        # Mock the instantiate function from hydra.utils
+        mock_instantiate = mocker.patch("crystal_diffusion.active_learning_loop.benchmark.instantiate")
+        mock_instantiate.side_effect = lambda x: x  # Return the config itself for simplicity
+
+        # Create an instance of ActiveLearningLoop
+        loop = ActiveLearningLoop(meta_config)
+        return loop
+
+    def test_parse_config(self, mock_al_loop, mock_yaml_config, meta_config):
+
+        # Assertions to verify that the attributes were correctly set
+        assert mock_al_loop.data_paths == {'key1': 'value1'}
+        assert mock_al_loop.mlip_model == {'key2': 'value2'}
+        assert mock_al_loop.eval_config == {'key3': 'value3'}
+        assert mock_al_loop.structure_generation == {'key4': 'value4'}
+        assert mock_al_loop.oracle == {'key5': 'value5'}
+
+        # Verify that the file was opened and the path was checked
+        open.assert_called_once_with(meta_config, 'r')
+        os.path.exists.assert_called_once_with(meta_config)
+
+    def test_train_mlip(self, mocker, mock_yaml_config, mock_al_loop):
+        # Mocking the mlip_model's methods
+        mock_mlip_model = MagicMock()
+        mock_mlip_model.prepare_dataset_from_lammps.return_value = "mock_training_set"
+        mock_mlip_model.train.return_value = "mock_trained_mlip_model"
+        mock_mlip_model.merge_inputs.return_value = "mock_training_set"
+
+        # Inject the mocked mlip_model into the loop instance
+        mock_al_loop.mlip_model = mock_mlip_model
+        mock_al_loop.data_paths = MagicMock(training_data_dir="mock_training_data_dir")
+
+        # Run the train_mlip method without providing a training_set
+        result = mock_al_loop.train_mlip(round=1)
+
+        # Verify the methods were called with expected parameters
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with(
+            root_data_dir="mock_training_data_dir",
+            atom_dict=mock_al_loop.atom_dict,
+            mode="train"
+        )
+
+        mock_mlip_model.train.assert_called_once_with("mock_training_set", mlip_name="mlip_round_1")
+
+        # Verify the trained model path is correctly returned
+        assert result == "mock_trained_mlip_model"
+
+        # Verify that the trained model is appended to the history
+        assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model"]
+
+        # Test when a training set is provided
+        custom_training_set = "custom_training_set"
+        result = mock_al_loop.train_mlip(round=2, training_set=custom_training_set)
+
+        # The prepare_dataset_from_lammps should not be called since we provided a training_set
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_once()  # No new call
+        mock_mlip_model.train.assert_called_with(custom_training_set, mlip_name="mlip_round_2")
+
+        assert result == "mock_trained_mlip_model"
+        assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model", "mock_trained_mlip_model"]
+
+    def test_evaluate_mlip(self, mock_al_loop, tmpdir):
+        # Mocking the mlip_model's methods
+        mock_mlip_model = MagicMock()
+        mock_evaluation_dataset = "mock_evaluation_dataset"
+        mock_prediction_df = pd.DataFrame({"atom_index": [0, 1], "force": [1.0, 2.0]})
+
+        # Mocking return values for the prepare_dataset_from_lammps and evaluate methods
+        mock_mlip_model.prepare_dataset_from_lammps.return_value = mock_evaluation_dataset
+        mock_mlip_model.evaluate.return_value = (None, mock_prediction_df)
+
+        loop = mock_al_loop
+
+        # Inject the mocked mlip_model into the loop instance
+        loop.mlip_model = mock_mlip_model
+        loop.data_paths = MagicMock(evaluation_data_dir="mock_evaluation_data_dir")
+
+        # Run the evaluate_mlip method without specifying mlip_name
+        result_df = loop.evaluate_mlip(round=1)
+
+        # Verify the prepare_dataset_from_lammps method was called with expected parameters
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with(
+            root_data_dir="mock_evaluation_data_dir",
+            atom_dict=loop.atom_dict,
+            mode="evaluation",
+            get_forces=True
+        )
+        # Verify the evaluate method was called with the correct parameters
+        expected_mlip_name = os.path.join(mock_mlip_model.savedir, 'mlip_round_1.almtp')
+        mock_mlip_model.evaluate.assert_called_once_with(mock_evaluation_dataset, mlip_name=expected_mlip_name)
+
+        # Verify the method returns the correct dataframe
+        pd.testing.assert_frame_equal(result_df, mock_prediction_df)
+
+        # Run the evaluate_mlip method with a custom mlip_name
+        custom_mlip_name = "custom_mlip.almtp"
+        result_df = loop.evaluate_mlip(round=2, mlip_name=custom_mlip_name)
+
+        # The evaluate method should be called with the custom mlip_name
+        mock_mlip_model.evaluate.assert_called_with(mock_evaluation_dataset, mlip_name=custom_mlip_name)
+
+        pd.testing.assert_frame_equal(result_df, mock_prediction_df)
+
+        # Test without forces_available
+        _ = loop.evaluate_mlip(round=3, forces_available=False)
+
+        mock_mlip_model.prepare_dataset_from_lammps.assert_called_with(
+            root_data_dir="mock_evaluation_data_dir",
+            atom_dict=loop.atom_dict,
+            mode="evaluation",
+            get_forces=False
+        )
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 5431dd15..13a661ae 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -8,10 +8,11 @@
 from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
-from crystal_diffusion.models.mtp import MTPWithMLIP3
-from crystal_diffusion.train_mtp import (
-    extract_energy_from_thermo_log, extract_structure_and_forces_from_file,
-    get_metrics_from_pred, prepare_mtp_inputs_from_lammps)
+from crystal_diffusion.mlip.mtp_utils import (
+    MTPInputs, extract_energy_from_thermo_log,
+    extract_structure_and_forces_from_file, get_metrics_from_pred,
+    prepare_mtp_inputs_from_lammps)
+from crystal_diffusion.models.mlip.mtp import MTPArguments, MTPWithMLIP3
 
 
 class FakeStructure:
@@ -38,15 +39,15 @@ def mock_popen(mocker):
 
 
 # Mock the external dependencies and method calls within the MTPWithMLIP3.train method
-def test_train(mocker, mock_popen):
+def test_train(mocker, mock_popen, tmpdir):
     # Mock os.path.exists to always return True
     mocker.patch("os.path.exists", return_value=True)
 
     # Mock check_structures_forces_stresses to return a value without needing real input
-    mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", side_effect=passthrough)
+    mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses", side_effect=passthrough)
 
     # Mock pool_from to return a simplified pool object
-    mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="simple_pool_object")
+    mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="simple_pool_object")
 
     # Mock self.write_cfg to simulate creating a config file without file operations
     mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
@@ -54,19 +55,23 @@ def test_train(mocker, mock_popen):
     mocker.patch("shutil.copyfile", return_value=None)
 
     # Initialize MTPWithMLIP3 with mock parameters
-    model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model")
+    mtp_args = MTPArguments(
+        mlip_path="/mock/path",
+        name="test_model",
+        unfitted_mtp="08.almtp",
+        fitted_mtp_savedir=tmpdir
+    )
+    model = MTPWithMLIP3(mtp_args)
     # Call the train method
+    mtp_inputs = MTPInputs(
+        structure=[FakeStructure(['H', 'O']), FakeStructure(['Si'])],
+        forces=[],
+        energy=[1, 2]
+    )
 
-    return_code = model.train(
-        train_structures=[FakeStructure(['H', 'O']), FakeStructure(['Si'])],
-        train_energies=[1, 2],
-        train_forces=[],
-        train_stresses=[],
-        unfitted_mtp="08.almtp",
-        fitted_mtp_savedir="/mock/dir"
+    _ = model.train(
+        mtp_inputs,
     )
-    # Assert the expected results
-    assert return_code == 0  # The train method should return the mocked subprocess success return code from mock_open
 
     # Assert that mocked methods were called
     model.write_cfg.assert_called()
@@ -91,19 +96,17 @@ def mtp_instance(mocker):
     return instance
 
 
-# def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
 def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
     test_structures = [fake_structure]
     test_energies = [1.0]
     test_forces = [[[0, 0, 0]]]
-    test_stresses = None  # or appropriate mock stresses
 
     # Mock check_structures_forces_stresses to return the arguments unmodified
-    mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses",
+    mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses",
                  side_effect=lambda s, f, st: (s, f, st))
 
     # Mock pool_from to return a mocked value
-    mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="mock_pool")
+    mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="mock_pool")
 
     # Mock self.write_cfg to simulate creating a config file without file operations
     mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
@@ -116,8 +119,14 @@ def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
     mocker.patch("shutil.copyfile", return_value=None)
     mocker.patch("os.path.exists", return_value=True)
 
+    mtp_inputs = MTPInputs(
+        structure=test_structures,
+        forces=test_forces,
+        energy=test_energies
+    )
+
     # Perform the test
-    df_orig, df_predict = mtp_instance.evaluate(test_structures, test_energies, test_forces, test_stresses)
+    df_orig, df_predict = mtp_instance.evaluate(mtp_inputs)
 
     # Assertions can vary based on the real output of `read_cfgs`
     # Here's an example assertion assuming `read_cfgs` returns a string in this mocked scenario
@@ -209,12 +218,13 @@ def test_extract_energy_from_thermo_log(tmpdir):
 
 @pytest.fixture
 def mock_extract_energy_from_thermo_log(mocker):
-    return mocker.patch('crystal_diffusion.train_mtp.extract_energy_from_thermo_log', return_value=[])
+    return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_energy_from_thermo_log', return_value=[])
 
 
 @pytest.fixture
 def mock_extract_structure_and_forces(mocker):
-    return mocker.patch('crystal_diffusion.train_mtp.extract_structure_and_forces_from_file', return_value=([], []))
+    return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_structure_and_forces_from_file',
+                        return_value=([], []))
 
 
 def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_extract_energy_from_thermo_log, tmpdir):
@@ -230,7 +240,7 @@ def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_
 
     # Verify that the mocks were called correctly
     assert mock_extract_structure_and_forces.call_count == 2
-    mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict)
+    mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict, True)
 
     assert mock_extract_energy_from_thermo_log.call_count == 2
     mock_extract_energy_from_thermo_log.assert_called_with(thermo_yaml_files[1])