diff --git a/README_active_learning.md b/README_active_learning.md new file mode 100644 index 00000000..89f9f177 --- /dev/null +++ b/README_active_learning.md @@ -0,0 +1 @@ +Instructions to run the active learning benchmark. \ No newline at end of file diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py new file mode 100644 index 00000000..c5333d3c --- /dev/null +++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass + + +@dataclass(kw_only=True) +class ActiveLearningDataArguments: + """Paths to the training, validaition datasets and output directory.""" + training_data_dir: str # training data directory + evaluation_data_dir: str # evaluation data directory + output_dir: str # directory where to save the results + + +@dataclass(kw_only=True) +class StructureEvaluationArguments: + """Parameters related to the MLIP evaluation.""" + evaluation_criteria: str = 'nbh_grades' + criteria_threshold: float = 10 + number_of_structures: int = None + extraction_radius: float = 3 + + +@dataclass(kw_only=True) +class RepaintingArguments: + """Parameters related to the structure generation model.""" + model: str = 'dev_dummy' diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py new file mode 100644 index 00000000..7b231d3a --- /dev/null +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -0,0 +1,306 @@ +import argparse +import os +from typing import Any, List, Optional, Tuple + +import numpy as np +import pandas as pd +import yaml +from hydra.utils import instantiate + +from crystal_diffusion.active_learning_loop.utils import ( + extract_target_region, get_structures_for_retraining) +from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 + + +class ActiveLearningLoop: + """Method to train, evaluate and fine-tune a MLIP.""" + def __init__(self, + meta_config: str, + ): + """Active learning benchmark. + + Includes methods to train & evaluate a MLIP, isolate bad sub-structures, repaint new structures and retrain + the MLIP. + + Args: + meta_config: path to a yaml configuration with the parameters for the modules in the class + """ + assert os.path.exists(meta_config), "configuration file for active learning loop does not exist." + # define the modules in the __init__ function + self.data_paths, self.mlip_model, self.eval_config, self.structure_generation = None, None, None, None + self.oracle = None + # use hydra to convert the yaml into modules and other data classes + self.parse_config(meta_config) + self.atom_dict = {1: "Si"} # TODO this should be define somewhere smart + self.trained_mlips = [] # history of trained MLIPs (optional - not sure if we should keep this) + self.training_sets = [] # history of training sets + + def parse_config(self, meta_config: str): + """Read a configuration file and instantiate the different blocks with hydra. + + The configuration file should have the following blocks of parameters: + active_learning_data: dataset paths + mlip_model: MLIP module training parameters + structure_evaluation: identification and isolation of the atomic regions to finetune the MLIP + + Args: + meta_config: path to configuration yaml file + """ + with open(meta_config, 'r') as stream: + meta_config = yaml.load(stream, Loader=yaml.FullLoader) + # paths to the training & evaluation datasets + self.data_paths = instantiate(meta_config['active_learning_data']) + # MLIP model - for example MTP + self.mlip_model = instantiate(meta_config['mlip']) + # parameters to find and isolate the problematic regions in the evaluation dataset + self.eval_config = instantiate(meta_config['structure_evaluation']) + # structure generation module + self.structure_generation = instantiate(meta_config['repainting_model']) + # force labeling module + self.oracle = instantiate(meta_config['oracle']) + + def train_mlip(self, round: int = 1, training_set: Optional[Any] = None) -> str: + """Train a MLIP using the parameters specified in the configuration file. + + Args: + round (optional): current round of training. Used to track now configurations in the training set. A round + includes the initial training and the evaluation process. + training_set (optional): if specified, use this dataset for training. Otherwise, use the dataset from the + paths in the configuration file. Defaults to None. + + Returns: + path to the trained MLIP model + """ + if training_set is None: + if len(self.training_sets) == 0: + self.training_sets = [self.mlip_model.prepare_dataset_from_lammps( + root_data_dir=self.data_paths.training_data_dir, + atom_dict=self.atom_dict, + mode="train" + )] + training_set = self.mlip_model.merge_inputs(self.training_sets) + + trained_mtp = self.mlip_model.train(training_set, mlip_name=f'mlip_round_{round}') + self.trained_mlips.append(trained_mtp) # history of trained MLIPs ... not sure if useful + return trained_mtp + + def evaluate_mlip(self, round: int = 1, mlip_name: Optional[str] = None, forces_available: bool = True + ) -> pd.DataFrame: + """Evaluate a MLIP using the parameters specified in the configuration file. + + Args: + round (optional): current round of training. Defaults to 1. + mlip_name (optional): if not None, use this MTP to evaluate the dataset. + forces_available (optional): if True, get the ground truth forces from the dataset. + + Returns: + dataframe with the atomic indices, positions, forces and evaluation criteria + """ + evaluation_dataset = self.mlip_model.prepare_dataset_from_lammps( + root_data_dir=self.data_paths.evaluation_data_dir, + atom_dict=self.atom_dict, + mode="evaluation", + get_forces=forces_available + ) + # first returned element is the ground truth DF + # TODO make sure this works even if the GT is not available... + if mlip_name is None: + mlip_name = os.path.join(self.mlip_model.savedir, f'mlip_round_{round}.almtp') + _, prediction_df = self.mlip_model.evaluate(evaluation_dataset, mlip_name=mlip_name) + + return prediction_df + + def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]: + """Find the structures with a high uncertainty based on the configuration file parameters. + + Args: + prediction_df: evaluation outputs of the MLIP model. Should contain atomic positions, uncertainty criteria + and structure indices. + + Returns: + list of structures with a high uncertainty criteria. + """ + num_structures = self.eval_config.number_of_structures + structures_to_retrain = get_structures_for_retraining(prediction_df, + criteria_threshold=self.eval_config.criteria_threshold, + number_of_structures=num_structures, + evaluation_criteria=self.eval_config.evaluation_criteria) + return structures_to_retrain + + def excise_worst_atom(self, structures_to_retrain: List[pd.DataFrame]) -> List[pd.DataFrame]: + """For a given structure, isolate the atom with the highest uncertainty criteria. + + Args: + structures_to_retrain: list of dataframes with the atomic positions and evaluate criteria + + Returns: + list of dataframes with only the targeted region + """ + # we assume the extraction region to be a sphere of radius extraction_radius around the worst atoms + # if more than 1 atom are bad in a structure, we only extract the worst + # TODO implement other extraction methods + bad_regions = [extract_target_region(s, + extraction_radius=self.eval_config.extraction_radius, + evaluation_criteria=self.eval_config.evaluation_criteria) + for s in structures_to_retrain] + return bad_regions + + def get_structure_candidate_from_generative_model(self, + fixed_atoms: pd.DataFrame, + number_of_candidates: int = 1 + ) -> pd.DataFrame: + """Generate new structures around the specified fixed atoms. + + Args: + fixed_atoms: dataframe with the atom type, coordinates and unit cell information + number_of_candidates: how many structure to generate. Defaults to 1. + + Returns: + dataframe with the atom type, coordinates and unit cell + + """ + # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms + if self.structure_generation.model == 'dev_dummy': # replace with a wrapper around the diffusion model + # and hydra instantiate + return fixed_atoms + else: + raise NotImplementedError('Only dev_dummy is supported at the moment.') + + def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1): + """Save the generated structures in a csv format in the output dir. + + Args: + new_structures: structures proposed by the generative model + round: current round of training. Defaults to 1. + """ + root_data_dir = os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}') + os.makedirs(root_data_dir, exist_ok=True) + for i, new_struc in enumerate(new_structures): + new_struc.to_csv(os.path.join(root_data_dir, f'structure_{i}.csv'), index=False) + + def get_labels_from_oracle(self, round: int = 1) -> Any: + """Compute energy and forces from an oracle such as LAMMPS for the new candidates generated in a round of AL. + + Args: + round (optional): round of retraining. Defaults to 1. + + Returns: + mlip data input (for example, MTPInputs) + """ + new_labeled_samples = [] + for file in os.listdir(os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}')): + if file.endswith('.csv'): + new_labeled_samples.append(self.call_oracle( + os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}', file) + )) + new_labeled_samples = self.mlip_model.merge_inputs(new_labeled_samples) + return new_labeled_samples + + def call_oracle(self, path_to_file: str) -> Any: + """Compute energy and forces for a given atomic structure. + + Args: + path_to_file: path to csv file containing the atomic positions and structure information + + Returns: + mlip data inputs (for example, MTPInputs) + """ + data = pd.read_csv(path_to_file) + cartesian_positions = data[['x', 'y', 'z']].to_numpy() + box = np.eye(3, 3) * 5.43 # TODO this is bad - fix this + atom_type = np.ones(cartesian_positions.shape[0], dtype=np.integer) # TODO also bad + energy, forces = self.oracle(cartesian_positions, box, atom_type) + labels_as_mtp = self.mlip_model.prepare_dataset_from_numpy( + cartesian_positions, + box, + forces, + energy, + atom_type, + ) + return labels_as_mtp + + def round_of_active_learning_loop(self, trained_mlip: Optional[MTPWithMLIP3] = None + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Do a full loop of activate learning. + + The following steps are done in sequence: + - train a MLIP from the training set specified in the config file if trained_mlip is not specified + - evaluate the MLIP with the evaluation set specified in the config file + - find the "bad" structures in the evaluation set based on the criteria from the config file + - excise the problematic regions + - generate new candidates based on these regions + - call the oracle to get the labels for the new generated candidates + - retrain the MLIP + - evaluate the MLIP again + + Args: + trained_mlip (optional): if not None, use this MLIP as a starting point. If None, train a MLIP from scratch + using the training data specified in the config file. + + Returns: + dataframe with the MLIP evaluation results before finetuning with the generated structures + dataframe with the MLIP evaluation results after finetuning with the generated structures + """ + # one round from a known mtp (or train from provided training set) + # evaluate, find candidates and update MTP + # return the updated MTP + if trained_mlip is None: + trained_mlip = self.train_mlip() + pred_df = self.evaluate_mlip(mlip_name=trained_mlip) + bad_structures = self.get_bad_structures(pred_df) + bad_regions = self.excise_worst_atom(bad_structures) + new_candidates = [self.get_structure_candidate_from_generative_model(x) for x in bad_regions] + self.new_structure_to_csv(new_candidates) + new_labeled_candidates = self.get_labels_from_oracle() + new_training_set = self.mlip_model.merge_inputs([self.training_sets[-1], new_labeled_candidates]) + self.training_sets.append(new_training_set) + new_mtp = self.train_mlip() + new_pred_df = self.evaluate_mlip(mlip_name=new_mtp) + return pred_df, new_pred_df + + def evaluate_mtp_update(self, original_predictions: pd.DataFrame, updated_predictions) -> Tuple[float, float]: + """Find the evaluation criteria in the original predictions and the corresponding value after retraining. + + Args: + original_predictions: MLIP predictions before retraining + updated_predictions: MLIP predictions after retraining + + Returns: + worst evaluation_criteria (e.g. MaxVol) in the original evaluation + corresponding value after retraining with new samples. Not guaranteed to be the maximum value. + """ + # find the highest MaxVol in the original predictions - identified by the atom index and structure index + # TODO we assume a max - but it could be a min i + criteria = self.eval_config.evaluation_criteria + atom_index, structure_index, original_value = original_predictions.iloc[ + original_predictions[criteria].argmax()][['atom_index', 'structure_index', criteria]] + updated_value = updated_predictions.loc[ + (updated_predictions['atom_index'] == atom_index) + & (updated_predictions['structure_index'] == structure_index), criteria].values.item() + return original_value, updated_value + + +def get_arguments() -> argparse.Namespace: + """Parse arguments. + + Returns: + args: arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument('--config', help='path to data directory', required=True) + args = parser.parse_args() + return args + + +def main(): + """Example to do an active learning loop once.""" + args = get_arguments() + # TODO get mtp_config_path from the args + config_path = args.config + al_loop = ActiveLearningLoop(config_path) + initial_df, new_df = al_loop.round_of_active_learning_loop() + al_loop.evaluate_mtp_update(initial_df, new_df) + + +if __name__ == '__main__': + main() diff --git a/crystal_diffusion/active_learning_loop/oracle.py b/crystal_diffusion/active_learning_loop/oracle.py new file mode 100644 index 00000000..a1893f5c --- /dev/null +++ b/crystal_diffusion/active_learning_loop/oracle.py @@ -0,0 +1,55 @@ +from pathlib import Path +from typing import Dict, Tuple + +import numpy as np + +from crystal_diffusion import DATA_DIR +from crystal_diffusion.oracle.lammps import get_energy_and_forces_from_lammps + + +class LAMMPS_for_active_learning: + """Oracle using LAMMPS to get the energy and forces on atoms.""" + def __init__(self): + """Initialize the class.""" + pass + + def __call__(self, + cartesian_positions: np.ndarray, + box: np.ndarray, + atom_types: np.ndarray, + atom_type_map: Dict[int, str] = {1: 'Si'}, + tmp_work_dir: str = './', + pair_coeff_dir: Path = DATA_DIR) -> Tuple[float, np.ndarray]: + """Call LAMMPS to get energy and forces for a given set of atoms. + + Args: + cartesian_positions: atomic positions as a n_atom x 3 array + box: unit cell definition as a 3x3 array. Assumed to be diagonal. + atom_types: integers defining each atoms as an array of length n_atom + atom_type_map: map between indices and atom type. Defaults to {1: 'Si'} + tmp_work_dir: temporary work directory for LAMMPS. Defaults to ./ + pair_coeff_dir: path to stilinger-weber potential. Defaults to DATA_DIR. + + Returns: + energy and forces on each atom (n_atom x 3) + """ + shifted_positions = self.shift_positions(cartesian_positions, box) + energy, forces = get_energy_and_forces_from_lammps(shifted_positions, box, atom_types, atom_type_map, + tmp_work_dir, pair_coeff_dir) + return energy, forces[['fx', 'fy', 'fz']].to_numpy() + + def shift_positions(self, cartesian_positions: np.ndarray, box: np.ndarray) -> np.ndarray: + """Shift the positions of the atoms so all coordinates are positives. + + This is because LAMMPS will ignore atoms with coordinates outside the [0, a] range (a = size of the unit cell). + + Args: + cartesian_positions: atomic positions (n_atom x 3 array) + box: unit cell (3x3 array) - assumed to be diagonal + + Returns: + array with shifted positions + """ + for i, cell_size in enumerate(np.diag(box)): + cartesian_positions[:, i] = cartesian_positions[:, i] % cell_size + return cartesian_positions diff --git a/crystal_diffusion/active_learning_loop/utils.py b/crystal_diffusion/active_learning_loop/utils.py new file mode 100644 index 00000000..02833f89 --- /dev/null +++ b/crystal_diffusion/active_learning_loop/utils.py @@ -0,0 +1,68 @@ +from typing import List, Optional + +import pandas as pd + + +def get_structures_for_retraining(prediction_df: pd.DataFrame, + criteria_threshold: Optional[float] = None, + number_of_structures: Optional[int] = None, + evaluation_criteria: str = 'nbh_grades', + structure_index: str = 'structure_index' + ) -> List[pd.DataFrame]: + """Find the structures with the worst value of the evaluation criteria. + + Args: + prediction_df: dataframe with the atom positions, forces, uncertainty criteria (e.g. MaxVol coefficient), + indices and structure indices + criteria_threshold: value above which the evaluation_criteria is considered bad. Either this or + number_of_structures should be specified. number_of_structures has priority if both are specified. + Defaults to None. + number_of_structures: number of structures to return. The top number_of_structures with the highest value of + evaluation_criteria are returned. Either this or criteria_threshold should be specified. Defaults to None. + evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (MaxVol coefficient in MTP) + structure_index: name of the column in the dataframe with the index identifying the structure. Defaults to + structure_index. + + Returns: + list of the structures with a bad evaluation criteria. Length of the list depends on criteria_threhold and + number_of_structures. + """ + assert criteria_threshold is not None or number_of_structures is not None, \ + "criteria_threshold or number_of_structures should be set." + # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection + criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max() + # find the top number_of_structures + structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False) + if criteria_threshold is not None: + structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold] + structures_indices = structures_indices.index.to_list() + if number_of_structures is not None: + structures_indices = structures_indices[:number_of_structures] + structures_to_retrain = [] + for idx in structures_indices: + structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx]) + return structures_to_retrain + + +def extract_target_region(structure_df: pd.DataFrame, + extraction_radius: float, + evaluation_criteria: str = 'nbh_grades') -> pd.DataFrame: + """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious. + + Args: + structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value) + extraction_radius: include all atoms within this distance of the targeted atom + evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP) + + Returns: + dataframe with the atomic coordinates in columns x, y, z + """ + # extract the worst ato and a region around of radius extraction_radius + # TODO better method to determine radius: number of atoms ? + target_atom = structure_df[evaluation_criteria].idxmax() + target_position = structure_df.loc[target_atom][['x', 'y', 'z']] + # TODO periodicity... + structure_df.loc[:, 'distance_squared'] = structure_df.apply( + lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1) + atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']] + return atom_positions diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py new file mode 100644 index 00000000..bcf527ed --- /dev/null +++ b/crystal_diffusion/mlip/mtp_train.py @@ -0,0 +1,132 @@ +"""Script to train and evaluate a MTP. + +Running the main() runs a debugging example. Entry points are train_mtp. +""" +import argparse +from typing import Dict, Tuple + +import pandas as pd +from sklearn.metrics import mean_absolute_error + +from crystal_diffusion.mlip.mtp_utils import (MTPInputs, + crawl_lammps_directory, + prepare_mtp_inputs_from_lammps) +from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 + +atom_dict = {1: 'Si'} + + +def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTPInputs: + """Prepare the dataset in a given directory into a MTP format. + + Args: + root_data_dir: root data directory to parse + atom_dict: map between an index and an atom type + mode: type of data to look for. e.g. train will only look at the folders in root_data_dirwith "train" in their + name. Defaults to train. + + Returns: + data in the MTPInputs dataclass + """ + lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) + mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict) + return mtp_dataset + + +def train_mtp(train_inputs: MTPInputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3: + """Create and train an MTP potential. + + Args: + train_inputs: inputs for training. Should contain structure, energies and forces + mlip_folder_path: path to MLIP-3 folder + save_dir: path to directory where to save the fitted model + + Returns: + dataframe with original and predicted energies and forces. + """ + # TODO more kwargs for MTP training. See maml and mlip-3 documentation. + # create MTP + mtp = MTPWithMLIP3(mlip_path=mlip_folder_path) + # train + mtp.train( + train_structures=train_inputs.structure, + train_energies=train_inputs.energy, + train_forces=train_inputs.forces, + train_stresses=None, + max_dist=5, + stress_weight=0, + fitted_mtp_savedir=save_dir, + ) + + return mtp + + +def evaluate_mtp(eval_inputs: MTPInputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Evaluate a trained MTP potential. + + Args: + eval_inputs: inputs to evaluate. Should contain structure, energies and forces + mtp: trained MTP potential. + + Returns: + dataframe with original and predicted energies and forces. + """ + # evaluate + df_orig, df_predict = mtp.evaluate( + test_structures=eval_inputs.structure, + test_energies=eval_inputs.energy, + test_forces=eval_inputs.forces, + test_stresses=None, + ) + return df_orig, df_predict + + +def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]: + """Get mean absolute error on energy and forces from the outputs of MTP. + + Args: + df_orig: dataframe with ground truth values + df_predict: dataframe with MTP predictions + + Returns: + MAE on energy in eV/atom and MAE on forces in eV/Å + """ + # from demo in maml + # get a single predicted energy per structure + predicted_energy = df_predict.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'}) + # normalize by number of atoms + predicted_energy = (predicted_energy['energy'] / predicted_energy['atom_index']).to_numpy() + # same for ground truth + gt_energy = df_orig.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'}) + gt_energy = (gt_energy['energy'] / gt_energy['atom_index']).to_numpy() + + predicted_forces = (df_predict[['fx', 'fy', 'fz']].to_numpy().flatten()) + gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten()) + + return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces) + + +def main(): + """Train and evaluate an example for MTP.""" + parser = argparse.ArgumentParser() + parser.add_argument('--lammps_yaml', help='path to LAMMPS yaml file', required=True, nargs='+') + parser.add_argument('--lammps_thermo', help='path to LAMMPS thermo output', required=True, nargs='+') + parser.add_argument('--mlip_dir', help='directory to MLIP compilation folder', required=True) + parser.add_argument('--output_dir', help='path to folder where outputs will be saved', required=True) + args = parser.parse_args() + + lammps_yaml = args.lammps_yaml + lammps_thermo_yaml = args.lammps_thermo + assert len(lammps_yaml) == len(lammps_thermo_yaml), "LAMMPS outputs yaml should match thermodynamics output." + + mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict) + mtp = train_mtp(mtp_inputs, args.mlip_dir, args.output_dir) + print("Training is done") + df_orig, df_predict = evaluate_mtp(mtp_inputs, mtp) + energy_mae, force_mae = get_metrics_from_pred(df_orig, df_predict) + print(f"MAE of training energy prediction is {energy_mae * 1000} meV/atom") + print(f"MAE of training force prediction is {force_mae} eV/Å") + + +if __name__ == '__main__': + main() diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/mlip/mtp_utils.py similarity index 52% rename from crystal_diffusion/train_mtp.py rename to crystal_diffusion/mlip/mtp_utils.py index 59d65bf3..d2cbf298 100644 --- a/crystal_diffusion/train_mtp.py +++ b/crystal_diffusion/mlip/mtp_utils.py @@ -1,9 +1,7 @@ -"""Script to train and evaluate a MTP. - -Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp. -""" -import argparse -from typing import Any, Dict, List, NamedTuple, Tuple +import os +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple import numpy as np import pandas as pd @@ -11,25 +9,30 @@ from pymatgen.core import Structure from sklearn.metrics import mean_absolute_error -from crystal_diffusion.models.mtp import MTPWithMLIP3 -atom_dict = {1: 'Si'} +@dataclass(kw_only=True) +class MTPInputs: + """Create a dataclass to train or evaluate a MTP model.""" + structure: List[Structure] + forces: List[List[List[float]]] # num samples x num atoms x spatial dimension + energy: List[float] -def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \ - Tuple[List[Structure], List[List[float]]]: +def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any], forces_avail: bool = True) -> \ + Tuple[List[Structure], Optional[List[List[float]]]]: """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods. Args: filename: path to LAMMPS output file in yaml format atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str) + forces_avail (optional): if True, get the forces from the LAMMPS output file. Defaults to True. Returns: list of pymatgen Structure containing the atoms and their positions - list of forces (n x 3) for each atom + list of forces (n x 3) for each atom. None if forces_avail is False """ structures = [] - forces = [] + forces = [] if forces_avail else None with (open(filename, 'r') as f): l_yaml = yaml.safe_load_all(f) for d in l_yaml: # loop over LAMMPS outputs and convert in pymatgen Structure objects @@ -47,9 +50,10 @@ def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, A coords=coords, coords_are_cartesian=True) structures.append(pm_structure) - force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']] - structure_forces = [[x[i] for i in force_idx] for x in d['data']] - forces.append(structure_forces) + if forces_avail: + force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']] + structure_forces = [[x[i] for i in force_idx] for x in d['data']] + forces.append(structure_forces) return structures, forces @@ -70,27 +74,21 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]: return energies -class MTP_Inputs(NamedTuple): - """Create a namedtuple instance for MTP inputs.""" - - structure: List[Structure] - forces: List[List[float]] - energy: List[float] - - def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str], - atom_dict: Dict[int, Any] - ) -> MTP_Inputs: + atom_dict: Dict[int, Any], + get_forces: bool = True, + ) -> MTPInputs: """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format. Args: output_yaml: list of LAMMPS output files as yaml. thermo_yaml: list of LAMMPS thermodynamic output files as yaml. atom_dict: mapping of LAMMPS indices to atom type. + get_forces (optional): if True, get the forces. Defaults to True. Returns: - namedtuple with structure, energies and forces usable by MTP. + dataclass used as inputs to train and evaluation a MTP model """ mtp_inputs = { 'structure': [], @@ -98,63 +96,55 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], 'forces': [] } for filename in output_yaml: - structures, forces = extract_structure_and_forces_from_file(filename, atom_dict) + structures, forces = extract_structure_and_forces_from_file(filename, atom_dict, get_forces) mtp_inputs['structure'] += structures - mtp_inputs['forces'] += forces + mtp_inputs['forces'] += forces # will be None if get_forces is False for filename in thermo_yaml: mtp_inputs['energy'] += extract_energy_from_thermo_log(filename) - mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'], - energy=mtp_inputs['energy'], - forces=mtp_inputs['forces']) + mtp_inputs = MTPInputs(structure=mtp_inputs['structure'], + energy=mtp_inputs['energy'], + forces=mtp_inputs['forces']) return mtp_inputs -def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3: - """Create and train an MTP potential. +def crawl_lammps_directory(folder_name: str, folder_name_pattern: str = "train") -> Tuple[List[str], List[str]]: + """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name. + + LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml Args: - train_inputs: inputs for training. Should contain structure, energies and forces - mlip_folder_path: path to MLIP-3 folder - save_dir: path to directory where to save the fitted model + folder_name: folder to crawl + folder_name_pattern (optional): name of the subfolder to keep. Defaults to train. Returns: - dataframe with original and predicted energies and forces. - """ - # TODO more kwargs for MTP training. See maml and mlip-3 documentation. - # create MTP - mtp = MTPWithMLIP3(mlip_path=mlip_folder_path) - # train - mtp.train( - train_structures=train_inputs.structure, - train_energies=train_inputs.energy, - train_forces=train_inputs.forces, - train_stresses=None, - max_dist=5, - stress_weight=0, - fitted_mtp_savedir=save_dir, - ) + list of LAMMPS dump outputs and list of LAMMPS thermo outputs - return mtp + """ + assert os.path.exists(folder_name), "Invalid folder name provided." + lammps_output_files, thermo_output_files = [], [] + for dirpath, _, filenames in os.walk(folder_name): + if re.search(folder_name_pattern, dirpath): + lammps_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("dump.yaml")]) + thermo_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("thermo.yaml")]) + return lammps_output_files, thermo_output_files -def evaluate_mtp(eval_inputs: MTP_Inputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]: - """Evaluate a trained MTP potential. +def concat_mtp_inputs(input1: MTPInputs, input2: MTPInputs) -> MTPInputs: + """Merge two MTP inputs data class. Args: - eval_inputs: inputs to evaluate. Should contain structure, energies and forces - mtp: trained MTP potential. + input1: first MTPInputs dataset + input2: second MTPInputs dataset Returns: - dataframe with original and predicted energies and forces. + concatenated MTPInputs dataset """ - # evaluate - df_orig, df_predict = mtp.evaluate( - test_structures=eval_inputs.structure, - test_energies=eval_inputs.energy, - test_forces=eval_inputs.forces, - test_stresses=None, + concat_inputs = MTPInputs( + structure=input1.structure + input2.structure, + forces=input1.forces + input2.forces, + energy=input1.energy + input2.energy ) - return df_orig, df_predict + return concat_inputs def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]: @@ -180,29 +170,3 @@ def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tu gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten()) return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces) - - -def main(): - """Train and evaluate an example for MTP.""" - parser = argparse.ArgumentParser() - parser.add_argument('--lammps_yaml', help='path to LAMMPS yaml file', required=True, nargs='+') - parser.add_argument('--lammps_thermo', help='path to LAMMPS thermo output', required=True, nargs='+') - parser.add_argument('--mlip_dir', help='directory to MLIP compilation folder', required=True) - parser.add_argument('--output_dir', help='path to folder where outputs will be saved', required=True) - args = parser.parse_args() - - lammps_yaml = args.lammps_yaml - lammps_thermo_yaml = args.lammps_thermo - assert len(lammps_yaml) == len(lammps_thermo_yaml), "LAMMPS outputs yaml should match thermodynamics output." - - mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict) - mtp = train_mtp(mtp_inputs, args.mlip_dir, args.output_dir) - print("Training is done") - df_orig, df_predict = evaluate_mtp(mtp_inputs, mtp) - energy_mae, force_mae = get_metrics_from_pred(df_orig, df_predict) - print(f"MAE of training energy prediction is {energy_mae * 1000} meV/atom") - print(f"MAE of training force prediction is {force_mae} eV/Å") - - -if __name__ == '__main__': - main() diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mlip/mtp.py similarity index 56% rename from crystal_diffusion/models/mtp.py rename to crystal_diffusion/models/mlip/mtp.py index 67570495..bc822cf6 100644 --- a/crystal_diffusion/models/mtp.py +++ b/crystal_diffusion/models/mlip/mtp.py @@ -10,6 +10,7 @@ import shutil import subprocess from collections import defaultdict +from dataclasses import dataclass from typing import Any, Dict, List, Optional, TextIO, Tuple import numpy as np @@ -20,30 +21,51 @@ from monty.tempfile import ScratchDir from pymatgen.core import Structure +from crystal_diffusion.mlip.mtp_utils import (MTPInputs, concat_mtp_inputs, + crawl_lammps_directory, + prepare_mtp_inputs_from_lammps) + + +@dataclass(kw_only=True) +class MTPArguments: + """Arguments to train an MTP with the MLIP3 library.""" + mlip_path: str # path to MLIP3 library + name: Optional[str] = None # MTP + param: Optional[Dict[Any, Any]] = None + unfitted_mtp: str = "08.almtp" # Define the initial mtp file. Default to 08g.amltp + fitted_mtp_savedir: str = '../' # save directory for the fitted MTP. Defaults to '../' (current wd) + max_dist: float = 5 # The actual radial cutoff. Defaults to 5. + radial_basis_size: int = 8 # Relevant to number of radial basis function. Defaults to 8. + max_iter: int = 1000 # The number of maximum iteration. Defaults to 1000. + energy_weight: float = 1 # The weight of energy. Defaults to 1 + force_weight: float = 1e-2 # The weight of forces. Defaults to 1e-2 + stress_weight: float = 1e-3 # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. + init_params: str = "same" # how to initialize parameters if a potential was not pre-fitted: "same" or "random". + scale_by_force: float = 0 # If > 0 then configurations near equilibrium get more weight. Defaults to 0. + bfgs_conv_tol: float = 1e-3 # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations. + weighting: str = "vibration" # How to weight configuration with different sizes relative to each other. + # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + class MTPWithMLIP3(MTPotential): """MTP with MLIP-3.""" - def __init__(self, - mlip_path: str, - name: Optional[str] = None, - param: Optional[Dict[Any, Any]] = None, - version: Optional[str] = None): + def __init__(self, mtp_args: MTPArguments): """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3. Args: - mlip_path: path to mlip3 library - name: MTPotential argument - param: MTPotential argument - version: MTPotential argument + mtp_args: MTP arguments from the class MTPArguments """ - super().__init__(name, param, version) - self.mlp_command = os.path.join(mlip_path, "build", "mlp") + super().__init__(mtp_args.name, mtp_args.param) + self.mlp_command = os.path.join(mtp_args.mlip_path, "build", "mlp") assert os.path.exists(self.mlp_command), "mlp command not found in mlip-3 build folder" - self.mlp_templates = os.path.join(mlip_path, "MTP_templates") + self.mlp_templates = os.path.join(mtp_args.mlip_path, "MTP_templates") assert os.path.exists(self.mlp_templates), "MTP templates not found in mlip-3 folder" self.fitted_mtp = None self.elements = None + self.mtp_args = mtp_args + self.savedir = mtp_args.fitted_mtp_savedir + os.makedirs(self.savedir, exist_ok=True) def to_lammps_format(self): """Write the trained MTP in a LAMMPS compatible format.""" @@ -60,44 +82,51 @@ def to_lammps_format(self): pass def evaluate(self, - test_structures: List[Structure], - test_energies: List[float], - test_forces: List[List[float]], - test_stresses: Optional[List[List[float]]] = None, + dataset: MTPInputs, + mlip_name: str = 'mtp_fitted.almtp' ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP. Args: - test_structures: evaluation set of pymatgen Structure Objects. - test_energies: list of total energies of each structure to evaluation in test_structures list. - test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list. - m can be varied with each single structure case. - test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in - test_structures list. If None, do not evaluate on stresses. Default to None. + dataset: MTPInputs dataclass with the following elements: + structures: The list of Pymatgen Structure object. + energies: List of total energies of each structure in structures list. + forces: List of (m, 3) forces array of each structure with m atoms in structures list. + m can be varied with each single structure case. + mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp Returns: dataframe with ground truth energies, forces dataframe with predicted energies, forces, MaxVol gamma (nbh grades) """ - if self.fitted_mtp is None: - raise AttributeError('MTP was not trained. Please call train() before evaluate().') + if not mlip_name.endswith('.almtp'): + mlip_name += '.almtp' + assert os.path.exists(mlip_name), f"Trained MTP does not exists: {mlip_name}" original_file = "original.cfgs" predict_file = "predict.cfgs" - test_structures, test_forces, test_stresses = check_structures_forces_stresses( - test_structures, test_forces, test_stresses + + # TODO if forces are not available... + test_structures, test_forces, _ = check_structures_forces_stresses( + dataset.structure, dataset.forces, None ) - predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses) + predict_pool = pool_from(test_structures, dataset.energy, test_forces) + local_mtp_name = "mtp.almtp" with ScratchDir("."): # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere # write the structures to evaluate in a mlp compatible format original_file = self.write_cfg(original_file, cfg_pool=predict_pool) + # TODO how to handle when GT is not available df_orig = self.read_cfgs(original_file, nbh_grade=False) # read original values as a DataFrame + # copy the trained mtp in the scratchdir + shutil.copyfile(mlip_name, os.path.join(os.getcwd(), local_mtp_name)) # calculate_grade is the method to get the forces, energy & maxvol values - cmd = [self.mlp_command, "calculate_grade", self.fitted_mtp, original_file, predict_file] + cmd = [self.mlp_command, "calculate_grade", local_mtp_name, original_file, predict_file] predict_file += '.0' # added by mlp... stdout, rc = self._call_mlip(cmd) + + # check that MTP was called properly if rc != 0: error_msg = f"mlp exited with return code {rc}" msg = stdout.decode("utf-8").split("\n")[:-1] @@ -107,7 +136,7 @@ def evaluate(self, except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) - + # read the config df_predict = self.read_cfgs(predict_file, nbh_grade=True) return df_orig, df_predict @@ -235,72 +264,108 @@ def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO): with subprocess.Popen(cmd, stdout=output_file) as p: p.communicate()[0] - def train( - self, - train_structures: List[Structure], - train_energies: List[float], - train_forces: List[List[float]], - train_stresses: Optional[List[List[float]]] = None, - unfitted_mtp: str = "08.almtp", - fitted_mtp_savedir: str = '../', - max_dist: float = 5, - radial_basis_size: int = 8, - max_iter: int = 1000, # TODO check the next kwargs in mlip3 - energy_weight: float = 1, - force_weight: float = 1e-2, - stress_weight: float = 1e-3, - init_params: str = "same", - scale_by_force: float = 0, - bfgs_conv_tol: float = 1e-3, - weighting: str = "vibration", - ) -> int: + @staticmethod + def prepare_dataset_from_lammps( + root_data_dir: str, + atom_dict: Dict[int, str], + mode: str = "train", + get_forces: bool = True, + ) -> MTPInputs: + """Get the LAMMPS in a folder and organize them as inputs for a MTP. + + Args: + root_data_dir: folder to read. Each LAMMPS sample is expected to be in a subfolder. + atom_dict: map from LAMMPS index to atom name. e.g. {1: 'Si'} + mode: subset of samples to get. Data from root_data_dir/*mode*/ folders will be parsed. Defaults to train. + get_forces: if True, get the forces from the samples. Defaults to True. + + Returns: + inputs for MTP in the MTPInputs dataclass + """ + lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) + mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict, get_forces=get_forces) + return mtp_dataset + + @staticmethod + def prepare_dataset_from_numpy( + cartesian_positions: np.ndarray, + box: np.ndarray, + forces: np.ndarray, + energy: float, + atom_type: np.ndarray, + atom_dict: Dict[int, str] = {1: 'Si'} + ) -> MTPInputs: + """Convert numpy array variables to a format compatible with MTP. + + Args: + cartesian_positions: atomic positions in Angstrom as a (n_atom, 3) array. + box: unit cell description as a (3, 3) array. + forces: forces on each atom as a (n_atom, 3) array + energy: energy of the configuration + atom_type: indices for each atom in the structure as a (n_atom,) array + atom_dict: map between atom indices and atom types + + Returns: + data formatted at an input for MTP. + """ + structure = Structure( + lattice=box, + species=[atom_dict[x] for x in atom_type], + coords=cartesian_positions, + coords_are_cartesian=True + ) + forces = forces.tolist() # from Nx3 np array to a list of length N where each element is a list of 3 forces + return MTPInputs(structure=[structure], forces=[forces], energy=[energy]) + + @staticmethod + def merge_inputs(mtp_inputs: List[MTPInputs]) -> MTPInputs: + """Merge a list of MTPInputs in a single MTPInputs. + + Args: + mtp_inputs: list of MTPInputs + + Returns: + merged MTPInputs + """ + merged_inputs = MTPInputs(structure=[], forces=[], energy=[]) + for x in mtp_inputs: + merged_inputs = concat_mtp_inputs(merged_inputs, x) + return merged_inputs + + def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str: """Training data with moment tensor method using MLIP-3. Override the base class method. Args: - train_structures: The list of Pymatgen Structure object. - train_energies: List of total energies of each structure in structures list. - train_forces: List of (m, 3) forces array of each structure with m atoms in structures list. - m can be varied with each single structure case. - train_stresses (optional): List of (6, ) virial stresses of each structure in structures list. - Defaults to None. - unfitted_mtp (optional): Define the initial mtp file. Default to 08g.amltp - fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd) - max_dist (optional): The actual radial cutoff. Defaults to 5. - radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8. - max_iter (optional): The number of maximum iteration. Defaults to 1000. - energy_weight (optional): The weight of energy. Defaults to 1 - force_weight (optional): The weight of forces. Defaults to 1e-2 - stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. - init_params (optional): How to initialize parameters if a potential was not - pre-fitted. Choose from "same" and "random". Defaults to "same". - scale_by_force (optional): If >0 then configurations near equilibrium - (with roughly force < scale_by_force) get more weight. Defaults to 0. - bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this - over 50 BFGS iterations. Defaults to 1e-3. - weighting (optional): How to weight configuration with different sizes relative to each other. - Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + dataset: MTPInputs dataclass with the following elements: + structures: The list of Pymatgen Structure object. + energies: List of total energies of each structure in structures list. + forces: List of (m, 3) forces array of each structure with m atoms in structures list. + m can be varied with each single structure case. + mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp Returns: - rc : return code of the mlp training script + fitted_mtp: path to the fitted MTP """ train_structures, train_forces, train_stresses = check_structures_forces_stresses( - train_structures, train_forces, train_stresses + dataset.structure, dataset.forces, None ) - train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses) + # last argument is for stresses - not used currently + train_pool = pool_from(train_structures, dataset.energy, train_forces) + elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures]))) self.elements = [str(element) for element in elements] # TODO move to __init__ atoms_filename = "train.cfgs" - with (ScratchDir(".")): # create a tmpdir - deleted afterwards + with ((ScratchDir("."))): # create a tmpdir - deleted afterwards atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool) - if not unfitted_mtp: + if not self.mtp_args.unfitted_mtp: raise RuntimeError("No specific parameter file provided.") - mtp_file_path = os.path.join(self.mlp_templates, unfitted_mtp) - shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp)) + mtp_file_path = os.path.join(self.mlp_templates, self.mtp_args.unfitted_mtp) + shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), self.mtp_args.unfitted_mtp)) commands = [self.mlp_command, "mindist", atoms_filename] with open("min_dist", "w") as f: self._call_cmd_to_stdout(commands, f) @@ -311,23 +376,26 @@ def train( # split_symbol = "=" # different for mlip-2 (":") and mlip-3 ("=") # min_dist = float(lines[-1].split(split_symbol)[1]) - save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]]) + save_fitted_mtp = mlip_name + if not save_fitted_mtp.endswith('.almtp'): + save_fitted_mtp += '.almtp' + cmds_list = [ self.mlp_command, "train", - unfitted_mtp, + self.mtp_args.unfitted_mtp, atoms_filename, f"--save_to={save_fitted_mtp}", - f"--iteration_limit={max_iter}", + f"--iteration_limit={self.mtp_args.max_iter}", "--al_mode=nbh", # active learning mode - required to get extrapolation grade - # f"--curr-pot-name={unfitted_mtp}", # TODO check those kwargs - # f"--energy-weight={energy_weight}", - # f"--force-weight={force_weight}", - # f"--stress-weight={stress_weight}", - # f"--init-params={init_params}", - # f"--scale-by-force={scale_by_force}", - # f"--bfgs-conv-tol={bfgs_conv_tol}", - # f"--weighting={weighting}", + f"--curr-pot-name={self.mtp_args.unfitted_mtp}", + f"--energy-weight={self.mtp_args.energy_weight}", + f"--force-weight={self.mtp_args.force_weight}", + f"--stress-weight={self.mtp_args.stress_weight}", + f"--init-params={self.mtp_args.init_params}", + f"--scale-by-force={self.mtp_args.scale_by_force}", + f"--bfgs-conv-tol={self.mtp_args.bfgs_conv_tol}", + f"--weighting={self.mtp_args.weighting}", ] stdout, rc = self._call_mlip(cmds_list) if rc != 0: @@ -340,6 +408,6 @@ def train( error_msg += msg[-1] raise RuntimeError(error_msg) # copy the fitted mtp outside the working directory - self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp) + self.fitted_mtp = os.path.join(self.savedir, save_fitted_mtp) shutil.copyfile(save_fitted_mtp, self.fitted_mtp) - return rc + return self.fitted_mtp diff --git a/examples/config_files/active_learning_loop/config.yaml b/examples/config_files/active_learning_loop/config.yaml new file mode 100644 index 00000000..04e70faf --- /dev/null +++ b/examples/config_files/active_learning_loop/config.yaml @@ -0,0 +1,43 @@ +active_learning_data: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments + training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/ + +mlip: + _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3 + mtp_args: + _target_: crystal_diffusion.models.mlip.mtp.MTPArguments + mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/ # path to MLIP3 library + name: None # MTP + param: None + unfitted_mtp: 08.almtp # Define the initial mtp file. Default to 08g.amltp + fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/ # save directory for the fitted MTP. Defaults to '../' (current wd) + max_dist: 5.0 # The actual radial cutoff. Defaults to 5. + radial_basis_size: 8 # Relevant to number of radial basis function. Defaults to 8. + max_iter: 1000 # The number of maximum iteration. Defaults to 1000. + energy_weight: 1.0 # The weight of energy. Defaults to 1 + force_weight: 1e-2 # The weight of forces. Defaults to 1e-2 + stress_weight: 1e-3 # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. + init_params: same # how to initialize parameters if a potential was not pre-fitted: "same" or "random". + scale_by_force: 0 # If > 0 then configurations near equilibrium get more weight. Defaults to 0. + bfgs_conv_tol: 1e-3 # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations. + weighting: vibration # How to weight configuration with different sizes relative to each other. + # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + +structure_evaluation: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments + evaluation_criteria: 'nbh_grades' # maxvol coefficient name in the mtp outputs + criteria_threshold: 10.0 # atoms with a MaxVol value under this value are considered bad + # number_of_structures: None # alternatively, take the N worst samples from the evaluation set + extraction_radius: 3 # extract atoms within this radius from the worst atom in a structure + +repainting_model: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments + model: dev_dummy # placeholder for development + +oracle: + _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning + +initial_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/before_tuning.csv +updated_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/after_tuning.csv \ No newline at end of file diff --git a/examples/local/active_learning/run_benchmark.sh b/examples/local/active_learning/run_benchmark.sh new file mode 100644 index 00000000..20c65901 --- /dev/null +++ b/examples/local/active_learning/run_benchmark.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +CONFIG=../../config_files/active_learning_loop/config.yaml + +python ../../../crystal_diffusion/active_learning_loop/benchmark.py --config $CONFIG \ No newline at end of file diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml new file mode 100644 index 00000000..e6ab83b0 --- /dev/null +++ b/experiments/active_learning_benchmark/config/mtp_training.yaml @@ -0,0 +1,40 @@ +active_learning_data: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments + training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/ + +mlip: + _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3 + mtp_args: + _target_: crystal_diffusion.models.mlip.mtp.MTPArguments + mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/ # path to MLIP3 library + name: None # MTP + param: None + unfitted_mtp: 08.almtp # Define the initial mtp file. Default to 08g.amltp + fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/ # save directory for the fitted MTP. Defaults to '../' (current wd) + max_dist: 5.0 # The actual radial cutoff. Defaults to 5. + radial_basis_size: 8 # Relevant to number of radial basis function. Defaults to 8. + max_iter: 1000 # The number of maximum iteration. Defaults to 1000. + energy_weight: 1.0 # The weight of energy. Defaults to 1 + force_weight: 1e-2 # The weight of forces. Defaults to 1e-2 + stress_weight: 1e-3 # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. + init_params: same # how to initialize parameters if a potential was not pre-fitted: "same" or "random". + scale_by_force: 0 # If > 0 then configurations near equilibrium get more weight. Defaults to 0. + bfgs_conv_tol: 1e-3 # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations. + weighting: vibration # How to weight configuration with different sizes relative to each other. + # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + +structure_evaluation: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments + evaluation_criteria: 'nbh_grades' # maxvol coefficient name in the mtp outputs + criteria_threshold: 10.0 # atoms with a MaxVol value under this value are considered bad + # number_of_structures: None # alternatively, take the N worst samples from the evaluation set + extraction_radius: 3 # extract atoms within this radius from the worst atom in a structure + +repainting_model: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments + model: dev_dummy # placeholder for development + +oracle: + _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning diff --git a/requirements.txt b/requirements.txt index 46421893..dd631c5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ datasets==2.17.1 flake8==4.0.1 flake8-docstrings==1.6.0 gitpython==3.1.27 +hydra-core==1.3.2 isort==5.13.2 jupyter==1.0.0 jinja2==3.1.2 @@ -38,3 +39,4 @@ einops==0.8.0 torchode==0.2.0 torchsde==0.2.6 ovito==3.10.6.post2 + diff --git a/tests/active_learning_loop/test_benchmark.py b/tests/active_learning_loop/test_benchmark.py new file mode 100644 index 00000000..ac712970 --- /dev/null +++ b/tests/active_learning_loop/test_benchmark.py @@ -0,0 +1,147 @@ +import os +from unittest.mock import MagicMock, mock_open + +import pandas as pd +import pytest + +from crystal_diffusion.active_learning_loop.benchmark import ActiveLearningLoop + + +class TestActiveLearningLoop: + @pytest.fixture + def mock_yaml_config(self): + return """ + active_learning_data: + key1: value1 + mlip: + key2: value2 + structure_evaluation: + key3: value3 + repainting_model: + key4: value4 + oracle: + key5: value5 + """ + + @pytest.fixture + def meta_config(self): # mock a path to a meta_config yaml file + return "fake_config.yaml" + + @pytest.fixture + def mock_al_loop(self, mocker, mock_yaml_config, meta_config): + # Mock the open function to simulate reading the YAML file + mocker.patch("builtins.open", mock_open(read_data=mock_yaml_config)) + # Mock os.path.exists to always return True + mocker.patch("os.path.exists", return_value=True) + # Mock the instantiate function from hydra.utils + mock_instantiate = mocker.patch("crystal_diffusion.active_learning_loop.benchmark.instantiate") + mock_instantiate.side_effect = lambda x: x # Return the config itself for simplicity + + # Create an instance of ActiveLearningLoop + loop = ActiveLearningLoop(meta_config) + return loop + + def test_parse_config(self, mock_al_loop, mock_yaml_config, meta_config): + + # Assertions to verify that the attributes were correctly set + assert mock_al_loop.data_paths == {'key1': 'value1'} + assert mock_al_loop.mlip_model == {'key2': 'value2'} + assert mock_al_loop.eval_config == {'key3': 'value3'} + assert mock_al_loop.structure_generation == {'key4': 'value4'} + assert mock_al_loop.oracle == {'key5': 'value5'} + + # Verify that the file was opened and the path was checked + open.assert_called_once_with(meta_config, 'r') + os.path.exists.assert_called_once_with(meta_config) + + def test_train_mlip(self, mocker, mock_yaml_config, mock_al_loop): + # Mocking the mlip_model's methods + mock_mlip_model = MagicMock() + mock_mlip_model.prepare_dataset_from_lammps.return_value = "mock_training_set" + mock_mlip_model.train.return_value = "mock_trained_mlip_model" + mock_mlip_model.merge_inputs.return_value = "mock_training_set" + + # Inject the mocked mlip_model into the loop instance + mock_al_loop.mlip_model = mock_mlip_model + mock_al_loop.data_paths = MagicMock(training_data_dir="mock_training_data_dir") + + # Run the train_mlip method without providing a training_set + result = mock_al_loop.train_mlip(round=1) + + # Verify the methods were called with expected parameters + mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with( + root_data_dir="mock_training_data_dir", + atom_dict=mock_al_loop.atom_dict, + mode="train" + ) + + mock_mlip_model.train.assert_called_once_with("mock_training_set", mlip_name="mlip_round_1") + + # Verify the trained model path is correctly returned + assert result == "mock_trained_mlip_model" + + # Verify that the trained model is appended to the history + assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model"] + + # Test when a training set is provided + custom_training_set = "custom_training_set" + result = mock_al_loop.train_mlip(round=2, training_set=custom_training_set) + + # The prepare_dataset_from_lammps should not be called since we provided a training_set + mock_mlip_model.prepare_dataset_from_lammps.assert_called_once() # No new call + mock_mlip_model.train.assert_called_with(custom_training_set, mlip_name="mlip_round_2") + + assert result == "mock_trained_mlip_model" + assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model", "mock_trained_mlip_model"] + + def test_evaluate_mlip(self, mock_al_loop, tmpdir): + # Mocking the mlip_model's methods + mock_mlip_model = MagicMock() + mock_evaluation_dataset = "mock_evaluation_dataset" + mock_prediction_df = pd.DataFrame({"atom_index": [0, 1], "force": [1.0, 2.0]}) + + # Mocking return values for the prepare_dataset_from_lammps and evaluate methods + mock_mlip_model.prepare_dataset_from_lammps.return_value = mock_evaluation_dataset + mock_mlip_model.evaluate.return_value = (None, mock_prediction_df) + + loop = mock_al_loop + + # Inject the mocked mlip_model into the loop instance + loop.mlip_model = mock_mlip_model + loop.data_paths = MagicMock(evaluation_data_dir="mock_evaluation_data_dir") + + # Run the evaluate_mlip method without specifying mlip_name + result_df = loop.evaluate_mlip(round=1) + + # Verify the prepare_dataset_from_lammps method was called with expected parameters + mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with( + root_data_dir="mock_evaluation_data_dir", + atom_dict=loop.atom_dict, + mode="evaluation", + get_forces=True + ) + # Verify the evaluate method was called with the correct parameters + expected_mlip_name = os.path.join(mock_mlip_model.savedir, 'mlip_round_1.almtp') + mock_mlip_model.evaluate.assert_called_once_with(mock_evaluation_dataset, mlip_name=expected_mlip_name) + + # Verify the method returns the correct dataframe + pd.testing.assert_frame_equal(result_df, mock_prediction_df) + + # Run the evaluate_mlip method with a custom mlip_name + custom_mlip_name = "custom_mlip.almtp" + result_df = loop.evaluate_mlip(round=2, mlip_name=custom_mlip_name) + + # The evaluate method should be called with the custom mlip_name + mock_mlip_model.evaluate.assert_called_with(mock_evaluation_dataset, mlip_name=custom_mlip_name) + + pd.testing.assert_frame_equal(result_df, mock_prediction_df) + + # Test without forces_available + _ = loop.evaluate_mlip(round=3, forces_available=False) + + mock_mlip_model.prepare_dataset_from_lammps.assert_called_with( + root_data_dir="mock_evaluation_data_dir", + atom_dict=loop.atom_dict, + mode="evaluation", + get_forces=False + ) diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py index 5431dd15..13a661ae 100644 --- a/tests/models/test_mtp.py +++ b/tests/models/test_mtp.py @@ -8,10 +8,11 @@ from pymatgen.core import Structure from sklearn.metrics import mean_absolute_error -from crystal_diffusion.models.mtp import MTPWithMLIP3 -from crystal_diffusion.train_mtp import ( - extract_energy_from_thermo_log, extract_structure_and_forces_from_file, - get_metrics_from_pred, prepare_mtp_inputs_from_lammps) +from crystal_diffusion.mlip.mtp_utils import ( + MTPInputs, extract_energy_from_thermo_log, + extract_structure_and_forces_from_file, get_metrics_from_pred, + prepare_mtp_inputs_from_lammps) +from crystal_diffusion.models.mlip.mtp import MTPArguments, MTPWithMLIP3 class FakeStructure: @@ -38,15 +39,15 @@ def mock_popen(mocker): # Mock the external dependencies and method calls within the MTPWithMLIP3.train method -def test_train(mocker, mock_popen): +def test_train(mocker, mock_popen, tmpdir): # Mock os.path.exists to always return True mocker.patch("os.path.exists", return_value=True) # Mock check_structures_forces_stresses to return a value without needing real input - mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", side_effect=passthrough) + mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses", side_effect=passthrough) # Mock pool_from to return a simplified pool object - mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="simple_pool_object") + mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="simple_pool_object") # Mock self.write_cfg to simulate creating a config file without file operations mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg") @@ -54,19 +55,23 @@ def test_train(mocker, mock_popen): mocker.patch("shutil.copyfile", return_value=None) # Initialize MTPWithMLIP3 with mock parameters - model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model") + mtp_args = MTPArguments( + mlip_path="/mock/path", + name="test_model", + unfitted_mtp="08.almtp", + fitted_mtp_savedir=tmpdir + ) + model = MTPWithMLIP3(mtp_args) # Call the train method + mtp_inputs = MTPInputs( + structure=[FakeStructure(['H', 'O']), FakeStructure(['Si'])], + forces=[], + energy=[1, 2] + ) - return_code = model.train( - train_structures=[FakeStructure(['H', 'O']), FakeStructure(['Si'])], - train_energies=[1, 2], - train_forces=[], - train_stresses=[], - unfitted_mtp="08.almtp", - fitted_mtp_savedir="/mock/dir" + _ = model.train( + mtp_inputs, ) - # Assert the expected results - assert return_code == 0 # The train method should return the mocked subprocess success return code from mock_open # Assert that mocked methods were called model.write_cfg.assert_called() @@ -91,19 +96,17 @@ def mtp_instance(mocker): return instance -# def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen): def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen): test_structures = [fake_structure] test_energies = [1.0] test_forces = [[[0, 0, 0]]] - test_stresses = None # or appropriate mock stresses # Mock check_structures_forces_stresses to return the arguments unmodified - mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", + mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses", side_effect=lambda s, f, st: (s, f, st)) # Mock pool_from to return a mocked value - mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="mock_pool") + mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="mock_pool") # Mock self.write_cfg to simulate creating a config file without file operations mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg") @@ -116,8 +119,14 @@ def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen): mocker.patch("shutil.copyfile", return_value=None) mocker.patch("os.path.exists", return_value=True) + mtp_inputs = MTPInputs( + structure=test_structures, + forces=test_forces, + energy=test_energies + ) + # Perform the test - df_orig, df_predict = mtp_instance.evaluate(test_structures, test_energies, test_forces, test_stresses) + df_orig, df_predict = mtp_instance.evaluate(mtp_inputs) # Assertions can vary based on the real output of `read_cfgs` # Here's an example assertion assuming `read_cfgs` returns a string in this mocked scenario @@ -209,12 +218,13 @@ def test_extract_energy_from_thermo_log(tmpdir): @pytest.fixture def mock_extract_energy_from_thermo_log(mocker): - return mocker.patch('crystal_diffusion.train_mtp.extract_energy_from_thermo_log', return_value=[]) + return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_energy_from_thermo_log', return_value=[]) @pytest.fixture def mock_extract_structure_and_forces(mocker): - return mocker.patch('crystal_diffusion.train_mtp.extract_structure_and_forces_from_file', return_value=([], [])) + return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_structure_and_forces_from_file', + return_value=([], [])) def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_extract_energy_from_thermo_log, tmpdir): @@ -230,7 +240,7 @@ def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_ # Verify that the mocks were called correctly assert mock_extract_structure_and_forces.call_count == 2 - mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict) + mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict, True) assert mock_extract_energy_from_thermo_log.call_count == 2 mock_extract_energy_from_thermo_log.assert_called_with(thermo_yaml_files[1])